diff --git a/Makefile.inc1 b/Makefile.inc1 index 851d20ba1f60..6b77bb195bfd 100644 --- a/Makefile.inc1 +++ b/Makefile.inc1 @@ -2442,7 +2442,7 @@ _btxld= usr.sbin/btxld # Rebuild ctfconvert and ctfmerge to avoid difficult-to-diagnose failures # resulting from missing bug fixes or ELF Toolchain updates. .if ${MK_CDDL} != "no" -_dtrace_tools= cddl/lib/libctf cddl/usr.bin/ctfconvert \ +_dtrace_tools= cddl/lib/libctf cddl/lib/libspl cddl/usr.bin/ctfconvert \ cddl/usr.bin/ctfmerge .endif @@ -2756,7 +2756,12 @@ _prebuild_libs= ${_kerberos5_lib_libasn1} \ ${_cddl_lib_libumem} ${_cddl_lib_libnvpair} \ ${_cddl_lib_libuutil} \ ${_cddl_lib_libavl} \ + ${_cddl_lib_libicp} \ + ${_cddl_lib_libicp_rescue} \ + ${_cddl_lib_libspl} \ + ${_cddl_lib_libtpool} \ ${_cddl_lib_libzfs_core} ${_cddl_lib_libzfs} \ + ${_cddl_lib_libzutil} \ ${_cddl_lib_libctf} \ lib/libufs \ lib/libutil lib/libpjdlog ${_lib_libypclnt} lib/libz lib/msun \ @@ -2826,21 +2831,34 @@ _cddl_lib_libumem= cddl/lib/libumem _cddl_lib_libnvpair= cddl/lib/libnvpair _cddl_lib_libavl= cddl/lib/libavl _cddl_lib_libuutil= cddl/lib/libuutil +_cddl_lib_libspl= cddl/lib/libspl + +cddl/lib/libuutil__L: cddl/lib/libavl__L cddl/lib/libspl__L + .if ${MK_ZFS} != "no" +_cddl_lib_libicp= cddl/lib/libicp +_cddl_lib_libicp_rescue= cddl/lib/libicp_rescue +_cddl_lib_libtpool= cddl/lib/libtpool +_cddl_lib_libzutil= cddl/lib/libzutil _cddl_lib_libzfs_core= cddl/lib/libzfs_core _cddl_lib_libzfs= cddl/lib/libzfs +cddl/lib/libtpool__L: cddl/lib/libspl__L + +cddl/lib/libzutil__L: cddl/lib/libavl__L cddl/lib/libtpool__L + cddl/lib/libzfs_core__L: cddl/lib/libnvpair__L cddl/lib/libzfs__L: cddl/lib/libzfs_core__L lib/msun__L lib/libutil__L cddl/lib/libzfs__L: lib/libthr__L lib/libmd__L lib/libz__L cddl/lib/libumem__L cddl/lib/libzfs__L: cddl/lib/libuutil__L cddl/lib/libavl__L lib/libgeom__L +cddl/lib/libzfs__L: cddl/lib/libnvpair__L cddl/lib/libzutil__L lib/libbe__L: cddl/lib/libzfs__L .endif _cddl_lib_libctf= cddl/lib/libctf _cddl_lib= cddl/lib -cddl/lib/libctf__L: lib/libz__L +cddl/lib/libctf__L: lib/libz__L cddl/lib/libspl__L .endif # cddl/lib/libdtrace requires lib/libproc and lib/librtld_db _prebuild_libs+= lib/libprocstat lib/libproc lib/librtld_db diff --git a/cddl/compat/opensolaris/include/fcntl.h b/cddl/compat/opensolaris/include/fcntl.h index 548918aaab3a..b962bb4855ee 100644 --- a/cddl/compat/opensolaris/include/fcntl.h +++ b/cddl/compat/opensolaris/include/fcntl.h @@ -32,7 +32,9 @@ #include_next +#ifndef open64 #define open64(...) open(__VA_ARGS__) +#endif #define openat64(...) openat(__VA_ARGS__) #endif diff --git a/cddl/compat/opensolaris/include/mnttab.h b/cddl/compat/opensolaris/include/mnttab.h deleted file mode 100644 index 227196a4017f..000000000000 --- a/cddl/compat/opensolaris/include/mnttab.h +++ /dev/null @@ -1,35 +0,0 @@ -/* $FreeBSD$ */ - -#ifndef _OPENSOLARIS_MNTTAB_H_ -#define _OPENSOLARIS_MNTTAB_H_ - -#include -#include - -#include -#include - -#define MNTTAB _PATH_DEVZERO -#define MNT_LINE_MAX 1024 - -#define MS_OVERLAY 0x0 -#define MS_NOMNTTAB 0x0 -#define MS_RDONLY 0x1 - -#define umount2(p, f) unmount(p, f) - -struct mnttab { - char *mnt_special; - char *mnt_mountp; - char *mnt_fstype; - char *mnt_mntopts; -}; -#define extmnttab mnttab - -int getmntany(FILE *fd, struct mnttab *mgetp, struct mnttab *mrefp); -int getmntent(FILE *fp, struct mnttab *mp); -char *hasmntopt(struct mnttab *mnt, char *opt); - -void statfs2mnttab(struct statfs *sfs, struct mnttab *mp); - -#endif /* !_OPENSOLARIS_MNTTAB_H_ */ diff --git a/cddl/contrib/opensolaris/cmd/lockstat/sym.c b/cddl/contrib/opensolaris/cmd/lockstat/sym.c index f2987a028e74..b5366c566857 100644 --- a/cddl/contrib/opensolaris/cmd/lockstat/sym.c +++ b/cddl/contrib/opensolaris/cmd/lockstat/sym.c @@ -54,6 +54,7 @@ #endif #include + typedef struct syment { uintptr_t addr; char *name; @@ -72,6 +73,11 @@ static char maxsymname[64]; #endif #endif +#define __sElfN(x) typedef __CONCAT(__CONCAT(__CONCAT(Elf,__ELF_WORD_SIZE),_),x) x +__sElfN(Sym); +__sElfN(Shdr); +#define elf_getshdr __elfN(getshdr) + static void add_symbol(char *name, uintptr_t addr, size_t size) { diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.8 b/cddl/contrib/opensolaris/cmd/zdb/zdb.8 deleted file mode 100644 index e60c56c7c199..000000000000 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.8 +++ /dev/null @@ -1,414 +0,0 @@ -.\" -.\" This file and its contents are supplied under the terms of the -.\" Common Development and Distribution License ("CDDL"), version 1.0. -.\" You may only use this file in accordance with the terms of version -.\" 1.0 of the CDDL. -.\" -.\" A full copy of the text of the CDDL should have accompanied this -.\" source. A copy of the CDDL is also available via the Internet at -.\" http://www.illumos.org/license/CDDL. -.\" -.\" -.\" Copyright 2012, Richard Lowe. -.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. -.\" Copyright 2017 Nexenta Systems, Inc. -.\" -.Dd February 25, 2020 -.Dt ZDB 8 -.Os -.Sh NAME -.Nm zdb -.Nd display zpool debugging and consistency information -.Sh SYNOPSIS -.Nm -.Op Fl AbcdDFGhikLMPsvX -.Op Fl e Oo Fl V Oc Op Fl p Ar path ... -.Op Fl I Ar inflight I/Os -.Oo Fl o Ar var Ns = Ns Ar value Oc Ns ... -.Op Fl t Ar txg -.Op Fl U Ar cache -.Op Fl x Ar dumpdir -.Op Ar poolname Op Ar object ... -.Nm -.Op Fl AdiPv -.Op Fl e Oo Fl V Oc Op Fl p Ar path ... -.Op Fl U Ar cache -.Ar dataset Op Ar object ... -.Nm -.Fl C -.Op Fl A -.Op Fl U Ar cache -.Nm -.Fl E -.Op Fl A -.Ar word0 Ns \&: Ns Ar word1 Ns :...: Ns Ar word15 -.Nm -.Fl l -.Op Fl Aqu -.Ar device -.Nm -.Fl m -.Op Fl AFLPX -.Op Fl e Oo Fl V Oc Op Fl p Ar path ... -.Op Fl t Ar txg -.Op Fl U Ar cache -.Ar poolname Op Ar vdev Op Ar metaslab ... -.Nm -.Fl O -.Ar dataset path -.Nm -.Fl R -.Op Fl A -.Op Fl e Oo Fl V Oc Op Fl p Ar path ... -.Op Fl U Ar cache -.Ar poolname vdev Ns \&: Ns Ar offset Ns \&: Ns Ar size Ns Op : Ns Ar flags -.Nm -.Fl S -.Op Fl AP -.Op Fl e Oo Fl V Oc Op Fl p Ar path ... -.Op Fl U Ar cache -.Ar poolname -.Sh DESCRIPTION -The -.Nm -utility displays information about a ZFS pool useful for debugging and performs -some amount of consistency checking. -It is a not a general purpose tool and options -.Pq and facilities -may change. -This is neither a -.Xr fsck 8 -nor an -.Xr fsdb 8 -utility. -.Pp -The output of this command in general reflects the on-disk structure of a ZFS -pool, and is inherently unstable. -The precise output of most invocations is not documented, a knowledge of ZFS -internals is assumed. -.Pp -If the -.Ar dataset -argument does not contain any -.Qq Sy / -or -.Qq Sy @ -characters, it is interpreted as a pool name. -The root dataset can be specified as -.Ar pool Ns / -.Pq pool name followed by a slash . -.Pp -When operating on an imported and active pool it is possible, though unlikely, -that zdb may interpret inconsistent pool data and behave erratically. -.Sh OPTIONS -Display options: -.Bl -tag -width Ds -.It Fl b -Display statistics regarding the number, size -.Pq logical, physical and allocated -and deduplication of blocks. -.It Fl c -Verify the checksum of all metadata blocks while printing block statistics -.Po see -.Fl b -.Pc . -.Pp -If specified multiple times, verify the checksums of all blocks. -.It Fl C -Display information about the configuration. -If specified with no other options, instead display information about the cache -file -.Pq Pa /boot/zfs/zpool.cache . -To specify the cache file to display, see -.Fl U . -.Pp -If specified multiple times, and a pool name is also specified display both the -cached configuration and the on-disk configuration. -If specified multiple times with -.Fl e -also display the configuration that would be used were the pool to be imported. -.It Fl d -Display information about datasets. -Specified once, displays basic dataset information: ID, create transaction, -size, and object count. -.Pp -If specified multiple times provides greater and greater verbosity. -.Pp -If object IDs are specified, display information about those specific objects -only. -.It Fl D -Display deduplication statistics, including the deduplication ratio -.Pq Sy dedup , -compression ratio -.Pq Sy compress , -inflation due to the zfs copies property -.Pq Sy copies , -and an overall effective ratio -.Pq Sy dedup No * Sy compress No / Sy copies . -.It Fl DD -Display a histogram of deduplication statistics, showing the allocated -.Pq physically present on disk -and referenced -.Pq logically referenced in the pool -block counts and sizes by reference count. -.It Fl DDD -Display the statistics independently for each deduplication table. -.It Fl DDDD -Dump the contents of the deduplication tables describing duplicate blocks. -.It Fl DDDDD -Also dump the contents of the deduplication tables describing unique blocks. -.It Fl E Ar word0 Ns \&: Ns Ar word1 Ns :...: Ns Ar word15 -Decode and display block from an embedded block pointer specified by the -.Ar word -arguments. -.It Fl h -Display pool history similar to -.Nm zpool Cm history , -but include internal changes, transaction, and dataset information. -.It Fl i -Display information about intent log -.Pq ZIL -entries relating to each dataset. -If specified multiple times, display counts of each intent log transaction type. -.It Fl k -Examine the checkpointed state of the pool. -Note, the on disk format of the pool is not reverted to the checkpointed state. -.It Fl l Ar device -Read the vdev labels from the specified device. -.Nm Fl l -will return 0 if valid label was found, 1 if error occurred, and 2 if no valid -labels were found. -.Pp -If the -.Fl q -option is also specified, don't print the labels. -.Pp -If the -.Fl u -option is also specified, also display the uberblocks on this device. -.It Fl L -Disable leak detection and the loading of space maps. -By default, -.Nm -verifies that all non-free blocks are referenced, which can be very expensive. -.It Fl m -Display the offset, spacemap, and free space of each metaslab. -.It Fl mm -Also display information about the on-disk free space histogram associated with -each metaslab. -.It Fl mmm -Display the maximum contiguous free space, the in-core free space histogram, and -the percentage of free space in each space map. -.It Fl mmmm -Display every spacemap record. -.It Fl M -Display the offset, spacemap, and free space of each metaslab. -.It Fl MM -Also display information about the maximum contiguous free space and the -percentage of free space in each space map. -.It Fl MMM -Display every spacemap record. -.It Fl O Ar dataset path -Look up the specified -.Ar path -inside of the -.Ar dataset -and display its metadata and indirect blocks. -Specified -.Ar path -must be relative to the root of -.Ar dataset . -This option can be combined with -.Fl v -for increasing verbosity. -.It Xo -.Fl R Ar poolname vdev Ns \&: Ns Ar offset Ns \&: Ns Ar size Ns Op : Ns Ar flags -.Xc -Read and display a block from the specified device. -By default the block is displayed as a hex dump, but see the description of the -.Sy r -flag, below. -.Pp -The block is specified in terms of a colon-separated tuple -.Ar vdev -.Pq an integer vdev identifier -.Ar offset -.Pq the offset within the vdev -.Ar size -.Pq the size of the block to read -and, optionally, -.Ar flags -.Pq a set of flags, described below . -.Pp -.Bl -tag -compact -width "b offset" -.It Sy b Ar offset -Print block pointer -.It Sy d -Decompress the block -.It Sy e -Byte swap the block -.It Sy g -Dump gang block header -.It Sy i -Dump indirect block -.It Sy r -Dump raw uninterpreted block data -.El -.It Fl s -Report statistics on -.Nm zdb -I/O. -Display operation counts, bandwidth, and error counts of I/O to the pool from -.Nm . -.It Fl S -Simulate the effects of deduplication, constructing a DDT and then display -that DDT as with -.Fl DD . -.It Fl u -Display the current uberblock. -.El -.Pp -Other options: -.Bl -tag -width Ds -.It Fl A -Do not abort should any assertion fail. -.It Fl AA -Enable panic recovery, certain errors which would otherwise be fatal are -demoted to warnings. -.It Fl AAA -Do not abort if asserts fail and also enable panic recovery. -.It Fl e Op Fl p Ar path ... -Operate on an exported pool, not present in -.Pa /boot/zfs/zpool.cache . -The -.Fl p -flag specifies the path under which devices are to be searched. -.It Fl x Ar dumpdir -All blocks accessed will be copied to files in the specified directory. -The blocks will be placed in sparse files whose name is the same as -that of the file or device read. -.Nm -can be then run on the generated files. -Note that the -.Fl bbc -flags are sufficient to access -.Pq and thus copy -all metadata on the pool. -.It Fl F -Attempt to make an unreadable pool readable by trying progressively older -transactions. -.It Fl G -Dump the contents of the zfs_dbgmsg buffer before exiting -.Nm . -zfs_dbgmsg is a buffer used by ZFS to dump advanced debug information. -.It Fl I Ar inflight I/Os -Limit the number of outstanding checksum I/Os to the specified value. -The default value is 200. -This option affects the performance of the -.Fl c -option. -.It Fl o Ar var Ns = Ns Ar value ... -Set the given global libzpool variable to the provided value. -The value must be an unsigned 32-bit integer. -Currently only little-endian systems are supported to avoid accidentally setting -the high 32 bits of 64-bit variables. -.It Fl P -Print numbers in an unscaled form more amenable to parsing, eg. 1000000 rather -than 1M. -.It Fl t Ar transaction -Specify the highest transaction to use when searching for uberblocks. -See also the -.Fl u -and -.Fl l -options for a means to see the available uberblocks and their associated -transaction numbers. -.It Fl U Ar cachefile -Use a cache file other than -.Pa /boot/zfs/zpool.cache . -.It Fl v -Enable verbosity. -Specify multiple times for increased verbosity. -.It Fl V -Attempt verbatim import. -This mimics the behavior of the kernel when loading a pool from a cachefile. -Only usable with -.Fl e . -.It Fl X -Attempt -.Qq extreme -transaction rewind, that is attempt the same recovery as -.Fl F -but read transactions otherwise deemed too old. -.El -.Pp -Specifying a display option more than once enables verbosity for only that -option, with more occurrences enabling more verbosity. -.Pp -If no options are specified, all information about the named pool will be -displayed at default verbosity. -.Sh EXAMPLES -.Bl -tag -width Ds -.It Xo -.Sy Example 1 -Display the configuration of imported pool -.Pa rpool -.Xc -.Bd -literal -# zdb -C rpool - -MOS Configuration: - version: 28 - name: 'rpool' - ... -.Ed -.It Xo -.Sy Example 2 -Display basic dataset information about -.Pa rpool -.Xc -.Bd -literal -# zdb -d rpool -Dataset mos [META], ID 0, cr_txg 4, 26.9M, 1051 objects -Dataset rpool/swap [ZVOL], ID 59, cr_txg 356, 486M, 2 objects - ... -.Ed -.It Xo -.Sy Example 3 -Display basic information about object 0 in -.Pa rpool/export/home -.Xc -.Bd -literal -# zdb -d rpool/export/home 0 -Dataset rpool/export/home [ZPL], ID 137, cr_txg 1546, 32K, 8 objects - - Object lvl iblk dblk dsize lsize %full type - 0 7 16K 16K 15.0K 16K 25.00 DMU dnode -.Ed -.It Xo -.Sy Example 4 -Display the predicted effect of enabling deduplication on -.Pa rpool -.Xc -.Bd -literal -# zdb -S rpool -Simulated DDT histogram: - -bucket allocated referenced -______ ______________________________ ______________________________ -refcnt blocks LSIZE PSIZE DSIZE blocks LSIZE PSIZE DSIZE ------- ------ ----- ----- ----- ------ ----- ----- ----- - 1 694K 27.1G 15.0G 15.0G 694K 27.1G 15.0G 15.0G - 2 35.0K 1.33G 699M 699M 74.7K 2.79G 1.45G 1.45G - ... -dedup = 1.11, compress = 1.80, copies = 1.00, dedup * compress / copies = 2.00 -.Ed -.El -.Sh SEE ALSO -.Xr zfs 8 , -.Xr zpool 8 -.Sh HISTORY -The -.Nm -utility first appeared in -.Fx 7.0 . diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c deleted file mode 100644 index d51ddc68908c..000000000000 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c +++ /dev/null @@ -1,5749 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2017 Nexenta Systems, Inc. - * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. - * Copyright 2017 RackTop Systems. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#undef verify -#include - -#include "zdb.h" - -#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ - zio_compress_table[(idx)].ci_name : "UNKNOWN") -#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ - zio_checksum_table[(idx)].ci_name : "UNKNOWN") -#define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \ - dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ? \ - dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN") -#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ - (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ - DMU_OT_ZAP_OTHER : \ - (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ - DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) - -#ifndef lint -extern int reference_tracking_enable; -extern boolean_t zfs_recover; -extern uint64_t zfs_arc_max, zfs_arc_meta_limit; -extern int zfs_vdev_async_read_max_active; -extern boolean_t spa_load_verify_dryrun; -extern int aok; -#else -int reference_tracking_enable; -boolean_t zfs_recover; -uint64_t zfs_arc_max, zfs_arc_meta_limit; -int zfs_vdev_async_read_max_active; -boolean_t spa_load_verify_dryrun; -int aok; -#endif - -static const char cmdname[] = "zdb"; -uint8_t dump_opt[256]; - -typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); - -static uint64_t *zopt_object = NULL; -static unsigned zopt_objects = 0; -static libzfs_handle_t *g_zfs; -static uint64_t max_inflight = 1000; -static int leaked_objects = 0; - -static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *); -static void mos_obj_refd(uint64_t); - -/* - * These libumem hooks provide a reasonable set of defaults for the allocator's - * debugging facilities. - */ -const char * -_umem_debug_init() -{ - return ("default,verbose"); /* $UMEM_DEBUG setting */ -} - -const char * -_umem_logging_init(void) -{ - return ("fail,contents"); /* $UMEM_LOGGING setting */ -} - -static void -usage(void) -{ - (void) fprintf(stderr, - "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p ...]] " - "[-I ]\n" - "\t\t[-o =]... [-t ] [-U ] [-x ]\n" - "\t\t[ [ ...]]\n" - "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ] " - "[ ...]\n" - "\t%s -C [-A] [-U ]\n" - "\t%s -l [-Aqu] \n" - "\t%s -m [-AFLPX] [-e [-V] [-p ...]] [-t ] " - "[-U ]\n\t\t [ [ ...]]\n" - "\t%s -O \n" - "\t%s -R [-A] [-e [-V] [-p ...]] [-U ]\n" - "\t\t ::[:]\n" - "\t%s -E [-A] word0:word1:...:word15\n" - "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " - "\n\n", - cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, - cmdname, cmdname); - - (void) fprintf(stderr, " Dataset name must include at least one " - "separator character '/' or '@'\n"); - (void) fprintf(stderr, " If dataset name is specified, only that " - "dataset is dumped\n"); - (void) fprintf(stderr, " If object numbers are specified, only " - "those objects are dumped\n\n"); - (void) fprintf(stderr, " Options to control amount of output:\n"); - (void) fprintf(stderr, " -b block statistics\n"); - (void) fprintf(stderr, " -c checksum all metadata (twice for " - "all data) blocks\n"); - (void) fprintf(stderr, " -C config (or cachefile if alone)\n"); - (void) fprintf(stderr, " -d dataset(s)\n"); - (void) fprintf(stderr, " -D dedup statistics\n"); - (void) fprintf(stderr, " -E decode and display block from an " - "embedded block pointer\n"); - (void) fprintf(stderr, " -h pool history\n"); - (void) fprintf(stderr, " -i intent logs\n"); - (void) fprintf(stderr, " -l read label contents\n"); - (void) fprintf(stderr, " -k examine the checkpointed state " - "of the pool\n"); - (void) fprintf(stderr, " -L disable leak tracking (do not " - "load spacemaps)\n"); - (void) fprintf(stderr, " -m metaslabs\n"); - (void) fprintf(stderr, " -M metaslab groups\n"); - (void) fprintf(stderr, " -O perform object lookups by path\n"); - (void) fprintf(stderr, " -R read and display block from a " - "device\n"); - (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); - (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); - (void) fprintf(stderr, " -v verbose (applies to all " - "others)\n\n"); - (void) fprintf(stderr, " Below options are intended for use " - "with other options:\n"); - (void) fprintf(stderr, " -A ignore assertions (-A), enable " - "panic recovery (-AA) or both (-AAA)\n"); - (void) fprintf(stderr, " -e pool is exported/destroyed/" - "has altroot/not in a cachefile\n"); - (void) fprintf(stderr, " -F attempt automatic rewind within " - "safe range of transaction groups\n"); - (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before " - "exiting\n"); - (void) fprintf(stderr, " -I -- " - "specify the maximum number of " - "checksumming I/Os [default is 200]\n"); - (void) fprintf(stderr, " -o = set global " - "variable to an unsigned 32-bit integer value\n"); - (void) fprintf(stderr, " -p -- use one or more with " - "-e to specify path to vdev dir\n"); - (void) fprintf(stderr, " -P print numbers in parseable form\n"); - (void) fprintf(stderr, " -q don't print label contents\n"); - (void) fprintf(stderr, " -t -- highest txg to use when " - "searching for uberblocks\n"); - (void) fprintf(stderr, " -u uberblock\n"); - (void) fprintf(stderr, " -U -- use alternate " - "cachefile\n"); - (void) fprintf(stderr, " -V do verbatim import\n"); - (void) fprintf(stderr, " -x -- " - "dump all read blocks into specified directory\n"); - (void) fprintf(stderr, " -X attempt extreme rewind (does not " - "work with dataset)\n\n"); - (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " - "to make only that option verbose\n"); - (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); - exit(1); -} - -static void -dump_debug_buffer() -{ - if (dump_opt['G']) { - (void) printf("\n"); - zfs_dbgmsg_print("zdb"); - } -} - -/* - * Called for usage errors that are discovered after a call to spa_open(), - * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. - */ - -static void -fatal(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - (void) fprintf(stderr, "%s: ", cmdname); - (void) vfprintf(stderr, fmt, ap); - va_end(ap); - (void) fprintf(stderr, "\n"); - - dump_debug_buffer(); - - exit(1); -} - -/* ARGSUSED */ -static void -dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) -{ - nvlist_t *nv; - size_t nvsize = *(uint64_t *)data; - char *packed = umem_alloc(nvsize, UMEM_NOFAIL); - - VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); - - VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); - - umem_free(packed, nvsize); - - dump_nvlist(nv, 8); - - nvlist_free(nv); -} - -/* ARGSUSED */ -static void -dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) -{ - spa_history_phys_t *shp = data; - - if (shp == NULL) - return; - - (void) printf("\t\tpool_create_len = %llu\n", - (u_longlong_t)shp->sh_pool_create_len); - (void) printf("\t\tphys_max_off = %llu\n", - (u_longlong_t)shp->sh_phys_max_off); - (void) printf("\t\tbof = %llu\n", - (u_longlong_t)shp->sh_bof); - (void) printf("\t\teof = %llu\n", - (u_longlong_t)shp->sh_eof); - (void) printf("\t\trecords_lost = %llu\n", - (u_longlong_t)shp->sh_records_lost); -} - -static void -zdb_nicenum(uint64_t num, char *buf, size_t buflen) -{ - if (dump_opt['P']) - (void) snprintf(buf, buflen, "%llu", (longlong_t)num); - else - nicenum(num, buf, sizeof (buf)); -} - -static const char histo_stars[] = "****************************************"; -static const uint64_t histo_width = sizeof (histo_stars) - 1; - -static void -dump_histogram(const uint64_t *histo, int size, int offset) -{ - int i; - int minidx = size - 1; - int maxidx = 0; - uint64_t max = 0; - - for (i = 0; i < size; i++) { - if (histo[i] > max) - max = histo[i]; - if (histo[i] > 0 && i > maxidx) - maxidx = i; - if (histo[i] > 0 && i < minidx) - minidx = i; - } - - if (max < histo_width) - max = histo_width; - - for (i = minidx; i <= maxidx; i++) { - (void) printf("\t\t\t%3u: %6llu %s\n", - i + offset, (u_longlong_t)histo[i], - &histo_stars[(max - histo[i]) * histo_width / max]); - } -} - -static void -dump_zap_stats(objset_t *os, uint64_t object) -{ - int error; - zap_stats_t zs; - - error = zap_get_stats(os, object, &zs); - if (error) - return; - - if (zs.zs_ptrtbl_len == 0) { - ASSERT(zs.zs_num_blocks == 1); - (void) printf("\tmicrozap: %llu bytes, %llu entries\n", - (u_longlong_t)zs.zs_blocksize, - (u_longlong_t)zs.zs_num_entries); - return; - } - - (void) printf("\tFat ZAP stats:\n"); - - (void) printf("\t\tPointer table:\n"); - (void) printf("\t\t\t%llu elements\n", - (u_longlong_t)zs.zs_ptrtbl_len); - (void) printf("\t\t\tzt_blk: %llu\n", - (u_longlong_t)zs.zs_ptrtbl_zt_blk); - (void) printf("\t\t\tzt_numblks: %llu\n", - (u_longlong_t)zs.zs_ptrtbl_zt_numblks); - (void) printf("\t\t\tzt_shift: %llu\n", - (u_longlong_t)zs.zs_ptrtbl_zt_shift); - (void) printf("\t\t\tzt_blks_copied: %llu\n", - (u_longlong_t)zs.zs_ptrtbl_blks_copied); - (void) printf("\t\t\tzt_nextblk: %llu\n", - (u_longlong_t)zs.zs_ptrtbl_nextblk); - - (void) printf("\t\tZAP entries: %llu\n", - (u_longlong_t)zs.zs_num_entries); - (void) printf("\t\tLeaf blocks: %llu\n", - (u_longlong_t)zs.zs_num_leafs); - (void) printf("\t\tTotal blocks: %llu\n", - (u_longlong_t)zs.zs_num_blocks); - (void) printf("\t\tzap_block_type: 0x%llx\n", - (u_longlong_t)zs.zs_block_type); - (void) printf("\t\tzap_magic: 0x%llx\n", - (u_longlong_t)zs.zs_magic); - (void) printf("\t\tzap_salt: 0x%llx\n", - (u_longlong_t)zs.zs_salt); - - (void) printf("\t\tLeafs with 2^n pointers:\n"); - dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); - - (void) printf("\t\tBlocks with n*5 entries:\n"); - dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); - - (void) printf("\t\tBlocks n/10 full:\n"); - dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); - - (void) printf("\t\tEntries with n chunks:\n"); - dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); - - (void) printf("\t\tBuckets with n entries:\n"); - dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); -} - -/*ARGSUSED*/ -static void -dump_none(objset_t *os, uint64_t object, void *data, size_t size) -{ -} - -/*ARGSUSED*/ -static void -dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) -{ - (void) printf("\tUNKNOWN OBJECT TYPE\n"); -} - -/*ARGSUSED*/ -static void -dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) -{ -} - -/*ARGSUSED*/ -static void -dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) -{ -} - -/*ARGSUSED*/ -static void -dump_zap(objset_t *os, uint64_t object, void *data, size_t size) -{ - zap_cursor_t zc; - zap_attribute_t attr; - void *prop; - unsigned i; - - dump_zap_stats(os, object); - (void) printf("\n"); - - for (zap_cursor_init(&zc, os, object); - zap_cursor_retrieve(&zc, &attr) == 0; - zap_cursor_advance(&zc)) { - (void) printf("\t\t%s = ", attr.za_name); - if (attr.za_num_integers == 0) { - (void) printf("\n"); - continue; - } - prop = umem_zalloc(attr.za_num_integers * - attr.za_integer_length, UMEM_NOFAIL); - (void) zap_lookup(os, object, attr.za_name, - attr.za_integer_length, attr.za_num_integers, prop); - if (attr.za_integer_length == 1) { - (void) printf("%s", (char *)prop); - } else { - for (i = 0; i < attr.za_num_integers; i++) { - switch (attr.za_integer_length) { - case 2: - (void) printf("%u ", - ((uint16_t *)prop)[i]); - break; - case 4: - (void) printf("%u ", - ((uint32_t *)prop)[i]); - break; - case 8: - (void) printf("%lld ", - (u_longlong_t)((int64_t *)prop)[i]); - break; - } - } - } - (void) printf("\n"); - umem_free(prop, attr.za_num_integers * attr.za_integer_length); - } - zap_cursor_fini(&zc); -} - -static void -dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) -{ - bpobj_phys_t *bpop = data; - char bytes[32], comp[32], uncomp[32]; - - /* make sure the output won't get truncated */ - CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); - - if (bpop == NULL) - return; - - zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes)); - zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp)); - zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp)); - - (void) printf("\t\tnum_blkptrs = %llu\n", - (u_longlong_t)bpop->bpo_num_blkptrs); - (void) printf("\t\tbytes = %s\n", bytes); - if (size >= BPOBJ_SIZE_V1) { - (void) printf("\t\tcomp = %s\n", comp); - (void) printf("\t\tuncomp = %s\n", uncomp); - } - if (size >= sizeof (*bpop)) { - (void) printf("\t\tsubobjs = %llu\n", - (u_longlong_t)bpop->bpo_subobjs); - (void) printf("\t\tnum_subobjs = %llu\n", - (u_longlong_t)bpop->bpo_num_subobjs); - } - - if (dump_opt['d'] < 5) - return; - - for (uint64_t i = 0; i < bpop->bpo_num_blkptrs; i++) { - char blkbuf[BP_SPRINTF_LEN]; - blkptr_t bp; - - int err = dmu_read(os, object, - i * sizeof (bp), sizeof (bp), &bp, 0); - if (err != 0) { - (void) printf("got error %u from dmu_read\n", err); - break; - } - snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp); - (void) printf("\t%s\n", blkbuf); - } -} - -/* ARGSUSED */ -static void -dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) -{ - dmu_object_info_t doi; - - VERIFY0(dmu_object_info(os, object, &doi)); - uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); - - int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); - if (err != 0) { - (void) printf("got error %u from dmu_read\n", err); - kmem_free(subobjs, doi.doi_max_offset); - return; - } - - int64_t last_nonzero = -1; - for (uint64_t i = 0; i < doi.doi_max_offset / 8; i++) { - if (subobjs[i] != 0) - last_nonzero = i; - } - - for (int64_t i = 0; i <= last_nonzero; i++) { - (void) printf("\t%llu\n", (longlong_t)subobjs[i]); - } - kmem_free(subobjs, doi.doi_max_offset); -} - -/*ARGSUSED*/ -static void -dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) -{ - dump_zap_stats(os, object); - /* contents are printed elsewhere, properly decoded */ -} - -/*ARGSUSED*/ -static void -dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) -{ - zap_cursor_t zc; - zap_attribute_t attr; - - dump_zap_stats(os, object); - (void) printf("\n"); - - for (zap_cursor_init(&zc, os, object); - zap_cursor_retrieve(&zc, &attr) == 0; - zap_cursor_advance(&zc)) { - (void) printf("\t\t%s = ", attr.za_name); - if (attr.za_num_integers == 0) { - (void) printf("\n"); - continue; - } - (void) printf(" %llx : [%d:%d:%d]\n", - (u_longlong_t)attr.za_first_integer, - (int)ATTR_LENGTH(attr.za_first_integer), - (int)ATTR_BSWAP(attr.za_first_integer), - (int)ATTR_NUM(attr.za_first_integer)); - } - zap_cursor_fini(&zc); -} - -/*ARGSUSED*/ -static void -dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) -{ - zap_cursor_t zc; - zap_attribute_t attr; - uint16_t *layout_attrs; - unsigned i; - - dump_zap_stats(os, object); - (void) printf("\n"); - - for (zap_cursor_init(&zc, os, object); - zap_cursor_retrieve(&zc, &attr) == 0; - zap_cursor_advance(&zc)) { - (void) printf("\t\t%s = [", attr.za_name); - if (attr.za_num_integers == 0) { - (void) printf("\n"); - continue; - } - - VERIFY(attr.za_integer_length == 2); - layout_attrs = umem_zalloc(attr.za_num_integers * - attr.za_integer_length, UMEM_NOFAIL); - - VERIFY(zap_lookup(os, object, attr.za_name, - attr.za_integer_length, - attr.za_num_integers, layout_attrs) == 0); - - for (i = 0; i != attr.za_num_integers; i++) - (void) printf(" %d ", (int)layout_attrs[i]); - (void) printf("]\n"); - umem_free(layout_attrs, - attr.za_num_integers * attr.za_integer_length); - } - zap_cursor_fini(&zc); -} - -/*ARGSUSED*/ -static void -dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) -{ - zap_cursor_t zc; - zap_attribute_t attr; - const char *typenames[] = { - /* 0 */ "not specified", - /* 1 */ "FIFO", - /* 2 */ "Character Device", - /* 3 */ "3 (invalid)", - /* 4 */ "Directory", - /* 5 */ "5 (invalid)", - /* 6 */ "Block Device", - /* 7 */ "7 (invalid)", - /* 8 */ "Regular File", - /* 9 */ "9 (invalid)", - /* 10 */ "Symbolic Link", - /* 11 */ "11 (invalid)", - /* 12 */ "Socket", - /* 13 */ "Door", - /* 14 */ "Event Port", - /* 15 */ "15 (invalid)", - }; - - dump_zap_stats(os, object); - (void) printf("\n"); - - for (zap_cursor_init(&zc, os, object); - zap_cursor_retrieve(&zc, &attr) == 0; - zap_cursor_advance(&zc)) { - (void) printf("\t\t%s = %lld (type: %s)\n", - attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer), - typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]); - } - zap_cursor_fini(&zc); -} - -static int -get_dtl_refcount(vdev_t *vd) -{ - int refcount = 0; - - if (vd->vdev_ops->vdev_op_leaf) { - space_map_t *sm = vd->vdev_dtl_sm; - - if (sm != NULL && - sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) - return (1); - return (0); - } - - for (unsigned c = 0; c < vd->vdev_children; c++) - refcount += get_dtl_refcount(vd->vdev_child[c]); - return (refcount); -} - -static int -get_metaslab_refcount(vdev_t *vd) -{ - int refcount = 0; - - if (vd->vdev_top == vd) { - for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { - space_map_t *sm = vd->vdev_ms[m]->ms_sm; - - if (sm != NULL && - sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) - refcount++; - } - } - for (unsigned c = 0; c < vd->vdev_children; c++) - refcount += get_metaslab_refcount(vd->vdev_child[c]); - - return (refcount); -} - -static int -get_obsolete_refcount(vdev_t *vd) -{ - int refcount = 0; - - uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); - if (vd->vdev_top == vd && obsolete_sm_obj != 0) { - dmu_object_info_t doi; - VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset, - obsolete_sm_obj, &doi)); - if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { - refcount++; - } - } else { - ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); - ASSERT3U(obsolete_sm_obj, ==, 0); - } - for (unsigned c = 0; c < vd->vdev_children; c++) { - refcount += get_obsolete_refcount(vd->vdev_child[c]); - } - - return (refcount); -} - -static int -get_prev_obsolete_spacemap_refcount(spa_t *spa) -{ - uint64_t prev_obj = - spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object; - if (prev_obj != 0) { - dmu_object_info_t doi; - VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi)); - if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { - return (1); - } - } - return (0); -} - -static int -get_checkpoint_refcount(vdev_t *vd) -{ - int refcount = 0; - - if (vd->vdev_top == vd && vd->vdev_top_zap != 0 && - zap_contains(spa_meta_objset(vd->vdev_spa), - vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0) - refcount++; - - for (uint64_t c = 0; c < vd->vdev_children; c++) - refcount += get_checkpoint_refcount(vd->vdev_child[c]); - - return (refcount); -} - -static int -verify_spacemap_refcounts(spa_t *spa) -{ - uint64_t expected_refcount = 0; - uint64_t actual_refcount; - - (void) feature_get_refcount(spa, - &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], - &expected_refcount); - actual_refcount = get_dtl_refcount(spa->spa_root_vdev); - actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); - actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); - actual_refcount += get_prev_obsolete_spacemap_refcount(spa); - actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); - - if (expected_refcount != actual_refcount) { - (void) printf("space map refcount mismatch: expected %lld != " - "actual %lld\n", - (longlong_t)expected_refcount, - (longlong_t)actual_refcount); - return (2); - } - return (0); -} - -static void -dump_spacemap(objset_t *os, space_map_t *sm) -{ - char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", - "INVALID", "INVALID", "INVALID", "INVALID" }; - - if (sm == NULL) - return; - - (void) printf("space map object %llu:\n", - (longlong_t)sm->sm_object); - (void) printf(" smp_length = 0x%llx\n", - (longlong_t)sm->sm_phys->smp_length); - (void) printf(" smp_alloc = 0x%llx\n", - (longlong_t)sm->sm_phys->smp_alloc); - - if (dump_opt['d'] < 6 && dump_opt['m'] < 4) - return; - - /* - * Print out the freelist entries in both encoded and decoded form. - */ - uint8_t mapshift = sm->sm_shift; - int64_t alloc = 0; - uint64_t word, entry_id = 0; - for (uint64_t offset = 0; offset < space_map_length(sm); - offset += sizeof (word)) { - - VERIFY0(dmu_read(os, space_map_object(sm), offset, - sizeof (word), &word, DMU_READ_PREFETCH)); - - if (sm_entry_is_debug(word)) { - (void) printf("\t [%6llu] %s: txg %llu pass %llu\n", - (u_longlong_t)entry_id, - ddata[SM_DEBUG_ACTION_DECODE(word)], - (u_longlong_t)SM_DEBUG_TXG_DECODE(word), - (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word)); - entry_id++; - continue; - } - - uint8_t words; - char entry_type; - uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; - - if (sm_entry_is_single_word(word)) { - entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? - 'A' : 'F'; - entry_off = (SM_OFFSET_DECODE(word) << mapshift) + - sm->sm_start; - entry_run = SM_RUN_DECODE(word) << mapshift; - words = 1; - } else { - /* it is a two-word entry so we read another word */ - ASSERT(sm_entry_is_double_word(word)); - - uint64_t extra_word; - offset += sizeof (extra_word); - VERIFY0(dmu_read(os, space_map_object(sm), offset, - sizeof (extra_word), &extra_word, - DMU_READ_PREFETCH)); - - ASSERT3U(offset, <=, space_map_length(sm)); - - entry_run = SM2_RUN_DECODE(word) << mapshift; - entry_vdev = SM2_VDEV_DECODE(word); - entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? - 'A' : 'F'; - entry_off = (SM2_OFFSET_DECODE(extra_word) << - mapshift) + sm->sm_start; - words = 2; - } - - (void) printf("\t [%6llu] %c range:" - " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", - (u_longlong_t)entry_id, - entry_type, (u_longlong_t)entry_off, - (u_longlong_t)(entry_off + entry_run), - (u_longlong_t)entry_run, - (u_longlong_t)entry_vdev, words); - - if (entry_type == 'A') - alloc += entry_run; - else - alloc -= entry_run; - entry_id++; - } - if (alloc != space_map_allocated(sm)) { - (void) printf("space_map_object alloc (%lld) INCONSISTENT " - "with space map summary (%lld)\n", - (longlong_t)space_map_allocated(sm), (longlong_t)alloc); - } -} - -static void -dump_metaslab_stats(metaslab_t *msp) -{ - char maxbuf[32]; - range_tree_t *rt = msp->ms_allocatable; - avl_tree_t *t = &msp->ms_allocatable_by_size; - int free_pct = range_tree_space(rt) * 100 / msp->ms_size; - - /* max sure nicenum has enough space */ - CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ); - - zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf)); - - (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", - "segments", avl_numnodes(t), "maxsize", maxbuf, - "freepct", free_pct); - (void) printf("\tIn-memory histogram:\n"); - dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); -} - -static void -dump_metaslab(metaslab_t *msp) -{ - vdev_t *vd = msp->ms_group->mg_vd; - spa_t *spa = vd->vdev_spa; - space_map_t *sm = msp->ms_sm; - char freebuf[32]; - - zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf, - sizeof (freebuf)); - - (void) printf( - "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", - (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, - (u_longlong_t)space_map_object(sm), freebuf); - - if (dump_opt['m'] > 2 && !dump_opt['L']) { - mutex_enter(&msp->ms_lock); - VERIFY0(metaslab_load(msp)); - range_tree_stat_verify(msp->ms_allocatable); - dump_metaslab_stats(msp); - metaslab_unload(msp); - mutex_exit(&msp->ms_lock); - } - - if (dump_opt['m'] > 1 && sm != NULL && - spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { - /* - * The space map histogram represents free space in chunks - * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). - */ - (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", - (u_longlong_t)msp->ms_fragmentation); - dump_histogram(sm->sm_phys->smp_histogram, - SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); - } - - ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); - dump_spacemap(spa->spa_meta_objset, msp->ms_sm); -} - -static void -print_vdev_metaslab_header(vdev_t *vd) -{ - vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; - const char *bias_str; - - bias_str = (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) ? - VDEV_ALLOC_BIAS_LOG : - (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : - (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : - vd->vdev_islog ? "log" : ""; - - (void) printf("\tvdev %10llu %s\n" - "\t%-10s%5llu %-19s %-15s %-12s\n", - (u_longlong_t)vd->vdev_id, bias_str, - "metaslabs", (u_longlong_t)vd->vdev_ms_count, - "offset", "spacemap", "free"); - (void) printf("\t%15s %19s %15s %12s\n", - "---------------", "-------------------", - "---------------", "------------"); -} - -static void -dump_metaslab_groups(spa_t *spa) -{ - vdev_t *rvd = spa->spa_root_vdev; - metaslab_class_t *mc = spa_normal_class(spa); - uint64_t fragmentation; - - metaslab_class_histogram_verify(mc); - - for (unsigned c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; - - if (mg == NULL || mg->mg_class != mc) - continue; - - metaslab_group_histogram_verify(mg); - mg->mg_fragmentation = metaslab_group_fragmentation(mg); - - (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" - "fragmentation", - (u_longlong_t)tvd->vdev_id, - (u_longlong_t)tvd->vdev_ms_count); - if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { - (void) printf("%3s\n", "-"); - } else { - (void) printf("%3llu%%\n", - (u_longlong_t)mg->mg_fragmentation); - } - dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); - } - - (void) printf("\tpool %s\tfragmentation", spa_name(spa)); - fragmentation = metaslab_class_fragmentation(mc); - if (fragmentation == ZFS_FRAG_INVALID) - (void) printf("\t%3s\n", "-"); - else - (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); - dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); -} - -static void -print_vdev_indirect(vdev_t *vd) -{ - vdev_indirect_config_t *vic = &vd->vdev_indirect_config; - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - vdev_indirect_births_t *vib = vd->vdev_indirect_births; - - if (vim == NULL) { - ASSERT3P(vib, ==, NULL); - return; - } - - ASSERT3U(vdev_indirect_mapping_object(vim), ==, - vic->vic_mapping_object); - ASSERT3U(vdev_indirect_births_object(vib), ==, - vic->vic_births_object); - - (void) printf("indirect births obj %llu:\n", - (longlong_t)vic->vic_births_object); - (void) printf(" vib_count = %llu\n", - (longlong_t)vdev_indirect_births_count(vib)); - for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) { - vdev_indirect_birth_entry_phys_t *cur_vibe = - &vib->vib_entries[i]; - (void) printf("\toffset %llx -> txg %llu\n", - (longlong_t)cur_vibe->vibe_offset, - (longlong_t)cur_vibe->vibe_phys_birth_txg); - } - (void) printf("\n"); - - (void) printf("indirect mapping obj %llu:\n", - (longlong_t)vic->vic_mapping_object); - (void) printf(" vim_max_offset = 0x%llx\n", - (longlong_t)vdev_indirect_mapping_max_offset(vim)); - (void) printf(" vim_bytes_mapped = 0x%llx\n", - (longlong_t)vdev_indirect_mapping_bytes_mapped(vim)); - (void) printf(" vim_count = %llu\n", - (longlong_t)vdev_indirect_mapping_num_entries(vim)); - - if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3) - return; - - uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim); - - for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { - vdev_indirect_mapping_entry_phys_t *vimep = - &vim->vim_entries[i]; - (void) printf("\t<%llx:%llx:%llx> -> " - "<%llx:%llx:%llx> (%x obsolete)\n", - (longlong_t)vd->vdev_id, - (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), - (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), - (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst), - (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst), - (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), - counts[i]); - } - (void) printf("\n"); - - uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); - if (obsolete_sm_object != 0) { - objset_t *mos = vd->vdev_spa->spa_meta_objset; - (void) printf("obsolete space map object %llu:\n", - (u_longlong_t)obsolete_sm_object); - ASSERT(vd->vdev_obsolete_sm != NULL); - ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==, - obsolete_sm_object); - dump_spacemap(mos, vd->vdev_obsolete_sm); - (void) printf("\n"); - } -} - -static void -dump_metaslabs(spa_t *spa) -{ - vdev_t *vd, *rvd = spa->spa_root_vdev; - uint64_t m, c = 0, children = rvd->vdev_children; - - (void) printf("\nMetaslabs:\n"); - - if (!dump_opt['d'] && zopt_objects > 0) { - c = zopt_object[0]; - - if (c >= children) - (void) fatal("bad vdev id: %llu", (u_longlong_t)c); - - if (zopt_objects > 1) { - vd = rvd->vdev_child[c]; - print_vdev_metaslab_header(vd); - - for (m = 1; m < zopt_objects; m++) { - if (zopt_object[m] < vd->vdev_ms_count) - dump_metaslab( - vd->vdev_ms[zopt_object[m]]); - else - (void) fprintf(stderr, "bad metaslab " - "number %llu\n", - (u_longlong_t)zopt_object[m]); - } - (void) printf("\n"); - return; - } - children = c + 1; - } - for (; c < children; c++) { - vd = rvd->vdev_child[c]; - print_vdev_metaslab_header(vd); - - print_vdev_indirect(vd); - - for (m = 0; m < vd->vdev_ms_count; m++) - dump_metaslab(vd->vdev_ms[m]); - (void) printf("\n"); - } -} - -static void -dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) -{ - const ddt_phys_t *ddp = dde->dde_phys; - const ddt_key_t *ddk = &dde->dde_key; - const char *types[4] = { "ditto", "single", "double", "triple" }; - char blkbuf[BP_SPRINTF_LEN]; - blkptr_t blk; - - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0) - continue; - ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); - snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); - (void) printf("index %llx refcnt %llu %s %s\n", - (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, - types[p], blkbuf); - } -} - -static void -dump_dedup_ratio(const ddt_stat_t *dds) -{ - double rL, rP, rD, D, dedup, compress, copies; - - if (dds->dds_blocks == 0) - return; - - rL = (double)dds->dds_ref_lsize; - rP = (double)dds->dds_ref_psize; - rD = (double)dds->dds_ref_dsize; - D = (double)dds->dds_dsize; - - dedup = rD / D; - compress = rL / rP; - copies = rD / rP; - - (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " - "dedup * compress / copies = %.2f\n\n", - dedup, compress, copies, dedup * compress / copies); -} - -static void -dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) -{ - char name[DDT_NAMELEN]; - ddt_entry_t dde; - uint64_t walk = 0; - dmu_object_info_t doi; - uint64_t count, dspace, mspace; - int error; - - error = ddt_object_info(ddt, type, class, &doi); - - if (error == ENOENT) - return; - ASSERT(error == 0); - - error = ddt_object_count(ddt, type, class, &count); - ASSERT(error == 0); - if (count == 0) - return; - - dspace = doi.doi_physical_blocks_512 << 9; - mspace = doi.doi_fill_count * doi.doi_data_block_size; - - ddt_object_name(ddt, type, class, name); - - (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", - name, - (u_longlong_t)count, - (u_longlong_t)(dspace / count), - (u_longlong_t)(mspace / count)); - - if (dump_opt['D'] < 3) - return; - - zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); - - if (dump_opt['D'] < 4) - return; - - if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) - return; - - (void) printf("%s contents:\n\n", name); - - while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) - dump_dde(ddt, &dde, walk); - - ASSERT3U(error, ==, ENOENT); - - (void) printf("\n"); -} - -static void -dump_all_ddts(spa_t *spa) -{ - ddt_histogram_t ddh_total; - ddt_stat_t dds_total; - - bzero(&ddh_total, sizeof (ddh_total)); - bzero(&dds_total, sizeof (dds_total)); - - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { - ddt_t *ddt = spa->spa_ddt[c]; - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; - class++) { - dump_ddt(ddt, type, class); - } - } - } - - ddt_get_dedup_stats(spa, &dds_total); - - if (dds_total.dds_blocks == 0) { - (void) printf("All DDTs are empty\n"); - return; - } - - (void) printf("\n"); - - if (dump_opt['D'] > 1) { - (void) printf("DDT histogram (aggregated over all DDTs):\n"); - ddt_get_dedup_histogram(spa, &ddh_total); - zpool_dump_ddt(&dds_total, &ddh_total); - } - - dump_dedup_ratio(&dds_total); -} - -static void -dump_dtl_seg(void *arg, uint64_t start, uint64_t size) -{ - char *prefix = arg; - - (void) printf("%s [%llu,%llu) length %llu\n", - prefix, - (u_longlong_t)start, - (u_longlong_t)(start + size), - (u_longlong_t)(size)); -} - -static void -dump_dtl(vdev_t *vd, int indent) -{ - spa_t *spa = vd->vdev_spa; - boolean_t required; - const char *name[DTL_TYPES] = { "missing", "partial", "scrub", - "outage" }; - char prefix[256]; - - spa_vdev_state_enter(spa, SCL_NONE); - required = vdev_dtl_required(vd); - (void) spa_vdev_state_exit(spa, NULL, 0); - - if (indent == 0) - (void) printf("\nDirty time logs:\n\n"); - - (void) printf("\t%*s%s [%s]\n", indent, "", - vd->vdev_path ? vd->vdev_path : - vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), - required ? "DTL-required" : "DTL-expendable"); - - for (int t = 0; t < DTL_TYPES; t++) { - range_tree_t *rt = vd->vdev_dtl[t]; - if (range_tree_space(rt) == 0) - continue; - (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", - indent + 2, "", name[t]); - range_tree_walk(rt, dump_dtl_seg, prefix); - if (dump_opt['d'] > 5 && vd->vdev_children == 0) - dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm); - } - - for (unsigned c = 0; c < vd->vdev_children; c++) - dump_dtl(vd->vdev_child[c], indent + 4); -} - -/* from spa_history.c: spa_history_create_obj() */ -#define HIS_BUF_LEN_DEF (128 << 10) -#define HIS_BUF_LEN_MAX (1 << 30) - -static void -dump_history(spa_t *spa) -{ - nvlist_t **events = NULL; - char *buf = NULL; - uint64_t bufsize = HIS_BUF_LEN_DEF; - uint64_t resid, len, off = 0; - uint_t num = 0; - int error; - time_t tsec; - struct tm t; - char tbuf[30]; - char internalstr[MAXPATHLEN]; - - if ((buf = malloc(bufsize)) == NULL) - (void) fprintf(stderr, "Unable to read history: " - "out of memory\n"); - do { - len = bufsize; - - if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { - (void) fprintf(stderr, "Unable to read history: " - "error %d\n", error); - return; - } - - if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) - break; - off -= resid; - - /* - * If the history block is too big, double the buffer - * size and try again. - */ - if (resid == len) { - free(buf); - buf = NULL; - - bufsize <<= 1; - if ((bufsize >= HIS_BUF_LEN_MAX) || - ((buf = malloc(bufsize)) == NULL)) { - (void) fprintf(stderr, "Unable to read history: " - "out of memory\n"); - return; - } - } - } while (len != 0); - free(buf); - - (void) printf("\nHistory:\n"); - for (unsigned i = 0; i < num; i++) { - uint64_t time, txg, ievent; - char *cmd, *intstr; - boolean_t printed = B_FALSE; - - if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, - &time) != 0) - goto next; - if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, - &cmd) != 0) { - if (nvlist_lookup_uint64(events[i], - ZPOOL_HIST_INT_EVENT, &ievent) != 0) - goto next; - verify(nvlist_lookup_uint64(events[i], - ZPOOL_HIST_TXG, &txg) == 0); - verify(nvlist_lookup_string(events[i], - ZPOOL_HIST_INT_STR, &intstr) == 0); - if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) - goto next; - - (void) snprintf(internalstr, - sizeof (internalstr), - "[internal %s txg:%ju] %s", - zfs_history_event_names[ievent], (uintmax_t)txg, - intstr); - cmd = internalstr; - } - tsec = time; - (void) localtime_r(&tsec, &t); - (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); - (void) printf("%s %s\n", tbuf, cmd); - printed = B_TRUE; - -next: - if (dump_opt['h'] > 1) { - if (!printed) - (void) printf("unrecognized record:\n"); - dump_nvlist(events[i], 2); - } - } -} - -/*ARGSUSED*/ -static void -dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) -{ -} - -static uint64_t -blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, - const zbookmark_phys_t *zb) -{ - if (dnp == NULL) { - ASSERT(zb->zb_level < 0); - if (zb->zb_object == 0) - return (zb->zb_blkid); - return (zb->zb_blkid * BP_GET_LSIZE(bp)); - } - - ASSERT(zb->zb_level >= 0); - - return ((zb->zb_blkid << - (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * - dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); -} - -static void -snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) -{ - const dva_t *dva = bp->blk_dva; - int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; - - if (dump_opt['b'] >= 6) { - snprintf_blkptr(blkbuf, buflen, bp); - return; - } - - if (BP_IS_EMBEDDED(bp)) { - (void) sprintf(blkbuf, - "EMBEDDED et=%u %llxL/%llxP B=%llu", - (int)BPE_GET_ETYPE(bp), - (u_longlong_t)BPE_GET_LSIZE(bp), - (u_longlong_t)BPE_GET_PSIZE(bp), - (u_longlong_t)bp->blk_birth); - return; - } - - blkbuf[0] = '\0'; - for (int i = 0; i < ndvas; i++) - (void) snprintf(blkbuf + strlen(blkbuf), - buflen - strlen(blkbuf), "%llu:%llx:%llx ", - (u_longlong_t)DVA_GET_VDEV(&dva[i]), - (u_longlong_t)DVA_GET_OFFSET(&dva[i]), - (u_longlong_t)DVA_GET_ASIZE(&dva[i])); - - if (BP_IS_HOLE(bp)) { - (void) snprintf(blkbuf + strlen(blkbuf), - buflen - strlen(blkbuf), - "%llxL B=%llu", - (u_longlong_t)BP_GET_LSIZE(bp), - (u_longlong_t)bp->blk_birth); - } else { - (void) snprintf(blkbuf + strlen(blkbuf), - buflen - strlen(blkbuf), - "%llxL/%llxP F=%llu B=%llu/%llu", - (u_longlong_t)BP_GET_LSIZE(bp), - (u_longlong_t)BP_GET_PSIZE(bp), - (u_longlong_t)BP_GET_FILL(bp), - (u_longlong_t)bp->blk_birth, - (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); - } -} - -static void -print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb, - const dnode_phys_t *dnp) -{ - char blkbuf[BP_SPRINTF_LEN]; - int l; - - if (!BP_IS_EMBEDDED(bp)) { - ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); - ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); - } - - (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); - - ASSERT(zb->zb_level >= 0); - - for (l = dnp->dn_nlevels - 1; l >= -1; l--) { - if (l == zb->zb_level) { - (void) printf("L%llx", (u_longlong_t)zb->zb_level); - } else { - (void) printf(" "); - } - } - - snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); - (void) printf("%s\n", blkbuf); -} - -static int -visit_indirect(spa_t *spa, const dnode_phys_t *dnp, - blkptr_t *bp, const zbookmark_phys_t *zb) -{ - int err = 0; - - if (bp->blk_birth == 0) - return (0); - - print_indirect(bp, zb, dnp); - - if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { - arc_flags_t flags = ARC_FLAG_WAIT; - int i; - blkptr_t *cbp; - int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; - arc_buf_t *buf; - uint64_t fill = 0; - - err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) - return (err); - ASSERT(buf->b_data); - - /* recursively visit blocks below this */ - cbp = buf->b_data; - for (i = 0; i < epb; i++, cbp++) { - zbookmark_phys_t czb; - - SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); - err = visit_indirect(spa, dnp, cbp, &czb); - if (err) - break; - fill += BP_GET_FILL(cbp); - } - if (!err) - ASSERT3U(fill, ==, BP_GET_FILL(bp)); - arc_buf_destroy(buf, &buf); - } - - return (err); -} - -/*ARGSUSED*/ -static void -dump_indirect(dnode_t *dn) -{ - dnode_phys_t *dnp = dn->dn_phys; - int j; - zbookmark_phys_t czb; - - (void) printf("Indirect blocks:\n"); - - SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), - dn->dn_object, dnp->dn_nlevels - 1, 0); - for (j = 0; j < dnp->dn_nblkptr; j++) { - czb.zb_blkid = j; - (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, - &dnp->dn_blkptr[j], &czb); - } - - (void) printf("\n"); -} - -/*ARGSUSED*/ -static void -dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) -{ - dsl_dir_phys_t *dd = data; - time_t crtime; - char nice[32]; - - /* make sure nicenum has enough space */ - CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ); - - if (dd == NULL) - return; - - ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); - - crtime = dd->dd_creation_time; - (void) printf("\t\tcreation_time = %s", ctime(&crtime)); - (void) printf("\t\thead_dataset_obj = %llu\n", - (u_longlong_t)dd->dd_head_dataset_obj); - (void) printf("\t\tparent_dir_obj = %llu\n", - (u_longlong_t)dd->dd_parent_obj); - (void) printf("\t\torigin_obj = %llu\n", - (u_longlong_t)dd->dd_origin_obj); - (void) printf("\t\tchild_dir_zapobj = %llu\n", - (u_longlong_t)dd->dd_child_dir_zapobj); - zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice)); - (void) printf("\t\tused_bytes = %s\n", nice); - zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice)); - (void) printf("\t\tcompressed_bytes = %s\n", nice); - zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice)); - (void) printf("\t\tuncompressed_bytes = %s\n", nice); - zdb_nicenum(dd->dd_quota, nice, sizeof (nice)); - (void) printf("\t\tquota = %s\n", nice); - zdb_nicenum(dd->dd_reserved, nice, sizeof (nice)); - (void) printf("\t\treserved = %s\n", nice); - (void) printf("\t\tprops_zapobj = %llu\n", - (u_longlong_t)dd->dd_props_zapobj); - (void) printf("\t\tdeleg_zapobj = %llu\n", - (u_longlong_t)dd->dd_deleg_zapobj); - (void) printf("\t\tflags = %llx\n", - (u_longlong_t)dd->dd_flags); - -#define DO(which) \ - zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \ - sizeof (nice)); \ - (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) - DO(HEAD); - DO(SNAP); - DO(CHILD); - DO(CHILD_RSRV); - DO(REFRSRV); -#undef DO - (void) printf("\t\tclones = %llu\n", - (u_longlong_t)dd->dd_clones); -} - -/*ARGSUSED*/ -static void -dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) -{ - dsl_dataset_phys_t *ds = data; - time_t crtime; - char used[32], compressed[32], uncompressed[32], unique[32]; - char blkbuf[BP_SPRINTF_LEN]; - - /* make sure nicenum has enough space */ - CTASSERT(sizeof (used) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ); - - if (ds == NULL) - return; - - ASSERT(size == sizeof (*ds)); - crtime = ds->ds_creation_time; - zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used)); - zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed)); - zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed, - sizeof (uncompressed)); - zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique)); - snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); - - (void) printf("\t\tdir_obj = %llu\n", - (u_longlong_t)ds->ds_dir_obj); - (void) printf("\t\tprev_snap_obj = %llu\n", - (u_longlong_t)ds->ds_prev_snap_obj); - (void) printf("\t\tprev_snap_txg = %llu\n", - (u_longlong_t)ds->ds_prev_snap_txg); - (void) printf("\t\tnext_snap_obj = %llu\n", - (u_longlong_t)ds->ds_next_snap_obj); - (void) printf("\t\tsnapnames_zapobj = %llu\n", - (u_longlong_t)ds->ds_snapnames_zapobj); - (void) printf("\t\tnum_children = %llu\n", - (u_longlong_t)ds->ds_num_children); - (void) printf("\t\tuserrefs_obj = %llu\n", - (u_longlong_t)ds->ds_userrefs_obj); - (void) printf("\t\tcreation_time = %s", ctime(&crtime)); - (void) printf("\t\tcreation_txg = %llu\n", - (u_longlong_t)ds->ds_creation_txg); - (void) printf("\t\tdeadlist_obj = %llu\n", - (u_longlong_t)ds->ds_deadlist_obj); - (void) printf("\t\tused_bytes = %s\n", used); - (void) printf("\t\tcompressed_bytes = %s\n", compressed); - (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); - (void) printf("\t\tunique = %s\n", unique); - (void) printf("\t\tfsid_guid = %llu\n", - (u_longlong_t)ds->ds_fsid_guid); - (void) printf("\t\tguid = %llu\n", - (u_longlong_t)ds->ds_guid); - (void) printf("\t\tflags = %llx\n", - (u_longlong_t)ds->ds_flags); - (void) printf("\t\tnext_clones_obj = %llu\n", - (u_longlong_t)ds->ds_next_clones_obj); - (void) printf("\t\tprops_obj = %llu\n", - (u_longlong_t)ds->ds_props_obj); - (void) printf("\t\tbp = %s\n", blkbuf); -} - -/* ARGSUSED */ -static int -dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -{ - char blkbuf[BP_SPRINTF_LEN]; - - if (bp->blk_birth != 0) { - snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); - (void) printf("\t%s\n", blkbuf); - } - return (0); -} - -static void -dump_bptree(objset_t *os, uint64_t obj, const char *name) -{ - char bytes[32]; - bptree_phys_t *bt; - dmu_buf_t *db; - - /* make sure nicenum has enough space */ - CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); - - if (dump_opt['d'] < 3) - return; - - VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; - zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes)); - (void) printf("\n %s: %llu datasets, %s\n", - name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); - dmu_buf_rele(db, FTAG); - - if (dump_opt['d'] < 5) - return; - - (void) printf("\n"); - - (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); -} - -/* ARGSUSED */ -static int -dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -{ - char blkbuf[BP_SPRINTF_LEN]; - - ASSERT(bp->blk_birth != 0); - snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); - (void) printf("\t%s\n", blkbuf); - return (0); -} - -static void -dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) -{ - char bytes[32]; - char comp[32]; - char uncomp[32]; - - /* make sure nicenum has enough space */ - CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); - - if (dump_opt['d'] < 3) - return; - - zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes)); - if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { - zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); - zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); - (void) printf(" %*s: object %llu, %llu local blkptrs, " - "%llu subobjs in object %llu, %s (%s/%s comp)\n", - indent * 8, name, - (u_longlong_t)bpo->bpo_object, - (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, - (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, - (u_longlong_t)bpo->bpo_phys->bpo_subobjs, - bytes, comp, uncomp); - - for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { - uint64_t subobj; - bpobj_t subbpo; - int error; - VERIFY0(dmu_read(bpo->bpo_os, - bpo->bpo_phys->bpo_subobjs, - i * sizeof (subobj), sizeof (subobj), &subobj, 0)); - error = bpobj_open(&subbpo, bpo->bpo_os, subobj); - if (error != 0) { - (void) printf("ERROR %u while trying to open " - "subobj id %llu\n", - error, (u_longlong_t)subobj); - continue; - } - dump_full_bpobj(&subbpo, "subobj", indent + 1); - bpobj_close(&subbpo); - } - } else { - (void) printf(" %*s: object %llu, %llu blkptrs, %s\n", - indent * 8, name, - (u_longlong_t)bpo->bpo_object, - (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, - bytes); - } - - if (dump_opt['d'] < 5) - return; - - - if (indent == 0) { - (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); - (void) printf("\n"); - } -} - -static void -bpobj_count_refd(bpobj_t *bpo) -{ - mos_obj_refd(bpo->bpo_object); - - if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { - mos_obj_refd(bpo->bpo_phys->bpo_subobjs); - for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { - uint64_t subobj; - bpobj_t subbpo; - int error; - VERIFY0(dmu_read(bpo->bpo_os, - bpo->bpo_phys->bpo_subobjs, - i * sizeof (subobj), sizeof (subobj), &subobj, 0)); - error = bpobj_open(&subbpo, bpo->bpo_os, subobj); - if (error != 0) { - (void) printf("ERROR %u while trying to open " - "subobj id %llu\n", - error, (u_longlong_t)subobj); - continue; - } - bpobj_count_refd(&subbpo); - bpobj_close(&subbpo); - } - } -} - -static void -dump_deadlist(dsl_deadlist_t *dl) -{ - dsl_deadlist_entry_t *dle; - uint64_t unused; - char bytes[32]; - char comp[32]; - char uncomp[32]; - uint64_t empty_bpobj = - dmu_objset_spa(dl->dl_os)->spa_dsl_pool->dp_empty_bpobj; - - /* force the tree to be loaded */ - dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused); - - if (dl->dl_oldfmt) { - if (dl->dl_bpobj.bpo_object != empty_bpobj) - bpobj_count_refd(&dl->dl_bpobj); - } else { - mos_obj_refd(dl->dl_object); - for (dle = avl_first(&dl->dl_tree); dle; - dle = AVL_NEXT(&dl->dl_tree, dle)) { - if (dle->dle_bpobj.bpo_object != empty_bpobj) - bpobj_count_refd(&dle->dle_bpobj); - } - } - - /* make sure nicenum has enough space */ - CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); - - if (dump_opt['d'] < 3) - return; - - if (dl->dl_oldfmt) { - dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); - return; - } - - zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); - zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); - zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); - (void) printf("\n Deadlist: %s (%s/%s comp)\n", - bytes, comp, uncomp); - - if (dump_opt['d'] < 4) - return; - - (void) printf("\n"); - - for (dle = avl_first(&dl->dl_tree); dle; - dle = AVL_NEXT(&dl->dl_tree, dle)) { - if (dump_opt['d'] >= 5) { - char buf[128]; - (void) snprintf(buf, sizeof (buf), - "mintxg %llu -> obj %llu", - (longlong_t)dle->dle_mintxg, - (longlong_t)dle->dle_bpobj.bpo_object); - dump_full_bpobj(&dle->dle_bpobj, buf, 0); - } else { - (void) printf("mintxg %llu -> obj %llu\n", - (longlong_t)dle->dle_mintxg, - (longlong_t)dle->dle_bpobj.bpo_object); - } - } -} - -static avl_tree_t idx_tree; -static avl_tree_t domain_tree; -static boolean_t fuid_table_loaded; -static objset_t *sa_os = NULL; -static sa_attr_type_t *sa_attr_table = NULL; - -static int -open_objset(const char *path, dmu_objset_type_t type, void *tag, objset_t **osp) -{ - int err; - uint64_t sa_attrs = 0; - uint64_t version = 0; - - VERIFY3P(sa_os, ==, NULL); - err = dmu_objset_own(path, type, B_TRUE, tag, osp); - if (err != 0) { - (void) fprintf(stderr, "failed to own dataset '%s': %s\n", path, - strerror(err)); - return (err); - } - - if (dmu_objset_type(*osp) == DMU_OST_ZFS) { - (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, - 8, 1, &version); - if (version >= ZPL_VERSION_SA) { - (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, - 8, 1, &sa_attrs); - } - err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, - &sa_attr_table); - if (err != 0) { - (void) fprintf(stderr, "sa_setup failed: %s\n", - strerror(err)); - dmu_objset_disown(*osp, tag); - *osp = NULL; - } - } - sa_os = *osp; - - return (0); -} - -static void -close_objset(objset_t *os, void *tag) -{ - VERIFY3P(os, ==, sa_os); - if (os->os_sa != NULL) - sa_tear_down(os); - dmu_objset_disown(os, tag); - sa_attr_table = NULL; - sa_os = NULL; -} - -static void -fuid_table_destroy() -{ - if (fuid_table_loaded) { - zfs_fuid_table_destroy(&idx_tree, &domain_tree); - fuid_table_loaded = B_FALSE; - } -} - -/* - * print uid or gid information. - * For normal POSIX id just the id is printed in decimal format. - * For CIFS files with FUID the fuid is printed in hex followed by - * the domain-rid string. - */ -static void -print_idstr(uint64_t id, const char *id_type) -{ - if (FUID_INDEX(id)) { - char *domain; - - domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); - (void) printf("\t%s %llx [%s-%d]\n", id_type, - (u_longlong_t)id, domain, (int)FUID_RID(id)); - } else { - (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); - } - -} - -static void -dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) -{ - uint32_t uid_idx, gid_idx; - - uid_idx = FUID_INDEX(uid); - gid_idx = FUID_INDEX(gid); - - /* Load domain table, if not already loaded */ - if (!fuid_table_loaded && (uid_idx || gid_idx)) { - uint64_t fuid_obj; - - /* first find the fuid object. It lives in the master node */ - VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, - 8, 1, &fuid_obj) == 0); - zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); - (void) zfs_fuid_table_load(os, fuid_obj, - &idx_tree, &domain_tree); - fuid_table_loaded = B_TRUE; - } - - print_idstr(uid, "uid"); - print_idstr(gid, "gid"); -} - -/*ARGSUSED*/ -static void -dump_znode(objset_t *os, uint64_t object, void *data, size_t size) -{ - char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ - sa_handle_t *hdl; - uint64_t xattr, rdev, gen; - uint64_t uid, gid, mode, fsize, parent, links; - uint64_t pflags; - uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; - time_t z_crtime, z_atime, z_mtime, z_ctime; - sa_bulk_attr_t bulk[12]; - int idx = 0; - int error; - - VERIFY3P(os, ==, sa_os); - if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { - (void) printf("Failed to get handle for SA znode\n"); - return; - } - - SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); - SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); - SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, - &links, 8); - SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); - SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, - &mode, 8); - SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], - NULL, &parent, 8); - SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, - &fsize, 8); - SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, - acctm, 16); - SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, - modtm, 16); - SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, - crtm, 16); - SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, - chgtm, 16); - SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, - &pflags, 8); - - if (sa_bulk_lookup(hdl, bulk, idx)) { - (void) sa_handle_destroy(hdl); - return; - } - - z_crtime = (time_t)crtm[0]; - z_atime = (time_t)acctm[0]; - z_mtime = (time_t)modtm[0]; - z_ctime = (time_t)chgtm[0]; - - if (dump_opt['d'] > 4) { - error = zfs_obj_to_path(os, object, path, sizeof (path)); - if (error == ESTALE) { - (void) snprintf(path, sizeof (path), "on delete queue"); - } else if (error != 0) { - leaked_objects++; - (void) snprintf(path, sizeof (path), - "path not found, possibly leaked"); - } - (void) printf("\tpath %s\n", path); - } - dump_uidgid(os, uid, gid); - (void) printf("\tatime %s", ctime(&z_atime)); - (void) printf("\tmtime %s", ctime(&z_mtime)); - (void) printf("\tctime %s", ctime(&z_ctime)); - (void) printf("\tcrtime %s", ctime(&z_crtime)); - (void) printf("\tgen %llu\n", (u_longlong_t)gen); - (void) printf("\tmode %llo\n", (u_longlong_t)mode); - (void) printf("\tsize %llu\n", (u_longlong_t)fsize); - (void) printf("\tparent %llu\n", (u_longlong_t)parent); - (void) printf("\tlinks %llu\n", (u_longlong_t)links); - (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); - if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, - sizeof (uint64_t)) == 0) - (void) printf("\txattr %llu\n", (u_longlong_t)xattr); - if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, - sizeof (uint64_t)) == 0) - (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); - sa_handle_destroy(hdl); -} - -/*ARGSUSED*/ -static void -dump_acl(objset_t *os, uint64_t object, void *data, size_t size) -{ -} - -/*ARGSUSED*/ -static void -dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) -{ -} - -static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { - dump_none, /* unallocated */ - dump_zap, /* object directory */ - dump_uint64, /* object array */ - dump_none, /* packed nvlist */ - dump_packed_nvlist, /* packed nvlist size */ - dump_none, /* bpobj */ - dump_bpobj, /* bpobj header */ - dump_none, /* SPA space map header */ - dump_none, /* SPA space map */ - dump_none, /* ZIL intent log */ - dump_dnode, /* DMU dnode */ - dump_dmu_objset, /* DMU objset */ - dump_dsl_dir, /* DSL directory */ - dump_zap, /* DSL directory child map */ - dump_zap, /* DSL dataset snap map */ - dump_zap, /* DSL props */ - dump_dsl_dataset, /* DSL dataset */ - dump_znode, /* ZFS znode */ - dump_acl, /* ZFS V0 ACL */ - dump_uint8, /* ZFS plain file */ - dump_zpldir, /* ZFS directory */ - dump_zap, /* ZFS master node */ - dump_zap, /* ZFS delete queue */ - dump_uint8, /* zvol object */ - dump_zap, /* zvol prop */ - dump_uint8, /* other uint8[] */ - dump_uint64, /* other uint64[] */ - dump_zap, /* other ZAP */ - dump_zap, /* persistent error log */ - dump_uint8, /* SPA history */ - dump_history_offsets, /* SPA history offsets */ - dump_zap, /* Pool properties */ - dump_zap, /* DSL permissions */ - dump_acl, /* ZFS ACL */ - dump_uint8, /* ZFS SYSACL */ - dump_none, /* FUID nvlist */ - dump_packed_nvlist, /* FUID nvlist size */ - dump_zap, /* DSL dataset next clones */ - dump_zap, /* DSL scrub queue */ - dump_zap, /* ZFS user/group used */ - dump_zap, /* ZFS user/group quota */ - dump_zap, /* snapshot refcount tags */ - dump_ddt_zap, /* DDT ZAP object */ - dump_zap, /* DDT statistics */ - dump_znode, /* SA object */ - dump_zap, /* SA Master Node */ - dump_sa_attrs, /* SA attribute registration */ - dump_sa_layouts, /* SA attribute layouts */ - dump_zap, /* DSL scrub translations */ - dump_none, /* fake dedup BP */ - dump_zap, /* deadlist */ - dump_none, /* deadlist hdr */ - dump_zap, /* dsl clones */ - dump_bpobj_subobjs, /* bpobj subobjs */ - dump_unknown, /* Unknown type, must be last */ -}; - -static void -dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header, - uint64_t *dnode_slots_used) -{ - dmu_buf_t *db = NULL; - dmu_object_info_t doi; - dnode_t *dn; - void *bonus = NULL; - size_t bsize = 0; - char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32]; - char bonus_size[32]; - char aux[50]; - int error; - - /* make sure nicenum has enough space */ - CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ); - - if (*print_header) { - (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", - "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", - "lsize", "%full", "type"); - *print_header = 0; - } - - if (object == 0) { - dn = DMU_META_DNODE(os); - } else { - error = dmu_bonus_hold(os, object, FTAG, &db); - if (error) - fatal("dmu_bonus_hold(%llu) failed, errno %u", - object, error); - bonus = db->db_data; - bsize = db->db_size; - dn = DB_DNODE((dmu_buf_impl_t *)db); - } - dmu_object_info_from_dnode(dn, &doi); - - if (dnode_slots_used != NULL) - *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; - - zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); - zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); - zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); - zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); - zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); - zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize)); - (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * - doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / - doi.doi_max_offset); - - aux[0] = '\0'; - - if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { - (void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)", - ZDB_CHECKSUM_NAME(doi.doi_checksum)); - } - - if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { - (void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)", - ZDB_COMPRESS_NAME(doi.doi_compress)); - } - - (void) printf("%10" PRIu64 - " %3u %5s %5s %5s %5s %5s %6s %s%s\n", - object, doi.doi_indirection, iblk, dblk, - asize, dnsize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux); - - if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { - (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n", - "", "", "", "", "", "", bonus_size, "bonus", - ZDB_OT_NAME(doi.doi_bonus_type)); - } - - if (verbosity >= 4) { - (void) printf("\tdnode flags: %s%s%s\n", - (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? - "USED_BYTES " : "", - (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? - "USERUSED_ACCOUNTED " : "", - (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? - "SPILL_BLKPTR" : ""); - (void) printf("\tdnode maxblkid: %llu\n", - (longlong_t)dn->dn_phys->dn_maxblkid); - - object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object, - bonus, bsize); - object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0); - *print_header = 1; - } - - if (verbosity >= 5) - dump_indirect(dn); - - if (verbosity >= 5) { - /* - * Report the list of segments that comprise the object. - */ - uint64_t start = 0; - uint64_t end; - uint64_t blkfill = 1; - int minlvl = 1; - - if (dn->dn_type == DMU_OT_DNODE) { - minlvl = 0; - blkfill = DNODES_PER_BLOCK; - } - - for (;;) { - char segsize[32]; - /* make sure nicenum has enough space */ - CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ); - error = dnode_next_offset(dn, - 0, &start, minlvl, blkfill, 0); - if (error) - break; - end = start; - error = dnode_next_offset(dn, - DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); - zdb_nicenum(end - start, segsize, sizeof (segsize)); - (void) printf("\t\tsegment [%016llx, %016llx)" - " size %5s\n", (u_longlong_t)start, - (u_longlong_t)end, segsize); - if (error) - break; - start = end; - } - } - - if (db != NULL) - dmu_buf_rele(db, FTAG); -} - -static void -count_dir_mos_objects(dsl_dir_t *dd) -{ - mos_obj_refd(dd->dd_object); - mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj); - mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj); - mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj); - mos_obj_refd(dsl_dir_phys(dd)->dd_clones); -} - -static void -count_ds_mos_objects(dsl_dataset_t *ds) -{ - mos_obj_refd(ds->ds_object); - mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj); - mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj); - mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj); - mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj); - - if (!dsl_dataset_is_snapshot(ds)) { - count_dir_mos_objects(ds->ds_dir); - } -} - -static const char *objset_types[DMU_OST_NUMTYPES] = { - "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; - -static void -dump_dir(objset_t *os) -{ - dmu_objset_stats_t dds; - uint64_t object, object_count; - uint64_t refdbytes, usedobjs, scratch; - char numbuf[32]; - char blkbuf[BP_SPRINTF_LEN + 20]; - char osname[ZFS_MAX_DATASET_NAME_LEN]; - const char *type = "UNKNOWN"; - int verbosity = dump_opt['d']; - int print_header = 1; - unsigned i; - int error; - uint64_t total_slots_used = 0; - uint64_t max_slot_used = 0; - uint64_t dnode_slots; - - /* make sure nicenum has enough space */ - CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ); - - dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - dmu_objset_fast_stat(os, &dds); - dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - - if (dds.dds_type < DMU_OST_NUMTYPES) - type = objset_types[dds.dds_type]; - - if (dds.dds_type == DMU_OST_META) { - dds.dds_creation_txg = TXG_INITIAL; - usedobjs = BP_GET_FILL(os->os_rootbp); - refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> - dd_used_bytes; - } else { - dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); - } - - ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); - - zdb_nicenum(refdbytes, numbuf, sizeof (numbuf)); - - if (verbosity >= 4) { - (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); - (void) snprintf_blkptr(blkbuf + strlen(blkbuf), - sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); - } else { - blkbuf[0] = '\0'; - } - - dmu_objset_name(os, osname); - - (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " - "%s, %llu objects%s%s\n", - osname, type, (u_longlong_t)dmu_objset_id(os), - (u_longlong_t)dds.dds_creation_txg, - numbuf, (u_longlong_t)usedobjs, blkbuf, - (dds.dds_inconsistent) ? " (inconsistent)" : ""); - - if (zopt_objects != 0) { - for (i = 0; i < zopt_objects; i++) - dump_object(os, zopt_object[i], verbosity, - &print_header, NULL); - (void) printf("\n"); - return; - } - - if (dump_opt['i'] != 0 || verbosity >= 2) - dump_intent_log(dmu_objset_zil(os)); - - if (dmu_objset_ds(os) != NULL) { - dsl_dataset_t *ds = dmu_objset_ds(os); - dump_deadlist(&ds->ds_deadlist); - - if (dsl_dataset_remap_deadlist_exists(ds)) { - (void) printf("ds_remap_deadlist:\n"); - dump_deadlist(&ds->ds_remap_deadlist); - } - count_ds_mos_objects(ds); - } - - if (verbosity < 2) - return; - - if (BP_IS_HOLE(os->os_rootbp)) - return; - - dump_object(os, 0, verbosity, &print_header, NULL); - object_count = 0; - if (DMU_USERUSED_DNODE(os) != NULL && - DMU_USERUSED_DNODE(os)->dn_type != 0) { - dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, - NULL); - dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, - NULL); - } - - object = 0; - while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { - dump_object(os, object, verbosity, &print_header, &dnode_slots); - object_count++; - total_slots_used += dnode_slots; - max_slot_used = object + dnode_slots - 1; - } - - (void) printf("\n"); - - (void) printf(" Dnode slots:\n"); - (void) printf("\tTotal used: %10llu\n", - (u_longlong_t)total_slots_used); - (void) printf("\tMax used: %10llu\n", - (u_longlong_t)max_slot_used); - (void) printf("\tPercent empty: %10lf\n", - (double)(max_slot_used - total_slots_used)*100 / - (double)max_slot_used); - - (void) printf("\n"); - - if (error != ESRCH) { - (void) fprintf(stderr, "dmu_object_next() = %d\n", error); - abort(); - } - - ASSERT3U(object_count, ==, usedobjs); - - if (leaked_objects != 0) { - (void) printf("%d potentially leaked objects detected\n", - leaked_objects); - leaked_objects = 0; - } -} - -static void -dump_uberblock(uberblock_t *ub, const char *header, const char *footer) -{ - time_t timestamp = ub->ub_timestamp; - - (void) printf("%s", header ? header : ""); - (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); - (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); - (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); - (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); - (void) printf("\ttimestamp = %llu UTC = %s", - (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); - - (void) printf("\tmmp_magic = %016llx\n", - (u_longlong_t)ub->ub_mmp_magic); - if (MMP_VALID(ub)) { - (void) printf("\tmmp_delay = %0llu\n", - (u_longlong_t)ub->ub_mmp_delay); - if (MMP_SEQ_VALID(ub)) - (void) printf("\tmmp_seq = %u\n", - (unsigned int) MMP_SEQ(ub)); - if (MMP_FAIL_INT_VALID(ub)) - (void) printf("\tmmp_fail = %u\n", - (unsigned int) MMP_FAIL_INT(ub)); - if (MMP_INTERVAL_VALID(ub)) - (void) printf("\tmmp_write = %u\n", - (unsigned int) MMP_INTERVAL(ub)); - /* After MMP_* to make summarize_uberblock_mmp cleaner */ - (void) printf("\tmmp_valid = %x\n", - (unsigned int) ub->ub_mmp_config & 0xFF); - } - - if (dump_opt['u'] >= 3) { - char blkbuf[BP_SPRINTF_LEN]; - snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); - (void) printf("\trootbp = %s\n", blkbuf); - } - (void) printf("\tcheckpoint_txg = %llu\n", - (u_longlong_t)ub->ub_checkpoint_txg); - (void) printf("%s", footer ? footer : ""); -} - -static void -dump_config(spa_t *spa) -{ - dmu_buf_t *db; - size_t nvsize = 0; - int error = 0; - - - error = dmu_bonus_hold(spa->spa_meta_objset, - spa->spa_config_object, FTAG, &db); - - if (error == 0) { - nvsize = *(uint64_t *)db->db_data; - dmu_buf_rele(db, FTAG); - - (void) printf("\nMOS Configuration:\n"); - dump_packed_nvlist(spa->spa_meta_objset, - spa->spa_config_object, (void *)&nvsize, 1); - } else { - (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", - (u_longlong_t)spa->spa_config_object, error); - } -} - -static void -dump_cachefile(const char *cachefile) -{ - int fd; - struct stat64 statbuf; - char *buf; - nvlist_t *config; - - if ((fd = open64(cachefile, O_RDONLY)) < 0) { - (void) fprintf(stderr, "cannot open '%s': %s\n", cachefile, - strerror(errno)); - exit(1); - } - - if (fstat64(fd, &statbuf) != 0) { - (void) fprintf(stderr, "failed to stat '%s': %s\n", cachefile, - strerror(errno)); - exit(1); - } - - if ((buf = malloc(statbuf.st_size)) == NULL) { - (void) fprintf(stderr, "failed to allocate %llu bytes\n", - (u_longlong_t)statbuf.st_size); - exit(1); - } - - if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { - (void) fprintf(stderr, "failed to read %llu bytes\n", - (u_longlong_t)statbuf.st_size); - exit(1); - } - - (void) close(fd); - - if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { - (void) fprintf(stderr, "failed to unpack nvlist\n"); - exit(1); - } - - free(buf); - - dump_nvlist(config, 0); - - nvlist_free(config); -} - -#define ZDB_MAX_UB_HEADER_SIZE 32 - -static void -dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift) -{ - vdev_t vd; - vdev_t *vdp = &vd; - char header[ZDB_MAX_UB_HEADER_SIZE]; - - vd.vdev_ashift = ashift; - vdp->vdev_top = vdp; - - for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) { - uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i); - uberblock_t *ub = (void *)((char *)lbl + uoff); - - if (uberblock_verify(ub)) - continue; - - if ((dump_opt['u'] < 4) && - (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay && - (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL)) - continue; - - (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, - "Uberblock[%d]\n", i); - dump_uberblock(ub, header, ""); - } -} - -static char curpath[PATH_MAX]; - -/* - * Iterate through the path components, recursively passing - * current one's obj and remaining path until we find the obj - * for the last one. - */ -static int -dump_path_impl(objset_t *os, uint64_t obj, char *name) -{ - int err; - int header = 1; - uint64_t child_obj; - char *s; - dmu_buf_t *db; - dmu_object_info_t doi; - - if ((s = strchr(name, '/')) != NULL) - *s = '\0'; - err = zap_lookup(os, obj, name, 8, 1, &child_obj); - - (void) strlcat(curpath, name, sizeof (curpath)); - - if (err != 0) { - (void) fprintf(stderr, "failed to lookup %s: %s\n", - curpath, strerror(err)); - return (err); - } - - child_obj = ZFS_DIRENT_OBJ(child_obj); - err = sa_buf_hold(os, child_obj, FTAG, &db); - if (err != 0) { - (void) fprintf(stderr, - "failed to get SA dbuf for obj %llu: %s\n", - (u_longlong_t)child_obj, strerror(err)); - return (EINVAL); - } - dmu_object_info_from_db(db, &doi); - sa_buf_rele(db, FTAG); - - if (doi.doi_bonus_type != DMU_OT_SA && - doi.doi_bonus_type != DMU_OT_ZNODE) { - (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", - doi.doi_bonus_type, (u_longlong_t)child_obj); - return (EINVAL); - } - - if (dump_opt['v'] > 6) { - (void) printf("obj=%llu %s type=%d bonustype=%d\n", - (u_longlong_t)child_obj, curpath, doi.doi_type, - doi.doi_bonus_type); - } - - (void) strlcat(curpath, "/", sizeof (curpath)); - - switch (doi.doi_type) { - case DMU_OT_DIRECTORY_CONTENTS: - if (s != NULL && *(s + 1) != '\0') - return (dump_path_impl(os, child_obj, s + 1)); - /*FALLTHROUGH*/ - case DMU_OT_PLAIN_FILE_CONTENTS: - dump_object(os, child_obj, dump_opt['v'], &header, NULL); - return (0); - default: - (void) fprintf(stderr, "object %llu has non-file/directory " - "type %d\n", (u_longlong_t)obj, doi.doi_type); - break; - } - - return (EINVAL); -} - -/* - * Dump the blocks for the object specified by path inside the dataset. - */ -static int -dump_path(char *ds, char *path) -{ - int err; - objset_t *os; - uint64_t root_obj; - - err = open_objset(ds, DMU_OST_ZFS, FTAG, &os); - if (err != 0) - return (err); - - err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); - if (err != 0) { - (void) fprintf(stderr, "can't lookup root znode: %s\n", - strerror(err)); - dmu_objset_disown(os, FTAG); - return (EINVAL); - } - - (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); - - err = dump_path_impl(os, root_obj, path); - - close_objset(os, FTAG); - return (err); -} - -static int -dump_label(const char *dev) -{ - int fd; - vdev_label_t label; - char path[MAXPATHLEN]; - char *buf = label.vl_vdev_phys.vp_nvlist; - size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist); - struct stat64 statbuf; - uint64_t psize, ashift; - boolean_t label_found = B_FALSE; - - (void) strlcpy(path, dev, sizeof (path)); - if (dev[0] == '/') { - if (strncmp(dev, ZFS_DISK_ROOTD, - strlen(ZFS_DISK_ROOTD)) == 0) { - (void) snprintf(path, sizeof (path), "%s%s", - ZFS_RDISK_ROOTD, dev + strlen(ZFS_DISK_ROOTD)); - } - } else if (stat64(path, &statbuf) != 0) { - char *s; - - (void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD, - dev); - if (((s = strrchr(dev, 's')) == NULL && - (s = strchr(dev, 'p')) == NULL) || - !isdigit(*(s + 1))) - (void) strlcat(path, "s0", sizeof (path)); - } - - if ((fd = open64(path, O_RDONLY)) < 0) { - (void) fprintf(stderr, "cannot open '%s': %s\n", path, - strerror(errno)); - exit(1); - } - - if (fstat64(fd, &statbuf) != 0) { - (void) fprintf(stderr, "failed to stat '%s': %s\n", path, - strerror(errno)); - (void) close(fd); - exit(1); - } - - if (S_ISBLK(statbuf.st_mode)) { - (void) fprintf(stderr, - "cannot use '%s': character device required\n", path); - (void) close(fd); - exit(1); - } - - psize = statbuf.st_size; - psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); - - for (int l = 0; l < VDEV_LABELS; l++) { - nvlist_t *config = NULL; - - if (!dump_opt['q']) { - (void) printf("------------------------------------\n"); - (void) printf("LABEL %d\n", l); - (void) printf("------------------------------------\n"); - } - - if (pread64(fd, &label, sizeof (label), - vdev_label_offset(psize, l, 0)) != sizeof (label)) { - if (!dump_opt['q']) - (void) printf("failed to read label %d\n", l); - continue; - } - - if (nvlist_unpack(buf, buflen, &config, 0) != 0) { - if (!dump_opt['q']) - (void) printf("failed to unpack label %d\n", l); - ashift = SPA_MINBLOCKSHIFT; - } else { - nvlist_t *vdev_tree = NULL; - - if (!dump_opt['q']) - dump_nvlist(config, 4); - if ((nvlist_lookup_nvlist(config, - ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || - (nvlist_lookup_uint64(vdev_tree, - ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) - ashift = SPA_MINBLOCKSHIFT; - nvlist_free(config); - label_found = B_TRUE; - } - if (dump_opt['u']) - dump_label_uberblocks(&label, ashift); - } - - (void) close(fd); - - return (label_found ? 0 : 2); -} - -static uint64_t dataset_feature_count[SPA_FEATURES]; -static uint64_t remap_deadlist_count = 0; - -/*ARGSUSED*/ -static int -dump_one_dir(const char *dsname, void *arg) -{ - int error; - objset_t *os; - - error = open_objset(dsname, DMU_OST_ANY, FTAG, &os); - if (error != 0) - return (0); - - for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - if (!dmu_objset_ds(os)->ds_feature_inuse[f]) - continue; - ASSERT(spa_feature_table[f].fi_flags & - ZFEATURE_FLAG_PER_DATASET); - dataset_feature_count[f]++; - } - - if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) { - remap_deadlist_count++; - } - - dump_dir(os); - close_objset(os, FTAG); - fuid_table_destroy(); - return (0); -} - -/* - * Block statistics. - */ -#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) -typedef struct zdb_blkstats { - uint64_t zb_asize; - uint64_t zb_lsize; - uint64_t zb_psize; - uint64_t zb_count; - uint64_t zb_gangs; - uint64_t zb_ditto_samevdev; - uint64_t zb_ditto_same_ms; - uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; -} zdb_blkstats_t; - -/* - * Extended object types to report deferred frees and dedup auto-ditto blocks. - */ -#define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) -#define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) -#define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) -#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) - -static const char *zdb_ot_extname[] = { - "deferred free", - "dedup ditto", - "other", - "Total", -}; - -#define ZB_TOTAL DN_MAX_LEVELS - -typedef struct zdb_cb { - zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; - uint64_t zcb_removing_size; - uint64_t zcb_checkpoint_size; - uint64_t zcb_dedup_asize; - uint64_t zcb_dedup_blocks; - uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; - uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] - [BPE_PAYLOAD_SIZE]; - uint64_t zcb_start; - hrtime_t zcb_lastprint; - uint64_t zcb_totalasize; - uint64_t zcb_errors[256]; - int zcb_readfails; - int zcb_haderrors; - spa_t *zcb_spa; - uint32_t **zcb_vd_obsolete_counts; -} zdb_cb_t; - -/* test if two DVA offsets from same vdev are within the same metaslab */ -static boolean_t -same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) -{ - vdev_t *vd = vdev_lookup_top(spa, vdev); - uint64_t ms_shift = vd->vdev_ms_shift; - - return ((off1 >> ms_shift) == (off2 >> ms_shift)); -} - -static void -zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, - dmu_object_type_t type) -{ - uint64_t refcnt = 0; - - ASSERT(type < ZDB_OT_TOTAL); - - if (zilog && zil_bp_tree_add(zilog, bp) != 0) - return; - - spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); - - for (int i = 0; i < 4; i++) { - int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; - int t = (i & 1) ? type : ZDB_OT_TOTAL; - int equal; - zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; - - zb->zb_asize += BP_GET_ASIZE(bp); - zb->zb_lsize += BP_GET_LSIZE(bp); - zb->zb_psize += BP_GET_PSIZE(bp); - zb->zb_count++; - - /* - * The histogram is only big enough to record blocks up to - * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, - * "other", bucket. - */ - unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; - idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); - zb->zb_psize_histogram[idx]++; - - zb->zb_gangs += BP_COUNT_GANG(bp); - - switch (BP_GET_NDVAS(bp)) { - case 2: - if (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[1])) { - zb->zb_ditto_samevdev++; - - if (same_metaslab(zcb->zcb_spa, - DVA_GET_VDEV(&bp->blk_dva[0]), - DVA_GET_OFFSET(&bp->blk_dva[0]), - DVA_GET_OFFSET(&bp->blk_dva[1]))) - zb->zb_ditto_same_ms++; - } - break; - case 3: - equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[1])) + - (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[2])) + - (DVA_GET_VDEV(&bp->blk_dva[1]) == - DVA_GET_VDEV(&bp->blk_dva[2])); - if (equal != 0) { - zb->zb_ditto_samevdev++; - - if (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[1]) && - same_metaslab(zcb->zcb_spa, - DVA_GET_VDEV(&bp->blk_dva[0]), - DVA_GET_OFFSET(&bp->blk_dva[0]), - DVA_GET_OFFSET(&bp->blk_dva[1]))) - zb->zb_ditto_same_ms++; - else if (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[2]) && - same_metaslab(zcb->zcb_spa, - DVA_GET_VDEV(&bp->blk_dva[0]), - DVA_GET_OFFSET(&bp->blk_dva[0]), - DVA_GET_OFFSET(&bp->blk_dva[2]))) - zb->zb_ditto_same_ms++; - else if (DVA_GET_VDEV(&bp->blk_dva[1]) == - DVA_GET_VDEV(&bp->blk_dva[2]) && - same_metaslab(zcb->zcb_spa, - DVA_GET_VDEV(&bp->blk_dva[1]), - DVA_GET_OFFSET(&bp->blk_dva[1]), - DVA_GET_OFFSET(&bp->blk_dva[2]))) - zb->zb_ditto_same_ms++; - } - break; - } - } - - spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); - - if (BP_IS_EMBEDDED(bp)) { - zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; - zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] - [BPE_GET_PSIZE(bp)]++; - return; - } - - if (dump_opt['L']) - return; - - if (BP_GET_DEDUP(bp)) { - ddt_t *ddt; - ddt_entry_t *dde; - - ddt = ddt_select(zcb->zcb_spa, bp); - ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_FALSE); - - if (dde == NULL) { - refcnt = 0; - } else { - ddt_phys_t *ddp = ddt_phys_select(dde, bp); - ddt_phys_decref(ddp); - refcnt = ddp->ddp_refcnt; - if (ddt_phys_total_refcnt(dde) == 0) - ddt_remove(ddt, dde); - } - ddt_exit(ddt); - } - - VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, - refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), - bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); -} - -/* ARGSUSED */ -static void -zdb_blkptr_done(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - blkptr_t *bp = zio->io_bp; - int ioerr = zio->io_error; - zdb_cb_t *zcb = zio->io_private; - zbookmark_phys_t *zb = &zio->io_bookmark; - - abd_free(zio->io_abd); - - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_inflight--; - spa->spa_load_verify_ios--; - cv_broadcast(&spa->spa_scrub_io_cv); - - if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - char blkbuf[BP_SPRINTF_LEN]; - - zcb->zcb_haderrors = 1; - zcb->zcb_errors[ioerr]++; - - if (dump_opt['b'] >= 2) - snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); - else - blkbuf[0] = '\0'; - - (void) printf("zdb_blkptr_cb: " - "Got error %d reading " - "<%llu, %llu, %lld, %llx> %s -- skipping\n", - ioerr, - (u_longlong_t)zb->zb_objset, - (u_longlong_t)zb->zb_object, - (u_longlong_t)zb->zb_level, - (u_longlong_t)zb->zb_blkid, - blkbuf); - } - mutex_exit(&spa->spa_scrub_lock); -} - -/* ARGSUSED */ -static int -zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) -{ - zdb_cb_t *zcb = arg; - dmu_object_type_t type; - boolean_t is_metadata; - - if (bp == NULL) - return (0); - - if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { - char blkbuf[BP_SPRINTF_LEN]; - snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); - (void) printf("objset %llu object %llu " - "level %lld offset 0x%llx %s\n", - (u_longlong_t)zb->zb_objset, - (u_longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, - (u_longlong_t)blkid2offset(dnp, bp, zb), - blkbuf); - } - - if (BP_IS_HOLE(bp)) - return (0); - - type = BP_GET_TYPE(bp); - - zdb_count_block(zcb, zilog, bp, - (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); - - is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); - - if (!BP_IS_EMBEDDED(bp) && - (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { - size_t size = BP_GET_PSIZE(bp); - abd_t *abd = abd_alloc(size, B_FALSE); - int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; - - /* If it's an intent log block, failure is expected. */ - if (zb->zb_level == ZB_ZIL_LEVEL) - flags |= ZIO_FLAG_SPECULATIVE; - - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_load_verify_ios > max_inflight) - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_scrub_inflight++; - spa->spa_load_verify_ios++; - mutex_exit(&spa->spa_scrub_lock); - - zio_nowait(zio_read(NULL, spa, bp, abd, size, - zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); - } - - zcb->zcb_readfails = 0; - - /* only call gethrtime() every 100 blocks */ - static int iters; - if (++iters > 100) - iters = 0; - else - return (0); - - if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { - uint64_t now = gethrtime(); - char buf[10]; - uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; - int kb_per_sec = - 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); - int sec_remaining = - (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; - - /* make sure nicenum has enough space */ - CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ); - - zfs_nicenum(bytes, buf, sizeof (buf)); - (void) fprintf(stderr, - "\r%5s completed (%4dMB/s) " - "estimated time remaining: %uhr %02umin %02usec ", - buf, kb_per_sec / 1024, - sec_remaining / 60 / 60, - sec_remaining / 60 % 60, - sec_remaining % 60); - - zcb->zcb_lastprint = now; - } - - return (0); -} - -static void -zdb_leak(void *arg, uint64_t start, uint64_t size) -{ - vdev_t *vd = arg; - - (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", - (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); -} - -static metaslab_ops_t zdb_metaslab_ops = { - NULL /* alloc */ -}; - -static void -zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) -{ - ddt_bookmark_t ddb; - ddt_entry_t dde; - int error; - - ASSERT(!dump_opt['L']); - - bzero(&ddb, sizeof (ddb)); - while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { - blkptr_t blk; - ddt_phys_t *ddp = dde.dde_phys; - - if (ddb.ddb_class == DDT_CLASS_UNIQUE) - return; - - ASSERT(ddt_phys_total_refcnt(&dde) > 1); - - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0) - continue; - ddt_bp_create(ddb.ddb_checksum, - &dde.dde_key, ddp, &blk); - if (p == DDT_PHYS_DITTO) { - zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); - } else { - zcb->zcb_dedup_asize += - BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); - zcb->zcb_dedup_blocks++; - } - } - ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; - ddt_enter(ddt); - VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); - ddt_exit(ddt); - } - - ASSERT(error == ENOENT); -} - -/* ARGSUSED */ -static void -claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, - uint64_t size, void *arg) -{ - /* - * This callback was called through a remap from - * a device being removed. Therefore, the vdev that - * this callback is applied to is a concrete - * vdev. - */ - ASSERT(vdev_is_concrete(vd)); - - VERIFY0(metaslab_claim_impl(vd, offset, size, - spa_min_claim_txg(vd->vdev_spa))); -} - -static void -claim_segment_cb(void *arg, uint64_t offset, uint64_t size) -{ - vdev_t *vd = arg; - - vdev_indirect_ops.vdev_op_remap(vd, offset, size, - claim_segment_impl_cb, NULL); -} - -/* - * After accounting for all allocated blocks that are directly referenced, - * we might have missed a reference to a block from a partially complete - * (and thus unused) indirect mapping object. We perform a secondary pass - * through the metaslabs we have already mapped and claim the destination - * blocks. - */ -static void -zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) -{ - if (dump_opt['L']) - return; - - if (spa->spa_vdev_removal == NULL) - return; - - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - - spa_vdev_removal_t *svr = spa->spa_vdev_removal; - vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - - for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { - metaslab_t *msp = vd->vdev_ms[msi]; - - if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) - break; - - ASSERT0(range_tree_space(svr->svr_allocd_segs)); - - if (msp->ms_sm != NULL) { - VERIFY0(space_map_load(msp->ms_sm, - svr->svr_allocd_segs, SM_ALLOC)); - - /* - * Clear everything past what has been synced unless - * it's past the spacemap, because we have not allocated - * mappings for it yet. - */ - uint64_t vim_max_offset = - vdev_indirect_mapping_max_offset(vim); - uint64_t sm_end = msp->ms_sm->sm_start + - msp->ms_sm->sm_size; - if (sm_end > vim_max_offset) - range_tree_clear(svr->svr_allocd_segs, - vim_max_offset, sm_end - vim_max_offset); - } - - zcb->zcb_removing_size += - range_tree_space(svr->svr_allocd_segs); - range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); - } - - spa_config_exit(spa, SCL_CONFIG, FTAG); -} - -/* ARGSUSED */ -static int -increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -{ - zdb_cb_t *zcb = arg; - spa_t *spa = zcb->zcb_spa; - vdev_t *vd; - const dva_t *dva = &bp->blk_dva[0]; - - ASSERT(!dump_opt['L']); - ASSERT3U(BP_GET_NDVAS(bp), ==, 1); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva)); - ASSERT3P(vd, !=, NULL); - spa_config_exit(spa, SCL_VDEV, FTAG); - - ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); - ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL); - - vdev_indirect_mapping_increment_obsolete_count( - vd->vdev_indirect_mapping, - DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), - zcb->zcb_vd_obsolete_counts[vd->vdev_id]); - - return (0); -} - -static uint32_t * -zdb_load_obsolete_counts(vdev_t *vd) -{ - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - spa_t *spa = vd->vdev_spa; - spa_condensing_indirect_phys_t *scip = - &spa->spa_condensing_indirect_phys; - uint32_t *counts; - - EQUIV(vdev_obsolete_sm_object(vd) != 0, vd->vdev_obsolete_sm != NULL); - counts = vdev_indirect_mapping_load_obsolete_counts(vim); - if (vd->vdev_obsolete_sm != NULL) { - vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, - vd->vdev_obsolete_sm); - } - if (scip->scip_vdev == vd->vdev_id && - scip->scip_prev_obsolete_sm_object != 0) { - space_map_t *prev_obsolete_sm = NULL; - VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, - scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); - vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, - prev_obsolete_sm); - space_map_close(prev_obsolete_sm); - } - return (counts); -} - -typedef struct checkpoint_sm_exclude_entry_arg { - vdev_t *cseea_vd; - uint64_t cseea_checkpoint_size; -} checkpoint_sm_exclude_entry_arg_t; - -static int -checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) -{ - checkpoint_sm_exclude_entry_arg_t *cseea = arg; - vdev_t *vd = cseea->cseea_vd; - metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; - uint64_t end = sme->sme_offset + sme->sme_run; - - ASSERT(sme->sme_type == SM_FREE); - - /* - * Since the vdev_checkpoint_sm exists in the vdev level - * and the ms_sm space maps exist in the metaslab level, - * an entry in the checkpoint space map could theoretically - * cross the boundaries of the metaslab that it belongs. - * - * In reality, because of the way that we populate and - * manipulate the checkpoint's space maps currently, - * there shouldn't be any entries that cross metaslabs. - * Hence the assertion below. - * - * That said, there is no fundamental requirement that - * the checkpoint's space map entries should not cross - * metaslab boundaries. So if needed we could add code - * that handles metaslab-crossing segments in the future. - */ - VERIFY3U(sme->sme_offset, >=, ms->ms_start); - VERIFY3U(end, <=, ms->ms_start + ms->ms_size); - - /* - * By removing the entry from the allocated segments we - * also verify that the entry is there to begin with. - */ - mutex_enter(&ms->ms_lock); - range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run); - mutex_exit(&ms->ms_lock); - - cseea->cseea_checkpoint_size += sme->sme_run; - return (0); -} - -static void -zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) -{ - spa_t *spa = vd->vdev_spa; - space_map_t *checkpoint_sm = NULL; - uint64_t checkpoint_sm_obj; - - /* - * If there is no vdev_top_zap, we are in a pool whose - * version predates the pool checkpoint feature. - */ - if (vd->vdev_top_zap == 0) - return; - - /* - * If there is no reference of the vdev_checkpoint_sm in - * the vdev_top_zap, then one of the following scenarios - * is true: - * - * 1] There is no checkpoint - * 2] There is a checkpoint, but no checkpointed blocks - * have been freed yet - * 3] The current vdev is indirect - * - * In these cases we return immediately. - */ - if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, - VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) - return; - - VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, - VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, - &checkpoint_sm_obj)); - - checkpoint_sm_exclude_entry_arg_t cseea; - cseea.cseea_vd = vd; - cseea.cseea_checkpoint_size = 0; - - VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), - checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); - - VERIFY0(space_map_iterate(checkpoint_sm, - space_map_length(checkpoint_sm), - checkpoint_sm_exclude_entry_cb, &cseea)); - space_map_close(checkpoint_sm); - - zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size; -} - -static void -zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) -{ - ASSERT(!dump_opt['L']); - - vdev_t *rvd = spa->spa_root_vdev; - for (uint64_t c = 0; c < rvd->vdev_children; c++) { - ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); - zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb); - } -} - -static void -load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) -{ - vdev_t *rvd = spa->spa_root_vdev; - for (uint64_t i = 0; i < rvd->vdev_children; i++) { - vdev_t *vd = rvd->vdev_child[i]; - - ASSERT3U(i, ==, vd->vdev_id); - - if (vd->vdev_ops == &vdev_indirect_ops) - continue; - - for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - - (void) fprintf(stderr, - "\rloading concrete vdev %llu, " - "metaslab %llu of %llu ...", - (longlong_t)vd->vdev_id, - (longlong_t)msp->ms_id, - (longlong_t)vd->vdev_ms_count); - - mutex_enter(&msp->ms_lock); - metaslab_unload(msp); - - /* - * We don't want to spend the CPU manipulating the - * size-ordered tree, so clear the range_tree ops. - */ - msp->ms_allocatable->rt_ops = NULL; - - if (msp->ms_sm != NULL) { - VERIFY0(space_map_load(msp->ms_sm, - msp->ms_allocatable, maptype)); - } - if (!msp->ms_loaded) - msp->ms_loaded = B_TRUE; - mutex_exit(&msp->ms_lock); - } - } -} - -/* - * vm_idxp is an in-out parameter which (for indirect vdevs) is the - * index in vim_entries that has the first entry in this metaslab. - * On return, it will be set to the first entry after this metaslab. - */ -static void -load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, - uint64_t *vim_idxp) -{ - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - - mutex_enter(&msp->ms_lock); - metaslab_unload(msp); - - /* - * We don't want to spend the CPU manipulating the - * size-ordered tree, so clear the range_tree ops. - */ - msp->ms_allocatable->rt_ops = NULL; - - for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); - (*vim_idxp)++) { - vdev_indirect_mapping_entry_phys_t *vimep = - &vim->vim_entries[*vim_idxp]; - uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); - uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); - ASSERT3U(ent_offset, >=, msp->ms_start); - if (ent_offset >= msp->ms_start + msp->ms_size) - break; - - /* - * Mappings do not cross metaslab boundaries, - * because we create them by walking the metaslabs. - */ - ASSERT3U(ent_offset + ent_len, <=, - msp->ms_start + msp->ms_size); - range_tree_add(msp->ms_allocatable, ent_offset, ent_len); - } - - if (!msp->ms_loaded) - msp->ms_loaded = B_TRUE; - mutex_exit(&msp->ms_lock); -} - -static void -zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) -{ - ASSERT(!dump_opt['L']); - - vdev_t *rvd = spa->spa_root_vdev; - for (uint64_t c = 0; c < rvd->vdev_children; c++) { - vdev_t *vd = rvd->vdev_child[c]; - - ASSERT3U(c, ==, vd->vdev_id); - - if (vd->vdev_ops != &vdev_indirect_ops) - continue; - - /* - * Note: we don't check for mapping leaks on - * removing vdevs because their ms_allocatable's - * are used to look for leaks in allocated space. - */ - zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd); - - /* - * Normally, indirect vdevs don't have any - * metaslabs. We want to set them up for - * zio_claim(). - */ - VERIFY0(vdev_metaslab_init(vd, 0)); - - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - uint64_t vim_idx = 0; - for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { - - (void) fprintf(stderr, - "\rloading indirect vdev %llu, " - "metaslab %llu of %llu ...", - (longlong_t)vd->vdev_id, - (longlong_t)vd->vdev_ms[m]->ms_id, - (longlong_t)vd->vdev_ms_count); - - load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m], - &vim_idx); - } - ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim)); - } -} - -static void -zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) -{ - zcb->zcb_spa = spa; - - if (dump_opt['L']) - return; - - dsl_pool_t *dp = spa->spa_dsl_pool; - vdev_t *rvd = spa->spa_root_vdev; - - /* - * We are going to be changing the meaning of the metaslab's - * ms_allocatable. Ensure that the allocator doesn't try to - * use the tree. - */ - spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; - spa->spa_log_class->mc_ops = &zdb_metaslab_ops; - - zcb->zcb_vd_obsolete_counts = - umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), - UMEM_NOFAIL); - - /* - * For leak detection, we overload the ms_allocatable trees - * to contain allocated segments instead of free segments. - * As a result, we can't use the normal metaslab_load/unload - * interfaces. - */ - zdb_leak_init_prepare_indirect_vdevs(spa, zcb); - load_concrete_ms_allocatable_trees(spa, SM_ALLOC); - - /* - * On load_concrete_ms_allocatable_trees() we loaded all the - * allocated entries from the ms_sm to the ms_allocatable for - * each metaslab. If the pool has a checkpoint or is in the - * middle of discarding a checkpoint, some of these blocks - * may have been freed but their ms_sm may not have been - * updated because they are referenced by the checkpoint. In - * order to avoid false-positives during leak-detection, we - * go through the vdev's checkpoint space map and exclude all - * its entries from their relevant ms_allocatable. - * - * We also aggregate the space held by the checkpoint and add - * it to zcb_checkpoint_size. - * - * Note that at this point we are also verifying that all the - * entries on the checkpoint_sm are marked as allocated in - * the ms_sm of their relevant metaslab. - * [see comment in checkpoint_sm_exclude_entry_cb()] - */ - zdb_leak_init_exclude_checkpoint(spa, zcb); - ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa)); - - /* for cleaner progress output */ - (void) fprintf(stderr, "\n"); - - if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { - ASSERT(spa_feature_is_enabled(spa, - SPA_FEATURE_DEVICE_REMOVAL)); - (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, - increment_indirect_mapping_cb, zcb, NULL); - } - - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - zdb_ddt_leak_init(spa, zcb); - spa_config_exit(spa, SCL_CONFIG, FTAG); -} - -static boolean_t -zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) -{ - boolean_t leaks = B_FALSE; - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - uint64_t total_leaked = 0; - - ASSERT(vim != NULL); - - for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { - vdev_indirect_mapping_entry_phys_t *vimep = - &vim->vim_entries[i]; - uint64_t obsolete_bytes = 0; - uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); - metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - - /* - * This is not very efficient but it's easy to - * verify correctness. - */ - for (uint64_t inner_offset = 0; - inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); - inner_offset += 1 << vd->vdev_ashift) { - if (range_tree_contains(msp->ms_allocatable, - offset + inner_offset, 1 << vd->vdev_ashift)) { - obsolete_bytes += 1 << vd->vdev_ashift; - } - } - - int64_t bytes_leaked = obsolete_bytes - - zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]; - ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=, - zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]); - if (bytes_leaked != 0 && - (vdev_obsolete_counts_are_precise(vd) || - dump_opt['d'] >= 5)) { - (void) printf("obsolete indirect mapping count " - "mismatch on %llu:%llx:%llx : %llx bytes leaked\n", - (u_longlong_t)vd->vdev_id, - (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), - (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), - (u_longlong_t)bytes_leaked); - } - total_leaked += ABS(bytes_leaked); - } - - if (!vdev_obsolete_counts_are_precise(vd) && total_leaked > 0) { - int pct_leaked = total_leaked * 100 / - vdev_indirect_mapping_bytes_mapped(vim); - (void) printf("cannot verify obsolete indirect mapping " - "counts of vdev %llu because precise feature was not " - "enabled when it was removed: %d%% (%llx bytes) of mapping" - "unreferenced\n", - (u_longlong_t)vd->vdev_id, pct_leaked, - (u_longlong_t)total_leaked); - } else if (total_leaked > 0) { - (void) printf("obsolete indirect mapping count mismatch " - "for vdev %llu -- %llx total bytes mismatched\n", - (u_longlong_t)vd->vdev_id, - (u_longlong_t)total_leaked); - leaks |= B_TRUE; - } - - vdev_indirect_mapping_free_obsolete_counts(vim, - zcb->zcb_vd_obsolete_counts[vd->vdev_id]); - zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL; - - return (leaks); -} - -static boolean_t -zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) -{ - if (dump_opt['L']) - return (B_FALSE); - - boolean_t leaks = B_FALSE; - - vdev_t *rvd = spa->spa_root_vdev; - for (unsigned c = 0; c < rvd->vdev_children; c++) { - vdev_t *vd = rvd->vdev_child[c]; -#if DEBUG - metaslab_group_t *mg = vd->vdev_mg; -#endif - - if (zcb->zcb_vd_obsolete_counts[c] != NULL) { - leaks |= zdb_check_for_obsolete_leaks(vd, zcb); - } - - for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - ASSERT3P(mg, ==, msp->ms_group); - - /* - * ms_allocatable has been overloaded - * to contain allocated segments. Now that - * we finished traversing all blocks, any - * block that remains in the ms_allocatable - * represents an allocated block that we - * did not claim during the traversal. - * Claimed blocks would have been removed - * from the ms_allocatable. For indirect - * vdevs, space remaining in the tree - * represents parts of the mapping that are - * not referenced, which is not a bug. - */ - if (vd->vdev_ops == &vdev_indirect_ops) { - range_tree_vacate(msp->ms_allocatable, - NULL, NULL); - } else { - range_tree_vacate(msp->ms_allocatable, - zdb_leak, vd); - } - - if (msp->ms_loaded) { - msp->ms_loaded = B_FALSE; - } - } - - } - - umem_free(zcb->zcb_vd_obsolete_counts, - rvd->vdev_children * sizeof (uint32_t *)); - zcb->zcb_vd_obsolete_counts = NULL; - - return (leaks); -} - -/* ARGSUSED */ -static int -count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -{ - zdb_cb_t *zcb = arg; - - if (dump_opt['b'] >= 5) { - char blkbuf[BP_SPRINTF_LEN]; - snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); - (void) printf("[%s] %s\n", - "deferred free", blkbuf); - } - zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); - return (0); -} - -static int -dump_block_stats(spa_t *spa) -{ - zdb_cb_t zcb; - zdb_blkstats_t *zb, *tzb; - uint64_t norm_alloc, norm_space, total_alloc, total_found; - int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD; - boolean_t leaks = B_FALSE; - int err; - - bzero(&zcb, sizeof (zcb)); - (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", - (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", - (dump_opt['c'] == 1) ? "metadata " : "", - dump_opt['c'] ? "checksums " : "", - (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", - !dump_opt['L'] ? "nothing leaked " : ""); - - /* - * When leak detection is enabled we load all space maps as SM_ALLOC - * maps, then traverse the pool claiming each block we discover. If - * the pool is perfectly consistent, the segment trees will be empty - * when we're done. Anything left over is a leak; any block we can't - * claim (because it's not part of any space map) is a double - * allocation, reference to a freed block, or an unclaimed log block. - * - * When leak detection is disabled (-L option) we still traverse the - * pool claiming each block we discover, but we skip opening any space - * maps. - */ - bzero(&zcb, sizeof (zdb_cb_t)); - zdb_leak_init(spa, &zcb); - - /* - * If there's a deferred-free bplist, process that first. - */ - (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, - count_block_cb, &zcb, NULL); - - if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { - (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, - count_block_cb, &zcb, NULL); - } - - zdb_claim_removing(spa, &zcb); - - if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { - VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, - spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, - &zcb, NULL)); - } - - if (dump_opt['c'] > 1) - flags |= TRAVERSE_PREFETCH_DATA; - - zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); - zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); - zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); - zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); - err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); - - /* - * If we've traversed the data blocks then we need to wait for those - * I/Os to complete. We leverage "The Godfather" zio to wait on - * all async I/Os to complete. - */ - if (dump_opt['c']) { - for (int i = 0; i < max_ncpus; i++) { - (void) zio_wait(spa->spa_async_zio_root[i]); - spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | - ZIO_FLAG_GODFATHER); - } - } - - /* - * Done after zio_wait() since zcb_haderrors is modified in - * zdb_blkptr_done() - */ - zcb.zcb_haderrors |= err; - - if (zcb.zcb_haderrors) { - (void) printf("\nError counts:\n\n"); - (void) printf("\t%5s %s\n", "errno", "count"); - for (int e = 0; e < 256; e++) { - if (zcb.zcb_errors[e] != 0) { - (void) printf("\t%5d %llu\n", - e, (u_longlong_t)zcb.zcb_errors[e]); - } - } - } - - /* - * Report any leaked segments. - */ - leaks |= zdb_leak_fini(spa, &zcb); - - tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; - - norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); - norm_space = metaslab_class_get_space(spa_normal_class(spa)); - - total_alloc = norm_alloc + - metaslab_class_get_alloc(spa_log_class(spa)) + - metaslab_class_get_alloc(spa_special_class(spa)) + - metaslab_class_get_alloc(spa_dedup_class(spa)); - total_found = tzb->zb_asize - zcb.zcb_dedup_asize + - zcb.zcb_removing_size + zcb.zcb_checkpoint_size; - - if (total_found == total_alloc && !dump_opt['L']) { - (void) printf("\n\tNo leaks (block sum matches space" - " maps exactly)\n"); - } else if (!dump_opt['L']) { - (void) printf("block traversal size %llu != alloc %llu " - "(%s %lld)\n", - (u_longlong_t)total_found, - (u_longlong_t)total_alloc, - (dump_opt['L']) ? "unreachable" : "leaked", - (longlong_t)(total_alloc - total_found)); - leaks = B_TRUE; - } - - if (tzb->zb_count == 0) - return (2); - - (void) printf("\n"); - (void) printf("\t%-16s %14llu\n", "bp count:", - (u_longlong_t)tzb->zb_count); - (void) printf("\t%-16s %14llu\n", "ganged count:", - (longlong_t)tzb->zb_gangs); - (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:", - (u_longlong_t)tzb->zb_lsize, - (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); - (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", - "bp physical:", (u_longlong_t)tzb->zb_psize, - (u_longlong_t)(tzb->zb_psize / tzb->zb_count), - (double)tzb->zb_lsize / tzb->zb_psize); - (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", - "bp allocated:", (u_longlong_t)tzb->zb_asize, - (u_longlong_t)(tzb->zb_asize / tzb->zb_count), - (double)tzb->zb_lsize / tzb->zb_asize); - (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", - "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize, - (u_longlong_t)zcb.zcb_dedup_blocks, - (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); - (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", - (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); - - if (spa_special_class(spa)->mc_rotor != NULL) { - uint64_t alloc = metaslab_class_get_alloc( - spa_special_class(spa)); - uint64_t space = metaslab_class_get_space( - spa_special_class(spa)); - - (void) printf("\t%-16s %14llu used: %5.2f%%\n", - "Special class", (u_longlong_t)alloc, - 100.0 * alloc / space); - } - - if (spa_dedup_class(spa)->mc_rotor != NULL) { - uint64_t alloc = metaslab_class_get_alloc( - spa_dedup_class(spa)); - uint64_t space = metaslab_class_get_space( - spa_dedup_class(spa)); - - (void) printf("\t%-16s %14llu used: %5.2f%%\n", - "Dedup class", (u_longlong_t)alloc, - 100.0 * alloc / space); - } - - for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { - if (zcb.zcb_embedded_blocks[i] == 0) - continue; - (void) printf("\n"); - (void) printf("\tadditional, non-pointer bps of type %u: " - "%10llu\n", - i, (u_longlong_t)zcb.zcb_embedded_blocks[i]); - - if (dump_opt['b'] >= 3) { - (void) printf("\t number of (compressed) bytes: " - "number of bps\n"); - dump_histogram(zcb.zcb_embedded_histogram[i], - sizeof (zcb.zcb_embedded_histogram[i]) / - sizeof (zcb.zcb_embedded_histogram[i][0]), 0); - } - } - - if (tzb->zb_ditto_samevdev != 0) { - (void) printf("\tDittoed blocks on same vdev: %llu\n", - (longlong_t)tzb->zb_ditto_samevdev); - } - if (tzb->zb_ditto_same_ms != 0) { - (void) printf("\tDittoed blocks in same metaslab: %llu\n", - (longlong_t)tzb->zb_ditto_same_ms); - } - - for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { - vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - - if (vim == NULL) { - continue; - } - - char mem[32]; - zdb_nicenum(vdev_indirect_mapping_num_entries(vim), - mem, vdev_indirect_mapping_size(vim)); - - (void) printf("\tindirect vdev id %llu has %llu segments " - "(%s in memory)\n", - (longlong_t)vd->vdev_id, - (longlong_t)vdev_indirect_mapping_num_entries(vim), mem); - } - - if (dump_opt['b'] >= 2) { - int l, t, level; - (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" - "\t avg\t comp\t%%Total\tType\n"); - - for (t = 0; t <= ZDB_OT_TOTAL; t++) { - char csize[32], lsize[32], psize[32], asize[32]; - char avg[32], gang[32]; - const char *typename; - - /* make sure nicenum has enough space */ - CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ); - CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ); - - if (t < DMU_OT_NUMTYPES) - typename = dmu_ot[t].ot_name; - else - typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; - - if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) { - (void) printf("%6s\t%5s\t%5s\t%5s" - "\t%5s\t%5s\t%6s\t%s\n", - "-", - "-", - "-", - "-", - "-", - "-", - "-", - typename); - continue; - } - - for (l = ZB_TOTAL - 1; l >= -1; l--) { - level = (l == -1 ? ZB_TOTAL : l); - zb = &zcb.zcb_type[level][t]; - - if (zb->zb_asize == 0) - continue; - - if (dump_opt['b'] < 3 && level != ZB_TOTAL) - continue; - - if (level == 0 && zb->zb_asize == - zcb.zcb_type[ZB_TOTAL][t].zb_asize) - continue; - - zdb_nicenum(zb->zb_count, csize, - sizeof (csize)); - zdb_nicenum(zb->zb_lsize, lsize, - sizeof (lsize)); - zdb_nicenum(zb->zb_psize, psize, - sizeof (psize)); - zdb_nicenum(zb->zb_asize, asize, - sizeof (asize)); - zdb_nicenum(zb->zb_asize / zb->zb_count, avg, - sizeof (avg)); - zdb_nicenum(zb->zb_gangs, gang, sizeof (gang)); - - (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" - "\t%5.2f\t%6.2f\t", - csize, lsize, psize, asize, avg, - (double)zb->zb_lsize / zb->zb_psize, - 100.0 * zb->zb_asize / tzb->zb_asize); - - if (level == ZB_TOTAL) - (void) printf("%s\n", typename); - else - (void) printf(" L%d %s\n", - level, typename); - - if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { - (void) printf("\t number of ganged " - "blocks: %s\n", gang); - } - - if (dump_opt['b'] >= 4) { - (void) printf("psize " - "(in 512-byte sectors): " - "number of blocks\n"); - dump_histogram(zb->zb_psize_histogram, - PSIZE_HISTO_SIZE, 0); - } - } - } - } - - (void) printf("\n"); - - if (leaks) - return (2); - - if (zcb.zcb_haderrors) - return (3); - - return (0); -} - -typedef struct zdb_ddt_entry { - ddt_key_t zdde_key; - uint64_t zdde_ref_blocks; - uint64_t zdde_ref_lsize; - uint64_t zdde_ref_psize; - uint64_t zdde_ref_dsize; - avl_node_t zdde_node; -} zdb_ddt_entry_t; - -/* ARGSUSED */ -static int -zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) -{ - avl_tree_t *t = arg; - avl_index_t where; - zdb_ddt_entry_t *zdde, zdde_search; - - if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) - return (0); - - if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { - (void) printf("traversing objset %llu, %llu objects, " - "%lu blocks so far\n", - (u_longlong_t)zb->zb_objset, - (u_longlong_t)BP_GET_FILL(bp), - avl_numnodes(t)); - } - - if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || - BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) - return (0); - - ddt_key_fill(&zdde_search.zdde_key, bp); - - zdde = avl_find(t, &zdde_search, &where); - - if (zdde == NULL) { - zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); - zdde->zdde_key = zdde_search.zdde_key; - avl_insert(t, zdde, where); - } - - zdde->zdde_ref_blocks += 1; - zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); - zdde->zdde_ref_psize += BP_GET_PSIZE(bp); - zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); - - return (0); -} - -static void -dump_simulated_ddt(spa_t *spa) -{ - avl_tree_t t; - void *cookie = NULL; - zdb_ddt_entry_t *zdde; - ddt_histogram_t ddh_total; - ddt_stat_t dds_total; - - bzero(&ddh_total, sizeof (ddh_total)); - bzero(&dds_total, sizeof (dds_total)); - avl_create(&t, ddt_entry_compare, - sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); - - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - - (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, - zdb_ddt_add_cb, &t); - - spa_config_exit(spa, SCL_CONFIG, FTAG); - - while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { - ddt_stat_t dds; - uint64_t refcnt = zdde->zdde_ref_blocks; - ASSERT(refcnt != 0); - - dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; - dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; - dds.dds_psize = zdde->zdde_ref_psize / refcnt; - dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; - - dds.dds_ref_blocks = zdde->zdde_ref_blocks; - dds.dds_ref_lsize = zdde->zdde_ref_lsize; - dds.dds_ref_psize = zdde->zdde_ref_psize; - dds.dds_ref_dsize = zdde->zdde_ref_dsize; - - ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], - &dds, 0); - - umem_free(zdde, sizeof (*zdde)); - } - - avl_destroy(&t); - - ddt_histogram_stat(&dds_total, &ddh_total); - - (void) printf("Simulated DDT histogram:\n"); - - zpool_dump_ddt(&dds_total, &ddh_total); - - dump_dedup_ratio(&dds_total); -} - -static int -verify_device_removal_feature_counts(spa_t *spa) -{ - uint64_t dr_feature_refcount = 0; - uint64_t oc_feature_refcount = 0; - uint64_t indirect_vdev_count = 0; - uint64_t precise_vdev_count = 0; - uint64_t obsolete_counts_object_count = 0; - uint64_t obsolete_sm_count = 0; - uint64_t obsolete_counts_count = 0; - uint64_t scip_count = 0; - uint64_t obsolete_bpobj_count = 0; - int ret = 0; - - spa_condensing_indirect_phys_t *scip = - &spa->spa_condensing_indirect_phys; - if (scip->scip_next_mapping_object != 0) { - vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev]; - ASSERT(scip->scip_prev_obsolete_sm_object != 0); - ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); - - (void) printf("Condensing indirect vdev %llu: new mapping " - "object %llu, prev obsolete sm %llu\n", - (u_longlong_t)scip->scip_vdev, - (u_longlong_t)scip->scip_next_mapping_object, - (u_longlong_t)scip->scip_prev_obsolete_sm_object); - if (scip->scip_prev_obsolete_sm_object != 0) { - space_map_t *prev_obsolete_sm = NULL; - VERIFY0(space_map_open(&prev_obsolete_sm, - spa->spa_meta_objset, - scip->scip_prev_obsolete_sm_object, - 0, vd->vdev_asize, 0)); - dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); - (void) printf("\n"); - space_map_close(prev_obsolete_sm); - } - - scip_count += 2; - } - - for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { - vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; - vdev_indirect_config_t *vic = &vd->vdev_indirect_config; - - if (vic->vic_mapping_object != 0) { - ASSERT(vd->vdev_ops == &vdev_indirect_ops || - vd->vdev_removing); - indirect_vdev_count++; - - if (vd->vdev_indirect_mapping->vim_havecounts) { - obsolete_counts_count++; - } - } - if (vdev_obsolete_counts_are_precise(vd)) { - ASSERT(vic->vic_mapping_object != 0); - precise_vdev_count++; - } - if (vdev_obsolete_sm_object(vd) != 0) { - ASSERT(vic->vic_mapping_object != 0); - obsolete_sm_count++; - } - } - - (void) feature_get_refcount(spa, - &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL], - &dr_feature_refcount); - (void) feature_get_refcount(spa, - &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS], - &oc_feature_refcount); - - if (dr_feature_refcount != indirect_vdev_count) { - ret = 1; - (void) printf("Number of indirect vdevs (%llu) " \ - "does not match feature count (%llu)\n", - (u_longlong_t)indirect_vdev_count, - (u_longlong_t)dr_feature_refcount); - } else { - (void) printf("Verified device_removal feature refcount " \ - "of %llu is correct\n", - (u_longlong_t)dr_feature_refcount); - } - - if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_OBSOLETE_BPOBJ) == 0) { - obsolete_bpobj_count++; - } - - - obsolete_counts_object_count = precise_vdev_count; - obsolete_counts_object_count += obsolete_sm_count; - obsolete_counts_object_count += obsolete_counts_count; - obsolete_counts_object_count += scip_count; - obsolete_counts_object_count += obsolete_bpobj_count; - obsolete_counts_object_count += remap_deadlist_count; - - if (oc_feature_refcount != obsolete_counts_object_count) { - ret = 1; - (void) printf("Number of obsolete counts objects (%llu) " \ - "does not match feature count (%llu)\n", - (u_longlong_t)obsolete_counts_object_count, - (u_longlong_t)oc_feature_refcount); - (void) printf("pv:%llu os:%llu oc:%llu sc:%llu " - "ob:%llu rd:%llu\n", - (u_longlong_t)precise_vdev_count, - (u_longlong_t)obsolete_sm_count, - (u_longlong_t)obsolete_counts_count, - (u_longlong_t)scip_count, - (u_longlong_t)obsolete_bpobj_count, - (u_longlong_t)remap_deadlist_count); - } else { - (void) printf("Verified indirect_refcount feature refcount " \ - "of %llu is correct\n", - (u_longlong_t)oc_feature_refcount); - } - return (ret); -} - -static void -zdb_set_skip_mmp(char *target) -{ - spa_t *spa; - - /* - * Disable the activity check to allow examination of - * active pools. - */ - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(target)) != NULL) { - spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; - } - mutex_exit(&spa_namespace_lock); -} - -#define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE" -/* - * Import the checkpointed state of the pool specified by the target - * parameter as readonly. The function also accepts a pool config - * as an optional parameter, else it attempts to infer the config by - * the name of the target pool. - * - * Note that the checkpointed state's pool name will be the name of - * the original pool with the above suffix appened to it. In addition, - * if the target is not a pool name (e.g. a path to a dataset) then - * the new_path parameter is populated with the updated path to - * reflect the fact that we are looking into the checkpointed state. - * - * The function returns a newly-allocated copy of the name of the - * pool containing the checkpointed state. When this copy is no - * longer needed it should be freed with free(3C). Same thing - * applies to the new_path parameter if allocated. - */ -static char * -import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) -{ - int error = 0; - char *poolname, *bogus_name; - - /* If the target is not a pool, the extract the pool name */ - char *path_start = strchr(target, '/'); - if (path_start != NULL) { - size_t poolname_len = path_start - target; - poolname = strndup(target, poolname_len); - } else { - poolname = target; - } - - if (cfg == NULL) { - zdb_set_skip_mmp(poolname); - error = spa_get_stats(poolname, &cfg, NULL, 0); - if (error != 0) { - fatal("Tried to read config of pool \"%s\" but " - "spa_get_stats() failed with error %d\n", - poolname, error); - } - } - - (void) asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX); - fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name); - - error = spa_import(bogus_name, cfg, NULL, - ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT | - ZFS_IMPORT_SKIP_MMP); - if (error != 0) { - fatal("Tried to import pool \"%s\" but spa_import() failed " - "with error %d\n", bogus_name, error); - } - - if (new_path != NULL && path_start != NULL) - (void) asprintf(new_path, "%s%s", bogus_name, path_start); - - if (target != poolname) - free(poolname); - - return (bogus_name); -} - -typedef struct verify_checkpoint_sm_entry_cb_arg { - vdev_t *vcsec_vd; - - /* the following fields are only used for printing progress */ - uint64_t vcsec_entryid; - uint64_t vcsec_num_entries; -} verify_checkpoint_sm_entry_cb_arg_t; - -#define ENTRIES_PER_PROGRESS_UPDATE 10000 - -static int -verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) -{ - verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; - vdev_t *vd = vcsec->vcsec_vd; - metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; - uint64_t end = sme->sme_offset + sme->sme_run; - - ASSERT(sme->sme_type == SM_FREE); - - if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { - (void) fprintf(stderr, - "\rverifying vdev %llu, space map entry %llu of %llu ...", - (longlong_t)vd->vdev_id, - (longlong_t)vcsec->vcsec_entryid, - (longlong_t)vcsec->vcsec_num_entries); - } - vcsec->vcsec_entryid++; - - /* - * See comment in checkpoint_sm_exclude_entry_cb() - */ - VERIFY3U(sme->sme_offset, >=, ms->ms_start); - VERIFY3U(end, <=, ms->ms_start + ms->ms_size); - - /* - * The entries in the vdev_checkpoint_sm should be marked as - * allocated in the checkpointed state of the pool, therefore - * their respective ms_allocateable trees should not contain them. - */ - mutex_enter(&ms->ms_lock); - range_tree_verify_not_present(ms->ms_allocatable, - sme->sme_offset, sme->sme_run); - mutex_exit(&ms->ms_lock); - - return (0); -} - -/* - * Verify that all segments in the vdev_checkpoint_sm are allocated - * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's - * ms_allocatable). - * - * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of - * each vdev in the current state of the pool to the metaslab space maps - * (ms_sm) of the checkpointed state of the pool. - * - * Note that the function changes the state of the ms_allocatable - * trees of the current spa_t. The entries of these ms_allocatable - * trees are cleared out and then repopulated from with the free - * entries of their respective ms_sm space maps. - */ -static void -verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) -{ - vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; - vdev_t *current_rvd = current->spa_root_vdev; - - load_concrete_ms_allocatable_trees(checkpoint, SM_FREE); - - for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) { - vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c]; - vdev_t *current_vd = current_rvd->vdev_child[c]; - - space_map_t *checkpoint_sm = NULL; - uint64_t checkpoint_sm_obj; - - if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { - /* - * Since we don't allow device removal in a pool - * that has a checkpoint, we expect that all removed - * vdevs were removed from the pool before the - * checkpoint. - */ - ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); - continue; - } - - /* - * If the checkpoint space map doesn't exist, then nothing - * here is checkpointed so there's nothing to verify. - */ - if (current_vd->vdev_top_zap == 0 || - zap_contains(spa_meta_objset(current), - current_vd->vdev_top_zap, - VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) - continue; - - VERIFY0(zap_lookup(spa_meta_objset(current), - current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, - sizeof (uint64_t), 1, &checkpoint_sm_obj)); - - VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), - checkpoint_sm_obj, 0, current_vd->vdev_asize, - current_vd->vdev_ashift)); - - verify_checkpoint_sm_entry_cb_arg_t vcsec; - vcsec.vcsec_vd = ckpoint_vd; - vcsec.vcsec_entryid = 0; - vcsec.vcsec_num_entries = - space_map_length(checkpoint_sm) / sizeof (uint64_t); - VERIFY0(space_map_iterate(checkpoint_sm, - space_map_length(checkpoint_sm), - verify_checkpoint_sm_entry_cb, &vcsec)); - dump_spacemap(current->spa_meta_objset, checkpoint_sm); - space_map_close(checkpoint_sm); - } - - /* - * If we've added vdevs since we took the checkpoint, ensure - * that their checkpoint space maps are empty. - */ - if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) { - for (uint64_t c = ckpoint_rvd->vdev_children; - c < current_rvd->vdev_children; c++) { - vdev_t *current_vd = current_rvd->vdev_child[c]; - ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL); - } - } - - /* for cleaner progress output */ - (void) fprintf(stderr, "\n"); -} - -/* - * Verifies that all space that's allocated in the checkpoint is - * still allocated in the current version, by checking that everything - * in checkpoint's ms_allocatable (which is actually allocated, not - * allocatable/free) is not present in current's ms_allocatable. - * - * Note that the function changes the state of the ms_allocatable - * trees of both spas when called. The entries of all ms_allocatable - * trees are cleared out and then repopulated from their respective - * ms_sm space maps. In the checkpointed state we load the allocated - * entries, and in the current state we load the free entries. - */ -static void -verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) -{ - vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; - vdev_t *current_rvd = current->spa_root_vdev; - - load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC); - load_concrete_ms_allocatable_trees(current, SM_FREE); - - for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) { - vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i]; - vdev_t *current_vd = current_rvd->vdev_child[i]; - - if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { - /* - * See comment in verify_checkpoint_vdev_spacemaps() - */ - ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); - continue; - } - - for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) { - metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m]; - metaslab_t *current_msp = current_vd->vdev_ms[m]; - - (void) fprintf(stderr, - "\rverifying vdev %llu of %llu, " - "metaslab %llu of %llu ...", - (longlong_t)current_vd->vdev_id, - (longlong_t)current_rvd->vdev_children, - (longlong_t)current_vd->vdev_ms[m]->ms_id, - (longlong_t)current_vd->vdev_ms_count); - - /* - * We walk through the ms_allocatable trees that - * are loaded with the allocated blocks from the - * ms_sm spacemaps of the checkpoint. For each - * one of these ranges we ensure that none of them - * exists in the ms_allocatable trees of the - * current state which are loaded with the ranges - * that are currently free. - * - * This way we ensure that none of the blocks that - * are part of the checkpoint were freed by mistake. - */ - range_tree_walk(ckpoint_msp->ms_allocatable, - (range_tree_func_t *)range_tree_verify_not_present, - current_msp->ms_allocatable); - } - } - - /* for cleaner progress output */ - (void) fprintf(stderr, "\n"); -} - -static void -verify_checkpoint_blocks(spa_t *spa) -{ - ASSERT(!dump_opt['L']); - - spa_t *checkpoint_spa; - char *checkpoint_pool; - nvlist_t *config = NULL; - int error = 0; - - /* - * We import the checkpointed state of the pool (under a different - * name) so we can do verification on it against the current state - * of the pool. - */ - checkpoint_pool = import_checkpointed_state(spa->spa_name, config, - NULL); - ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); - - error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG); - if (error != 0) { - fatal("Tried to open pool \"%s\" but spa_open() failed with " - "error %d\n", checkpoint_pool, error); - } - - /* - * Ensure that ranges in the checkpoint space maps of each vdev - * are allocated according to the checkpointed state's metaslab - * space maps. - */ - verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa); - - /* - * Ensure that allocated ranges in the checkpoint's metaslab - * space maps remain allocated in the metaslab space maps of - * the current state. - */ - verify_checkpoint_ms_spacemaps(checkpoint_spa, spa); - - /* - * Once we are done, we get rid of the checkpointed state. - */ - spa_close(checkpoint_spa, FTAG); - free(checkpoint_pool); -} - -static void -dump_leftover_checkpoint_blocks(spa_t *spa) -{ - vdev_t *rvd = spa->spa_root_vdev; - - for (uint64_t i = 0; i < rvd->vdev_children; i++) { - vdev_t *vd = rvd->vdev_child[i]; - - space_map_t *checkpoint_sm = NULL; - uint64_t checkpoint_sm_obj; - - if (vd->vdev_top_zap == 0) - continue; - - if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, - VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) - continue; - - VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, - VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, - sizeof (uint64_t), 1, &checkpoint_sm_obj)); - - VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), - checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); - dump_spacemap(spa->spa_meta_objset, checkpoint_sm); - space_map_close(checkpoint_sm); - } -} - -static int -verify_checkpoint(spa_t *spa) -{ - uberblock_t checkpoint; - int error; - - if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) - return (0); - - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), - sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); - - if (error == ENOENT && !dump_opt['L']) { - /* - * If the feature is active but the uberblock is missing - * then we must be in the middle of discarding the - * checkpoint. - */ - (void) printf("\nPartially discarded checkpoint " - "state found:\n"); - dump_leftover_checkpoint_blocks(spa); - return (0); - } else if (error != 0) { - (void) printf("lookup error %d when looking for " - "checkpointed uberblock in MOS\n", error); - return (error); - } - dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n"); - - if (checkpoint.ub_checkpoint_txg == 0) { - (void) printf("\nub_checkpoint_txg not set in checkpointed " - "uberblock\n"); - error = 3; - } - - if (error == 0 && !dump_opt['L']) - verify_checkpoint_blocks(spa); - - return (error); -} - -/* ARGSUSED */ -static void -mos_leaks_cb(void *arg, uint64_t start, uint64_t size) -{ - for (uint64_t i = start; i < size; i++) { - (void) printf("MOS object %llu referenced but not allocated\n", - (u_longlong_t)i); - } -} - -static range_tree_t *mos_refd_objs; - -static void -mos_obj_refd(uint64_t obj) -{ - if (obj != 0 && mos_refd_objs != NULL) - range_tree_add(mos_refd_objs, obj, 1); -} - -static void -mos_leak_vdev(vdev_t *vd) -{ - mos_obj_refd(vd->vdev_dtl_object); - mos_obj_refd(vd->vdev_ms_array); - mos_obj_refd(vd->vdev_top_zap); - mos_obj_refd(vd->vdev_indirect_config.vic_births_object); - mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); - mos_obj_refd(vd->vdev_leaf_zap); - if (vd->vdev_checkpoint_sm != NULL) - mos_obj_refd(vd->vdev_checkpoint_sm->sm_object); - if (vd->vdev_indirect_mapping != NULL) { - mos_obj_refd(vd->vdev_indirect_mapping-> - vim_phys->vimp_counts_object); - } - if (vd->vdev_obsolete_sm != NULL) - mos_obj_refd(vd->vdev_obsolete_sm->sm_object); - - for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *ms = vd->vdev_ms[m]; - mos_obj_refd(space_map_object(ms->ms_sm)); - } - - for (uint64_t c = 0; c < vd->vdev_children; c++) { - mos_leak_vdev(vd->vdev_child[c]); - } -} - -static int -dump_mos_leaks(spa_t *spa) -{ - int rv = 0; - objset_t *mos = spa->spa_meta_objset; - dsl_pool_t *dp = spa->spa_dsl_pool; - - /* Visit and mark all referenced objects in the MOS */ - - mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT); - mos_obj_refd(spa->spa_pool_props_object); - mos_obj_refd(spa->spa_config_object); - mos_obj_refd(spa->spa_ddt_stat_object); - mos_obj_refd(spa->spa_feat_desc_obj); - mos_obj_refd(spa->spa_feat_enabled_txg_obj); - mos_obj_refd(spa->spa_feat_for_read_obj); - mos_obj_refd(spa->spa_feat_for_write_obj); - mos_obj_refd(spa->spa_history); - mos_obj_refd(spa->spa_errlog_last); - mos_obj_refd(spa->spa_errlog_scrub); - mos_obj_refd(spa->spa_all_vdev_zaps); - mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj); - mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj); - mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj); - bpobj_count_refd(&spa->spa_deferred_bpobj); - mos_obj_refd(dp->dp_empty_bpobj); - bpobj_count_refd(&dp->dp_obsolete_bpobj); - bpobj_count_refd(&dp->dp_free_bpobj); - mos_obj_refd(spa->spa_l2cache.sav_object); - mos_obj_refd(spa->spa_spares.sav_object); - - mos_obj_refd(spa->spa_condensing_indirect_phys. - scip_next_mapping_object); - mos_obj_refd(spa->spa_condensing_indirect_phys. - scip_prev_obsolete_sm_object); - if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) { - vdev_indirect_mapping_t *vim = - vdev_indirect_mapping_open(mos, - spa->spa_condensing_indirect_phys.scip_next_mapping_object); - mos_obj_refd(vim->vim_phys->vimp_counts_object); - vdev_indirect_mapping_close(vim); - } - - if (dp->dp_origin_snap != NULL) { - dsl_dataset_t *ds; - - dsl_pool_config_enter(dp, FTAG); - VERIFY0(dsl_dataset_hold_obj(dp, - dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, - FTAG, &ds)); - count_ds_mos_objects(ds); - dump_deadlist(&ds->ds_deadlist); - dsl_dataset_rele(ds, FTAG); - dsl_pool_config_exit(dp, FTAG); - - count_ds_mos_objects(dp->dp_origin_snap); - dump_deadlist(&dp->dp_origin_snap->ds_deadlist); - } - count_dir_mos_objects(dp->dp_mos_dir); - if (dp->dp_free_dir != NULL) - count_dir_mos_objects(dp->dp_free_dir); - if (dp->dp_leak_dir != NULL) - count_dir_mos_objects(dp->dp_leak_dir); - - mos_leak_vdev(spa->spa_root_vdev); - - for (uint64_t class = 0; class < DDT_CLASSES; class++) { - for (uint64_t type = 0; type < DDT_TYPES; type++) { - for (uint64_t cksum = 0; - cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { - ddt_t *ddt = spa->spa_ddt[cksum]; - mos_obj_refd(ddt->ddt_object[type][class]); - } - } - } - - /* - * Visit all allocated objects and make sure they are referenced. - */ - uint64_t object = 0; - while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) { - if (range_tree_contains(mos_refd_objs, object, 1)) { - range_tree_remove(mos_refd_objs, object, 1); - } else { - dmu_object_info_t doi; - const char *name; - dmu_object_info(mos, object, &doi); - if (doi.doi_type & DMU_OT_NEWTYPE) { - dmu_object_byteswap_t bswap = - DMU_OT_BYTESWAP(doi.doi_type); - name = dmu_ot_byteswap[bswap].ob_name; - } else { - name = dmu_ot[doi.doi_type].ot_name; - } - - (void) printf("MOS object %llu (%s) leaked\n", - (u_longlong_t)object, name); - rv = 2; - } - } - (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); - if (!range_tree_is_empty(mos_refd_objs)) - rv = 2; - range_tree_vacate(mos_refd_objs, NULL, NULL); - range_tree_destroy(mos_refd_objs); - return (rv); -} - -static void -dump_zpool(spa_t *spa) -{ - dsl_pool_t *dp = spa_get_dsl(spa); - int rc = 0; - - if (dump_opt['S']) { - dump_simulated_ddt(spa); - return; - } - - if (!dump_opt['e'] && dump_opt['C'] > 1) { - (void) printf("\nCached configuration:\n"); - dump_nvlist(spa->spa_config, 8); - } - - if (dump_opt['C']) - dump_config(spa); - - if (dump_opt['u']) - dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); - - if (dump_opt['D']) - dump_all_ddts(spa); - - if (dump_opt['d'] > 2 || dump_opt['m']) - dump_metaslabs(spa); - if (dump_opt['M']) - dump_metaslab_groups(spa); - - if (dump_opt['d'] || dump_opt['i']) { - mos_refd_objs = range_tree_create(NULL, NULL); - dump_dir(dp->dp_meta_objset); - - if (dump_opt['d'] >= 3) { - dsl_pool_t *dp = spa->spa_dsl_pool; - dump_full_bpobj(&spa->spa_deferred_bpobj, - "Deferred frees", 0); - if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { - dump_full_bpobj(&dp->dp_free_bpobj, - "Pool snapshot frees", 0); - } - if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { - ASSERT(spa_feature_is_enabled(spa, - SPA_FEATURE_DEVICE_REMOVAL)); - dump_full_bpobj(&dp->dp_obsolete_bpobj, - "Pool obsolete blocks", 0); - } - - if (spa_feature_is_active(spa, - SPA_FEATURE_ASYNC_DESTROY)) { - dump_bptree(spa->spa_meta_objset, - dp->dp_bptree_obj, - "Pool dataset frees"); - } - dump_dtl(spa->spa_root_vdev, 0); - } - (void) dmu_objset_find(spa_name(spa), dump_one_dir, - NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); - - if (rc == 0 && !dump_opt['L']) - rc = dump_mos_leaks(spa); - - for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - uint64_t refcount; - - if (!(spa_feature_table[f].fi_flags & - ZFEATURE_FLAG_PER_DATASET)) { - ASSERT0(dataset_feature_count[f]); - continue; - } - (void) feature_get_refcount(spa, - &spa_feature_table[f], &refcount); - if (dataset_feature_count[f] != refcount) { - (void) printf("%s feature refcount mismatch: " - "%lld datasets != %lld refcount\n", - spa_feature_table[f].fi_uname, - (longlong_t)dataset_feature_count[f], - (longlong_t)refcount); - rc = 2; - } else { - (void) printf("Verified %s feature refcount " - "of %llu is correct\n", - spa_feature_table[f].fi_uname, - (longlong_t)refcount); - } - } - - if (rc == 0) { - rc = verify_device_removal_feature_counts(spa); - } - } - - if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) - rc = dump_block_stats(spa); - - if (rc == 0) - rc = verify_spacemap_refcounts(spa); - - if (dump_opt['s']) - show_pool_stats(spa); - - if (dump_opt['h']) - dump_history(spa); - - if (rc == 0) - rc = verify_checkpoint(spa); - - if (rc != 0) { - dump_debug_buffer(); - exit(rc); - } -} - -#define ZDB_FLAG_CHECKSUM 0x0001 -#define ZDB_FLAG_DECOMPRESS 0x0002 -#define ZDB_FLAG_BSWAP 0x0004 -#define ZDB_FLAG_GBH 0x0008 -#define ZDB_FLAG_INDIRECT 0x0010 -#define ZDB_FLAG_PHYS 0x0020 -#define ZDB_FLAG_RAW 0x0040 -#define ZDB_FLAG_PRINT_BLKPTR 0x0080 - -static int flagbits[256]; - -static void -zdb_print_blkptr(blkptr_t *bp, int flags) -{ - char blkbuf[BP_SPRINTF_LEN]; - - if (flags & ZDB_FLAG_BSWAP) - byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); - - snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); - (void) printf("%s\n", blkbuf); -} - -static void -zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) -{ - int i; - - for (i = 0; i < nbps; i++) - zdb_print_blkptr(&bp[i], flags); -} - -static void -zdb_dump_gbh(void *buf, int flags) -{ - zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); -} - -static void -zdb_dump_block_raw(void *buf, uint64_t size, int flags) -{ - if (flags & ZDB_FLAG_BSWAP) - byteswap_uint64_array(buf, size); - (void) write(1, buf, size); -} - -static void -zdb_dump_block(char *label, void *buf, uint64_t size, int flags) -{ - uint64_t *d = (uint64_t *)buf; - unsigned nwords = size / sizeof (uint64_t); - int do_bswap = !!(flags & ZDB_FLAG_BSWAP); - unsigned i, j; - const char *hdr; - char *c; - - - if (do_bswap) - hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; - else - hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; - - (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); - - for (i = 0; i < nwords; i += 2) { - (void) printf("%06llx: %016llx %016llx ", - (u_longlong_t)(i * sizeof (uint64_t)), - (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), - (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); - - c = (char *)&d[i]; - for (j = 0; j < 2 * sizeof (uint64_t); j++) - (void) printf("%c", isprint(c[j]) ? c[j] : '.'); - (void) printf("\n"); - } -} - -/* - * There are two acceptable formats: - * leaf_name - For example: c1t0d0 or /tmp/ztest.0a - * child[.child]* - For example: 0.1.1 - * - * The second form can be used to specify arbitrary vdevs anywhere - * in the heirarchy. For example, in a pool with a mirror of - * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . - */ -static vdev_t * -zdb_vdev_lookup(vdev_t *vdev, const char *path) -{ - char *s, *p, *q; - unsigned i; - - if (vdev == NULL) - return (NULL); - - /* First, assume the x.x.x.x format */ - i = strtoul(path, &s, 10); - if (s == path || (s && *s != '.' && *s != '\0')) - goto name; - if (i >= vdev->vdev_children) - return (NULL); - - vdev = vdev->vdev_child[i]; - if (*s == '\0') - return (vdev); - return (zdb_vdev_lookup(vdev, s+1)); - -name: - for (i = 0; i < vdev->vdev_children; i++) { - vdev_t *vc = vdev->vdev_child[i]; - - if (vc->vdev_path == NULL) { - vc = zdb_vdev_lookup(vc, path); - if (vc == NULL) - continue; - else - return (vc); - } - - p = strrchr(vc->vdev_path, '/'); - p = p ? p + 1 : vc->vdev_path; - q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; - - if (strcmp(vc->vdev_path, path) == 0) - return (vc); - if (strcmp(p, path) == 0) - return (vc); - if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) - return (vc); - } - - return (NULL); -} - -/* ARGSUSED */ -static int -random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused) -{ - return (random_get_pseudo_bytes(buf, len)); -} - -/* - * Read a block from a pool and print it out. The syntax of the - * block descriptor is: - * - * pool:vdev_specifier:offset:size[:flags] - * - * pool - The name of the pool you wish to read from - * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) - * offset - offset, in hex, in bytes - * size - Amount of data to read, in hex, in bytes - * flags - A string of characters specifying options - * b: Decode a blkptr at given offset within block - * *c: Calculate and display checksums - * d: Decompress data before dumping - * e: Byteswap data before dumping - * g: Display data as a gang block header - * i: Display as an indirect block - * p: Do I/O to physical offset - * r: Dump raw data to stdout - * - * * = not yet implemented - */ -static void -zdb_read_block(char *thing, spa_t *spa) -{ - blkptr_t blk, *bp = &blk; - dva_t *dva = bp->blk_dva; - int flags = 0; - uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; - zio_t *zio; - vdev_t *vd; - abd_t *pabd; - void *lbuf, *buf; - const char *s, *vdev; - char *p, *dup, *flagstr; - int i, error; - - dup = strdup(thing); - s = strtok(dup, ":"); - vdev = s ? s : ""; - s = strtok(NULL, ":"); - offset = strtoull(s ? s : "", NULL, 16); - s = strtok(NULL, ":"); - size = strtoull(s ? s : "", NULL, 16); - s = strtok(NULL, ":"); - if (s) - flagstr = strdup(s); - else - flagstr = strdup(""); - - s = NULL; - if (size == 0) - s = "size must not be zero"; - if (!IS_P2ALIGNED(size, DEV_BSIZE)) - s = "size must be a multiple of sector size"; - if (!IS_P2ALIGNED(offset, DEV_BSIZE)) - s = "offset must be a multiple of sector size"; - if (s) { - (void) printf("Invalid block specifier: %s - %s\n", thing, s); - free(flagstr); - free(dup); - return; - } - - for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) { - for (i = 0; flagstr[i]; i++) { - int bit = flagbits[(uchar_t)flagstr[i]]; - - if (bit == 0) { - (void) printf("***Invalid flag: %c\n", - flagstr[i]); - continue; - } - flags |= bit; - - /* If it's not something with an argument, keep going */ - if ((bit & (ZDB_FLAG_CHECKSUM | - ZDB_FLAG_PRINT_BLKPTR)) == 0) - continue; - - p = &flagstr[i + 1]; - if (bit == ZDB_FLAG_PRINT_BLKPTR) - blkptr_offset = strtoull(p, &p, 16); - if (*p != ':' && *p != '\0') { - (void) printf("***Invalid flag arg: '%s'\n", s); - free(flagstr); - free(dup); - return; - } - i += p - &flagstr[i + 1]; /* skip over the number */ - } - } - free(flagstr); - - vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); - if (vd == NULL) { - (void) printf("***Invalid vdev: %s\n", vdev); - free(dup); - return; - } else { - if (vd->vdev_path) - (void) fprintf(stderr, "Found vdev: %s\n", - vd->vdev_path); - else - (void) fprintf(stderr, "Found vdev type: %s\n", - vd->vdev_ops->vdev_op_type); - } - - psize = size; - lsize = size; - - pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE); - lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); - - BP_ZERO(bp); - - DVA_SET_VDEV(&dva[0], vd->vdev_id); - DVA_SET_OFFSET(&dva[0], offset); - DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); - DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); - - BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); - - BP_SET_LSIZE(bp, lsize); - BP_SET_PSIZE(bp, psize); - BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); - BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); - BP_SET_TYPE(bp, DMU_OT_NONE); - BP_SET_LEVEL(bp, 0); - BP_SET_DEDUP(bp, 0); - BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); - - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - zio = zio_root(spa, NULL, NULL, 0); - - if (vd == vd->vdev_top) { - /* - * Treat this as a normal block read. - */ - zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, - ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); - } else { - /* - * Treat this as a vdev child I/O. - */ - zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, - psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | - ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | - ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL, - NULL, NULL)); - } - - error = zio_wait(zio); - spa_config_exit(spa, SCL_STATE, FTAG); - - if (error) { - (void) printf("Read of %s failed, error: %d\n", thing, error); - goto out; - } - - if (flags & ZDB_FLAG_DECOMPRESS) { - /* - * We don't know how the data was compressed, so just try - * every decompress function at every inflated blocksize. - */ - enum zio_compress c; - void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); - void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); - - abd_copy_to_buf(pbuf2, pabd, psize); - - VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize, - random_get_pseudo_bytes_cb, NULL)); - - VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, - SPA_MAXBLOCKSIZE - psize)); - - for (lsize = SPA_MAXBLOCKSIZE; lsize > psize; - lsize -= SPA_MINBLOCKSIZE) { - for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) { - if (zio_decompress_data(c, pabd, - lbuf, psize, lsize) == 0 && - zio_decompress_data_buf(c, pbuf2, - lbuf2, psize, lsize) == 0 && - bcmp(lbuf, lbuf2, lsize) == 0) - break; - } - if (c != ZIO_COMPRESS_FUNCTIONS) - break; - lsize -= SPA_MINBLOCKSIZE; - } - - umem_free(pbuf2, SPA_MAXBLOCKSIZE); - umem_free(lbuf2, SPA_MAXBLOCKSIZE); - - if (lsize <= psize) { - (void) printf("Decompress of %s failed\n", thing); - goto out; - } - buf = lbuf; - size = lsize; - } else { - buf = abd_to_buf(pabd); - size = psize; - } - - if (flags & ZDB_FLAG_PRINT_BLKPTR) - zdb_print_blkptr((blkptr_t *)(void *) - ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); - else if (flags & ZDB_FLAG_RAW) - zdb_dump_block_raw(buf, size, flags); - else if (flags & ZDB_FLAG_INDIRECT) - zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t), - flags); - else if (flags & ZDB_FLAG_GBH) - zdb_dump_gbh(buf, flags); - else - zdb_dump_block(thing, buf, size, flags); - -out: - abd_free(pabd); - umem_free(lbuf, SPA_MAXBLOCKSIZE); - free(dup); -} - -static void -zdb_embedded_block(char *thing) -{ - blkptr_t bp; - unsigned long long *words = (void *)&bp; - char *buf; - int err; - - bzero(&bp, sizeof (bp)); - err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" - "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", - words + 0, words + 1, words + 2, words + 3, - words + 4, words + 5, words + 6, words + 7, - words + 8, words + 9, words + 10, words + 11, - words + 12, words + 13, words + 14, words + 15); - if (err != 16) { - (void) fprintf(stderr, "invalid input format\n"); - exit(1); - } - ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); - buf = malloc(SPA_MAXBLOCKSIZE); - if (buf == NULL) { - (void) fprintf(stderr, "out of memory\n"); - exit(1); - } - err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); - if (err != 0) { - (void) fprintf(stderr, "decode failed: %u\n", err); - free(buf); - exit(1); - } - zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); - free(buf); -} - -int -main(int argc, char **argv) -{ - int c; - struct rlimit rl = { 1024, 1024 }; - spa_t *spa = NULL; - objset_t *os = NULL; - int dump_all = 1; - int verbose = 0; - int error = 0; - char **searchdirs = NULL; - int nsearch = 0; - char *target, *target_pool; - nvlist_t *policy = NULL; - uint64_t max_txg = UINT64_MAX; - int flags = ZFS_IMPORT_MISSING_LOG; - int rewind = ZPOOL_NEVER_REWIND; - char *spa_config_path_env; - boolean_t target_is_spa = B_TRUE; - nvlist_t *cfg = NULL; - - (void) setrlimit(RLIMIT_NOFILE, &rl); - (void) enable_extended_FILE_stdio(-1, -1); - - dprintf_setup(&argc, argv); - - /* - * If there is an environment variable SPA_CONFIG_PATH it overrides - * default spa_config_path setting. If -U flag is specified it will - * override this environment variable settings once again. - */ - spa_config_path_env = getenv("SPA_CONFIG_PATH"); - if (spa_config_path_env != NULL) - spa_config_path = spa_config_path_env; - - while ((c = getopt(argc, argv, - "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) { - switch (c) { - case 'b': - case 'c': - case 'C': - case 'd': - case 'D': - case 'E': - case 'G': - case 'h': - case 'i': - case 'l': - case 'm': - case 'M': - case 'O': - case 'R': - case 's': - case 'S': - case 'u': - dump_opt[c]++; - dump_all = 0; - break; - case 'A': - case 'e': - case 'F': - case 'k': - case 'L': - case 'P': - case 'q': - case 'X': - dump_opt[c]++; - break; - /* NB: Sort single match options below. */ - case 'I': - max_inflight = strtoull(optarg, NULL, 0); - if (max_inflight == 0) { - (void) fprintf(stderr, "maximum number " - "of inflight I/Os must be greater " - "than 0\n"); - usage(); - } - break; - case 'o': - error = set_global_var(optarg); - if (error != 0) - usage(); - break; - case 'p': - if (searchdirs == NULL) { - searchdirs = umem_alloc(sizeof (char *), - UMEM_NOFAIL); - } else { - char **tmp = umem_alloc((nsearch + 1) * - sizeof (char *), UMEM_NOFAIL); - bcopy(searchdirs, tmp, nsearch * - sizeof (char *)); - umem_free(searchdirs, - nsearch * sizeof (char *)); - searchdirs = tmp; - } - searchdirs[nsearch++] = optarg; - break; - case 't': - max_txg = strtoull(optarg, NULL, 0); - if (max_txg < TXG_INITIAL) { - (void) fprintf(stderr, "incorrect txg " - "specified: %s\n", optarg); - usage(); - } - break; - case 'U': - spa_config_path = optarg; - if (spa_config_path[0] != '/') { - (void) fprintf(stderr, - "cachefile must be an absolute path " - "(i.e. start with a slash)\n"); - usage(); - } - break; - case 'v': - verbose++; - break; - case 'V': - flags = ZFS_IMPORT_VERBATIM; - break; - case 'x': - vn_dumpdir = optarg; - break; - default: - usage(); - break; - } - } - - if (!dump_opt['e'] && searchdirs != NULL) { - (void) fprintf(stderr, "-p option requires use of -e\n"); - usage(); - } - - /* - * ZDB does not typically re-read blocks; therefore limit the ARC - * to 256 MB, which can be used entirely for metadata. - */ - zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; - - /* - * "zdb -c" uses checksum-verifying scrub i/os which are async reads. - * "zdb -b" uses traversal prefetch which uses async reads. - * For good performance, let several of them be active at once. - */ - zfs_vdev_async_read_max_active = 10; - - /* - * Disable reference tracking for better performance. - */ - reference_tracking_enable = B_FALSE; - - /* - * Do not fail spa_load when spa_load_verify fails. This is needed - * to load non-idle pools. - */ - spa_load_verify_dryrun = B_TRUE; - - kernel_init(FREAD); - g_zfs = libzfs_init(); - if (g_zfs == NULL) - fatal("Fail to initialize zfs"); - - if (dump_all) - verbose = MAX(verbose, 1); - - for (c = 0; c < 256; c++) { - if (dump_all && strchr("AeEFklLOPRSX", c) == NULL) - dump_opt[c] = 1; - if (dump_opt[c]) - dump_opt[c] += verbose; - } - - aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2); - zfs_recover = (dump_opt['A'] > 1); - - argc -= optind; - argv += optind; - - if (argc < 2 && dump_opt['R']) - usage(); - - if (dump_opt['E']) { - if (argc != 1) - usage(); - zdb_embedded_block(argv[0]); - return (0); - } - - if (argc < 1) { - if (!dump_opt['e'] && dump_opt['C']) { - dump_cachefile(spa_config_path); - return (0); - } - usage(); - } - - if (dump_opt['l']) - return (dump_label(argv[0])); - - if (dump_opt['O']) { - if (argc != 2) - usage(); - dump_opt['v'] = verbose + 3; - return (dump_path(argv[0], argv[1])); - } - - if (dump_opt['X'] || dump_opt['F']) - rewind = ZPOOL_DO_REWIND | - (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); - - if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || - nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 || - nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0) - fatal("internal error: %s", strerror(ENOMEM)); - - error = 0; - target = argv[0]; - - if (strpbrk(target, "/@") != NULL) { - size_t targetlen; - - target_pool = strdup(target); - *strpbrk(target_pool, "/@") = '\0'; - - target_is_spa = B_FALSE; - targetlen = strlen(target); - if (targetlen && target[targetlen - 1] == '/') - target[targetlen - 1] = '\0'; - } else { - target_pool = target; - } - - if (dump_opt['e']) { - importargs_t args = { 0 }; - - args.paths = nsearch; - args.path = searchdirs; - args.can_be_active = B_TRUE; - - error = zpool_tryimport(g_zfs, target_pool, &cfg, &args); - - if (error == 0) { - - if (nvlist_add_nvlist(cfg, - ZPOOL_LOAD_POLICY, policy) != 0) { - fatal("can't open '%s': %s", - target, strerror(ENOMEM)); - } - - if (dump_opt['C'] > 1) { - (void) printf("\nConfiguration for import:\n"); - dump_nvlist(cfg, 8); - } - - /* - * Disable the activity check to allow examination of - * active pools. - */ - error = spa_import(target_pool, cfg, NULL, - flags | ZFS_IMPORT_SKIP_MMP); - } - } - - char *checkpoint_pool = NULL; - char *checkpoint_target = NULL; - if (dump_opt['k']) { - checkpoint_pool = import_checkpointed_state(target, cfg, - &checkpoint_target); - - if (checkpoint_target != NULL) - target = checkpoint_target; - - } - - if (error == 0) { - if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { - ASSERT(checkpoint_pool != NULL); - ASSERT(checkpoint_target == NULL); - - error = spa_open(checkpoint_pool, &spa, FTAG); - if (error != 0) { - fatal("Tried to open pool \"%s\" but " - "spa_open() failed with error %d\n", - checkpoint_pool, error); - } - - } else if (target_is_spa || dump_opt['R']) { - zdb_set_skip_mmp(target); - error = spa_open_rewind(target, &spa, FTAG, policy, - NULL); - if (error) { - /* - * If we're missing the log device then - * try opening the pool after clearing the - * log state. - */ - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(target)) != NULL && - spa->spa_log_state == SPA_LOG_MISSING) { - spa->spa_log_state = SPA_LOG_CLEAR; - error = 0; - } - mutex_exit(&spa_namespace_lock); - - if (!error) { - error = spa_open_rewind(target, &spa, - FTAG, policy, NULL); - } - } - } else { - zdb_set_skip_mmp(target); - error = open_objset(target, DMU_OST_ANY, FTAG, &os); - } - } - nvlist_free(policy); - - if (error) - fatal("can't open '%s': %s", target, strerror(error)); - - argv++; - argc--; - if (!dump_opt['R']) { - if (argc > 0) { - zopt_objects = argc; - zopt_object = calloc(zopt_objects, sizeof (uint64_t)); - for (unsigned i = 0; i < zopt_objects; i++) { - errno = 0; - zopt_object[i] = strtoull(argv[i], NULL, 0); - if (zopt_object[i] == 0 && errno != 0) - fatal("bad number %s: %s", - argv[i], strerror(errno)); - } - } - if (os != NULL) { - dump_dir(os); - } else if (zopt_objects > 0 && !dump_opt['m']) { - dump_dir(spa->spa_meta_objset); - } else { - dump_zpool(spa); - } - } else { - flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; - flagbits['c'] = ZDB_FLAG_CHECKSUM; - flagbits['d'] = ZDB_FLAG_DECOMPRESS; - flagbits['e'] = ZDB_FLAG_BSWAP; - flagbits['g'] = ZDB_FLAG_GBH; - flagbits['i'] = ZDB_FLAG_INDIRECT; - flagbits['p'] = ZDB_FLAG_PHYS; - flagbits['r'] = ZDB_FLAG_RAW; - - for (int i = 0; i < argc; i++) - zdb_read_block(argv[i], spa); - } - - if (dump_opt['k']) { - free(checkpoint_pool); - if (!target_is_spa) - free(checkpoint_target); - } - - if (os != NULL) - close_objset(os, FTAG); - else - spa_close(spa, FTAG); - - fuid_table_destroy(); - - dump_debug_buffer(); - - libzfs_fini(g_zfs); - kernel_fini(); - - return (error); -} diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.h b/cddl/contrib/opensolaris/cmd/zdb/zdb.h deleted file mode 100644 index 49579811efbb..000000000000 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2017 Spectra Logic Corp Inc. All rights reserved. - * Use is subject to license terms. - */ - - -#ifndef _ZDB_H -#define _ZDB_H - -void dump_intent_log(zilog_t *); -extern uint8_t dump_opt[256]; - -#endif /* _ZDB_H */ diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c b/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c deleted file mode 100644 index 9f3f23f82da1..000000000000 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c +++ /dev/null @@ -1,424 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. - */ - -/* - * Print intent log header and statistics. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zdb.h" - -extern uint8_t dump_opt[256]; - -static char tab_prefix[4] = "\t\t\t"; - -static void -print_log_bp(const blkptr_t *bp, const char *prefix) -{ - char blkbuf[BP_SPRINTF_LEN]; - - snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); - (void) printf("%s%s\n", prefix, blkbuf); -} - -/* ARGSUSED */ -static void -zil_prt_rec_create(zilog_t *zilog, int txtype, void *arg) -{ - lr_create_t *lr = arg; - time_t crtime = lr->lr_crtime[0]; - char *name, *link; - lr_attr_t *lrattr; - - name = (char *)(lr + 1); - - if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR || - lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) { - lrattr = (lr_attr_t *)(lr + 1); - name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); - } - - if (txtype == TX_SYMLINK) { - link = name + strlen(name) + 1; - (void) printf("%s%s -> %s\n", tab_prefix, name, link); - } else if (txtype != TX_MKXATTR) { - (void) printf("%s%s\n", tab_prefix, name); - } - - (void) printf("%s%s", tab_prefix, ctime(&crtime)); - (void) printf("%sdoid %" PRIu64 ", foid %" PRIu64 ", slots %" PRIu64 - ", mode %" PRIo64 "\n", - tab_prefix, lr->lr_doid, - (uint64_t)LR_FOID_GET_OBJ(lr->lr_foid), - (uint64_t)LR_FOID_GET_SLOTS(lr->lr_foid), - lr->lr_mode); - (void) printf("%suid %" PRIu64 ", gid %" PRIu64 ", gen %" PRIu64 - ", rdev %#" PRIx64 "\n", - tab_prefix, lr->lr_uid, lr->lr_gid, lr->lr_gen, lr->lr_rdev); -} - -/* ARGSUSED */ -static void -zil_prt_rec_remove(zilog_t *zilog, int txtype, void *arg) -{ - lr_remove_t *lr = arg; - - (void) printf("%sdoid %llu, name %s\n", tab_prefix, - (u_longlong_t)lr->lr_doid, (char *)(lr + 1)); -} - -/* ARGSUSED */ -static void -zil_prt_rec_link(zilog_t *zilog, int txtype, void *arg) -{ - lr_link_t *lr = arg; - - (void) printf("%sdoid %llu, link_obj %llu, name %s\n", tab_prefix, - (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj, - (char *)(lr + 1)); -} - -/* ARGSUSED */ -static void -zil_prt_rec_rename(zilog_t *zilog, int txtype, void *arg) -{ - lr_rename_t *lr = arg; - char *snm = (char *)(lr + 1); - char *tnm = snm + strlen(snm) + 1; - - (void) printf("%ssdoid %llu, tdoid %llu\n", tab_prefix, - (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid); - (void) printf("%ssrc %s tgt %s\n", tab_prefix, snm, tnm); -} - -/* ARGSUSED */ -static int -zil_prt_rec_write_cb(void *data, size_t len, void *unused) -{ - char *cdata = data; - for (size_t i = 0; i < len; i++) { - if (isprint(*cdata)) - (void) printf("%c ", *cdata); - else - (void) printf("%2X", *cdata); - cdata++; - } - return (0); -} - -/* ARGSUSED */ -static void -zil_prt_rec_write(zilog_t *zilog, int txtype, void *arg) -{ - lr_write_t *lr = arg; - abd_t *data; - blkptr_t *bp = &lr->lr_blkptr; - zbookmark_phys_t zb; - int verbose = MAX(dump_opt['d'], dump_opt['i']); - int error; - - (void) printf("%sfoid %llu, offset %llx, length %llx\n", tab_prefix, - (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset, - (u_longlong_t)lr->lr_length); - - if (txtype == TX_WRITE2 || verbose < 5) - return; - - if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { - (void) printf("%shas blkptr, %s\n", tab_prefix, - !BP_IS_HOLE(bp) && - bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ? - "will claim" : "won't claim"); - print_log_bp(bp, tab_prefix); - - if (BP_IS_HOLE(bp)) { - (void) printf("\t\t\tLSIZE 0x%llx\n", - (u_longlong_t)BP_GET_LSIZE(bp)); - (void) printf("%s\n", tab_prefix); - return; - } - if (bp->blk_birth < zilog->zl_header->zh_claim_txg) { - (void) printf("%s\n", - tab_prefix); - return; - } - - SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), - lr->lr_foid, ZB_ZIL_LEVEL, - lr->lr_offset / BP_GET_LSIZE(bp)); - - data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE); - error = zio_wait(zio_read(NULL, zilog->zl_spa, - bp, data, BP_GET_LSIZE(bp), NULL, NULL, - ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); - if (error) - goto out; - } else { - /* data is stored after the end of the lr_write record */ - data = abd_alloc(lr->lr_length, B_FALSE); - abd_copy_from_buf(data, lr + 1, lr->lr_length); - } - - (void) printf("%s", tab_prefix); - (void) abd_iterate_func(data, - 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)), - zil_prt_rec_write_cb, NULL); - (void) printf("\n"); - -out: - abd_free(data); -} - -/* ARGSUSED */ -static void -zil_prt_rec_truncate(zilog_t *zilog, int txtype, void *arg) -{ - lr_truncate_t *lr = arg; - - (void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", tab_prefix, - (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset, - (u_longlong_t)lr->lr_length); -} - -/* ARGSUSED */ -static void -zil_prt_rec_setattr(zilog_t *zilog, int txtype, void *arg) -{ - lr_setattr_t *lr = arg; - time_t atime = (time_t)lr->lr_atime[0]; - time_t mtime = (time_t)lr->lr_mtime[0]; - - (void) printf("%sfoid %llu, mask 0x%llx\n", tab_prefix, - (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask); - - if (lr->lr_mask & AT_MODE) { - (void) printf("%sAT_MODE %llo\n", tab_prefix, - (longlong_t)lr->lr_mode); - } - - if (lr->lr_mask & AT_UID) { - (void) printf("%sAT_UID %llu\n", tab_prefix, - (u_longlong_t)lr->lr_uid); - } - - if (lr->lr_mask & AT_GID) { - (void) printf("%sAT_GID %llu\n", tab_prefix, - (u_longlong_t)lr->lr_gid); - } - - if (lr->lr_mask & AT_SIZE) { - (void) printf("%sAT_SIZE %llu\n", tab_prefix, - (u_longlong_t)lr->lr_size); - } - - if (lr->lr_mask & AT_ATIME) { - (void) printf("%sAT_ATIME %llu.%09llu %s", tab_prefix, - (u_longlong_t)lr->lr_atime[0], - (u_longlong_t)lr->lr_atime[1], - ctime(&atime)); - } - - if (lr->lr_mask & AT_MTIME) { - (void) printf("%sAT_MTIME %llu.%09llu %s", tab_prefix, - (u_longlong_t)lr->lr_mtime[0], - (u_longlong_t)lr->lr_mtime[1], - ctime(&mtime)); - } -} - -/* ARGSUSED */ -static void -zil_prt_rec_acl(zilog_t *zilog, int txtype, void *arg) -{ - lr_acl_t *lr = arg; - - (void) printf("%sfoid %llu, aclcnt %llu\n", tab_prefix, - (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt); -} - -typedef void (*zil_prt_rec_func_t)(zilog_t *, int, void *); -typedef struct zil_rec_info { - zil_prt_rec_func_t zri_print; - const char *zri_name; - uint64_t zri_count; -} zil_rec_info_t; - -static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { - {.zri_print = NULL, .zri_name = "Total "}, - {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE "}, - {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR "}, - {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKXATTR "}, - {.zri_print = zil_prt_rec_create, .zri_name = "TX_SYMLINK "}, - {.zri_print = zil_prt_rec_remove, .zri_name = "TX_REMOVE "}, - {.zri_print = zil_prt_rec_remove, .zri_name = "TX_RMDIR "}, - {.zri_print = zil_prt_rec_link, .zri_name = "TX_LINK "}, - {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME "}, - {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE "}, - {.zri_print = zil_prt_rec_truncate, .zri_name = "TX_TRUNCATE "}, - {.zri_print = zil_prt_rec_setattr, .zri_name = "TX_SETATTR "}, - {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_V0 "}, - {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_ACL "}, - {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL "}, - {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ATTR "}, - {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL_ATTR "}, - {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL "}, - {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ATTR "}, - {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL_ATTR "}, - {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE2 "}, -}; - -/* ARGSUSED */ -static int -print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg) -{ - int txtype; - int verbose = MAX(dump_opt['d'], dump_opt['i']); - - /* reduce size of txtype to strip off TX_CI bit */ - txtype = lr->lrc_txtype; - - ASSERT(txtype != 0 && (uint_t)txtype < TX_MAX_TYPE); - ASSERT(lr->lrc_txg); - - (void) printf("\t\t%s%s len %6llu, txg %llu, seq %llu\n", - (lr->lrc_txtype & TX_CI) ? "CI-" : "", - zil_rec_info[txtype].zri_name, - (u_longlong_t)lr->lrc_reclen, - (u_longlong_t)lr->lrc_txg, - (u_longlong_t)lr->lrc_seq); - - if (txtype && verbose >= 3) - zil_rec_info[txtype].zri_print(zilog, txtype, lr); - - zil_rec_info[txtype].zri_count++; - zil_rec_info[0].zri_count++; - - return (0); -} - -/* ARGSUSED */ -static int -print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) -{ - char blkbuf[BP_SPRINTF_LEN + 10]; - int verbose = MAX(dump_opt['d'], dump_opt['i']); - const char *claim; - - if (verbose <= 3) - return (0); - - if (verbose >= 5) { - (void) strcpy(blkbuf, ", "); - snprintf_blkptr(blkbuf + strlen(blkbuf), - sizeof (blkbuf) - strlen(blkbuf), bp); - } else { - blkbuf[0] = '\0'; - } - - if (claim_txg != 0) - claim = "already claimed"; - else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa)) - claim = "will claim"; - else - claim = "won't claim"; - - (void) printf("\tBlock seqno %llu, %s%s\n", - (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf); - - return (0); -} - -static void -print_log_stats(int verbose) -{ - unsigned i, w, p10; - - if (verbose > 3) - (void) printf("\n"); - - if (zil_rec_info[0].zri_count == 0) - return; - - for (w = 1, p10 = 10; zil_rec_info[0].zri_count >= p10; p10 *= 10) - w++; - - for (i = 0; i < TX_MAX_TYPE; i++) - if (zil_rec_info[i].zri_count || verbose >= 3) - (void) printf("\t\t%s %*llu\n", - zil_rec_info[i].zri_name, w, - (u_longlong_t)zil_rec_info[i].zri_count); - (void) printf("\n"); -} - -/* ARGSUSED */ -void -dump_intent_log(zilog_t *zilog) -{ - const zil_header_t *zh = zilog->zl_header; - int verbose = MAX(dump_opt['d'], dump_opt['i']); - int i; - - if (BP_IS_HOLE(&zh->zh_log) || verbose < 1) - return; - - (void) printf("\n ZIL header: claim_txg %llu, " - "claim_blk_seq %llu, claim_lr_seq %llu", - (u_longlong_t)zh->zh_claim_txg, - (u_longlong_t)zh->zh_claim_blk_seq, - (u_longlong_t)zh->zh_claim_lr_seq); - (void) printf(" replay_seq %llu, flags 0x%llx\n", - (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags); - - for (i = 0; i < TX_MAX_TYPE; i++) - zil_rec_info[i].zri_count = 0; - - /* see comment in zil_claim() or zil_check_log_chain() */ - if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && - zh->zh_claim_txg == 0) - return; - - if (verbose >= 2) { - (void) printf("\n"); - (void) zil_parse(zilog, print_log_block, print_log_record, NULL, - zh->zh_claim_txg); - print_log_stats(verbose); - } -} diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs-program.8 b/cddl/contrib/opensolaris/cmd/zfs/zfs-program.8 deleted file mode 100644 index 76bb97c2d96d..000000000000 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs-program.8 +++ /dev/null @@ -1,551 +0,0 @@ -.\" This file and its contents are supplied under the terms of the -.\" Common Development and Distribution License ("CDDL"), version 1.0. -.\" You may only use this file in accordance with the terms of version -.\" 1.0 of the CDDL. -.\" -.\" A full copy of the text of the CDDL should have accompanied this -.\" source. A copy of the CDDL is also available via the Internet at -.\" http://www.illumos.org/license/CDDL. -.\" -.\" -.\" Copyright (c) 2016, 2017 by Delphix. All rights reserved. -.\" Copyright (c) 2018 Datto Inc. -.\" -.Dd April 18, 2020 -.Dt ZFS-PROGRAM 8 -.Os -.Sh NAME -.Nm zfs program -.Nd executes ZFS channel programs -.Sh SYNOPSIS -.Cm zfs program -.Op Fl jn -.Op Fl t Ar instruction-limit -.Op Fl m Ar memory-limit -.Ar pool -.Ar script -.\".Op Ar optional arguments to channel program -.Sh DESCRIPTION -The ZFS channel program interface allows ZFS administrative operations to be -run programmatically as a Lua script. -The entire script is executed atomically, with no other administrative -operations taking effect concurrently. -A library of ZFS calls is made available to channel program scripts. -Channel programs may only be run with root privileges. -.Pp -A modified version of the Lua 5.2 interpreter is used to run channel program -scripts. -The Lua 5.2 manual can be found at: -.Bd -centered -offset indent -.Lk http://www.lua.org/manual/5.2/ -.Ed -.Pp -The channel program given by -.Ar script -will be run on -.Ar pool , -and any attempts to access or modify other pools will cause an error. -.Sh OPTIONS -.Bl -tag -width "-t" -.It Fl j -Display channel program output in JSON format. -When this flag is specified and standard output is empty - -channel program encountered an error. -The details of such an error will be printed to standard error in plain text. -.It Fl n -Executes a read-only channel program, which runs faster. -The program cannot change on-disk state by calling functions from the -zfs.sync submodule. -The program can be used to gather information such as properties and -determining if changes would succeed (zfs.check.*). -Without this flag, all pending changes must be synced to disk before a -channel program can complete. -.It Fl t Ar instruction-limit -Execution time limit, in number of Lua instructions to execute. -If a channel program executes more than the specified number of instructions, -it will be stopped and an error will be returned. -The default limit is 10 million instructions, and it can be set to a maximum of -100 million instructions. -.It Fl m Ar memory-limit -Memory limit, in bytes. -If a channel program attempts to allocate more memory than the given limit, it -will be stopped and an error returned. -The default memory limit is 10 MB, and can be set to a maximum of 100 MB. -.El -.Pp -All remaining argument strings will be passed directly to the Lua script as -described in the -.Sx LUA INTERFACE -section below. -.Sh LUA INTERFACE -A channel program can be invoked either from the command line, or via a library -call to -.Fn lzc_channel_program . -.Ss Arguments -Arguments passed to the channel program are converted to a Lua table. -If invoked from the command line, extra arguments to the Lua script will be -accessible as an array stored in the argument table with the key 'argv': -.Bd -literal -offset indent -args = ... -argv = args["argv"] --- argv == {1="arg1", 2="arg2", ...} -.Ed -.Pp -If invoked from the libZFS interface, an arbitrary argument list can be -passed to the channel program, which is accessible via the same -"..." syntax in Lua: -.Bd -literal -offset indent -args = ... --- args == {"foo"="bar", "baz"={...}, ...} -.Ed -.Pp -Note that because Lua arrays are 1-indexed, arrays passed to Lua from the -libZFS interface will have their indices incremented by 1. -That is, the element -in -.Va arr[0] -in a C array passed to a channel program will be stored in -.Va arr[1] -when accessed from Lua. -.Ss Return Values -Lua return statements take the form: -.Bd -literal -offset indent -return ret0, ret1, ret2, ... -.Ed -.Pp -Return statements returning multiple values are permitted internally in a -channel program script, but attempting to return more than one value from the -top level of the channel program is not permitted and will throw an error. -However, tables containing multiple values can still be returned. -If invoked from the command line, a return statement: -.Bd -literal -offset indent -a = {foo="bar", baz=2} -return a -.Ed -.Pp -Will be output formatted as: -.Bd -literal -offset indent -Channel program fully executed with return value: - return: - baz: 2 - foo: 'bar' -.Ed -.Ss Fatal Errors -If the channel program encounters a fatal error while running, a non-zero exit -status will be returned. -If more information about the error is available, a singleton list will be -returned detailing the error: -.Bd -literal -offset indent -error: "error string, including Lua stack trace" -.Ed -.Pp -If a fatal error is returned, the channel program may have not executed at all, -may have partially executed, or may have fully executed but failed to pass a -return value back to userland. -.Pp -If the channel program exhausts an instruction or memory limit, a fatal error -will be generated and the program will be stopped, leaving the program partially -executed. -No attempt is made to reverse or undo any operations already performed. -Note that because both the instruction count and amount of memory used by a -channel program are deterministic when run against the same inputs and -filesystem state, as long as a channel program has run successfully once, you -can guarantee that it will finish successfully against a similar size system. -.Pp -If a channel program attempts to return too large a value, the program will -fully execute but exit with a nonzero status code and no return value. -.Pp -.Em Note: -ZFS API functions do not generate Fatal Errors when correctly invoked, they -return an error code and the channel program continues executing. -See the -.Sx ZFS API -section below for function-specific details on error return codes. -.Ss Lua to C Value Conversion -When invoking a channel program via the libZFS interface, it is necessary to -translate arguments and return values from Lua values to their C equivalents, -and vice-versa. -.Pp -There is a correspondence between nvlist values in C and Lua tables. -A Lua table which is returned from the channel program will be recursively -converted to an nvlist, with table values converted to their natural -equivalents: -.Bd -literal -offset indent -string -> string -number -> int64 -boolean -> boolean_value -nil -> boolean (no value) -table -> nvlist -.Ed -.Pp -Likewise, table keys are replaced by string equivalents as follows: -.Bd -literal -offset indent -string -> no change -number -> signed decimal string ("%lld") -boolean -> "true" | "false" -.Ed -.Pp -Any collision of table key strings (for example, the string "true" and a -true boolean value) will cause a fatal error. -.Pp -Lua numbers are represented internally as signed 64-bit integers. -.Sh LUA STANDARD LIBRARY -The following Lua built-in base library functions are available: -.Bd -literal -offset indent -assert rawlen -collectgarbage rawget -error rawset -getmetatable select -ipairs setmetatable -next tonumber -pairs tostring -rawequal type -.Ed -.Pp -All functions in the -.Em coroutine , -.Em string , -and -.Em table -built-in submodules are also available. -A complete list and documentation of these modules is available in the Lua -manual. -.Pp -The following functions base library functions have been disabled and are -not available for use in channel programs: -.Bd -literal -offset indent -dofile -loadfile -load -pcall -print -xpcall -.Ed -.Sh ZFS API -.Ss Function Arguments -Each API function takes a fixed set of required positional arguments and -optional keyword arguments. -For example, the destroy function takes a single positional string argument -(the name of the dataset to destroy) and an optional "defer" keyword boolean -argument. -When using parentheses to specify the arguments to a Lua function, only -positional arguments can be used: -.Bd -literal -offset indent -zfs.sync.destroy("rpool@snap") -.Ed -.Pp -To use keyword arguments, functions must be called with a single argument that -is a Lua table containing entries mapping integers to positional arguments and -strings to keyword arguments: -.Bd -literal -offset indent -zfs.sync.destroy({1="rpool@snap", defer=true}) -.Ed -.Pp -The Lua language allows curly braces to be used in place of parenthesis as -syntactic sugar for this calling convention: -.Bd -literal -offset indent -zfs.sync.snapshot{"rpool@snap", defer=true} -.Ed -.Ss Function Return Values -If an API function succeeds, it returns 0. -If it fails, it returns an error code and the channel program continues -executing. -API functions do not generate Fatal Errors except in the case of an -unrecoverable internal file system error. -.Pp -In addition to returning an error code, some functions also return extra -details describing what caused the error. -This extra description is given as a second return value, and will always be a -Lua table, or Nil if no error details were returned. -Different keys will exist in the error details table depending on the function -and error case. -Any such function may be called expecting a single return value: -.Bd -literal -offset indent -errno = zfs.sync.promote(dataset) -.Ed -.Pp -Or, the error details can be retrieved: -.Bd -literal -offset indent -errno, details = zfs.sync.promote(dataset) -if (errno == EEXIST) then - assert(details ~= Nil) - list_of_conflicting_snapshots = details -end -.Ed -.Pp -The following global aliases for API function error return codes are defined -for use in channel programs: -.Bd -literal -offset indent -EPERM ECHILD ENODEV ENOSPC -ENOENT EAGAIN ENOTDIR ESPIPE -ESRCH ENOMEM EISDIR EROFS -EINTR EACCES EINVAL EMLINK -EIO EFAULT ENFILE EPIPE -ENXIO ENOTBLK EMFILE EDOM -E2BIG EBUSY ENOTTY ERANGE -ENOEXEC EEXIST ETXTBSY EDQUOT -EBADF EXDEV EFBIG -.Ed -.Ss API Functions -For detailed descriptions of the exact behavior of any zfs administrative -operations, see the main -.Xr zfs 8 -manual page. -.Bl -tag -width "xx" -.It Em zfs.debug(msg) -Record a debug message in the zfs_dbgmsg log. -A log of these messages can be printed via mdb's "::zfs_dbgmsg" command, or -can be monitored live by running: -.Bd -literal -offset indent - dtrace -n 'zfs-dbgmsg{trace(stringof(arg0))}' -.Ed -.Pp -msg (string) -.Bd -ragged -compact -offset "xxxx" -Debug message to be printed. -.Ed -.It Em zfs.exists(dataset) -Returns true if the given dataset exists, or false if it doesn't. -A fatal error will be thrown if the dataset is not in the target pool. -That is, in a channel program running on rpool, -zfs.exists("rpool/nonexistent_fs") returns false, but -zfs.exists("somepool/fs_that_may_exist") will error. -.Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" -Dataset to check for existence. -Must be in the target pool. -.Ed -.It Em zfs.get_prop(dataset, property) -Returns two values. -First, a string, number or table containing the property value for the given -dataset. -Second, a string containing the source of the property (i.e. the name of the -dataset in which it was set or nil if it is readonly). -Throws a Lua error if the dataset is invalid or the property doesn't exist. -Note that Lua only supports int64 number types whereas ZFS number properties -are uint64. -This means very large values (like guid) may wrap around and appear negative. -.Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" -Filesystem or snapshot path to retrieve properties from. -.Ed -.Pp -property (string) -.Bd -ragged -compact -offset "xxxx" -Name of property to retrieve. -All filesystem, snapshot and volume properties are supported except -for 'mounted' and 'iscsioptions.' -Also supports the 'written@snap' and 'written#bookmark' properties and -the '@id' properties, though the id must be in numeric -form. -.Ed -.El -.Bl -tag -width "xx" -.It Sy zfs.sync submodule -The sync submodule contains functions that modify the on-disk state. -They are executed in "syncing context". -.Pp -The available sync submodule functions are as follows: -.Bl -tag -width "xx" -.It Em zfs.sync.destroy(dataset, [defer=true|false]) -Destroy the given dataset. -Returns 0 on successful destroy, or a nonzero error code if the dataset could -not be destroyed (for example, if the dataset has any active children or -clones). -.Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" -Filesystem or snapshot to be destroyed. -.Ed -.Pp -[optional] defer (boolean) -.Bd -ragged -compact -offset "xxxx" -Valid only for destroying snapshots. -If set to true, and the snapshot has holds or clones, allows the snapshot to be -marked for deferred deletion rather than failing. -.Ed -.It Em zfs.sync.promote(dataset) -Promote the given clone to a filesystem. -Returns 0 on successful promotion, or a nonzero error code otherwise. -If EEXIST is returned, the second return value will be an array of the clone's -snapshots whose names collide with snapshots of the parent filesystem. -.Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" -Clone to be promoted. -.Ed -.It Em zfs.sync.rollback(filesystem) -Rollback to the previous snapshot for a dataset. -Returns 0 on successful rollback, or a nonzero error code otherwise. -Rollbacks can be performed on filesystems or zvols, but not on snapshots -or mounted datasets. -EBUSY is returned in the case where the filesystem is mounted. -.Pp -filesystem (string) -.Bd -ragged -compact -offset "xxxx" -Filesystem to rollback. -.Ed -.It Em zfs.sync.snapshot(dataset) -Create a snapshot of a filesystem. -Returns 0 if the snapshot was successfully created, -and a nonzero error code otherwise. -.Pp -Note: Taking a snapshot will fail on any pool older than legacy version 27. -To enable taking snapshots from ZCP scripts, the pool must be upgraded. -.Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" -Name of snapshot to create. -.Ed -.El -.It Sy zfs.check submodule -For each function in the zfs.sync submodule, there is a corresponding zfs.check -function which performs a "dry run" of the same operation. -Each takes the same arguments as its zfs.sync counterpart and returns 0 if the -operation would succeed, or a non-zero error code if it would fail, along with -any other error details. -That is, each has the same behavior as the corresponding sync function except -for actually executing the requested change. -For example, -.Em zfs.check.destroy("fs") -returns 0 if -.Em zfs.sync.destroy("fs") -would successfully destroy the dataset. -.Pp -The available zfs.check functions are: -.Bl -tag -width "xx" -.It Em zfs.check.destroy(dataset, [defer=true|false]) -.It Em zfs.check.promote(dataset) -.It Em zfs.check.rollback(filesystem) -.It Em zfs.check.snapshot(dataset) -.El -.It Sy zfs.list submodule -The zfs.list submodule provides functions for iterating over datasets and -properties. -Rather than returning tables, these functions act as Lua iterators, and are -generally used as follows: -.Bd -literal -offset indent -for child in zfs.list.children("rpool") do - ... -end -.Ed -.Pp -The available zfs.list functions are: -.Bl -tag -width "xx" -.It Em zfs.list.clones(snapshot) -Iterate through all clones of the given snapshot. -.Pp -snapshot (string) -.Bd -ragged -compact -offset "xxxx" -Must be a valid snapshot path in the current pool. -.Ed -.It Em zfs.list.snapshots(dataset) -Iterate through all snapshots of the given dataset. -Each snapshot is returned as a string containing the full dataset name, e.g. -"pool/fs@snap". -.Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" -Must be a valid filesystem or volume. -.Ed -.It Em zfs.list.children(dataset) -Iterate through all direct children of the given dataset. -Each child is returned as a string containing the full dataset name, e.g. -"pool/fs/child". -.Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" -Must be a valid filesystem or volume. -.Ed -.It Em zfs.list.properties(dataset) -Iterate through all user properties for the given dataset. -.Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" -Must be a valid filesystem, snapshot, or volume. -.Ed -.It Em zfs.list.system_properties(dataset) -Returns an array of strings, the names of the valid system (non-user defined) -properties for the given dataset. -Throws a Lua error if the dataset is invalid. -.Pp -dataset (string) -.Bd -ragged -compact -offset "xxxx" -Must be a valid filesystem, snapshot or volume. -.Ed -.El -.El -.Sh EXAMPLES -.Ss Example 1 -The following channel program recursively destroys a filesystem and all its -snapshots and children in a naive manner. -Note that this does not involve any error handling or reporting. -.Bd -literal -offset indent -function destroy_recursive(root) - for child in zfs.list.children(root) do - destroy_recursive(child) - end - for snap in zfs.list.snapshots(root) do - zfs.sync.destroy(snap) - end - zfs.sync.destroy(root) -end -destroy_recursive("pool/somefs") -.Ed -.Ss Example 2 -A more verbose and robust version of the same channel program, which -properly detects and reports errors, and also takes the dataset to destroy -as a command line argument, would be as follows: -.Bd -literal -offset indent -succeeded = {} -failed = {} - -function destroy_recursive(root) - for child in zfs.list.children(root) do - destroy_recursive(child) - end - for snap in zfs.list.snapshots(root) do - err = zfs.sync.destroy(snap) - if (err ~= 0) then - failed[snap] = err - else - succeeded[snap] = err - end - end - err = zfs.sync.destroy(root) - if (err ~= 0) then - failed[root] = err - else - succeeded[root] = err - end -end - -args = ... -argv = args["argv"] - -destroy_recursive(argv[1]) - -results = {} -results["succeeded"] = succeeded -results["failed"] = failed -return results -.Ed -.Ss Example 3 -The following function performs a forced promote operation by attempting to -promote the given clone and destroying any conflicting snapshots. -.Bd -literal -offset indent -function force_promote(ds) - errno, details = zfs.check.promote(ds) - if (errno == EEXIST) then - assert(details ~= Nil) - for i, snap in ipairs(details) do - zfs.sync.destroy(ds .. "@" .. snap) - end - elseif (errno ~= 0) then - return errno - end - return zfs.sync.promote(ds) -end -.Ed diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 b/cddl/contrib/opensolaris/cmd/zfs/zfs.8 deleted file mode 100644 index 33e0ca4b3040..000000000000 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 +++ /dev/null @@ -1,3973 +0,0 @@ -'\" te -.\" Copyright (c) 2013, Martin Matuska . -.\" All Rights Reserved. -.\" -.\" The contents of this file are subject to the terms of the -.\" Common Development and Distribution License (the "License"). -.\" You may not use this file except in compliance with the License. -.\" -.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -.\" or http://www.opensolaris.org/os/licensing. -.\" See the License for the specific language governing permissions -.\" and limitations under the License. -.\" -.\" When distributing Covered Code, include this CDDL HEADER in each -.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. -.\" If applicable, add the following below this CDDL HEADER, with the -.\" fields enclosed by brackets "[]" replaced with your own identifying -.\" information: Portions Copyright [yyyy] [name of copyright owner] -.\" -.\" Copyright (c) 2010, Sun Microsystems, Inc. All Rights Reserved. -.\" Copyright (c) 2011, 2014 by Delphix. All rights reserved. -.\" Copyright (c) 2011, Pawel Jakub Dawidek -.\" Copyright (c) 2012, Glen Barber -.\" Copyright (c) 2012, Bryan Drewery -.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -.\" Copyright (c) 2013, Steven Hartland -.\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved. -.\" Copyright (c) 2014, Xin LI -.\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved. -.\" Copyright 2019 Joyent, Inc. -.\" Copyright (c) 2018 Datto Inc. -.\" -.\" $FreeBSD$ -.\" -.Dd February 16, 2020 -.Dt ZFS 8 -.Os -.Sh NAME -.Nm zfs -.Nd configures ZFS file systems -.Sh SYNOPSIS -.Nm -.Op Fl \&? -.Nm -.Cm create -.Op Fl pu -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... Ar filesystem -.Nm -.Cm create -.Op Fl ps -.Op Fl b Ar blocksize -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Fl V -.Ar size volume -.Nm -.Cm destroy -.Op Fl fnpRrv -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm destroy -.Op Fl dnpRrv -.Sm off -.Ar filesystem Ns | Ns volume -.Ns @snap -.Op % Ns Ar snap -.Op , Ns Ar snap Op % Ns Ar snap -.Op , Ns ... -.Sm on -.Nm -.Cm destroy -.Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark -.Nm -.Cm snapshot Ns | Ns Cm snap -.Op Fl r -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Ar filesystem@snapname Ns | Ns Ar volume@snapname -.Ar filesystem@snapname Ns | Ns Ar volume@snapname Ns ... -.Nm -.Cm rollback -.Op Fl rRf -.Ar snapshot -.Nm -.Cm clone -.Op Fl p -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Ar snapshot filesystem Ns | Ns Ar volume -.Nm -.Cm promote -.Ar clone-filesystem -.Nm -.Cm rename -.Op Fl f -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Nm -.Cm rename -.Op Fl f -.Fl p -.Ar filesystem Ns | Ns Ar volume -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm rename -.Fl r -.Ar snapshot snapshot -.Nm -.Cm rename -.Ar bookmark bookmark -.Nm -.Cm rename -.Fl u -.Op Fl p -.Ar filesystem filesystem -.Nm -.Cm list -.Op Fl r Ns | Ns Fl d Ar depth -.Op Fl Hp -.Op Fl o Ar property Ns Oo , Ns property Ns Oc Ns ... -.Op Fl t Ar type Ns Oo , Ns type Ns Oc Ns ... -.Oo Fl s Ar property Oc Ns ... -.Oo Fl S Ar property Oc Ns ... -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot | Ns Ar bookmark Ns ... -.Nm -.Cm remap -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm set -.Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns ... -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... -.Nm -.Cm get -.Op Fl r Ns | Ns Fl d Ar depth -.Op Fl Hp -.Op Fl o Ar all | field Ns Oo , Ns Ar field Oc Ns ... -.Op Fl t Ar type Ns Oo Ns , Ar type Oc Ns ... -.Op Fl s Ar source Ns Oo Ns , Ns Ar source Oc Ns ... -.Ar all | property Ns Oo Ns , Ns Ar property Oc Ns ... -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... -.Nm -.Cm inherit -.Op Fl rS -.Ar property -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... -.Nm -.Cm upgrade -.Op Fl v -.Nm -.Cm upgrade -.Op Fl r -.Op Fl V Ar version -.Fl a | Ar filesystem -.Nm -.Cm userspace -.Op Fl Hinp -.Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... -.Oo Fl s Ar field Oc Ns ... -.Oo Fl S Ar field Oc Ns ... -.Op Fl t Ar type Ns Oo Ns , Ns Ar type Oc Ns ... -.Ar filesystem Ns | Ns Ar snapshot -.Nm -.Cm groupspace -.Op Fl Hinp -.Op Fl o Ar field Ns Oo , Ns field Oc Ns ... -.Oo Fl s Ar field Oc Ns ... -.Oo Fl S Ar field Oc Ns ... -.Op Fl t Ar type Ns Oo Ns , Ns Ar type Oc Ns ... -.Ar filesystem Ns | Ns Ar snapshot -.Nm -.Cm mount -.Nm -.Cm mount -.Op Fl vO -.Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... -.Fl a | Ar filesystem -.Nm -.Cm unmount Ns | Ns Cm umount -.Op Fl f -.Fl a | Ar filesystem Ns | Ns Ar mountpoint -.Nm -.Cm share -.Fl a | Ar filesystem -.Nm -.Cm unshare -.Fl a | Ar filesystem Ns | Ns Ar mountpoint -.Nm -.Cm bookmark -.Ar snapshot -.Ar bookmark -.Nm -.Cm send -.Op Fl DLPRVcenpv -.Op Fl i Ar snapshot | Fl I Ar snapshot -.Ar snapshot -.Nm -.Cm send -.Op Fl LPcenv -.Op Fl i Ar snapshot Ns | Ns Ar bookmark -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Nm -.Cm send -.Op Fl PVenv -.Fl t Ar receive_resume_token -.Nm -.Cm receive Ns | Ns Cm recv -.Op Fl vnsFMu -.Op Fl o Sy origin Ns = Ns Ar snapshot -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Nm -.Cm receive Ns | Ns Cm recv -.Op Fl vnsFMu -.Op Fl d | e -.Op Fl o Sy origin Ns = Ns Ar snapshot -.Ar filesystem -.Nm -.Cm receive Ns | Ns Cm recv -.Fl A -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm allow -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm allow -.Op Fl ldug -.Ar user Ns | Ns Ar group Ns Oo Ns , Ns Ar user Ns | Ns Ar group Oc Ns ... -.Ar perm Ns | Ns Ar @setname Ns -.Oo Ns , Ns Ar perm Ns | Ns Ar @setname Oc Ns ... -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm allow -.Op Fl ld -.Fl e Ns | Ns Cm everyone -.Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns -.Ns ... -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm allow -.Fl c -.Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns -.Ns ... -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm allow -.Fl s -.Ar @setname -.Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns -.Ns ... -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm unallow -.Op Fl rldug -.Ar user Ns | Ns Ar group Ns Oo Ns , Ns Ar user Ns | Ns Ar group Oc Ns ... -.Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns -.Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm unallow -.Op Fl rld -.Fl e Ns | Ns Cm everyone -.Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns -.Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm unallow -.Op Fl r -.Fl c -.Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns -.Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm unallow -.Op Fl r -.Fl s -.Ar @setname -.Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns -.Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Nm -.Cm hold -.Op Fl r -.Ar tag snapshot Ns ... -.Nm -.Cm holds -.Op Fl Hp -.Op Fl r Ns | Ns Fl d Ar depth -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns -.Ns ... -.Nm -.Cm release -.Op Fl r -.Ar tag snapshot Ns ... -.Nm -.Cm diff -.Op Fl FHt -.Ar snapshot -.Op Ar snapshot Ns | Ns Ar filesystem -.Nm -.Cm program -.Op Fl jn -.Op Fl t Ar timeout -.Op Fl m Ar memory_limit -.Ar pool script -.Op Ar arg1 No ... -.Nm -.Cm jail -.Ar jailid Ns | Ns Ar jailname filesystem -.Nm -.Cm unjail -.Ar jailid Ns | Ns Ar jailname filesystem -.Sh DESCRIPTION -The -.Nm -command configures -.Tn ZFS -datasets within a -.Tn ZFS -storage pool, as described in -.Xr zpool 8 . -A dataset is identified by a unique path within the -.Tn ZFS -namespace. For example: -.Bd -ragged -offset 4n -.No pool/ Ns Brq filesystem,volume,snapshot -.Ed -.Pp -where the maximum length of a dataset name is -.Dv MAXNAMELEN -(256 bytes) -and the maximum amount of nesting allowed in a path is 50 levels deep. -.Pp -A dataset can be one of the following: -.Bl -hang -width 12n -.It Sy file system -A -.Tn ZFS -dataset of type -.Em filesystem -can be mounted within the standard system namespace and behaves like other file -systems. While -.Tn ZFS -file systems are designed to be -.Tn POSIX -compliant, known issues exist that prevent compliance in some cases. -Applications that depend on standards conformance might fail due to nonstandard -behavior when checking file system free space. -.It Sy volume -A logical volume exported as a raw or block device. This type of dataset should -only be used under special circumstances. File systems are typically used in -most environments. -.It Sy snapshot -A read-only version of a file system or volume at a given point in time. It is -specified as -.Em filesystem@name -or -.Em volume@name . -.El -.Ss ZFS File System Hierarchy -A -.Tn ZFS -storage pool is a logical collection of devices that provide space for -datasets. A storage pool is also the root of the -.Tn ZFS -file system hierarchy. -.Pp -The root of the pool can be accessed as a file system, such as mounting and -unmounting, taking snapshots, and setting properties. The physical storage -characteristics, however, are managed by the -.Xr zpool 8 -command. -.Pp -See -.Xr zpool 8 -for more information on creating and administering pools. -.Ss Snapshots -A snapshot is a read-only copy of a file system or volume. Snapshots can be -created extremely quickly, and initially consume no additional space within the -pool. As data within the active dataset changes, the snapshot consumes more -data than would otherwise be shared with the active dataset. -.Pp -Snapshots can have arbitrary names. Snapshots of volumes can be cloned or -rolled back, but cannot be accessed independently. -.Pp -File system snapshots can be accessed under the -.Pa \&.zfs/snapshot -directory in the root of the file system. Snapshots are automatically mounted -on demand and may be unmounted at regular intervals. The visibility of the -.Pa \&.zfs -directory can be controlled by the -.Sy snapdir -property. -.Ss Clones -A clone is a writable volume or file system whose initial contents are the same -as another dataset. As with snapshots, creating a clone is nearly -instantaneous, and initially consumes no additional space. -.Pp -Clones can only be created from a snapshot. When a snapshot is cloned, it -creates an implicit dependency between the parent and child. Even though the -clone is created somewhere else in the dataset hierarchy, the original snapshot -cannot be destroyed as long as a clone exists. The -.Sy origin -property exposes this dependency, and the -.Cm destroy -command lists any such dependencies, if they exist. -.Pp -The clone parent-child dependency relationship can be reversed by using the -.Cm promote -subcommand. This causes the "origin" file system to become a clone of the -specified file system, which makes it possible to destroy the file system that -the clone was created from. -.Ss Mount Points -Creating a -.Tn ZFS -file system is a simple operation, so the number of file systems per system is -likely to be numerous. To cope with this, -.Tn ZFS -automatically manages mounting and unmounting file systems without the need to -edit the -.Pa /etc/fstab -file. All automatically managed file systems are mounted by -.Tn ZFS -at boot time. -.Pp -By default, file systems are mounted under -.Pa /path , -where -.Ar path -is the name of the file system in the -.Tn ZFS -namespace. Directories are created and destroyed as needed. -.Pp -A file system can also have a mount point set in the -.Sy mountpoint -property. This directory is created as needed, and -.Tn ZFS -automatically mounts the file system when the -.Qq Nm Cm mount Fl a -command is invoked (without editing -.Pa /etc/fstab ) . -The -.Sy mountpoint -property can be inherited, so if -.Em pool/home -has a mount point of -.Pa /home , -then -.Em pool/home/user -automatically inherits a mount point of -.Pa /home/user . -.Pp -A file system -.Sy mountpoint -property of -.Cm none -prevents the file system from being mounted. -.Pp -If needed, -.Tn ZFS -file systems can also be managed with traditional tools -.Pq Xr mount 8 , Xr umount 8 , Xr fstab 5 . -If a file system's mount point is set to -.Cm legacy , -.Tn ZFS -makes no attempt to manage the file system, and the administrator is -responsible for mounting and unmounting the file system. -.Ss Jails -.No A Tn ZFS -dataset can be attached to a jail by using the -.Qq Nm Cm jail -subcommand. You cannot attach a dataset to one jail and the children of the -same dataset to another jail. You can also not attach the root file system -of the jail or any dataset which needs to be mounted before the zfs rc script -is run inside the jail, as it would be attached unmounted until it is -mounted from the rc script inside the jail. To allow management of the -dataset from within a jail, the -.Sy jailed -property has to be set and the jail needs access to the -.Pa /dev/zfs -device. The -.Sy quota -property cannot be changed from within a jail. See -.Xr jail 8 -for information on how to allow mounting -.Tn ZFS -datasets from within a jail. -.Pp -.No A Tn ZFS -dataset can be detached from a jail using the -.Qq Nm Cm unjail -subcommand. -.Pp -After a dataset is attached to a jail and the jailed property is set, a jailed -file system cannot be mounted outside the jail, since the jail administrator -might have set the mount point to an unacceptable value. -.Ss Deduplication -Deduplication is the process for removing redundant data at the block-level, -reducing the total amount of data stored. If a file system has the -.Cm dedup -property enabled, duplicate data blocks are removed synchronously. The result -is that only unique data is stored and common components are shared among -files. -.Ss Native Properties -Properties are divided into two types, native properties and user-defined (or -"user") properties. Native properties either export internal statistics or -control -.Tn ZFS -behavior. In addition, native properties are either editable or read-only. User -properties have no effect on -.Tn ZFS -behavior, but you can use them to annotate datasets in a way that is meaningful -in your environment. For more information about user properties, see the -.Qq Sx User Properties -section, below. -.Pp -Every dataset has a set of properties that export statistics about the dataset -as well as control various behaviors. Properties are inherited from the parent -unless overridden by the child. Some properties apply only to certain types of -datasets (file systems, volumes, or snapshots). -.Pp -The values of numeric properties can be specified using human-readable suffixes -(for example, -.Sy k , KB , M , Gb , -and so forth, up to -.Sy Z -for zettabyte). The following are all valid (and equal) specifications: -.Bd -ragged -offset 4n -1536M, 1.5g, 1.50GB -.Ed -.Pp -The values of non-numeric properties are case sensitive and must be lowercase, -except for -.Sy mountpoint , sharenfs , No and Sy sharesmb . -.Pp -The following native properties consist of read-only statistics about the -dataset. These properties can be neither set, nor inherited. Native properties -apply to all dataset types unless otherwise noted. -.Bl -tag -width 2n -.It Sy available -The amount of space available to the dataset and all its children, assuming -that there is no other activity in the pool. Because space is shared within a -pool, availability can be limited by any number of factors, including physical -pool size, quotas, reservations, or other datasets within the pool. -.Pp -This property can also be referred to by its shortened column name, -.Sy avail . -.It Sy compressratio -For non-snapshots, the compression ratio achieved for the -.Sy used -space of this dataset, expressed as a multiplier. The -.Sy used -property includes descendant datasets, and, for clones, does not include -the space shared with the origin snapshot. For snapshots, the -.Sy compressratio -is the same as the -.Sy refcompressratio -property. Compression can be turned on by running: -.Qq Nm Cm set compression=on Ar dataset -The default value is -.Cm off . -.It Sy createtxg -The transaction group (txg) in which the dataset was created. -Bookmarks have the same -.Sy createtxg -as the snapshot they are initially tied to. -This property is suitable for ordering a list of snapshots, -e.g. for incremental send and receive. -.It Sy creation -The time this dataset was created. -.It Sy clones -For snapshots, this property is a comma-separated list of filesystems or -volumes which are clones of this snapshot. The clones' -.Sy origin -property is this snapshot. If the -.Sy clones -property is not empty, then this snapshot can not be destroyed (even with the -.Fl r -or -.Fl f -options). -.It Sy defer_destroy -This property is -.Cm on -if the snapshot has been marked for deferred destroy by using the -.Qq Nm Cm destroy -d -command. Otherwise, the property is -.Cm off . -.It Sy filesystem_count -The total number of filesystems and volumes that exist under this location in the -dataset tree. -This value is only available when a -.Sy filesystem_limit -has -been set somewhere in the tree under which the dataset resides. -.It Sy guid -The 64 bit GUID of this dataset or bookmark which does not change over its -entire lifetime. -When a snapshot is sent to another pool, the received snapshot has the same -GUID. -Thus, the -.Sy guid -is suitable to identify a snapshot across pools. -.It Sy logicalreferenced -The amount of space that is -.Qq logically -accessible by this dataset. -See the -.Sy referenced -property. -The logical space ignores the effect of the -.Sy compression -and -.Sy copies -properties, giving a quantity closer to the amount of data that applications -see. -However, it does include space consumed by metadata. -.Pp -This property can also be referred to by its shortened column name, -.Sy lrefer . -.It Sy logicalused -The amount of space that is -.Qq logically -consumed by this dataset and all its descendents. -See the -.Sy used -property. -The logical space ignores the effect of the -.Sy compression -and -.Sy copies -properties, giving a quantity closer to the amount of data that applications -see. -.Pp -This property can also be referred to by its shortened column name, -.Sy lused . -.It Sy mounted -For file systems, indicates whether the file system is currently mounted. This -property can be either -.Cm yes -or -.Cm no . -.It Sy origin -For cloned file systems or volumes, the snapshot from which the clone was -created. See also the -.Sy clones -property. -.It Sy receive_resume_token -For filesystems or volumes which have saved partially-completed state from -.Sy zfs receive -s , -this opaque token can be provided to -.Sy zfs send -t -to resume and complete the -.Sy zfs receive . -.It Sy referenced -The amount of data that is accessible by this dataset, which may or may not be -shared with other datasets in the pool. When a snapshot or clone is created, it -initially references the same amount of space as the file system or snapshot it -was created from, since its contents are identical. -.Pp -This property can also be referred to by its shortened column name, -.Sy refer . -.It Sy refcompressratio -The compression ratio achieved for the -.Sy referenced -space of this dataset, expressed as a multiplier. See also the -.Sy compressratio -property. -.It Sy snapshot_count -The total number of snapshots that exist under this location in the dataset tree. -This value is only available when a -.Sy snapshot_limit -has been set somewhere -in the tree under which the dataset resides. -.It Sy type -The type of dataset: -.Sy filesystem , volume , No or Sy snapshot . -.It Sy used -The amount of space consumed by this dataset and all its descendents. This is -the value that is checked against this dataset's quota and reservation. The -space used does not include this dataset's reservation, but does take into -account the reservations of any descendent datasets. The amount of space that a -dataset consumes from its parent, as well as the amount of space that are freed -if this dataset is recursively destroyed, is the greater of its space used and -its reservation. -.Pp -When snapshots (see the -.Qq Sx Snapshots -section) are created, their space is -initially shared between the snapshot and the file system, and possibly with -previous snapshots. As the file system changes, space that was previously -shared becomes unique to the snapshot, and counted in the snapshot's space -used. Additionally, deleting snapshots can increase the amount of space unique -to (and used by) other snapshots. -.Pp -The amount of space used, available, or referenced does not take into account -pending changes. Pending changes are generally accounted for within a few -seconds. Committing a change to a disk using -.Xr fsync 2 -or -.Sy O_SYNC -does not necessarily guarantee that the space usage information is updated -immediately. -.It Sy usedby* -The -.Sy usedby* -properties decompose the -.Sy used -properties into the various reasons that space is used. Specifically, -.Sy used No = -.Sy usedbysnapshots + usedbydataset + usedbychildren + usedbyrefreservation . -These properties are only available for datasets created -with -.Tn ZFS -pool version 13 pools and higher. -.It Sy usedbysnapshots -The amount of space consumed by snapshots of this dataset. In particular, it is -the amount of space that would be freed if all of this dataset's snapshots were -destroyed. Note that this is not simply the sum of the snapshots' -.Sy used -properties because space can be shared by multiple snapshots. -.It Sy usedbydataset -The amount of space used by this dataset itself, which would be freed if the -dataset were destroyed (after first removing any -.Sy refreservation -and destroying any necessary snapshots or descendents). -.It Sy usedbychildren -The amount of space used by children of this dataset, which would be freed if -all the dataset's children were destroyed. -.It Sy usedbyrefreservation -The amount of space used by a -.Sy refreservation -set on this dataset, which would be freed if the -.Sy refreservation -was removed. -.It Sy userused@ Ns Ar user -The amount of space consumed by the specified user in this dataset. Space is -charged to the owner of each file, as displayed by -.Qq Nm ls Fl l . -The amount of space charged is displayed by -.Qq Nm du -and -.Qq Nm ls Fl s . -See the -.Qq Nm Cm userspace -subcommand for more information. -.Pp -Unprivileged users can access only their own space usage. The root user, or a -user who has been granted the -.Sy userused -privilege with -.Qq Nm Cm allow , -can access everyone's usage. -.Pp -The -.Sy userused@ Ns ... -properties are not displayed by -.Qq Nm Cm get all . -The user's name must be appended after the -.Sy @ -symbol, using one of the following forms: -.Bl -bullet -offset 2n -.It -POSIX name (for example, -.Em joe ) -.It -POSIX numeric ID (for example, -.Em 1001 ) -.El -.It Sy userrefs -This property is set to the number of user holds on this snapshot. User holds -are set by using the -.Qq Nm Cm hold -command. -.It Sy groupused@ Ns Ar group -The amount of space consumed by the specified group in this dataset. Space is -charged to the group of each file, as displayed by -.Nm ls Fl l . -See the -.Sy userused@ Ns Ar user -property for more information. -.Pp -Unprivileged users can only access their own groups' space usage. The root -user, or a user who has been granted the -.Sy groupused -privilege with -.Qq Nm Cm allow , -can access all groups' usage. -.It Sy volblocksize Ns = Ns Ar blocksize -For volumes, specifies the block size of the volume. The -.Ar blocksize -cannot be changed once the volume has been written, so it should be set at -volume creation time. The default -.Ar blocksize -for volumes is 8 Kbytes. Any -power of 2 from 512 bytes to 128 Kbytes is valid. -.Pp -This property can also be referred to by its shortened column name, -.Sy volblock . -.It Sy written -The amount of -.Sy referenced -space written to this dataset since the previous snapshot. -.It Sy written@ Ns Ar snapshot -The amount of -.Sy referenced -space written to this dataset since the specified snapshot. This is the space -that is referenced by this dataset but was not referenced by the specified -snapshot. -.Pp -The -.Ar snapshot -may be specified as a short snapshot name (just the part after the -.Sy @ ) , -in which case it will be interpreted as a snapshot in the same filesystem as -this dataset. The -.Ar snapshot -may be a full snapshot name -.Pq Em filesystem@snapshot , -which for clones may be a snapshot in the origin's filesystem (or the origin of -the origin's filesystem, etc). -.El -.Pp -The following native properties can be used to change the behavior of a -.Tn ZFS -dataset. -.Bl -tag -width 2n -.It Xo -.Sy aclinherit Ns = Ns Cm discard | -.Cm noallow | -.Cm restricted | -.Cm passthrough | -.Cm passthrough-x -.Xc -Controls how -.Tn ACL -entries are inherited when files and directories are created. A file system -with an -.Sy aclinherit -property of -.Cm discard -does not inherit any -.Tn ACL -entries. A file system with an -.Sy aclinherit -property value of -.Cm noallow -only inherits inheritable -.Tn ACL -entries that specify "deny" permissions. The property value -.Cm restricted -(the default) removes the -.Em write_acl -and -.Em write_owner -permissions when the -.Tn ACL -entry is inherited. A file system with an -.Sy aclinherit -property value of -.Cm passthrough -inherits all inheritable -.Tn ACL -entries without any modifications made to the -.Tn ACL -entries when they are inherited. A file system with an -.Sy aclinherit -property value of -.Cm passthrough-x -has the same meaning as -.Cm passthrough , -except that the -.Em owner@ , group@ , No and Em everyone@ Tn ACE Ns s -inherit the execute permission only if the file creation mode also requests the -execute bit. -.Pp -When the property value is set to -.Cm passthrough , -files are created with a mode determined by the inheritable -.Tn ACE Ns s. -If no inheritable -.Tn ACE Ns s -exist that affect the mode, then the mode is set in accordance to the requested -mode from the application. -.It Sy aclmode Ns = Ns Cm discard | groupmask | passthrough | restricted -Controls how an -.Tn ACL -is modified during -.Xr chmod 2 . -A file system with an -.Sy aclmode -property of -.Cm discard -(the default) deletes all -.Tn ACL -entries that do not represent the mode of the file. An -.Sy aclmode -property of -.Cm groupmask -reduces permissions granted in all -.Em ALLOW -entries found in the -.Tn ACL -such that they are no greater than the group permissions specified by -.Xr chmod 2 . -A file system with an -.Sy aclmode -property of -.Cm passthrough -indicates that no changes are made to the -.Tn ACL -other than creating or updating the necessary -.Tn ACL -entries to represent the new mode of the file or directory. -An -.Sy aclmode -property of -.Cm restricted -will cause the -.Xr chmod 2 -operation to return an error when used on any file or directory which has -a non-trivial -.Tn ACL -whose entries can not be represented by a mode. -.Xr chmod 2 -is required to change the set user ID, set group ID, or sticky bits on a file -or directory, as they do not have equivalent -.Tn ACL -entries. -In order to use -.Xr chmod 2 -on a file or directory with a non-trivial -.Tn ACL -when -.Sy aclmode -is set to -.Cm restricted , -you must first remove all -.Tn ACL -entries which do not represent the current mode. -.It Sy atime Ns = Ns Cm on | off -Controls whether the access time for files is updated when they are read. -Turning this property off avoids producing write traffic when reading files and -can result in significant performance gains, though it might confuse mailers -and other similar utilities. The default value is -.Cm on . -.It Sy canmount Ns = Ns Cm on | off | noauto -If this property is set to -.Cm off , -the file system cannot be mounted, and is ignored by -.Qq Nm Cm mount Fl a . -Setting this property to -.Cm off -is similar to setting the -.Sy mountpoint -property to -.Cm none , -except that the dataset still has a normal -.Sy mountpoint -property, which can be inherited. Setting this property to -.Cm off -allows datasets to be used solely as a mechanism to inherit properties. One -example of setting -.Sy canmount Ns = Ns Cm off -is to have two datasets with the same -.Sy mountpoint , -so that the children of both datasets appear in the same directory, but might -have different inherited characteristics. -.Pp -When the -.Cm noauto -value is set, a dataset can only be mounted and unmounted explicitly. The -dataset is not mounted automatically when the dataset is created or imported, -nor is it mounted by the -.Qq Nm Cm mount Fl a -command or unmounted by the -.Qq Nm Cm umount Fl a -command. -.Pp -This property is not inherited. -.It Sy checksum Ns = Ns Cm on | off | fletcher2 | fletcher4 | sha256 | noparity | sha512 | skein -Controls the checksum used to verify data integrity. The default value is -.Cm on , -which automatically selects an appropriate algorithm (currently, -.Cm fletcher4 , -but this may change in future releases). The value -.Cm off -disables integrity checking on user data. -The value -.Cm noparity -not only -disables integrity but also disables maintaining parity for user data. This -setting is used internally by a dump device residing on a RAID-Z pool and should -not be used by any other dataset. -Disabling checksums is -.Em NOT -a recommended practice. -The -.Sy sha512 , -and -.Sy skein -checksum algorithms require enabling the appropriate features on the pool. -Please see -.Xr zpool-features 7 -for more information on these algorithms. -.Pp -Changing this property affects only newly-written data. -.Pp -The salted checksum algorithm -.Pq Cm edonr -is currently not supported on FreeBSD. -.It Sy compression Ns = Ns Cm on | off | lzjb | gzip | gzip- Ns Ar N | Cm zle | Cm lz4 -Controls the compression algorithm used for this dataset. -Setting compression to -.Cm on -indicates that the current default compression algorithm should be used. -The default balances compression and decompression speed, with compression -ratio and is expected to work well on a wide variety of workloads. -Unlike all other settings for this property, on does not select a fixed -compression type. -As new compression algorithms are added to ZFS and enabled on a pool, the -default compression algorithm may change. -The current default compression algorthm is either -.Cm lzjb -or, if the -.Sy lz4_compress -feature is enabled, -.Cm lz4 . -The -.Cm lzjb -compression algorithm is optimized for performance while providing decent data -compression. Setting compression to -.Cm on -uses the -.Cm lzjb -compression algorithm. The -.Cm gzip -compression algorithm uses the same compression as the -.Xr gzip 1 -command. You can specify the -.Cm gzip -level by using the value -.Cm gzip- Ns Ar N -where -.Ar N -is an integer from 1 (fastest) to 9 (best compression ratio). Currently, -.Cm gzip -is equivalent to -.Cm gzip-6 -(which is also the default for -.Xr gzip 1 ) . -The -.Cm zle -compression algorithm compresses runs of zeros. -.Pp -The -.Sy lz4 -compression algorithm is a high-performance replacement -for the -.Sy lzjb -algorithm. It features significantly faster -compression and decompression, as well as a moderately higher -compression ratio than -.Sy lzjb , -but can only be used on pools with -the -.Sy lz4_compress -feature set to -.Sy enabled . -See -.Xr zpool-features 7 -for details on ZFS feature flags and the -.Sy lz4_compress -feature. -.Pp -This property can also be referred to by its shortened column name -.Cm compress . -Changing this property affects only newly-written data. -.It Sy copies Ns = Ns Cm 1 | 2 | 3 -Controls the number of copies of data stored for this dataset. These copies are -in addition to any redundancy provided by the pool, for example, mirroring or -RAID-Z. The copies are stored on different disks, if possible. The space used -by multiple copies is charged to the associated file and dataset, changing the -.Sy used -property and counting against quotas and reservations. -.Pp -Changing this property only affects newly-written data. Therefore, set this -property at file system creation time by using the -.Fl o Cm copies= Ns Ar N -option. -.It Sy dedup Ns = Ns Cm on | off | verify | sha256 Ns Oo Cm ,verify Oc | Sy sha512 Ns Oo Cm ,verify Oc | Sy skein Ns Oo Cm ,verify Oc -Configures deduplication for a dataset. The default value is -.Cm off . -The default deduplication checksum is -.Cm sha256 -(this may change in the future). -When -.Sy dedup -is enabled, the checksum defined here overrides the -.Sy checksum -property. Setting the value to -.Cm verify -has the same effect as the setting -.Cm sha256,verify . -.Pp -If set to -.Cm verify , -.Tn ZFS -will do a byte-to-byte comparsion in case of two blocks having the same -signature to make sure the block contents are identical. -.It Sy devices Ns = Ns Cm on | off -The -.Sy devices -property is currently not supported on -.Fx . -.It Sy exec Ns = Ns Cm on | off -Controls whether processes can be executed from within this file system. The -default value is -.Cm on . -.It Sy mlslabel Ns = Ns Ar label | Cm none -The -.Sy mlslabel -property is currently not supported on -.Fx . -.It Sy filesystem_limit Ns = Ns Ar count | Cm none -Limits the number of filesystems and volumes that can exist under this point in -the dataset tree. -The limit is not enforced if the user is allowed to change -the limit. -Setting a -.Sy filesystem_limit -on a descendent of a filesystem that -already has a -.Sy filesystem_limit -does not override the ancestor's -.Sy filesystem_limit , -but rather imposes an additional limit. -This feature must be enabled to be used -.Po see -.Xr zpool-features 7 -.Pc . -.It Sy special_small_blocks Ns = Ns Ar size -This value represents the threshold block size for including small file -blocks into the special allocation class. -Blocks smaller than or equal to this value will be assigned to the special -allocation class while greater blocks will be assigned to the regular class. -Valid values are zero or a power of two from 512B up to 128K. -The default size is 0 which means no small file blocks will be allocated in -the special class. -.Pp -Before setting this property, a special class vdev must be added to the -pool. -See -.Xr zpool 8 -for more details on the special allocation class. -.It Sy mountpoint Ns = Ns Ar path | Cm none | legacy -Controls the mount point used for this file system. -See the -.Qq Sx Mount Points -section for more information on how this property is used. -.Pp -When the -.Sy mountpoint -property is changed for a file system, the file system and any children that -inherit the mount point are unmounted. If the new value is -.Cm legacy , -then they remain unmounted. Otherwise, they are automatically remounted in the -new location if the property was previously -.Cm legacy -or -.Cm none , -or if they were mounted before the property was changed. In addition, any -shared file systems are unshared and shared in the new location. -.It Sy nbmand Ns = Ns Cm on | off -The -.Sy nbmand -property is currently not supported on -.Fx . -.It Sy primarycache Ns = Ns Cm all | none | metadata -Controls what is cached in the primary cache (ARC). If this property is set to -.Cm all , -then both user data and metadata is cached. If this property is set to -.Cm none , -then neither user data nor metadata is cached. If this property is set to -.Cm metadata , -then only metadata is cached. The default value is -.Cm all . -.It Sy quota Ns = Ns Ar size | Cm none -Limits the amount of space a dataset and its descendents can consume. This -property enforces a hard limit on the amount of space used. This includes all -space consumed by descendents, including file systems and snapshots. Setting a -quota on a descendent of a dataset that already has a quota does not override -the ancestor's quota, but rather imposes an additional limit. -.Pp -Quotas cannot be set on volumes, as the -.Sy volsize -property acts as an implicit quota. -.It Sy snapshot_limit Ns = Ns Ar count | Cm none -Limits the number of snapshots that can be created on a dataset and its -descendents. -Setting a -.Sy snapshot_limit -on a descendent of a dataset that already -has a -.Sy snapshot_limit -does not override the ancestor's -.Sy snapshot_limit , -but -rather imposes an additional limit. -The limit is not enforced if the user is -allowed to change the limit. -For example, this means that recursive snapshots -taken from the global zone are counted against each delegated dataset within -a jail. -This feature must be enabled to be used -.Po see -.Xr zpool-features 7 -.Pc . -.It Sy userquota@ Ns Ar user Ns = Ns Ar size | Cm none -Limits the amount of space consumed by the specified user. -Similar to the -.Sy refquota -property, the -.Sy userquota -space calculation does not include space that is used by descendent datasets, -such as snapshots and clones. User space consumption is identified by the -.Sy userspace@ Ns Ar user -property. -.Pp -Enforcement of user quotas may be delayed by several seconds. This delay means -that a user might exceed their quota before the system notices that they are -over quota and begins to refuse additional writes with the -.Em EDQUOT -error message. See the -.Cm userspace -subcommand for more information. -.Pp -Unprivileged users can only access their own groups' space usage. The root -user, or a user who has been granted the -.Sy userquota -privilege with -.Qq Nm Cm allow , -can get and set everyone's quota. -.Pp -This property is not available on volumes, on file systems before version 4, or -on pools before version 15. The -.Sy userquota@ Ns ... -properties are not displayed by -.Qq Nm Cm get all . -The user's name must be appended after the -.Sy @ -symbol, using one of the following forms: -.Bl -bullet -offset 2n -.It -POSIX name (for example, -.Em joe ) -.It -POSIX numeric ID (for example, -.Em 1001 ) -.El -.It Sy groupquota@ Ns Ar group Ns = Ns Ar size | Cm none -Limits the amount of space consumed by the specified group. Group space -consumption is identified by the -.Sy userquota@ Ns Ar user -property. -.Pp -Unprivileged users can access only their own groups' space usage. The root -user, or a user who has been granted the -.Sy groupquota -privilege with -.Qq Nm Cm allow , -can get and set all groups' quotas. -.It Sy readonly Ns = Ns Cm on | off -Controls whether this dataset can be modified. The default value is -.Cm off . -.It Sy recordsize Ns = Ns Ar size -Specifies a suggested block size for files in the file system. This property is -designed solely for use with database workloads that access files in fixed-size -records. -.Tn ZFS -automatically tunes block sizes according to internal algorithms optimized for -typical access patterns. -.Pp -For databases that create very large files but access them in small random -chunks, these algorithms may be suboptimal. Specifying a -.Sy recordsize -greater than or equal to the record size of the database can result in -significant performance gains. Use of this property for general purpose file -systems is strongly discouraged, and may adversely affect performance. -.Pp -The size specified must be a power of two greater than or equal to 512 and less -than or equal to 128 Kbytes. -If the -.Sy large_blocks -feature is enabled on the pool, the size may be up to 1 Mbyte. -See -.Xr zpool-features 7 -for details on ZFS feature flags. -.Pp -Changing the file system's -.Sy recordsize -affects only files created afterward; existing files are unaffected. -.Pp -This property can also be referred to by its shortened column name, -.Sy recsize . -.It Sy redundant_metadata Ns = Ns Cm all | most -Controls what types of metadata are stored redundantly. -ZFS stores an extra copy of metadata, so that if a single block is corrupted, -the amount of user data lost is limited. -This extra copy is in addition to any redundancy provided at the pool level -.Pq e.g. by mirroring or RAID-Z , -and is in addition to an extra copy specified by the -.Sy copies -property -.Pq up to a total of 3 copies . -For example if the pool is mirrored, -.Cm copies Ns = Ns Ar 2 , -and -.Cm redundant_metadata Ns = Ns Ar most , -then ZFS -stores 6 copies of most metadata, and 4 copies of data and some -metadata. -.Pp -When set to -.Cm all , -ZFS stores an extra copy of all metadata. -If a -single on-disk block is corrupt, at worst a single block of user data -.Po which is -.Cm recordsize -bytes long -can be lost. -.Pc -.Pp -When set to -.Cm most , -ZFS stores an extra copy of most types of -metadata. -This can improve performance of random writes, because less -metadata must be written. -In practice, at worst about 100 blocks -.Po of -.Cm recordsize -bytes each -.Pc -of user data can be lost if a single -on-disk block is corrupt. -The exact behavior of which metadata blocks -are stored redundantly may change in future releases. -.Pp -The default value is -.Cm all . -.It Sy refquota Ns = Ns Ar size | Cm none -Limits the amount of space a dataset can consume. This property enforces a hard -limit on the amount of space used. This hard limit does not include space used -by descendents, including file systems and snapshots. -.It Sy refreservation Ns = Ns Ar size | Cm none | Cm auto -The minimum amount of space guaranteed to a dataset, not including its -descendents. When the amount of space used is below this value, the dataset is -treated as if it were taking up the amount of space specified by -.Sy refreservation . -The -.Sy refreservation -reservation is accounted for in the parent datasets' space used, and counts -against the parent datasets' quotas and reservations. -.Pp -If -.Sy refreservation -is set, a snapshot is only allowed if there is enough free pool space outside -of this reservation to accommodate the current number of "referenced" bytes in -the dataset. -.Pp -If -.Sy refreservation -is set to -.Sy auto , -a volume is thick provisioned or not sparse. -.Sy refreservation Ns = Cm auto -is only supported on volumes. -See -.Sy volsize -in the Native Properties -section for more information about sparse volumes. -.Pp -This property can also be referred to by its shortened column name, -.Sy refreserv . -.It Sy reservation Ns = Ns Ar size | Cm none -The minimum amount of space guaranteed to a dataset and its descendents. When -the amount of space used is below this value, the dataset is treated as if it -were taking up the amount of space specified by its reservation. Reservations -are accounted for in the parent datasets' space used, and count against the -parent datasets' quotas and reservations. -.Pp -This property can also be referred to by its shortened column name, -.Sy reserv . -.It Sy secondarycache Ns = Ns Cm all | none | metadata -Controls what is cached in the secondary cache (L2ARC). If this property is set -to -.Cm all , -then both user data and metadata is cached. If this property is set to -.Cm none , -then neither user data nor metadata is cached. If this property is set to -.Cm metadata , -then only metadata is cached. The default value is -.Cm all . -.It Sy setuid Ns = Ns Cm on | off -Controls whether the -.No set- Ns Tn UID -bit is respected for the file system. The default value is -.Cm on . -.It Sy sharesmb Ns = Ns Cm on | off | Ar opts -The -.Sy sharesmb -property currently has no effect on -.Fx . -.It Sy sharenfs Ns = Ns Cm on | off | Ar opts -Controls whether the file system is shared via -.Tn NFS , -and what options are used. A file system with a -.Sy sharenfs -property of -.Cm off -is managed the traditional way via -.Xr exports 5 . -Otherwise, the file system is automatically shared and unshared with the -.Qq Nm Cm share -and -.Qq Nm Cm unshare -commands. If the property is set to -.Cm on -no -.Tn NFS -export options are used. Otherwise, -.Tn NFS -export options are equivalent to the contents of this property. The export -options may be comma-separated. See -.Xr exports 5 -for a list of valid options. -.Pp -When the -.Sy sharenfs -property is changed for a dataset, the -.Xr mountd 8 -daemon is reloaded. -.It Sy logbias Ns = Ns Cm latency | throughput -Provide a hint to -.Tn ZFS -about handling of synchronous requests in this dataset. -If -.Sy logbias -is set to -.Cm latency -(the default), -.Tn ZFS -will use pool log devices (if configured) to handle the requests at low -latency. If -.Sy logbias -is set to -.Cm throughput , -.Tn ZFS -will not use configured pool log devices. -.Tn ZFS -will instead optimize synchronous operations for global pool throughput and -efficient use of resources. -.It Sy snapdir Ns = Ns Cm hidden | visible -Controls whether the -.Pa \&.zfs -directory is hidden or visible in the root of the file system as discussed in -the -.Qq Sx Snapshots -section. The default value is -.Cm hidden . -.It Sy sync Ns = Ns Cm standard | always | disabled -Controls the behavior of synchronous requests (e.g. -.Xr fsync 2 , -O_DSYNC). This property accepts the following values: -.Bl -tag -offset 4n -width 8n -.It Sy standard -This is the POSIX specified behavior of ensuring all synchronous requests are -written to stable storage and all devices are flushed to ensure data is not -cached by device controllers (this is the default). -.It Sy always -All file system transactions are written and flushed before their system calls -return. This has a large performance penalty. -.It Sy disabled -Disables synchronous requests. File system transactions are only committed to -stable storage periodically. This option will give the highest performance. -However, it is very dangerous as -.Tn ZFS -would be ignoring the synchronous transaction demands of applications such as -databases or -.Tn NFS . -Administrators should only use this option when the risks are understood. -.El -.It Sy volsize Ns = Ns Ar size -For volumes, specifies the logical size of the volume. By default, creating a -volume establishes a reservation of equal size. For storage pools with a -version number of 9 or higher, a -.Sy refreservation -is set instead. Any changes to -.Sy volsize -are reflected in an equivalent change to the reservation (or -.Sy refreservation ) . -The -.Sy volsize -can only be set to a multiple of -.Cm volblocksize , -and cannot be zero. -.Pp -The reservation is kept equal to the volume's logical size to prevent -unexpected behavior for consumers. Without the reservation, the volume could -run out of space, resulting in undefined behavior or data corruption, depending -on how the volume is used. These effects can also occur when the volume size is -changed while it is in use (particularly when shrinking the size). Extreme care -should be used when adjusting the volume size. -.Pp -Though not recommended, a "sparse volume" (also known as "thin provisioned") -can be created by specifying the -.Fl s -option to the -.Qq Nm Cm create Fl V -command, or by changing the value of the -.Sy refreservation -property, or -.Sy reservation -property on pool -.Po -version 8 or earlier -.Pc -after the volume has been created. -A "sparse volume" is a volume where the value of -.Sy refreservation -is less then the size of the volume plus the space required to store its -metadata. -Consequently, writes to a sparse volume can fail with -.Sy ENOSPC -when the pool is low on space. For a sparse volume, changes to -.Sy volsize -are not reflected in the -.Sy refreservation . -A volume that is not sparse is said to be "thick provisioned". -A sparse volume can become thick provisioned by setting -.Sy refreservation -to -.Sy auto . -.It Sy volmode Ns = Ns Cm default | geom | dev | none -This property specifies how volumes should be exposed to the OS. -Setting it to -.Sy geom -exposes volumes as -.Xr geom 4 -providers, providing maximal functionality. -Setting it to -.Sy dev -exposes volumes only as cdev device in devfs. -Such volumes can be accessed only as raw disk device files, i.e. they -can not be partitioned, mounted, participate in RAIDs, etc, but they -are faster, and in some use scenarios with untrusted consumer, such as -NAS or VM storage, can be more safe. -Volumes with property set to -.Sy none -are not exposed outside ZFS, but can be snapshoted, cloned, replicated, etc, -that can be suitable for backup purposes. -Value -.Sy default -means that volumes exposition is controlled by system-wide sysctl/tunable -.Va vfs.zfs.vol.mode , -where -.Sy geom , -.Sy dev -and -.Sy none -are encoded as 1, 2 and 3 respectively. -The default values is -.Sy geom . -This property can be changed any time, but so far it is processed only -during volume creation and pool import. -.It Sy vscan Ns = Ns Cm off | on -The -.Sy vscan -property is currently not supported on -.Fx . -.It Sy xattr Ns = Ns Cm off | on -The -.Sy xattr -property is currently not supported on -.Fx . -.It Sy jailed Ns = Ns Cm off | on -Controls whether the dataset is managed from a jail. See the -.Qq Sx Jails -section for more information. The default value is -.Cm off . -.El -.Pp -The following three properties cannot be changed after the file system is -created, and therefore, should be set when the file system is created. If the -properties are not set with the -.Qq Nm Cm create -or -.Nm zpool Cm create -commands, these properties are inherited from the parent dataset. If the parent -dataset lacks these properties due to having been created prior to these -features being supported, the new file system will have the default values for -these properties. -.Bl -tag -width 4n -.It Sy casesensitivity Ns = Ns Cm sensitive | insensitive | mixed -Indicates whether the file name matching algorithm used by the file system -should be case-sensitive, case-insensitive, or allow a combination of both -styles of matching. The default value for the -.Sy casesensitivity -property is -.Cm sensitive . -Traditionally, UNIX and POSIX file systems have case-sensitive file names. -.Pp -The -.Cm mixed -value for the -.Sy casesensitivity -property indicates that the -file system can support requests for both case-sensitive and case-insensitive -matching behavior. -.It Sy normalization Ns = Ns Cm none | formC | formD | formKC | formKD -Indicates whether the file system should perform a -.Sy unicode -normalization of file names whenever two file names are compared, and which -normalization algorithm should be used. File names are always stored -unmodified, names are normalized as part of any comparison process. If this -property is set to a legal value other than -.Cm none , -and the -.Sy utf8only -property was left unspecified, the -.Sy utf8only -property is automatically set to -.Cm on . -The default value of the -.Sy normalization -property is -.Cm none . -This property cannot be changed after the file system is created. -.It Sy utf8only Ns = Ns Cm on | off -Indicates whether the file system should reject file names that include -characters that are not present in the -.Sy UTF-8 -character code set. If this property is explicitly set to -.Cm off , -the normalization property must either not be explicitly set or be set to -.Cm none . -The default value for the -.Sy utf8only -property is -.Cm off . -This property cannot be changed after the file system is created. -.El -.Pp -The -.Sy casesensitivity , normalization , No and Sy utf8only -properties are also new permissions that can be assigned to non-privileged -users by using the -.Tn ZFS -delegated administration feature. -.Ss Temporary Mount Point Properties -When a file system is mounted, either through -.Xr mount 8 -for legacy mounts or the -.Qq Nm Cm mount -command for normal file systems, its mount options are set according to its -properties. The correlation between properties and mount options is as follows: -.Bl -column -offset 4n "PROPERTY" "MOUNT OPTION" -.It "PROPERTY MOUNT OPTION" -.It "atime atime/noatime" -.It "exec exec/noexec" -.It "readonly ro/rw" -.It "setuid suid/nosuid" -.El -.Pp -In addition, these options can be set on a per-mount basis using the -.Fl o -option, without affecting the property that is stored on disk. The values -specified on the command line override the values stored in the dataset. These -properties are reported as "temporary" by the -.Qq Nm Cm get -command. If the properties are changed while the dataset is mounted, the new -setting overrides any temporary settings. -.Ss User Properties -In addition to the standard native properties, -.Tn ZFS -supports arbitrary user properties. User properties have no effect on -.Tn ZFS -behavior, but applications or administrators can use them to annotate datasets -(file systems, volumes, and snapshots). -.Pp -User property names must contain a colon -.Pq Sy \&: -character to distinguish them from native properties. They may contain -lowercase letters, numbers, and the following punctuation characters: colon -.Pq Sy \&: , -dash -.Pq Sy \&- , -period -.Pq Sy \&. -and underscore -.Pq Sy \&_ . -The expected convention is that the property name is divided into two portions -such as -.Em module Ns Sy \&: Ns Em property , -but this namespace is not enforced by -.Tn ZFS . -User property names can be at most 256 characters, and cannot begin with a dash -.Pq Sy \&- . -.Pp -When making programmatic use of user properties, it is strongly suggested to -use a reversed -.Tn DNS -domain name for the -.Ar module -component of property names to reduce the chance that two -independently-developed packages use the same property name for different -purposes. Property names beginning with -.Em com.sun -are reserved for use by Sun Microsystems. -.Pp -The values of user properties are arbitrary strings, are always inherited, and -are never validated. All of the commands that operate on properties -.Po -.Qq Nm Cm list , -.Qq Nm Cm get , -.Qq Nm Cm set -and so forth -.Pc -can be used to manipulate both native properties and user properties. Use the -.Qq Nm Cm inherit -command to clear a user property. If the property is not defined in any parent -dataset, it is removed entirely. Property values are limited to 1024 -characters. -.Sh SUBCOMMANDS -All subcommands that modify state are logged persistently to the pool in their -original form. -.Bl -tag -width 2n -.It Xo -.Nm -.Op Fl \&? -.Xc -.Pp -Displays a help message. -.It Xo -.Nm -.Cm create -.Op Fl pu -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Ar filesystem -.Xc -.Pp -Creates a new -.Tn ZFS -file system. The file system is automatically mounted according to the -.Sy mountpoint -property inherited from the parent. -.Bl -tag -width indent -.It Fl p -Creates all the non-existing parent datasets. Datasets created in this manner -are automatically mounted according to the -.Sy mountpoint -property inherited from their parent. Any property specified on the command -line using the -.Fl o -option is ignored. If the target filesystem already exists, the operation -completes successfully. -.It Fl u -Newly created file system is not mounted. -.It Fl o Ar property Ns = Ns Ar value -Sets the specified property as if the command -.Qq Nm Cm set Ar property Ns = Ns Ar value -was invoked at the same time the dataset was created. Any editable -.Tn ZFS -property can also be set at creation time. Multiple -.Fl o -options can be specified. An error results if the same property is specified in -multiple -.Fl o -options. -.El -.It Xo -.Nm -.Cm create -.Op Fl ps -.Op Fl b Ar blocksize -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Fl V -.Ar size volume -.Xc -.Pp -Creates a volume of the given size. The volume is exported as a block device in -.Pa /dev/zvol/path , -where -.Ar path -is the name of the volume in the -.Tn ZFS -namespace. The size represents the logical size as exported by the device. By -default, a reservation of equal size is created. -.Pp -.Ar size -is automatically rounded up to the nearest 128 Kbytes to ensure that -the volume has an integral number of blocks regardless of -.Ar blocksize . -.Bl -tag -width indent -.It Fl p -Creates all the non-existing parent datasets. Datasets created in this manner -are automatically mounted according to the -.Sy mountpoint -property inherited from their parent. Any property specified on the command -line using the -.Fl o -option is ignored. If the target filesystem already exists, the operation -completes successfully. -.It Fl s -Creates a sparse volume with no reservation. See -.Sy volsize -in the -.Qq Sx Native Properties -section for more information about sparse volumes. -.It Fl b Ar blocksize -Equivalent to -.Fl o Cm volblocksize Ns = Ns Ar blocksize . -If this option is specified in conjunction with -.Fl o Cm volblocksize , -the resulting behavior is undefined. -.It Fl o Ar property Ns = Ns Ar value -Sets the specified property as if the -.Qq Nm Cm set Ar property Ns = Ns Ar value -command was invoked at the same time the dataset was created. Any editable -.Tn ZFS -property can also be set at creation time. Multiple -.Fl o -options can be specified. An error results if the same property is specified in -multiple -.Fl o -options. -.El -.It Xo -.Nm -.Cm destroy -.Op Fl fnpRrv -.Ar filesystem Ns | Ns Ar volume -.Xc -.Pp -Destroys the given dataset. By default, the command unshares any file systems -that are currently shared, unmounts any file systems that are currently -mounted, and refuses to destroy a dataset that has active dependents (children -or clones). -.Bl -tag -width indent -.It Fl r -Recursively destroy all children. -.It Fl R -Recursively destroy all dependents, including cloned file systems outside the -target hierarchy. -.It Fl f -Force an unmount of any file systems using the -.Qq Nm Cm unmount Fl f -command. This option has no effect on non-file systems or unmounted file -systems. -.It Fl n -Do a dry-run ("No-op") deletion. No data will be deleted. This is useful in -conjunction with the -.Fl v -or -.Fl p -flags to determine what data would be deleted. -.It Fl p -Print machine-parsable verbose information about the deleted data. -.It Fl v -Print verbose information about the deleted data. -.El -.Pp -Extreme care should be taken when applying either the -.Fl r -or the -.Fl R -options, as they can destroy large portions of a pool and cause unexpected -behavior for mounted file systems in use. -.It Xo -.Nm -.Cm destroy -.Op Fl dnpRrv -.Sm off -.Ar snapshot -.Op % Ns Ar snapname -.Op , Ns ... -.Sm on -.Xc -.Pp -The given snapshots are destroyed immediately if and only if the -.Qq Nm Cm destroy -command without the -.Fl d -option would have destroyed it. Such immediate destruction would occur, for -example, if the snapshot had no clones and the user-initiated reference count -were zero. -.Pp -If a snapshot does not qualify for immediate destruction, it is marked for -deferred deletion. In this state, it exists as a usable, visible snapshot until -both of the preconditions listed above are met, at which point it is destroyed. -.Pp -An inclusive range of snapshots may be specified by separating the -first and last snapshots with a percent sign -.Pq Sy % . -The first and/or last snapshots may be left blank, in which case the -filesystem's oldest or newest snapshot will be implied. -.Pp -Multiple snapshots -(or ranges of snapshots) of the same filesystem or volume may be specified -in a comma-separated list of snapshots. -Only the snapshot's short name (the -part after the -.Sy @ ) -should be specified when using a range or comma-separated list to identify -multiple snapshots. -.Bl -tag -width indent -.It Fl r -Destroy (or mark for deferred deletion) all snapshots with this name in -descendent file systems. -.It Fl R -Recursively destroy all clones of these snapshots, including the clones, -snapshots, and children. -If this flag is specified, the -.Fl d -flag will have no effect. -.It Fl n -Do a dry-run ("No-op") deletion. No data will be deleted. This is useful in -conjunction with the -.Fl v -or -.Fl p -flags to determine what data would be deleted. -.It Fl p -Print machine-parsable verbose information about the deleted data. -.It Fl v -Print verbose information about the deleted data. -.It Fl d -Defer snapshot deletion. -.El -.Pp -Extreme care should be taken when applying either the -.Fl r -or the -.Fl R -options, as they can destroy large portions of a pool and cause unexpected -behavior for mounted file systems in use. -.It Xo -.Nm -.Cm destroy -.Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark -.Xc -.Pp -The given bookmark is destroyed. -.It Xo -.Nm -.Cm snapshot Ns | Ns Cm snap -.Op Fl r -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Ar filesystem@snapname Ns | Ns volume@snapname -.Ar filesystem@snapname Ns | Ns volume@snapname Ns ... -.Xc -.Pp -Creates snapshots with the given names. All previous modifications by -successful system calls to the file system are part of the snapshots. -Snapshots are taken atomically, so that all snapshots correspond to the same -moment in time. See the -.Qq Sx Snapshots -section for details. -.Bl -tag -width indent -.It Fl r -Recursively create snapshots of all descendent datasets -.It Fl o Ar property Ns = Ns Ar value -Sets the specified property; see -.Qq Nm Cm create -for details. -.El -.It Xo -.Nm -.Cm rollback -.Op Fl rRf -.Ar snapshot -.Xc -.Pp -Roll back the given dataset to a previous snapshot. When a dataset is rolled -back, all data that has changed since the snapshot is discarded, and the -dataset reverts to the state at the time of the snapshot. By default, the -command refuses to roll back to a snapshot other than the most recent one. In -order to do so, all intermediate snapshots and bookmarks must be destroyed -by specifying the -.Fl r -option. -.Pp -The -.Fl rR -options do not recursively destroy the child snapshots of a -recursive snapshot. -Only direct snapshots of the specified filesystem -are destroyed by either of these options. -To completely roll back a -recursive snapshot, you must rollback the individual child snapshots. -.Bl -tag -width indent -.It Fl r -Destroy any snapshots and bookmarks more recent than the one specified. -.It Fl R -Destroy any more recent snapshots and bookmarks, as well as any clones of those -snapshots. -.It Fl f -Used with the -.Fl R -option to force an unmount of any clone file systems that are to be destroyed. -.El -.It Xo -.Nm -.Cm clone -.Op Fl p -.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... -.Ar snapshot filesystem Ns | Ns Ar volume -.Xc -.Pp -Creates a clone of the given snapshot. See the -.Qq Sx Clones -section for details. The target dataset can be located anywhere in the -.Tn ZFS -hierarchy, and is created as the same type as the original. -.Bl -tag -width indent -.It Fl p -Creates all the non-existing parent datasets. Datasets created in this manner -are automatically mounted according to the -.Sy mountpoint -property inherited from their parent. If the target filesystem or volume -already exists, the operation completes successfully. -.It Fl o Ar property Ns = Ns Ar value -Sets the specified property; see -.Qq Nm Cm create -for details. -.El -.It Xo -.Nm -.Cm promote -.Ar clone-filesystem -.Xc -.Pp -Promotes a clone file system to no longer be dependent on its "origin" -snapshot. This makes it possible to destroy the file system that the clone was -created from. The clone parent-child dependency relationship is reversed, so -that the origin file system becomes a clone of the specified file system. -.Pp -The snapshot that was cloned, and any snapshots previous to this snapshot, are -now owned by the promoted clone. The space they use moves from the origin file -system to the promoted clone, so enough space must be available to accommodate -these snapshots. No new space is consumed by this operation, but the space -accounting is adjusted. The promoted clone must not have any conflicting -snapshot names of its own. The -.Cm rename -subcommand can be used to rename any conflicting snapshots. -.It Xo -.Nm -.Cm rename -.Op Fl f -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Xc -.It Xo -.Nm -.Cm rename -.Op Fl f -.Fl p -.Ar filesystem Ns | Ns Ar volume -.Ar filesystem Ns | Ns Ar volume -.Xc -.It Xo -.Nm -.Cm rename -.Fl u -.Op Fl p -.Ar filesystem filesystem -.Xc -.Pp -Renames the given dataset. The new target can be located anywhere in the -.Tn ZFS -hierarchy, with the exception of snapshots. Snapshots can only be renamed -within the parent file system or volume. When renaming a snapshot, the parent -file system of the snapshot does not need to be specified as part of the second -argument. Renamed file systems can inherit new mount points, in which case they -are unmounted and remounted at the new mount point. -.Bl -tag -width indent -.It Fl p -Creates all the nonexistent parent datasets. Datasets created in this manner -are automatically mounted according to the -.Sy mountpoint -property inherited from their parent. -.It Fl u -Do not remount file systems during rename. If a file system's -.Sy mountpoint -property is set to -.Cm legacy -or -.Cm none , -file system is not unmounted even if this option is not given. -.It Fl f -Force unmount any filesystems that need to be unmounted in the process. -This flag has no effect if used together with the -.Fl u -flag. -.El -.It Xo -.Nm -.Cm rename -.Fl r -.Ar snapshot snapshot -.Xc -.Pp -Recursively rename the snapshots of all descendent datasets. Snapshots are the -only dataset that can be renamed recursively. -.It Xo -.Nm -.Cm rename -.Ar bookmark bookmark -.Xc -.Pp -Renames the given bookmark. -Bookmarks can only be renamed within the parent file system or volume. -When renaming a bookmark, the parent file system or volume of the bookmark -does not need to be specified as part of the second argument. -.It Xo -.Nm -.Cm list -.Op Fl r Ns | Ns Fl d Ar depth -.Op Fl Hp -.Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... -.Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... -.Oo Fl s Ar property Oc Ns ... -.Oo Fl S Ar property Oc Ns ... -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... -.Xc -.Pp -Lists the property information for the given datasets in tabular form. If -specified, you can list property information by the absolute pathname or the -relative pathname. By default, all file systems and volumes are displayed. -Snapshots are displayed if the -.Sy listsnaps -property is -.Cm on -(the default is -.Cm off ) . -The following fields are displayed, -.Sy name , used , available , referenced , mountpoint . -.Bl -tag -width indent -.It Fl r -Recursively display any children of the dataset on the command line. -.It Fl d Ar depth -Recursively display any children of the dataset, limiting the recursion to -.Ar depth . -A depth of -.Sy 1 -will display only the dataset and its direct children. -.It Fl H -Used for scripting mode. Do not print headers and separate fields by a single -tab instead of arbitrary white space. -.It Fl p -Display numbers in parsable (exact) values. -.It Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... -A comma-separated list of properties to display. The property must be: -.Bl -bullet -offset 2n -.It -One of the properties described in the -.Qq Sx Native Properties -section -.It -A user property -.It -The value -.Cm name -to display the dataset name -.It -The value -.Cm space -to display space usage properties on file systems and volumes. This is a -shortcut for specifying -.Fl o -.Sy name,avail,used,usedsnap,usedds,usedrefreserv,usedchild -.Fl t -.Sy filesystem,volume -syntax. -.El -.It Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... -A comma-separated list of types to display, where -.Ar type -is one of -.Sy filesystem , snapshot , snap , volume , bookmark , No or Sy all . -For example, specifying -.Fl t Cm snapshot -displays only snapshots. -.It Fl s Ar property -A property for sorting the output by column in ascending order based on the -value of the property. The property must be one of the properties described in -the -.Qq Sx Properties -section, or the special value -.Cm name -to sort by the dataset name. Multiple properties can be specified at one time -using multiple -.Fl s -property options. Multiple -.Fl s -options are evaluated from left to right in decreasing order of importance. -.Pp -The following is a list of sorting criteria: -.Bl -bullet -offset 2n -.It -Numeric types sort in numeric order. -.It -String types sort in alphabetical order. -.It -Types inappropriate for a row sort that row to the literal bottom, regardless -of the specified ordering. -.It -If no sorting options are specified the existing behavior of -.Qq Nm Cm list -is preserved. -.El -.It Fl S Ar property -Same as the -.Fl s -option, but sorts by property in descending order. -.El -.It Xo -.Nm -.Cm set -.Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns ... -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Xc -.Pp -Sets the property or list of properties to the given value(s) for each dataset. -Only some properties can be edited. See the "Properties" section for more -information on what properties can be set and acceptable values. Numeric values -can be specified as exact values, or in a human-readable form with a suffix of -.Sy B , K , M , G , T , P , E , Z -(for bytes, kilobytes, megabytes, gigabytes, terabytes, petabytes, exabytes, or -zettabytes, respectively). User properties can be set on snapshots. For more -information, see the -.Qq Sx User Properties -section. -.It Xo -.Nm -.Cm get -.Op Fl r Ns | Ns Fl d Ar depth -.Op Fl Hp -.Op Fl o Ar all | field Ns Oo , Ns Ar field Oc Ns ... -.Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... -.Op Fl s Ar source Ns Oo , Ns Ar source Oc Ns ... -.Ar all | property Ns Oo , Ns Ar property Oc Ns ... -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns | Ns Ar bookmark Ns ... -.Xc -.Pp -Displays properties for the given datasets. If no datasets are specified, then -the command displays properties for all datasets on the system. For each -property, the following columns are displayed: -.Pp -.Bl -hang -width "property" -offset indent -compact -.It name -Dataset name -.It property -Property name -.It value -Property value -.It source -Property source. Can either be local, default, temporary, inherited, received, -or none -(\&-). -.El -.Pp -All columns except the -.Sy RECEIVED -column are displayed by default. The columns to display can be specified -by using the -.Fl o -option. This command takes a comma-separated list of properties as described in -the -.Qq Sx Native Properties -and -.Qq Sx User Properties -sections. -.Pp -The special value -.Cm all -can be used to display all properties that apply to the given dataset's type -(filesystem, volume, snapshot, or bookmark). -.Bl -tag -width indent -.It Fl r -Recursively display properties for any children. -.It Fl d Ar depth -Recursively display any children of the dataset, limiting the recursion to -.Ar depth . -A depth of -.Sy 1 -will display only the dataset and its direct children. -.It Fl H -Display output in a form more easily parsed by scripts. Any headers are -omitted, and fields are explicitly separated by a single tab instead of an -arbitrary amount of space. -.It Fl p -Display numbers in parsable (exact) values. -.It Fl o Cm all | Ar field Ns Oo , Ns Ar field Oc Ns ... -A comma-separated list of columns to display. Supported values are -.Sy name,property,value,received,source . -Default values are -.Sy name,property,value,source . -The keyword -.Cm all -specifies all columns. -.It Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... -A comma-separated list of types to display, where -.Ar type -is one of -.Sy filesystem , snapshot , volume , No or Sy all . -For example, specifying -.Fl t Cm snapshot -displays only snapshots. -.It Fl s Ar source Ns Oo , Ns Ar source Oc Ns ... -A comma-separated list of sources to display. Those properties coming from a -source other than those in this list are ignored. Each source must be one of -the following: -.Sy local,default,inherited,temporary,received,none . -The default value is all sources. -.El -.It Xo -.Nm -.Cm inherit -.Op Fl rS -.Ar property -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... -.Xc -.Pp -Clears the specified property, causing it to be inherited from an ancestor, -restored to default if no ancestor has the property set, or with the -.Fl S -option reverted to the received value if one exists. -See the -.Qq Sx Properties -section for a listing of default values, and details on which properties can be -inherited. -.Bl -tag -width indent -.It Fl r -Recursively inherit the given property for all children. -.It Fl S -Revert the property to the received value if one exists; otherwise operate as -if the -.Fl S -option was not specified. -.El -.It Xo -.Nm -.Cm remap -.Ar filesystem Ns | Ns Ar volume -.Xc -.Pp -Remap the indirect blocks in the given filesystem or volume so that they no -longer reference blocks on previously removed vdevs and we can eventually -shrink the size of the indirect mapping objects for the previously removed -vdevs. Note that remapping all blocks might not be possible and that -references from snapshots will still exist and cannot be remapped. -.It Xo -.Nm -.Cm upgrade -.Op Fl v -.Xc -.Pp -Displays a list of file systems that are not the most recent version. -.Bl -tag -width indent -.It Fl v -Displays -.Tn ZFS -filesystem versions supported by the current software. The current -.Tn ZFS -filesystem version and all previous supported versions are displayed, along -with an explanation of the features provided with each version. -.El -.It Xo -.Nm -.Cm upgrade -.Op Fl r -.Op Fl V Ar version -.Fl a | Ar filesystem -.Xc -.Pp -Upgrades file systems to a new on-disk version. Once this is done, the file -systems will no longer be accessible on systems running older versions of the -software. -.Qq Nm Cm send -streams generated from new snapshots of these file systems cannot be accessed -on systems running older versions of the software. -.Pp -In general, the file system version is independent of the pool version. See -.Xr zpool 8 -for information on the -.Nm zpool Cm upgrade -command. -.Pp -In some cases, the file system version and the pool version are interrelated -and the pool version must be upgraded before the file system version can be -upgraded. -.Bl -tag -width indent -.It Fl r -Upgrade the specified file system and all descendent file systems. -.It Fl V Ar version -Upgrade to the specified -.Ar version . -If the -.Fl V -flag is not specified, this command upgrades to the most recent version. This -option can only be used to increase the version number, and only up to the most -recent version supported by this software. -.It Fl a -Upgrade all file systems on all imported pools. -.It Ar filesystem -Upgrade the specified file system. -.El -.It Xo -.Nm -.Cm userspace -.Op Fl Hinp -.Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... -.Oo Fl s Ar field Oc Ns ... -.Oo Fl S Ar field Oc Ns ... -.Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... -.Ar filesystem Ns | Ns Ar snapshot -.Xc -.Pp -Displays space consumed by, and quotas on, each user in the specified -filesystem or snapshot. This corresponds to the -.Sy userused@ Ns Ar user -and -.Sy userquota@ Ns Ar user -properties. -.Bl -tag -width indent -.It Fl n -Print numeric ID instead of user/group name. -.It Fl H -Do not print headers, use tab-delimited output. -.It Fl p -Use exact (parsable) numeric output. -.It Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... -Display only the specified fields from the following set: -.Sy type,name,used,quota . -The default is to display all fields. -.It Fl s Ar field -Sort output by this field. The -.Fl s -and -.Fl S -flags may be specified multiple times to sort first by one field, then by -another. The default is -.Fl s Cm type Fl s Cm name . -.It Fl S Ar field -Sort by this field in reverse order. See -.Fl s . -.It Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... -Print only the specified types from the following set: -.Sy all,posixuser,smbuser,posixgroup,smbgroup . -.Pp -The default is -.Fl t Cm posixuser,smbuser . -.Pp -The default can be changed to include group types. -.It Fl i -Translate SID to POSIX ID. This flag currently has no effect on -.Fx . -.El -.It Xo -.Nm -.Cm groupspace -.Op Fl Hinp -.Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... -.Oo Fl s Ar field Oc Ns ... -.Oo Fl S Ar field Oc Ns ... -.Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... -.Ar filesystem Ns | Ns Ar snapshot -.Xc -.Pp -Displays space consumed by, and quotas on, each group in the specified -filesystem or snapshot. This subcommand is identical to -.Qq Nm Cm userspace , -except that the default types to display are -.Fl t Sy posixgroup,smbgroup . -.It Xo -.Nm -.Cm mount -.Xc -.Pp -Displays all -.Tn ZFS -file systems currently mounted. -.Bl -tag -width indent -.It Fl f -.El -.It Xo -.Nm -.Cm mount -.Op Fl vO -.Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... -.Fl a | Ar filesystem -.Xc -.Pp -Mounts -.Tn ZFS -file systems. -.Bl -tag -width indent -.It Fl v -Report mount progress. -.It Fl O -Perform an overlay mount. Overlay mounts are not supported on -.Fx . -.It Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... -An optional, comma-separated list of mount options to use temporarily for the -duration of the mount. See the -.Qq Sx Temporary Mount Point Properties -section for details. -.It Fl a -Mount all available -.Tn ZFS -file systems. -This command may be executed on -.Fx -system startup by -.Pa /etc/rc.d/zfs . -For more information, see variable -.Va zfs_enable -in -.Xr rc.conf 5 . -.It Ar filesystem -Mount the specified filesystem. -.El -.It Xo -.Nm -.Cm unmount Ns | Ns Cm umount -.Op Fl f -.Fl a | Ar filesystem Ns | Ns Ar mountpoint -.Xc -.Pp -Unmounts currently mounted -.Tn ZFS -file systems. -.Bl -tag -width indent -.It Fl f -Forcefully unmount the file system, even if it is currently in use. -.It Fl a -Unmount all available -.Tn ZFS -file systems. -.It Ar filesystem | mountpoint -Unmount the specified filesystem. The command can also be given a path to a -.Tn ZFS -file system mount point on the system. -.El -.It Xo -.Nm -.Cm share -.Fl a | Ar filesystem -.Xc -.Pp -Shares -.Tn ZFS -file systems that have the -.Sy sharenfs -property set. -.Bl -tag -width indent -.It Fl a -Share all -.Tn ZFS -file systems that have the -.Sy sharenfs -property set. -This command may be executed on -.Fx -system startup by -.Pa /etc/rc.d/zfs . -For more information, see variable -.Va zfs_enable -in -.Xr rc.conf 5 . -.It Ar filesystem -Share the specified filesystem according to the -.Tn sharenfs -property. File systems are shared when the -.Tn sharenfs -property is set. -.El -.It Xo -.Nm -.Cm unshare -.Fl a | Ar filesystem Ns | Ns Ar mountpoint -.Xc -.Pp -Unshares -.Tn ZFS -file systems that have the -.Tn sharenfs -property set. -.Bl -tag -width indent -.It Fl a -Unshares -.Tn ZFS -file systems that have the -.Sy sharenfs -property set. -This command may be executed on -.Fx -system shutdown by -.Pa /etc/rc.d/zfs . -For more information, see variable -.Va zfs_enable -in -.Xr rc.conf 5 . -.It Ar filesystem | mountpoint -Unshare the specified filesystem. The command can also be given a path to a -.Tn ZFS -file system shared on the system. -.El -.It Xo -.Nm -.Cm bookmark -.Ar snapshot -.Ar bookmark -.Xc -.Pp -Creates a bookmark of the given snapshot. -Bookmarks mark the point in time -when the snapshot was created, and can be used as the incremental source for -a -.Qq Nm Cm send -command. -.Pp -This feature must be enabled to be used. -See -.Xr zpool-features 7 -for details on ZFS feature flags and the -.Sy bookmark -feature. -.It Xo -.Nm -.Cm send -.Op Fl DLPRVcenpv -.Op Fl i Ar snapshot | Fl I Ar snapshot -.Ar snapshot -.Xc -.Pp -Creates a stream representation of the last -.Ar snapshot -argument (not part of -.Fl i -or -.Fl I ) -which is written to standard output. The output can be redirected to -a file or to a different system (for example, using -.Xr ssh 1 ) . -By default, a full stream is generated. -.Bl -tag -width indent -.It Fl i Ar snapshot -Generate an incremental stream from the first -.Ar snapshot Pq the incremental source -to the second -.Ar snapshot Pq the incremental target . -The incremental source can be specified as the last component of the -snapshot name -.Pq the Em @ No character and following -and -it is assumed to be from the same file system as the incremental target. -.Pp -If the destination is a clone, the source may be the origin snapshot, which -must be fully specified (for example, -.Cm pool/fs@origin , -not just -.Cm @origin ) . -.It Fl I Ar snapshot -Generate a stream package that sends all intermediary snapshots from the first -.Ar snapshot -to the second -.Ar snapshot . -For example, -.Ic -I @a fs@d -is similar to -.Ic -i @a fs@b; -i @b fs@c; -i @c fs@d . -The incremental -source may be specified as with the -.Fl i -option. -.It Fl R, -replicate -Generate a replication stream package, which will replicate the specified -filesystem, and all descendent file systems, up to the named snapshot. When -received, all properties, snapshots, descendent file systems, and clones are -preserved. -.Pp -If the -.Fl i -or -.Fl I -flags are used in conjunction with the -.Fl R -flag, an incremental replication stream is generated. The current values of -properties, and current snapshot and file system names are set when the stream -is received. If the -.Fl F -flag is specified when this stream is received, snapshots and file systems that -do not exist on the sending side are destroyed. -.It Fl D, -dedup -Generate a deduplicated stream. Blocks which would have been sent multiple -times in the send stream will only be sent once. The receiving system must -also support this feature to receive a deduplicated stream. This flag can -be used regardless of the dataset's -.Sy dedup -property, but performance will be much better if the filesystem uses a -dedup-capable checksum (eg. -.Sy sha256 ) . -.It Fl L, -large-block -Generate a stream which may contain blocks larger than 128KB. -This flag -has no effect if the -.Sy large_blocks -pool feature is disabled, or if the -.Sy recordsize -property of this filesystem has never been set above 128KB. -The receiving system must have the -.Sy large_blocks -pool feature enabled as well. -See -.Xr zpool-features 7 -for details on ZFS feature flags and the -.Sy large_blocks -feature. -.It Fl e, -embed -Generate a more compact stream by using WRITE_EMBEDDED records for blocks -which are stored more compactly on disk by the -.Sy embedded_data -pool -feature. -This flag has no effect if the -.Sy embedded_data -feature is -disabled. -The receiving system must have the -.Sy embedded_data -feature -enabled. -If the -.Sy lz4_compress -feature is active on the sending system, -then the receiving system must have that feature enabled as well. -See -.Xr zpool-features 7 -for details on ZFS feature flags and the -.Sy embedded_data -feature. -.It Fl c, -compressed -Generate a more compact stream by using compressed WRITE records for blocks -which are compressed on disk and in memory (see the -.Sy compression -property for details). -If the -.Sy lz4_compress -feature is active on the sending system, then the receiving system must have that -feature enabled as well. If the -.Sy large_blocks -feature is enabled on the sending system but the -.Fl L -option is not supplied in conjunction with -.Fl c -then the data will be decompressed before sending so it can be split -into smaller block sizes. -.It Fl p, -props -Include the dataset's properties in the stream. This flag is implicit when -.Fl R -is specified. The receiving system must also support this feature. -.It Fl n, -dryrun -Do a dry-run ("No-op") send. Do not generate any actual send data. This is -useful in conjunction with the -.Fl v -or -.Fl P -flags to determine what data will be sent. -In this case, the verbose output will be written to -standard output (contrast with a non-dry-run, where the stream is written -to standard output and the verbose output goes to standard error). -.It Fl P, -parsable -Print machine-parsable verbose information about the stream package generated. -.It Fl v, -verbose -Print verbose information about the stream package generated. -This information includes a per-second report of how much data has been sent. -.It Fl V -Set the process title to a per-second report of how much data has been sent. -.El -.Pp -The format of the stream is committed. You will be able to receive your streams -on future versions of -.Tn ZFS . -.It Xo -.Nm -.Cm send -.Op Fl LPcenv -.Op Fl i Ar snapshot Ns | Ns Ar bookmark -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Xc -.Pp -Generate a send stream, which may be of a filesystem, and may be -incremental from a bookmark. -If the destination is a filesystem or volume, -the pool must be read-only, or the filesystem must not be mounted. -When the -stream generated from a filesystem or volume is received, the default snapshot -name will be -.Pq --head-- . -.Bl -tag -width indent -.It Fl i Ar snapshot Ns | Ns Ar bookmark -Generate an incremental send stream. -The incremental source must be an earlier -snapshot in the destination's history. -It will commonly be an earlier -snapshot in the destination's filesystem, in which case it can be -specified as the last component of the name -.Pq the Em # No or Em @ No character and following . -.Pp -If the incremental target is a clone, the incremental source can -be the origin snapshot, or an earlier snapshot in the origin's filesystem, -or the origin's origin, etc. -.It Fl n, -dryrun -Do a dry-run -.Pq Qq No-op -send. -Do not generate any actual send data. -This is useful in conjunction with the -.Fl v -or -.Fl P -flags to determine what data will be sent. -In this case, the verbose output will be written to standard output -.Po contrast with a non-dry-run, where the stream is written to standard output -and the verbose output goes to standard error -.Pc . -.It Fl v, -verbose -Print verbose information about the stream package generated. -This information includes a per-second report of how much data has been sent. -.It Fl L, -large-block -Generate a stream which may contain blocks larger than 128KB. -This flag -has no effect if the -.Sy large_blocks -pool feature is disabled, or if the -.Sy recordsize -property of this filesystem has never been set above 128KB. -The receiving system must have the -.Sy large_blocks -pool feature enabled as well. -See -.Xr zpool-features 7 -for details on ZFS feature flags and the -.Sy large_blocks -feature. -.It Fl P, -parsable -Print machine-parsable verbose information about the stream package generated. -.It Fl c, -compressed -Generate a more compact stream by using compressed WRITE records for blocks -which are compressed on disk and in memory (see the -.Sy compression -property for details). If the -.Sy lz4_compress -feature is active on the sending system, then the receiving system must have -that feature enabled as well. If the -.Sy large_blocks -feature is enabled on the sending system but the -.Fl L -option is not supplied in conjunction with -.Fl c -then the data will be decompressed before sending so it can be split -into smaller block sizes. -.It Fl e, -embed -Generate a more compact stream by using WRITE_EMBEDDED records for blocks -which are stored more compactly on disk by the -.Sy embedded_data -pool -feature. -This flag has no effect if the -.Sy embedded_data -feature is -disabled. -The receiving system must have the -.Sy embedded_data -feature -enabled. -If the -.Sy lz4_compress -feature is active on the sending system, -then the receiving system must have that feature enabled as well. -See -.Xr zpool-features 7 -for details on ZFS feature flags and the -.Sy embedded_data -feature. -.El -.It Xo -.Nm -.Cm send -.Op Fl Penv -.Fl t -.Ar receive_resume_token -.Xc -Creates a send stream which resumes an interrupted receive. The -.Ar receive_resume_token -is the value of this property on the filesystem -or volume that was being received into. See the documentation for -.Sy zfs receive -s -for more details. -.It Xo -.Nm -.Cm receive Ns | Ns Cm recv -.Op Fl vnsFMu -.Op Fl o Sy origin Ns = Ns Ar snapshot -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Xc -.It Xo -.Nm -.Cm receive Ns | Ns Cm recv -.Op Fl vnsFMu -.Op Fl d | e -.Op Fl o Sy origin Ns = Ns Ar snapshot -.Ar filesystem -.Xc -.Pp -Creates a snapshot whose contents are as specified in the stream provided on -standard input. If a full stream is received, then a new file system is created -as well. Streams are created using the -.Qq Nm Cm send -subcommand, which by default creates a full stream. -.Qq Nm Cm recv -can be used as an alias for -.Qq Nm Cm receive . -.Pp -If an incremental stream is received, then the destination file system must -already exist, and its most recent snapshot must match the incremental stream's -source. For -.Sy zvol Ns s, -the destination device link is destroyed and recreated, which means the -.Sy zvol -cannot be accessed during the -.Sy receive -operation. -.Pp -When a snapshot replication package stream that is generated by using the -.Qq Nm Cm send Fl R -command is received, any snapshots that do not exist on the sending location -are destroyed by using the -.Qq Nm Cm destroy Fl d -command. -.Pp -The name of the snapshot (and file system, if a full stream is received) that -this subcommand creates depends on the argument type and the -.Fl d -or -.Fl e -option. -.Pp -If the argument is a snapshot name, the specified -.Ar snapshot -is created. If the argument is a file system or volume name, a snapshot with -the same name as the sent snapshot is created within the specified -.Ar filesystem -or -.Ar volume . -If the -.Fl d -or -.Fl e -option is specified, the snapshot name is determined by appending the sent -snapshot's name to the specified -.Ar filesystem . -If the -.Fl d -option is specified, all but the pool name of the sent snapshot path is -appended (for example, -.Sy b/c@1 -appended from sent snapshot -.Sy a/b/c@1 ) , -and if the -.Fl e -option is specified, only the tail of the sent snapshot path is appended (for -example, -.Sy c@1 -appended from sent snapshot -.Sy a/b/c@1 ) . -In the case of -.Fl d , -any file systems needed to replicate the path of the sent snapshot are created -within the specified file system. -.Bl -tag -width indent -.It Fl d -Use the full sent snapshot path without the first element (without pool name) -to determine the name of the new snapshot as described in the paragraph above. -.It Fl e -Use only the last element of the sent snapshot path to determine the name of -the new snapshot as described in the paragraph above. -.It Fl u -File system that is associated with the received stream is not mounted. -.It Fl v -Print verbose information about the stream and the time required to perform the -receive operation. -.It Fl n -Do not actually receive the stream. This can be useful in conjunction with the -.Fl v -option to verify the name the receive operation would use. -.It Fl o Sy origin Ns = Ns Ar snapshot -Forces the stream to be received as a clone of the given snapshot. -If the stream is a full send stream, this will create the filesystem -described by the stream as a clone of the specified snapshot. Which -snapshot was specified will not affect the success or failure of the -receive, as long as the snapshot does exist. If the stream is an -incremental send stream, all the normal verification will be performed. -.It Fl F -Force a rollback of the file system to the most recent snapshot before -performing the receive operation. If receiving an incremental replication -stream (for example, one generated by -.Qq Nm Cm send Fl R Bro Fl i | Fl I Brc ) , -destroy snapshots and file systems that do not exist on the sending side. -.It Fl M -Force an unmount of the file system while receiving a snapshot. -This option is not supported on Linux. -.It Fl s -If the receive is interrupted, save the partially received state, rather -than deleting it. Interruption may be due to premature termination of -the stream -.Po e.g. due to network failure or failure of the remote system -if the stream is being read over a network connection -.Pc , -a checksum error in the stream, termination of the -.Nm zfs Cm receive -process, or unclean shutdown of the system. -.Pp -The receive can be resumed with a stream generated by -.Nm zfs Cm send Fl t Ar token , -where the -.Ar token -is the value of the -.Sy receive_resume_token -property of the filesystem or volume which is received into. -.Pp -To use this flag, the storage pool must have the -.Sy extensible_dataset -feature enabled. See -.Xr zpool-features 7 -for details on ZFS feature flags. -.El -.It Xo -.Nm -.Cm receive Ns | Ns Cm recv -.Fl A -.Ar filesystem Ns | Ns Ar volume -.Xc -Abort an interrupted -.Nm zfs Cm receive Fl s , -deleting its saved partially received state. -.It Xo -.Nm -.Cm allow -.Ar filesystem Ns | Ns Ar volume -.Xc -.Pp -Displays permissions that have been delegated on the specified filesystem or -volume. See the other forms of -.Qq Nm Cm allow -for more information. -.It Xo -.Nm -.Cm allow -.Op Fl ldug -.Ar user Ns | Ns Ar group Ns Oo Ns , Ns Ar user Ns | Ns Ar group Oc Ns ... -.Ar perm Ns | Ns Ar @setname Ns -.Oo Ns , Ns Ar perm Ns | Ns Ar @setname Oc Ns ... -.Ar filesystem Ns | Ns Ar volume -.Xc -.It Xo -.Nm -.Cm allow -.Op Fl ld -.Fl e Ns | Ns Cm everyone -.Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns -.Ns ... -.Ar filesystem Ns | Ns Ar volume -.Xc -.Pp -Delegates -.Tn ZFS -administration permission for the file systems to non-privileged users. -.Bl -tag -width indent -.It Xo -.Op Fl ug -.Ar user Ns | Ns Ar group Ns Oo , Ar user Ns | Ns Ar group Oc Ns ... -.Xc -Specifies to whom the permissions are delegated. Multiple entities can be -specified as a comma-separated list. If neither of the -.Fl ug -options are specified, then the argument is interpreted preferentially as the -keyword -.Cm everyone , -then as a user name, and lastly as a group name. To specify -a user or group named -.Qq everyone , -use the -.Fl u -or -.Fl g -options. To specify a group with the same name as a user, use the -.Fl g -option. -.It Op Fl e Ns | Ns Cm everyone -Specifies that the permissions be delegated to -.Qq everyone . -.It Xo -.Ar perm Ns | Ns Ar @setname Ns Oo , Ns Ar perm Ns | Ns Ar @setname Oc Ns ... -.Xc -The permissions to delegate. Multiple permissions -may be specified as a comma-separated list. Permission names are the same as -.Tn ZFS -subcommand and property names. See the property list below. Property set names, -which begin with an at sign -.Pq Sy @ , -may be specified. See the -.Fl s -form below for details. -.It Xo -.Op Fl ld -.Ar filesystem Ns | Ns Ar volume -.Xc -Specifies where the permissions are delegated. If neither of the -.Fl ld -options are specified, or both are, then the permissions are allowed for the -file system or volume, and all of its descendents. If only the -.Fl l -option is used, then is allowed "locally" only for the specified file system. -If only the -.Fl d -option is used, then is allowed only for the descendent file systems. -.El -.Pp -Permissions are generally the ability to use a -.Tn ZFS -subcommand or change a -.Tn ZFS -property. The following permissions are available: -.Bl -column -offset 4n "secondarycache" "subcommand" -.It NAME Ta TYPE Ta NOTES -.It allow Ta subcommand Ta Must Xo -also have the permission that is being allowed -.Xc -.It clone Ta subcommand Ta Must Xo -also have the 'create' ability and 'mount' ability in the origin file system -.Xc -.It create Ta subcommand Ta Must also have the 'mount' ability -.It destroy Ta subcommand Ta Must also have the 'mount' ability -.It diff Ta subcommand Ta Allows lookup of paths within a dataset given an -object number, and the ability to create snapshots necessary to 'zfs diff' -.It hold Ta subcommand Ta Allows adding a user hold to a snapshot -.It mount Ta subcommand Ta Allows mount/umount of Tn ZFS No datasets -.It promote Ta subcommand Ta Must Xo -also have the 'mount' and 'promote' ability in the origin file system -.Xc -.It receive Ta subcommand Ta Must also have the 'mount' and 'create' ability -.It release Ta subcommand Ta Allows Xo -releasing a user hold which might destroy the snapshot -.Xc -.It rename Ta subcommand Ta Must Xo -also have the 'mount' and 'create' ability in the new parent -.Xc -.It rollback Ta subcommand Ta Must also have the 'mount' ability -.It send Ta subcommand -.It share Ta subcommand Ta Allows Xo -sharing file systems over the -.Tn NFS -protocol -.Xc -.It snapshot Ta subcommand Ta Must also have the 'mount' ability -.It groupquota Ta other Ta Allows accessing any groupquota@... property -.It groupused Ta other Ta Allows reading any groupused@... property -.It userprop Ta other Ta Allows changing any user property -.It userquota Ta other Ta Allows accessing any userquota@... property -.It userused Ta other Ta Allows reading any userused@... property -.It aclinherit Ta property -.It aclmode Ta property -.It atime Ta property -.It canmount Ta property -.It casesensitivity Ta property -.It checksum Ta property -.It compression Ta property -.It copies Ta property -.It dedup Ta property -.It devices Ta property -.It exec Ta property -.It filesystem_limit Ta property -.It logbias Ta property -.It jailed Ta property -.It mlslabel Ta property -.It mountpoint Ta property -.It nbmand Ta property -.It normalization Ta property -.It primarycache Ta property -.It quota Ta property -.It readonly Ta property -.It recordsize Ta property -.It refquota Ta property -.It refreservation Ta property -.It reservation Ta property -.It secondarycache Ta property -.It setuid Ta property -.It sharenfs Ta property -.It sharesmb Ta property -.It snapdir Ta property -.It snapshot_limit Ta property -.It sync Ta property -.It utf8only Ta property -.It version Ta property -.It volblocksize Ta property -.It volsize Ta property -.It vscan Ta property -.It xattr Ta property -.El -.It Xo -.Nm -.Cm allow -.Fl c -.Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns -.Ns ... -.Ar filesystem Ns | Ns Ar volume -.Xc -.Pp -Sets "create time" permissions. These permissions are granted (locally) to the -creator of any newly-created descendent file system. -.It Xo -.Nm -.Cm allow -.Fl s -.Ar @setname -.Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns -.Ns ... -.Ar filesystem Ns | Ns Ar volume -.Xc -.Pp -Defines or adds permissions to a permission set. The set can be used by other -.Qq Nm Cm allow -commands for the specified file system and its descendents. Sets are evaluated -dynamically, so changes to a set are immediately reflected. Permission sets -follow the same naming restrictions as ZFS file systems, but the name must -begin with an "at sign" -.Pq Sy @ , -and can be no more than 64 characters long. -.It Xo -.Nm -.Cm unallow -.Op Fl rldug -.Ar user Ns | Ns Ar group Ns Oo Ns , Ns Ar user Ns | Ns Ar group Oc Ns ... -.Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns -.Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Xc -.It Xo -.Nm -.Cm unallow -.Op Fl rld -.Fl e Ns | Ns Cm everyone -.Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns -.Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Xc -.It Xo -.Nm -.Cm unallow -.Op Fl r -.Fl c -.Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns -.Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Xc -.Pp -Removes permissions that were granted with the -.Qq Nm Cm allow -command. No permissions are explicitly denied, so other permissions granted are -still in effect. For example, if the permission is granted by an ancestor. If -no permissions are specified, then all permissions for the specified -.Ar user , group , No or everyone -are removed. Specifying -.Cm everyone -.Po or using the Fl e -option -.Pc only removes the permissions that were granted to everyone , -not all permissions for every user and group. See the -.Qq Nm Cm allow -command for a description of the -.Fl ldugec -options. -.Bl -tag -width indent -.It Fl r -Recursively remove the permissions from this file system and all descendents. -.El -.It Xo -.Nm -.Cm unallow -.Op Fl r -.Fl s -.Ar @setname -.Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns -.Ns ... Oc -.Ar filesystem Ns | Ns Ar volume -.Xc -.Pp -Removes permissions from a permission set. If no permissions are specified, -then all permissions are removed, thus removing the set entirely. -.It Xo -.Nm -.Cm hold -.Op Fl r -.Ar tag snapshot Ns ... -.Xc -.Pp -Adds a single reference, named with the -.Ar tag -argument, to the specified snapshot or snapshots. Each snapshot has its own tag -namespace, and tags must be unique within that space. -.Pp -If a hold exists on a snapshot, attempts to destroy that snapshot by using the -.Qq Nm Cm destroy -command returns -.Em EBUSY . -.Bl -tag -width indent -.It Fl r -Specifies that a hold with the given tag is applied recursively to the -snapshots of all descendent file systems. -.El -.It Xo -.Nm -.Cm holds -.Op Fl Hp -.Op Fl r Ns | Ns Fl d Ar depth -.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns -.Ns ... -.Xc -.Pp -Lists all existing user references for the given dataset or datasets. -.Bl -tag -width indent -.It Fl H -Used for scripting mode. Do not print headers and separate fields by a single -tab instead of arbitrary white space. -.It Fl p -Display numbers in parsable (exact) values. -.It Fl r -Lists the holds that are set on the descendent snapshots of the named datasets -or snapshots, in addition to listing the holds on the named snapshots, if any. -.It Fl d Ar depth -Recursively display any holds on the named snapshots, or descendent snapshots of -the named datasets or snapshots, limiting the recursion to -.Ar depth . -.El -.It Xo -.Nm -.Cm release -.Op Fl r -.Ar tag snapshot Ns ... -.Xc -.Pp -Removes a single reference, named with the -.Ar tag -argument, from the specified snapshot or snapshots. The tag must already exist -for each snapshot. -.Bl -tag -width indent -.It Fl r -Recursively releases a hold with the given tag on the snapshots of all -descendent file systems. -.El -.It Xo -.Nm -.Cm diff -.Op Fl FHt -.Ar snapshot -.Op Ar snapshot Ns | Ns Ar filesystem -.Xc -.Pp -Display the difference between a snapshot of a given filesystem and another -snapshot of that filesystem from a later time or the current contents of the -filesystem. The first column is a character indicating the type of change, -the other columns indicate pathname, new pathname -.Pq in case of rename , -change in link count, and optionally file type and/or change time. -.Pp -The types of change are: -.Bl -column -offset 2n indent -.It \&- Ta path was removed -.It \&+ Ta path was added -.It \&M Ta path was modified -.It \&R Ta path was renamed -.El -.Bl -tag -width indent -.It Fl F -Display an indication of the type of file, in a manner similar to the -.Fl F -option of -.Xr ls 1 . -.Bl -column -offset 2n indent -.It \&B Ta block device -.It \&C Ta character device -.It \&F Ta regular file -.It \&/ Ta directory -.It \&@ Ta symbolic link -.It \&= Ta socket -.It \&> Ta door (not supported on Fx ) -.It \&| Ta named pipe (not supported on Fx ) -.It \&P Ta event port (not supported on Fx ) -.El -.It Fl H -Give more parsable tab-separated output, without header lines and without -arrows. -.It Fl t -Display the path's inode change time as the first column of output. -.El -.It Xo -.Nm -.Cm program -.Op Fl jn -.Op Fl t Ar timeout -.Op Fl m Ar memory_limit -.Ar pool script -.Op Ar arg1 No ... -.Xc -.Pp -Executes -.Ar script -as a ZFS channel program on -.Ar pool . -The ZFS channel -program interface allows ZFS administrative operations to be run -programmatically via a Lua script. -The entire script is executed atomically, with no other administrative -operations taking effect concurrently. -A library of ZFS calls is made available to channel program scripts. -Channel programs may only be run with root privileges. -.Pp -For full documentation of the ZFS channel program interface, see the manual -page for -.Xr zfs-program 8 . -.Bl -tag -width indent -.It Fl j -Display channel program output in JSON format. -When this flag is specified and standard output is empty - -channel program encountered an error. -The details of such an error will be printed to standard error in plain text. -.It Fl n -Executes a read-only channel program, which runs faster. -The program cannot change on-disk state by calling functions from -the zfs.sync submodule. -The program can be used to gather information such as properties and -determining if changes would succeed (zfs.check.*). -Without this flag, all pending changes must be synced to disk before -a channel program can complete. -.It Fl t Ar timeout -Execution time limit, in milliseconds. -If a channel program executes for longer than the provided timeout, it will -be stopped and an error will be returned. -The default timeout is 1000 ms, and can be set to a maximum of 10000 ms. -.It Fl m Ar memory-limit -Memory limit, in bytes. -If a channel program attempts to allocate more memory than the given limit, -it will be stopped and an error returned. -The default memory limit is 10 MB, and can be set to a maximum of 100 MB. -.Pp -All remaining argument strings are passed directly to the channel program as -arguments. -See -.Xr zfs-program 8 -for more information. -.El -.It Xo -.Nm -.Cm jail -.Ar jailid filesystem -.Xc -.Pp -Attaches the specified -.Ar filesystem -to the jail identified by JID -.Ar jailid . -From now on this file system tree can be managed from within a jail if the -.Sy jailed -property has been set. To use this functionality, the jail needs the -.Va allow.mount -and -.Va allow.mount.zfs -parameters set to 1 and the -.Va enforce_statfs -parameter set to a value lower than 2. -.Pp -See -.Xr jail 8 -for more information on managing jails and configuring the parameters above. -.It Xo -.Nm -.Cm unjail -.Ar jailid filesystem -.Xc -.Pp -Detaches the specified -.Ar filesystem -from the jail identified by JID -.Ar jailid . -.El -.Sh EXIT STATUS -The following exit values are returned: -.Bl -tag -offset 2n -width 2n -.It 0 -Successful completion. -.It 1 -An error occurred. -.It 2 -Invalid command line options were specified. -.El -.Sh EXAMPLES -.Bl -tag -width 0n -.It Sy Example 1 No Creating a Tn ZFS No File System Hierarchy -.Pp -The following commands create a file system named -.Em pool/home -and a file system named -.Em pool/home/bob . -The mount point -.Pa /home -is set for the parent file system, and is automatically inherited by the child -file system. -.Bd -literal -offset 2n -.Li # Ic zfs create pool/home -.Li # Ic zfs set mountpoint=/home pool/home -.Li # Ic zfs create pool/home/bob -.Ed -.It Sy Example 2 No Creating a Tn ZFS No Snapshot -.Pp -The following command creates a snapshot named -.Sy yesterday . -This snapshot is mounted on demand in the -.Pa \&.zfs/snapshot -directory at the root of the -.Em pool/home/bob -file system. -.Bd -literal -offset 2n -.Li # Ic zfs snapshot pool/home/bob@yesterday -.Ed -.It Sy Example 3 No Creating and Destroying Multiple Snapshots -.Pp -The following command creates snapshots named -.Em yesterday -of -.Em pool/home -and all of its descendent file systems. Each snapshot is mounted on demand in -the -.Pa \&.zfs/snapshot -directory at the root of its file system. The second command destroys the newly -created snapshots. -.Bd -literal -offset 2n -.Li # Ic zfs snapshot -r pool/home@yesterday -.Li # Ic zfs destroy -r pool/home@yesterday -.Ed -.It Sy Example 4 No Disabling and Enabling File System Compression -.Pp -The following command disables the -.Sy compression -property for all file systems under -.Em pool/home . -The next command explicitly enables -.Sy compression -for -.Em pool/home/anne . -.Bd -literal -offset 2n -.Li # Ic zfs set compression=off pool/home -.Li # Ic zfs set compression=on pool/home/anne -.Ed -.It Sy Example 5 No Listing Tn ZFS No Datasets -.Pp -The following command lists all active file systems and volumes in the system. -Snapshots are displayed if the -.Sy listsnaps -property is -.Cm on . -The default is -.Cm off . -See -.Xr zpool 8 -for more information on pool properties. -.Bd -literal -offset 2n -.Li # Ic zfs list - NAME USED AVAIL REFER MOUNTPOINT - pool 450K 457G 18K /pool - pool/home 315K 457G 21K /home - pool/home/anne 18K 457G 18K /home/anne - pool/home/bob 276K 457G 276K /home/bob -.Ed -.It Sy Example 6 No Setting a Quota on a Tn ZFS No File System -.Pp -The following command sets a quota of 50 Gbytes for -.Em pool/home/bob . -.Bd -literal -offset 2n -.Li # Ic zfs set quota=50G pool/home/bob -.Ed -.It Sy Example 7 No Listing Tn ZFS No Properties -.Pp -The following command lists all properties for -.Em pool/home/bob . -.Bd -literal -offset 2n -.Li # Ic zfs get all pool/home/bob -NAME PROPERTY VALUE SOURCE -pool/home/bob type filesystem - -pool/home/bob creation Tue Jul 21 15:53 2009 - -pool/home/bob used 21K - -pool/home/bob available 20.0G - -pool/home/bob referenced 21K - -pool/home/bob compressratio 1.00x - -pool/home/bob mounted yes - -pool/home/bob quota 20G local -pool/home/bob reservation none default -pool/home/bob recordsize 128K default -pool/home/bob mountpoint /home/bob default -pool/home/bob sharenfs off default -pool/home/bob checksum on default -pool/home/bob compression on local -pool/home/bob atime on default -pool/home/bob devices on default -pool/home/bob exec on default -pool/home/bob filesystem_limit none default -pool/home/bob setuid on default -pool/home/bob readonly off default -pool/home/bob jailed off default -pool/home/bob snapdir hidden default -pool/home/bob snapshot_limit none default -pool/home/bob aclmode discard default -pool/home/bob aclinherit restricted default -pool/home/bob canmount on default -pool/home/bob xattr on default -pool/home/bob copies 1 default -pool/home/bob version 5 - -pool/home/bob utf8only off - -pool/home/bob normalization none - -pool/home/bob casesensitivity sensitive - -pool/home/bob vscan off default -pool/home/bob nbmand off default -pool/home/bob sharesmb off default -pool/home/bob refquota none default -pool/home/bob refreservation none default -pool/home/bob primarycache all default -pool/home/bob secondarycache all default -pool/home/bob usedbysnapshots 0 - -pool/home/bob usedbydataset 21K - -pool/home/bob usedbychildren 0 - -pool/home/bob usedbyrefreservation 0 - -pool/home/bob logbias latency default -pool/home/bob dedup off default -pool/home/bob mlslabel - -pool/home/bob sync standard default -pool/home/bob refcompressratio 1.00x - -.Ed -.Pp -The following command gets a single property value. -.Bd -literal -offset 2n -.Li # Ic zfs get -H -o value compression pool/home/bob -on -.Ed -.Pp -The following command lists all properties with local settings for -.Em pool/home/bob . -.Bd -literal -offset 2n -.Li # Ic zfs get -s local -o name,property,value all pool/home/bob -NAME PROPERTY VALUE -pool/home/bob quota 20G -pool/home/bob compression on -.Ed -.It Sy Example 8 No Rolling Back a Tn ZFS No File System -.Pp -The following command reverts the contents of -.Em pool/home/anne -to the snapshot named -.Em yesterday , -deleting all intermediate snapshots. -.Bd -literal -offset 2n -.Li # Ic zfs rollback -r pool/home/anne@yesterday -.Ed -.It Sy Example 9 No Creating a Tn ZFS No Clone -.Pp -The following command creates a writable file system whose initial contents are -the same as -.Em pool/home/bob@yesterday . -.Bd -literal -offset 2n -.Li # Ic zfs clone pool/home/bob@yesterday pool/clone -.Ed -.It Sy Example 10 No Promoting a Tn ZFS No Clone -.Pp -The following commands illustrate how to test out changes to a file system, and -then replace the original file system with the changed one, using clones, clone -promotion, and renaming: -.Bd -literal -offset 2n -.Li # Ic zfs create pool/project/production -.Ed -.Pp -Populate -.Pa /pool/project/production -with data and continue with the following commands: -.Bd -literal -offset 2n -.Li # Ic zfs snapshot pool/project/production@today -.Li # Ic zfs clone pool/project/production@today pool/project/beta -.Ed -.Pp -Now make changes to -.Pa /pool/project/beta -and continue with the following commands: -.Bd -literal -offset 2n -.Li # Ic zfs promote pool/project/beta -.Li # Ic zfs rename pool/project/production pool/project/legacy -.Li # Ic zfs rename pool/project/beta pool/project/production -.Ed -.Pp -Once the legacy version is no longer needed, it can be destroyed. -.Bd -literal -offset 2n -.Li # Ic zfs destroy pool/project/legacy -.Ed -.It Sy Example 11 No Inheriting Tn ZFS No Properties -.Pp -The following command causes -.Em pool/home/bob -and -.Em pool/home/anne -to inherit the -.Sy checksum -property from their parent. -.Bd -literal -offset 2n -.Li # Ic zfs inherit checksum pool/home/bob pool/home/anne -.Ed -.It Sy Example 12 No Remotely Replicating Tn ZFS No Data -.Pp -The following commands send a full stream and then an incremental stream to a -remote machine, restoring them into -.Sy poolB/received/fs@a -and -.Sy poolB/received/fs@b , -respectively. -.Sy poolB -must contain the file system -.Sy poolB/received , -and must not initially contain -.Sy poolB/received/fs . -.Bd -literal -offset 2n -.Li # Ic zfs send pool/fs@a | ssh host zfs receive poolB/received/fs@a -.Li # Ic zfs send -i a pool/fs@b | ssh host zfs receive poolB/received/fs -.Ed -.It Xo -.Sy Example 13 -Using the -.Qq zfs receive -d -Option -.Xc -.Pp -The following command sends a full stream of -.Sy poolA/fsA/fsB@snap -to a remote machine, receiving it into -.Sy poolB/received/fsA/fsB@snap . -The -.Sy fsA/fsB@snap -portion of the received snapshot's name is determined from the name of the sent -snapshot. -.Sy poolB -must contain the file system -.Sy poolB/received . -If -.Sy poolB/received/fsA -does not exist, it is created as an empty file system. -.Bd -literal -offset 2n -.Li # Ic zfs send poolA/fsA/fsB@snap | ssh host zfs receive -d poolB/received -.Ed -.It Sy Example 14 No Setting User Properties -.Pp -The following example sets the user-defined -.Sy com.example:department -property for a dataset. -.Bd -literal -offset 2n -.Li # Ic zfs set com.example:department=12345 tank/accounting -.Ed -.It Sy Example 15 No Performing a Rolling Snapshot -.Pp -The following example shows how to maintain a history of snapshots with a -consistent naming scheme. To keep a week's worth of snapshots, the user -destroys the oldest snapshot, renames the remaining snapshots, and then creates -a new snapshot, as follows: -.Bd -literal -offset 2n -.Li # Ic zfs destroy -r pool/users@7daysago -.Li # Ic zfs rename -r pool/users@6daysago @7daysago -.Li # Ic zfs rename -r pool/users@5daysago @6daysago -.Li # Ic zfs rename -r pool/users@4daysago @5daysago -.Li # Ic zfs rename -r pool/users@3daysago @4daysago -.Li # Ic zfs rename -r pool/users@2daysago @3daysago -.Li # Ic zfs rename -r pool/users@yesterday @2daysago -.Li # Ic zfs rename -r pool/users@today @yesterday -.Li # Ic zfs snapshot -r pool/users@today -.Ed -.It Xo -.Sy Example 16 -Setting -.Qq sharenfs -Property Options on a ZFS File System -.Xc -.Pp -The following command shows how to set -.Sy sharenfs -property options to enable root access for a specific network on the -.Em tank/home -file system. The contents of the -.Sy sharenfs -property are valid -.Xr exports 5 -options. -.Bd -literal -offset 2n -.Li # Ic zfs set sharenfs="maproot=root,network 192.168.0.0/24" tank/home -.Ed -.Pp -Another way to write this command with the same result is: -.Bd -literal -offset 2n -.Li # Ic set zfs sharenfs="-maproot=root -network 192.168.0.0/24" tank/home -.Ed -.It Xo -.Sy Example 17 -Delegating -.Tn ZFS -Administration Permissions on a -.Tn ZFS -Dataset -.Xc -.Pp -The following example shows how to set permissions so that user -.Em cindys -can create, destroy, mount, and take snapshots on -.Em tank/cindys . -The permissions on -.Em tank/cindys -are also displayed. -.Bd -literal -offset 2n -.Li # Ic zfs allow cindys create,destroy,mount,snapshot tank/cindys -.Li # Ic zfs allow tank/cindys ----- Permissions on tank/cindys -------------------------------------- -Local+Descendent permissions: - user cindys create,destroy,mount,snapshot -.Ed -.It Sy Example 18 No Delegating Create Time Permissions on a Tn ZFS No Dataset -.Pp -The following example shows how to grant anyone in the group -.Em staff -to create file systems in -.Em tank/users . -This syntax also allows staff members to destroy their own file systems, but -not destroy anyone else's file system. The permissions on -.Em tank/users -are also displayed. -.Bd -literal -offset 2n -.Li # Ic zfs allow staff create,mount tank/users -.Li # Ic zfs allow -c destroy tank/users -.Li # Ic zfs allow tank/users ----- Permissions on tank/users --------------------------------------- -Permission sets: - destroy -Local+Descendent permissions: - group staff create,mount -.Ed -.It Xo -.Sy Example 19 -Defining and Granting a Permission Set on a -.Tn ZFS -Dataset -.Xc -.Pp -The following example shows how to define and grant a permission set on the -.Em tank/users -file system. The permissions on -.Em tank/users -are also displayed. -.Bd -literal -offset 2n -.Li # Ic zfs allow -s @pset create,destroy,snapshot,mount tank/users -.Li # Ic zfs allow staff @pset tank/users -.Li # Ic zfs allow tank/users ----- Permissions on tank/users --------------------------------------- -Permission sets: - @pset create,destroy,mount,snapshot -Local+Descendent permissions: - group staff @pset -.Ed -.It Sy Example 20 No Delegating Property Permissions on a Tn ZFS No Dataset -.Pp -The following example shows to grant the ability to set quotas and reservations -on the -.Sy users/home -file system. The permissions on -.Sy users/home -are also displayed. -.Bd -literal -offset 2n -.Li # Ic zfs allow cindys quota,reservation users/home -.Li # Ic zfs allow users/home ----- Permissions on users/home --------------------------------------- -Local+Descendent permissions: - user cindys quota,reservation -.Li # Ic su - cindys -.Li cindys% Ic zfs set quota=10G users/home/marks -.Li cindys% Ic zfs get quota users/home/marks -NAME PROPERTY VALUE SOURCE -users/home/marks quota 10G local -.Ed -.It Sy Example 21 No Removing ZFS Delegated Permissions on a Tn ZFS No Dataset -.Pp -The following example shows how to remove the snapshot permission from the -.Em staff -group on the -.Em tank/users -file system. The permissions on -.Em tank/users -are also displayed. -.Bd -literal -offset 2n -.Li # Ic zfs unallow staff snapshot tank/users -.Li # Ic zfs allow tank/users ----- Permissions on tank/users --------------------------------------- -Permission sets: - @pset create,destroy,mount,snapshot -Local+Descendent permissions: - group staff @pset -.Ed -.It Sy Example 22 Showing the differences between a snapshot and a ZFS Dataset -.Pp -The following example shows how to see what has changed between a prior -snapshot of a ZFS Dataset and its current state. The -.Fl F -option is used to indicate type information for the files affected. -.Bd -literal -offset 2n -.Li # Ic zfs diff tank/test@before tank/test -M / /tank/test/ -M F /tank/test/linked (+1) -R F /tank/test/oldname -> /tank/test/newname -- F /tank/test/deleted -+ F /tank/test/created -M F /tank/test/modified -.Ed -.El -.Sh SEE ALSO -.Xr chmod 2 , -.Xr fsync 2 , -.Xr exports 5 , -.Xr fstab 5 , -.Xr rc.conf 5 , -.Xr jail 8 , -.Xr mount 8 , -.Xr umount 8 , -.Xr zfs-program 8 , -.Xr zpool 8 -.Sh HISTORY -The -.Nm -utility first appeared in -.Fx 7.0 . -.Sh AUTHORS -This manual page is a -.Xr mdoc 7 -reimplementation of the -.Tn OpenSolaris -manual page -.Em zfs(1M) , -modified and customized for -.Fx -and licensed under the -Common Development and Distribution License -.Pq Tn CDDL . -.Pp -The -.Xr mdoc 7 -implementation of this manual page was initially written by -.An Martin Matuska Aq mm@FreeBSD.org . diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.c b/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.c deleted file mode 100644 index a291db083568..000000000000 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.c +++ /dev/null @@ -1,497 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 Pawel Jakub Dawidek. All rights reserved. - * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include - -#include - -#include "zfs_util.h" -#include "zfs_iter.h" - -/* - * This is a private interface used to gather up all the datasets specified on - * the command line so that we can iterate over them in order. - * - * First, we iterate over all filesystems, gathering them together into an - * AVL tree. We report errors for any explicitly specified datasets - * that we couldn't open. - * - * When finished, we have an AVL tree of ZFS handles. We go through and execute - * the provided callback for each one, passing whatever data the user supplied. - */ - -typedef struct zfs_node { - zfs_handle_t *zn_handle; - uu_avl_node_t zn_avlnode; -} zfs_node_t; - -typedef struct callback_data { - uu_avl_t *cb_avl; - int cb_flags; - zfs_type_t cb_types; - zfs_sort_column_t *cb_sortcol; - zprop_list_t **cb_proplist; - int cb_depth_limit; - int cb_depth; - uint8_t cb_props_table[ZFS_NUM_PROPS]; -} callback_data_t; - -uu_avl_pool_t *avl_pool; - -/* - * Include snaps if they were requested or if this a zfs list where types - * were not specified and the "listsnapshots" property is set on this pool. - */ -static boolean_t -zfs_include_snapshots(zfs_handle_t *zhp, callback_data_t *cb) -{ - zpool_handle_t *zph; - - if ((cb->cb_flags & ZFS_ITER_PROP_LISTSNAPS) == 0) - return (cb->cb_types & ZFS_TYPE_SNAPSHOT); - - zph = zfs_get_pool_handle(zhp); - return (zpool_get_prop_int(zph, ZPOOL_PROP_LISTSNAPS, NULL)); -} - -/* - * Called for each dataset. If the object is of an appropriate type, - * add it to the avl tree and recurse over any children as necessary. - */ -static int -zfs_callback(zfs_handle_t *zhp, void *data) -{ - callback_data_t *cb = data; - boolean_t should_close = B_TRUE; - boolean_t include_snaps = zfs_include_snapshots(zhp, cb); - boolean_t include_bmarks = (cb->cb_types & ZFS_TYPE_BOOKMARK); - - if ((zfs_get_type(zhp) & cb->cb_types) || - ((zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) && include_snaps)) { - uu_avl_index_t idx; - zfs_node_t *node = safe_malloc(sizeof (zfs_node_t)); - - node->zn_handle = zhp; - uu_avl_node_init(node, &node->zn_avlnode, avl_pool); - if (uu_avl_find(cb->cb_avl, node, cb->cb_sortcol, - &idx) == NULL) { - if (cb->cb_proplist) { - if ((*cb->cb_proplist) && - !(*cb->cb_proplist)->pl_all) - zfs_prune_proplist(zhp, - cb->cb_props_table); - - if (zfs_expand_proplist(zhp, cb->cb_proplist, - (cb->cb_flags & ZFS_ITER_RECVD_PROPS), - (cb->cb_flags & ZFS_ITER_LITERAL_PROPS)) - != 0) { - free(node); - return (-1); - } - } - uu_avl_insert(cb->cb_avl, node, idx); - should_close = B_FALSE; - } else { - free(node); - } - } - - /* - * Recurse if necessary. - */ - if (cb->cb_flags & ZFS_ITER_RECURSE && - ((cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 || - cb->cb_depth < cb->cb_depth_limit)) { - cb->cb_depth++; - if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) - (void) zfs_iter_filesystems(zhp, zfs_callback, data); - if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT | - ZFS_TYPE_BOOKMARK)) == 0) && include_snaps) - (void) zfs_iter_snapshots(zhp, - (cb->cb_flags & ZFS_ITER_SIMPLE) != 0, zfs_callback, - data, 0, 0); - if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT | - ZFS_TYPE_BOOKMARK)) == 0) && include_bmarks) - (void) zfs_iter_bookmarks(zhp, zfs_callback, data); - cb->cb_depth--; - } - - if (should_close) - zfs_close(zhp); - - return (0); -} - -int -zfs_add_sort_column(zfs_sort_column_t **sc, const char *name, - boolean_t reverse) -{ - zfs_sort_column_t *col; - zfs_prop_t prop; - - if ((prop = zfs_name_to_prop(name)) == ZPROP_INVAL && - !zfs_prop_user(name)) - return (-1); - - col = safe_malloc(sizeof (zfs_sort_column_t)); - - col->sc_prop = prop; - col->sc_reverse = reverse; - if (prop == ZPROP_INVAL) { - col->sc_user_prop = safe_malloc(strlen(name) + 1); - (void) strcpy(col->sc_user_prop, name); - } - - if (*sc == NULL) { - col->sc_last = col; - *sc = col; - } else { - (*sc)->sc_last->sc_next = col; - (*sc)->sc_last = col; - } - - return (0); -} - -void -zfs_free_sort_columns(zfs_sort_column_t *sc) -{ - zfs_sort_column_t *col; - - while (sc != NULL) { - col = sc->sc_next; - free(sc->sc_user_prop); - free(sc); - sc = col; - } -} - -boolean_t -zfs_sort_only_by_name(const zfs_sort_column_t *sc) -{ - - return (sc != NULL && sc->sc_next == NULL && - sc->sc_prop == ZFS_PROP_NAME); -} - -/* ARGSUSED */ -static int -zfs_compare(const void *larg, const void *rarg, void *unused) -{ - zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; - zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; - const char *lname = zfs_get_name(l); - const char *rname = zfs_get_name(r); - char *lat, *rat; - uint64_t lcreate, rcreate; - int ret; - - lat = (char *)strchr(lname, '@'); - rat = (char *)strchr(rname, '@'); - - if (lat != NULL) - *lat = '\0'; - if (rat != NULL) - *rat = '\0'; - - ret = strcmp(lname, rname); - if (ret == 0 && (lat != NULL || rat != NULL)) { - /* - * If we're comparing a dataset to one of its snapshots, we - * always make the full dataset first. - */ - if (lat == NULL) { - ret = -1; - } else if (rat == NULL) { - ret = 1; - } else { - /* - * If we have two snapshots from the same dataset, then - * we want to sort them according to creation time. We - * use the hidden CREATETXG property to get an absolute - * ordering of snapshots. - */ - lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); - rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); - - /* - * Both lcreate and rcreate being 0 means we don't have - * properties and we should compare full name. - */ - if (lcreate == 0 && rcreate == 0) - ret = strcmp(lat + 1, rat + 1); - else if (lcreate < rcreate) - ret = -1; - else if (lcreate > rcreate) - ret = 1; - } - } - - if (lat != NULL) - *lat = '@'; - if (rat != NULL) - *rat = '@'; - - return (ret); -} - -/* - * Sort datasets by specified columns. - * - * o Numeric types sort in ascending order. - * o String types sort in alphabetical order. - * o Types inappropriate for a row sort that row to the literal - * bottom, regardless of the specified ordering. - * - * If no sort columns are specified, or two datasets compare equally - * across all specified columns, they are sorted alphabetically by name - * with snapshots grouped under their parents. - */ -static int -zfs_sort(const void *larg, const void *rarg, void *data) -{ - zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; - zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; - zfs_sort_column_t *sc = (zfs_sort_column_t *)data; - zfs_sort_column_t *psc; - - for (psc = sc; psc != NULL; psc = psc->sc_next) { - char lbuf[ZFS_MAXPROPLEN], rbuf[ZFS_MAXPROPLEN]; - char *lstr, *rstr; - uint64_t lnum, rnum; - boolean_t lvalid, rvalid; - int ret = 0; - - /* - * We group the checks below the generic code. If 'lstr' and - * 'rstr' are non-NULL, then we do a string based comparison. - * Otherwise, we compare 'lnum' and 'rnum'. - */ - lstr = rstr = NULL; - if (psc->sc_prop == ZPROP_INVAL) { - nvlist_t *luser, *ruser; - nvlist_t *lval, *rval; - - luser = zfs_get_user_props(l); - ruser = zfs_get_user_props(r); - - lvalid = (nvlist_lookup_nvlist(luser, - psc->sc_user_prop, &lval) == 0); - rvalid = (nvlist_lookup_nvlist(ruser, - psc->sc_user_prop, &rval) == 0); - - if (lvalid) - verify(nvlist_lookup_string(lval, - ZPROP_VALUE, &lstr) == 0); - if (rvalid) - verify(nvlist_lookup_string(rval, - ZPROP_VALUE, &rstr) == 0); - } else if (psc->sc_prop == ZFS_PROP_NAME) { - lvalid = rvalid = B_TRUE; - - (void) strlcpy(lbuf, zfs_get_name(l), sizeof (lbuf)); - (void) strlcpy(rbuf, zfs_get_name(r), sizeof (rbuf)); - - lstr = lbuf; - rstr = rbuf; - } else if (zfs_prop_is_string(psc->sc_prop)) { - lvalid = (zfs_prop_get(l, psc->sc_prop, lbuf, - sizeof (lbuf), NULL, NULL, 0, B_TRUE) == 0); - rvalid = (zfs_prop_get(r, psc->sc_prop, rbuf, - sizeof (rbuf), NULL, NULL, 0, B_TRUE) == 0); - - lstr = lbuf; - rstr = rbuf; - } else { - lvalid = zfs_prop_valid_for_type(psc->sc_prop, - zfs_get_type(l)); - rvalid = zfs_prop_valid_for_type(psc->sc_prop, - zfs_get_type(r)); - - if (lvalid) - (void) zfs_prop_get_numeric(l, psc->sc_prop, - &lnum, NULL, NULL, 0); - if (rvalid) - (void) zfs_prop_get_numeric(r, psc->sc_prop, - &rnum, NULL, NULL, 0); - } - - if (!lvalid && !rvalid) - continue; - else if (!lvalid) - return (1); - else if (!rvalid) - return (-1); - - if (lstr) - ret = strcmp(lstr, rstr); - else if (lnum < rnum) - ret = -1; - else if (lnum > rnum) - ret = 1; - - if (ret != 0) { - if (psc->sc_reverse == B_TRUE) - ret = (ret < 0) ? 1 : -1; - return (ret); - } - } - - return (zfs_compare(larg, rarg, NULL)); -} - -int -zfs_for_each(int argc, char **argv, int flags, zfs_type_t types, - zfs_sort_column_t *sortcol, zprop_list_t **proplist, int limit, - zfs_iter_f callback, void *data) -{ - callback_data_t cb = {0}; - int ret = 0; - zfs_node_t *node; - uu_avl_walk_t *walk; - - avl_pool = uu_avl_pool_create("zfs_pool", sizeof (zfs_node_t), - offsetof(zfs_node_t, zn_avlnode), zfs_sort, UU_DEFAULT); - - if (avl_pool == NULL) - nomem(); - - cb.cb_sortcol = sortcol; - cb.cb_flags = flags; - cb.cb_proplist = proplist; - cb.cb_types = types; - cb.cb_depth_limit = limit; - /* - * If cb_proplist is provided then in the zfs_handles created we - * retain only those properties listed in cb_proplist and sortcol. - * The rest are pruned. So, the caller should make sure that no other - * properties other than those listed in cb_proplist/sortcol are - * accessed. - * - * If cb_proplist is NULL then we retain all the properties. We - * always retain the zoned property, which some other properties - * need (userquota & friends), and the createtxg property, which - * we need to sort snapshots. - */ - if (cb.cb_proplist && *cb.cb_proplist) { - zprop_list_t *p = *cb.cb_proplist; - - while (p) { - if (p->pl_prop >= ZFS_PROP_TYPE && - p->pl_prop < ZFS_NUM_PROPS) { - cb.cb_props_table[p->pl_prop] = B_TRUE; - } - p = p->pl_next; - } - - while (sortcol) { - if (sortcol->sc_prop >= ZFS_PROP_TYPE && - sortcol->sc_prop < ZFS_NUM_PROPS) { - cb.cb_props_table[sortcol->sc_prop] = B_TRUE; - } - sortcol = sortcol->sc_next; - } - - cb.cb_props_table[ZFS_PROP_ZONED] = B_TRUE; - cb.cb_props_table[ZFS_PROP_CREATETXG] = B_TRUE; - } else { - (void) memset(cb.cb_props_table, B_TRUE, - sizeof (cb.cb_props_table)); - } - - if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) - nomem(); - - if (argc == 0) { - /* - * If given no arguments, iterate over all datasets. - */ - cb.cb_flags |= ZFS_ITER_RECURSE; - ret = zfs_iter_root(g_zfs, zfs_callback, &cb); - } else { - int i; - zfs_handle_t *zhp; - zfs_type_t argtype; - - /* - * If we're recursive, then we always allow filesystems as - * arguments. If we also are interested in snapshots or - * bookmarks, then we can take volumes as well. - */ - argtype = types; - if (flags & ZFS_ITER_RECURSE) { - argtype |= ZFS_TYPE_FILESYSTEM; - if (types & (ZFS_TYPE_SNAPSHOT | ZFS_TYPE_BOOKMARK)) - argtype |= ZFS_TYPE_VOLUME; - } - - for (i = 0; i < argc; i++) { - if (flags & ZFS_ITER_ARGS_CAN_BE_PATHS) { - zhp = zfs_path_to_zhandle(g_zfs, argv[i], - argtype); - } else { - zhp = zfs_open(g_zfs, argv[i], argtype); - } - if (zhp != NULL) - ret |= zfs_callback(zhp, &cb); - else - ret = 1; - } - } - - /* - * At this point we've got our AVL tree full of zfs handles, so iterate - * over each one and execute the real user callback. - */ - for (node = uu_avl_first(cb.cb_avl); node != NULL; - node = uu_avl_next(cb.cb_avl, node)) - ret |= callback(node->zn_handle, data); - - /* - * Finally, clean up the AVL tree. - */ - if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) - nomem(); - - while ((node = uu_avl_walk_next(walk)) != NULL) { - uu_avl_remove(cb.cb_avl, node); - zfs_close(node->zn_handle); - free(node); - } - - uu_avl_walk_end(walk); - uu_avl_destroy(cb.cb_avl); - uu_avl_pool_destroy(avl_pool); - - return (ret); -} diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.h b/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.h deleted file mode 100644 index b89b466ce6fe..000000000000 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs_iter.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. - * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ - -#ifndef ZFS_ITER_H -#define ZFS_ITER_H - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct zfs_sort_column { - struct zfs_sort_column *sc_next; - struct zfs_sort_column *sc_last; - zfs_prop_t sc_prop; - char *sc_user_prop; - boolean_t sc_reverse; -} zfs_sort_column_t; - -#define ZFS_ITER_RECURSE (1 << 0) -#define ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1) -#define ZFS_ITER_PROP_LISTSNAPS (1 << 2) -#define ZFS_ITER_DEPTH_LIMIT (1 << 3) -#define ZFS_ITER_RECVD_PROPS (1 << 4) -#define ZFS_ITER_SIMPLE (1 << 5) -#define ZFS_ITER_LITERAL_PROPS (1 << 6) - -int zfs_for_each(int, char **, int options, zfs_type_t, - zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *); -int zfs_add_sort_column(zfs_sort_column_t **, const char *, boolean_t); -void zfs_free_sort_columns(zfs_sort_column_t *); -boolean_t zfs_sort_only_by_name(const zfs_sort_column_t *); - -#ifdef __cplusplus -} -#endif - -#endif /* ZFS_ITER_H */ diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c deleted file mode 100644 index d453ba030488..000000000000 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c +++ /dev/null @@ -1,7592 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. - * Copyright 2012 Milan Jurik. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. - * Copyright (c) 2012 Martin Matuska . All rights reserved. - * Copyright (c) 2013 Steven Hartland. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2016 Igor Kozhukhov . - * Copyright 2016 Nexenta Systems, Inc. - * Copyright (c) 2019 Datto Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#ifdef illumos -#include -#include -#include -#include -#endif - -#include "zfs_iter.h" -#include "zfs_util.h" -#include "zfs_comutil.h" - -libzfs_handle_t *g_zfs; - -static FILE *mnttab_file; -static char history_str[HIS_MAX_RECORD_LEN]; -static boolean_t log_history = B_TRUE; - -static int zfs_do_clone(int argc, char **argv); -static int zfs_do_create(int argc, char **argv); -static int zfs_do_destroy(int argc, char **argv); -static int zfs_do_get(int argc, char **argv); -static int zfs_do_inherit(int argc, char **argv); -static int zfs_do_list(int argc, char **argv); -static int zfs_do_mount(int argc, char **argv); -static int zfs_do_rename(int argc, char **argv); -static int zfs_do_rollback(int argc, char **argv); -static int zfs_do_set(int argc, char **argv); -static int zfs_do_upgrade(int argc, char **argv); -static int zfs_do_snapshot(int argc, char **argv); -static int zfs_do_unmount(int argc, char **argv); -static int zfs_do_share(int argc, char **argv); -static int zfs_do_unshare(int argc, char **argv); -static int zfs_do_send(int argc, char **argv); -static int zfs_do_receive(int argc, char **argv); -static int zfs_do_promote(int argc, char **argv); -static int zfs_do_userspace(int argc, char **argv); -static int zfs_do_allow(int argc, char **argv); -static int zfs_do_unallow(int argc, char **argv); -static int zfs_do_hold(int argc, char **argv); -static int zfs_do_holds(int argc, char **argv); -static int zfs_do_release(int argc, char **argv); -static int zfs_do_diff(int argc, char **argv); -static int zfs_do_jail(int argc, char **argv); -static int zfs_do_unjail(int argc, char **argv); -static int zfs_do_bookmark(int argc, char **argv); -static int zfs_do_remap(int argc, char **argv); -static int zfs_do_channel_program(int argc, char **argv); - -/* - * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. - */ - -#ifdef DEBUG -const char * -_umem_debug_init(void) -{ - return ("default,verbose"); /* $UMEM_DEBUG setting */ -} - -const char * -_umem_logging_init(void) -{ - return ("fail,contents"); /* $UMEM_LOGGING setting */ -} -#endif - -typedef enum { - HELP_CLONE, - HELP_CREATE, - HELP_DESTROY, - HELP_GET, - HELP_INHERIT, - HELP_UPGRADE, - HELP_JAIL, - HELP_UNJAIL, - HELP_LIST, - HELP_MOUNT, - HELP_PROMOTE, - HELP_RECEIVE, - HELP_RENAME, - HELP_ROLLBACK, - HELP_SEND, - HELP_SET, - HELP_SHARE, - HELP_SNAPSHOT, - HELP_UNMOUNT, - HELP_UNSHARE, - HELP_ALLOW, - HELP_UNALLOW, - HELP_USERSPACE, - HELP_GROUPSPACE, - HELP_HOLD, - HELP_HOLDS, - HELP_RELEASE, - HELP_DIFF, - HELP_REMAP, - HELP_BOOKMARK, - HELP_CHANNEL_PROGRAM, -} zfs_help_t; - -typedef struct zfs_command { - const char *name; - int (*func)(int argc, char **argv); - zfs_help_t usage; -} zfs_command_t; - -/* - * Master command table. Each ZFS command has a name, associated function, and - * usage message. The usage messages need to be internationalized, so we have - * to have a function to return the usage message based on a command index. - * - * These commands are organized according to how they are displayed in the usage - * message. An empty command (one with a NULL name) indicates an empty line in - * the generic usage message. - */ -static zfs_command_t command_table[] = { - { "create", zfs_do_create, HELP_CREATE }, - { "destroy", zfs_do_destroy, HELP_DESTROY }, - { NULL }, - { "snapshot", zfs_do_snapshot, HELP_SNAPSHOT }, - { "rollback", zfs_do_rollback, HELP_ROLLBACK }, - { "clone", zfs_do_clone, HELP_CLONE }, - { "promote", zfs_do_promote, HELP_PROMOTE }, - { "rename", zfs_do_rename, HELP_RENAME }, - { "bookmark", zfs_do_bookmark, HELP_BOOKMARK }, - { "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM }, - { NULL }, - { "list", zfs_do_list, HELP_LIST }, - { NULL }, - { "set", zfs_do_set, HELP_SET }, - { "get", zfs_do_get, HELP_GET }, - { "inherit", zfs_do_inherit, HELP_INHERIT }, - { "upgrade", zfs_do_upgrade, HELP_UPGRADE }, - { "userspace", zfs_do_userspace, HELP_USERSPACE }, - { "groupspace", zfs_do_userspace, HELP_GROUPSPACE }, - { NULL }, - { "mount", zfs_do_mount, HELP_MOUNT }, - { "unmount", zfs_do_unmount, HELP_UNMOUNT }, - { "share", zfs_do_share, HELP_SHARE }, - { "unshare", zfs_do_unshare, HELP_UNSHARE }, - { NULL }, - { "send", zfs_do_send, HELP_SEND }, - { "receive", zfs_do_receive, HELP_RECEIVE }, - { NULL }, - { "allow", zfs_do_allow, HELP_ALLOW }, - { NULL }, - { "unallow", zfs_do_unallow, HELP_UNALLOW }, - { NULL }, - { "hold", zfs_do_hold, HELP_HOLD }, - { "holds", zfs_do_holds, HELP_HOLDS }, - { "release", zfs_do_release, HELP_RELEASE }, - { "diff", zfs_do_diff, HELP_DIFF }, - { NULL }, - { "jail", zfs_do_jail, HELP_JAIL }, - { "unjail", zfs_do_unjail, HELP_UNJAIL }, - { "remap", zfs_do_remap, HELP_REMAP }, -}; - -#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) - -zfs_command_t *current_command; - -static const char * -get_usage(zfs_help_t idx) -{ - switch (idx) { - case HELP_CLONE: - return (gettext("\tclone [-p] [-o property=value] ... " - " \n")); - case HELP_CREATE: - return (gettext("\tcreate [-pu] [-o property=value] ... " - "\n" - "\tcreate [-ps] [-b blocksize] [-o property=value] ... " - "-V \n")); - case HELP_DESTROY: - return (gettext("\tdestroy [-fnpRrv] \n" - "\tdestroy [-dnpRrv] " - "@[%][,...]\n" - "\tdestroy #\n")); - case HELP_GET: - return (gettext("\tget [-rHp] [-d max] " - "[-o \"all\" | field[,...]]\n" - "\t [-t type[,...]] [-s source[,...]]\n" - "\t <\"all\" | property[,...]> " - "[filesystem|volume|snapshot|bookmark] ...\n")); - case HELP_INHERIT: - return (gettext("\tinherit [-rS] " - " ...\n")); - case HELP_UPGRADE: - return (gettext("\tupgrade [-v]\n" - "\tupgrade [-r] [-V version] <-a | filesystem ...>\n")); - case HELP_JAIL: - return (gettext("\tjail \n")); - case HELP_UNJAIL: - return (gettext("\tunjail \n")); - case HELP_LIST: - return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] " - "[-s property]...\n\t [-S property]... [-t type[,...]] " - "[filesystem|volume|snapshot] ...\n")); - case HELP_MOUNT: - return (gettext("\tmount\n" - "\tmount [-vO] [-o opts] <-a | filesystem>\n")); - case HELP_PROMOTE: - return (gettext("\tpromote \n")); - case HELP_RECEIVE: - return (gettext("\treceive|recv [-vnsFMu] \n" - "\treceive|recv [-vnsFMu] [-o origin=] [-d | -e] " - "\n" - "\treceive|recv -A \n")); - case HELP_RENAME: - return (gettext("\trename [-f] " - "\n" - "\trename [-f] -p \n" - "\trename -r \n" - "\trename \n" - "\trename -u [-p] ")); - case HELP_ROLLBACK: - return (gettext("\trollback [-rRf] \n")); - case HELP_SEND: - return (gettext("\tsend [-DnPpRvLec] [-[iI] snapshot] " - "\n" - "\tsend [-LPcenv] [-i snapshot|bookmark] " - "\n" - "\tsend [-nvPe] -t \n")); - case HELP_SET: - return (gettext("\tset ... " - " ...\n")); - case HELP_SHARE: - return (gettext("\tshare <-a | filesystem>\n")); - case HELP_SNAPSHOT: - return (gettext("\tsnapshot|snap [-r] [-o property=value] ... " - "@ ...\n")); - case HELP_UNMOUNT: - return (gettext("\tunmount|umount [-f] " - "<-a | filesystem|mountpoint>\n")); - case HELP_UNSHARE: - return (gettext("\tunshare " - "<-a | filesystem|mountpoint>\n")); - case HELP_ALLOW: - return (gettext("\tallow \n" - "\tallow [-ldug] " - "<\"everyone\"|user|group>[,...] [,...]\n" - "\t \n" - "\tallow [-ld] -e [,...] " - "\n" - "\tallow -c [,...] \n" - "\tallow -s @setname [,...] " - "\n")); - case HELP_UNALLOW: - return (gettext("\tunallow [-rldug] " - "<\"everyone\"|user|group>[,...]\n" - "\t [[,...]] \n" - "\tunallow [-rld] -e [[,...]] " - "\n" - "\tunallow [-r] -c [[,...]] " - "\n" - "\tunallow [-r] -s @setname [[,...]] " - "\n")); - case HELP_USERSPACE: - return (gettext("\tuserspace [-Hinp] [-o field[,...]] " - "[-s field] ...\n" - "\t [-S field] ... [-t type[,...]] " - "\n")); - case HELP_GROUPSPACE: - return (gettext("\tgroupspace [-Hinp] [-o field[,...]] " - "[-s field] ...\n" - "\t [-S field] ... [-t type[,...]] " - "\n")); - case HELP_HOLD: - return (gettext("\thold [-r] ...\n")); - case HELP_HOLDS: - return (gettext("\tholds [-Hp] [-r|-d depth] " - " ...\n")); - case HELP_RELEASE: - return (gettext("\trelease [-r] ...\n")); - case HELP_DIFF: - return (gettext("\tdiff [-FHt] " - "[snapshot|filesystem]\n")); - case HELP_REMAP: - return (gettext("\tremap \n")); - case HELP_BOOKMARK: - return (gettext("\tbookmark \n")); - case HELP_CHANNEL_PROGRAM: - return (gettext("\tprogram [-jn] [-t ] " - "[-m ] " - "[lua args...]\n")); - } - - abort(); - /* NOTREACHED */ -} - -void -nomem(void) -{ - (void) fprintf(stderr, gettext("internal error: out of memory\n")); - exit(1); -} - -/* - * Utility function to guarantee malloc() success. - */ - -void * -safe_malloc(size_t size) -{ - void *data; - - if ((data = calloc(1, size)) == NULL) - nomem(); - - return (data); -} - -void * -safe_realloc(void *data, size_t size) -{ - void *newp; - if ((newp = realloc(data, size)) == NULL) { - free(data); - nomem(); - } - - return (newp); -} - -static char * -safe_strdup(char *str) -{ - char *dupstr = strdup(str); - - if (dupstr == NULL) - nomem(); - - return (dupstr); -} - -/* - * Callback routine that will print out information for each of - * the properties. - */ -static int -usage_prop_cb(int prop, void *cb) -{ - FILE *fp = cb; - - (void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop)); - - if (zfs_prop_readonly(prop)) - (void) fprintf(fp, " NO "); - else - (void) fprintf(fp, "YES "); - - if (zfs_prop_inheritable(prop)) - (void) fprintf(fp, " YES "); - else - (void) fprintf(fp, " NO "); - - if (zfs_prop_values(prop) == NULL) - (void) fprintf(fp, "-\n"); - else - (void) fprintf(fp, "%s\n", zfs_prop_values(prop)); - - return (ZPROP_CONT); -} - -/* - * Display usage message. If we're inside a command, display only the usage for - * that command. Otherwise, iterate over the entire command table and display - * a complete usage message. - */ -static void -usage(boolean_t requested) -{ - int i; - boolean_t show_properties = B_FALSE; - FILE *fp = requested ? stdout : stderr; - - if (current_command == NULL) { - - (void) fprintf(fp, gettext("usage: zfs command args ...\n")); - (void) fprintf(fp, - gettext("where 'command' is one of the following:\n\n")); - - for (i = 0; i < NCOMMAND; i++) { - if (command_table[i].name == NULL) - (void) fprintf(fp, "\n"); - else - (void) fprintf(fp, "%s", - get_usage(command_table[i].usage)); - } - - (void) fprintf(fp, gettext("\nEach dataset is of the form: " - "pool/[dataset/]*dataset[@name]\n")); - } else { - (void) fprintf(fp, gettext("usage:\n")); - (void) fprintf(fp, "%s", get_usage(current_command->usage)); - } - - if (current_command != NULL && - (strcmp(current_command->name, "set") == 0 || - strcmp(current_command->name, "get") == 0 || - strcmp(current_command->name, "inherit") == 0 || - strcmp(current_command->name, "list") == 0)) - show_properties = B_TRUE; - - if (show_properties) { - (void) fprintf(fp, - gettext("\nThe following properties are supported:\n")); - - (void) fprintf(fp, "\n\t%-14s %s %s %s\n\n", - "PROPERTY", "EDIT", "INHERIT", "VALUES"); - - /* Iterate over all properties */ - (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE, - ZFS_TYPE_DATASET); - - (void) fprintf(fp, "\t%-15s ", "userused@..."); - (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "groupused@..."); - (void) fprintf(fp, " NO NO \n"); - (void) fprintf(fp, "\t%-15s ", "userquota@..."); - (void) fprintf(fp, "YES NO | none\n"); - (void) fprintf(fp, "\t%-15s ", "groupquota@..."); - (void) fprintf(fp, "YES NO | none\n"); - (void) fprintf(fp, "\t%-15s ", "written@"); - (void) fprintf(fp, " NO NO \n"); - - (void) fprintf(fp, gettext("\nSizes are specified in bytes " - "with standard units such as K, M, G, etc.\n")); - (void) fprintf(fp, gettext("\nUser-defined properties can " - "be specified by using a name containing a colon (:).\n")); - (void) fprintf(fp, gettext("\nThe {user|group}{used|quota}@ " - "properties must be appended with\n" - "a user or group specifier of one of these forms:\n" - " POSIX name (eg: \"matt\")\n" - " POSIX id (eg: \"126829\")\n" - " SMB name@domain (eg: \"matt@sun\")\n" - " SMB SID (eg: \"S-1-234-567-89\")\n")); - } else { - (void) fprintf(fp, - gettext("\nFor the property list, run: %s\n"), - "zfs set|get"); - (void) fprintf(fp, - gettext("\nFor the delegated permission list, run: %s\n"), - "zfs allow|unallow"); - } - - /* - * See comments at end of main(). - */ - if (getenv("ZFS_ABORT") != NULL) { - (void) printf("dumping core by request\n"); - abort(); - } - - exit(requested ? 0 : 2); -} - -/* - * Take a property=value argument string and add it to the given nvlist. - * Modifies the argument inplace. - */ -static int -parseprop(nvlist_t *props, char *propname) -{ - char *propval, *strval; - - if ((propval = strchr(propname, '=')) == NULL) { - (void) fprintf(stderr, gettext("missing " - "'=' for property=value argument\n")); - return (-1); - } - *propval = '\0'; - propval++; - if (nvlist_lookup_string(props, propname, &strval) == 0) { - (void) fprintf(stderr, gettext("property '%s' " - "specified multiple times\n"), propname); - return (-1); - } - if (nvlist_add_string(props, propname, propval) != 0) - nomem(); - return (0); -} - -static int -parse_depth(char *opt, int *flags) -{ - char *tmp; - int depth; - - depth = (int)strtol(opt, &tmp, 0); - if (*tmp) { - (void) fprintf(stderr, - gettext("%s is not an integer\n"), opt); - usage(B_FALSE); - } - if (depth < 0) { - (void) fprintf(stderr, - gettext("Depth can not be negative.\n")); - usage(B_FALSE); - } - *flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE); - return (depth); -} - -#define PROGRESS_DELAY 2 /* seconds */ - -static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"; -static time_t pt_begin; -static char *pt_header = NULL; -static boolean_t pt_shown; - -static void -start_progress_timer(void) -{ - pt_begin = time(NULL) + PROGRESS_DELAY; - pt_shown = B_FALSE; -} - -static void -set_progress_header(char *header) -{ - assert(pt_header == NULL); - pt_header = safe_strdup(header); - if (pt_shown) { - (void) printf("%s: ", header); - (void) fflush(stdout); - } -} - -static void -update_progress(char *update) -{ - if (!pt_shown && time(NULL) > pt_begin) { - int len = strlen(update); - - (void) printf("%s: %s%*.*s", pt_header, update, len, len, - pt_reverse); - (void) fflush(stdout); - pt_shown = B_TRUE; - } else if (pt_shown) { - int len = strlen(update); - - (void) printf("%s%*.*s", update, len, len, pt_reverse); - (void) fflush(stdout); - } -} - -static void -finish_progress(char *done) -{ - if (pt_shown) { - (void) printf("%s\n", done); - (void) fflush(stdout); - } - free(pt_header); - pt_header = NULL; -} - -/* - * Check if the dataset is mountable and should be automatically mounted. - */ -static boolean_t -should_auto_mount(zfs_handle_t *zhp) -{ - if (!zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, zfs_get_type(zhp))) - return (B_FALSE); - return (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON); -} - -/* - * zfs clone [-p] [-o prop=value] ... - * - * Given an existing dataset, create a writable copy whose initial contents - * are the same as the source. The newly created dataset maintains a - * dependency on the original; the original cannot be destroyed so long as - * the clone exists. - * - * The '-p' flag creates all the non-existing ancestors of the target first. - */ -static int -zfs_do_clone(int argc, char **argv) -{ - zfs_handle_t *zhp = NULL; - boolean_t parents = B_FALSE; - nvlist_t *props; - int ret = 0; - int c; - - if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) - nomem(); - - /* check options */ - while ((c = getopt(argc, argv, "o:p")) != -1) { - switch (c) { - case 'o': - if (parseprop(props, optarg) != 0) - return (1); - break; - case 'p': - parents = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - goto usage; - } - } - - argc -= optind; - argv += optind; - - /* check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing source dataset " - "argument\n")); - goto usage; - } - if (argc < 2) { - (void) fprintf(stderr, gettext("missing target dataset " - "argument\n")); - goto usage; - } - if (argc > 2) { - (void) fprintf(stderr, gettext("too many arguments\n")); - goto usage; - } - - /* open the source dataset */ - if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) - return (1); - - if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM | - ZFS_TYPE_VOLUME)) { - /* - * Now create the ancestors of the target dataset. If the - * target already exists and '-p' option was used we should not - * complain. - */ - if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | - ZFS_TYPE_VOLUME)) - return (0); - if (zfs_create_ancestors(g_zfs, argv[1]) != 0) - return (1); - } - - /* pass to libzfs */ - ret = zfs_clone(zhp, argv[1], props); - - /* create the mountpoint if necessary */ - if (ret == 0) { - zfs_handle_t *clone; - - clone = zfs_open(g_zfs, argv[1], ZFS_TYPE_DATASET); - if (clone != NULL) { - /* - * If the user doesn't want the dataset - * automatically mounted, then skip the mount/share - * step. - */ - if (should_auto_mount(clone)) { - if ((ret = zfs_mount(clone, NULL, 0)) != 0) { - (void) fprintf(stderr, gettext("clone " - "successfully created, " - "but not mounted\n")); - } else if ((ret = zfs_share(clone)) != 0) { - (void) fprintf(stderr, gettext("clone " - "successfully created, " - "but not shared\n")); - } - } - zfs_close(clone); - } - } - - zfs_close(zhp); - nvlist_free(props); - - return (!!ret); - -usage: - if (zhp) - zfs_close(zhp); - nvlist_free(props); - usage(B_FALSE); - return (-1); -} - -/* - * zfs create [-pu] [-o prop=value] ... fs - * zfs create [-ps] [-b blocksize] [-o prop=value] ... -V vol size - * - * Create a new dataset. This command can be used to create filesystems - * and volumes. Snapshot creation is handled by 'zfs snapshot'. - * For volumes, the user must specify a size to be used. - * - * The '-s' flag applies only to volumes, and indicates that we should not try - * to set the reservation for this volume. By default we set a reservation - * equal to the size for any volume. For pools with SPA_VERSION >= - * SPA_VERSION_REFRESERVATION, we set a refreservation instead. - * - * The '-p' flag creates all the non-existing ancestors of the target first. - * - * The '-u' flag prevents mounting of newly created file system. - */ -static int -zfs_do_create(int argc, char **argv) -{ - zfs_type_t type = ZFS_TYPE_FILESYSTEM; - zfs_handle_t *zhp = NULL; - uint64_t volsize = 0; - int c; - boolean_t noreserve = B_FALSE; - boolean_t bflag = B_FALSE; - boolean_t parents = B_FALSE; - boolean_t nomount = B_FALSE; - int ret = 1; - nvlist_t *props; - uint64_t intval; - - if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) - nomem(); - - /* check options */ - while ((c = getopt(argc, argv, ":V:b:so:pu")) != -1) { - switch (c) { - case 'V': - type = ZFS_TYPE_VOLUME; - if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { - (void) fprintf(stderr, gettext("bad volume " - "size '%s': %s\n"), optarg, - libzfs_error_description(g_zfs)); - goto error; - } - - if (nvlist_add_uint64(props, - zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0) - nomem(); - volsize = intval; - break; - case 'p': - parents = B_TRUE; - break; - case 'b': - bflag = B_TRUE; - if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { - (void) fprintf(stderr, gettext("bad volume " - "block size '%s': %s\n"), optarg, - libzfs_error_description(g_zfs)); - goto error; - } - - if (nvlist_add_uint64(props, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), - intval) != 0) - nomem(); - break; - case 'o': - if (parseprop(props, optarg) != 0) - goto error; - break; - case 's': - noreserve = B_TRUE; - break; - case 'u': - nomount = B_TRUE; - break; - case ':': - (void) fprintf(stderr, gettext("missing size " - "argument\n")); - goto badusage; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - goto badusage; - } - } - - if ((bflag || noreserve) && type != ZFS_TYPE_VOLUME) { - (void) fprintf(stderr, gettext("'-s' and '-b' can only be " - "used when creating a volume\n")); - goto badusage; - } - if (nomount && type != ZFS_TYPE_FILESYSTEM) { - (void) fprintf(stderr, gettext("'-u' can only be " - "used when creating a file system\n")); - goto badusage; - } - - argc -= optind; - argv += optind; - - /* check number of arguments */ - if (argc == 0) { - (void) fprintf(stderr, gettext("missing %s argument\n"), - zfs_type_to_name(type)); - goto badusage; - } - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - goto badusage; - } - - if (type == ZFS_TYPE_VOLUME && !noreserve) { - zpool_handle_t *zpool_handle; - nvlist_t *real_props = NULL; - uint64_t spa_version; - char *p; - zfs_prop_t resv_prop; - char *strval; - char msg[1024]; - - if ((p = strchr(argv[0], '/')) != NULL) - *p = '\0'; - zpool_handle = zpool_open(g_zfs, argv[0]); - if (p != NULL) - *p = '/'; - if (zpool_handle == NULL) - goto error; - spa_version = zpool_get_prop_int(zpool_handle, - ZPOOL_PROP_VERSION, NULL); - if (spa_version >= SPA_VERSION_REFRESERVATION) - resv_prop = ZFS_PROP_REFRESERVATION; - else - resv_prop = ZFS_PROP_RESERVATION; - - (void) snprintf(msg, sizeof (msg), - gettext("cannot create '%s'"), argv[0]); - if (props && (real_props = zfs_valid_proplist(g_zfs, type, - props, 0, NULL, zpool_handle, msg)) == NULL) { - zpool_close(zpool_handle); - goto error; - } - zpool_close(zpool_handle); - - volsize = zvol_volsize_to_reservation(volsize, real_props); - nvlist_free(real_props); - - if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), - &strval) != 0) { - if (nvlist_add_uint64(props, - zfs_prop_to_name(resv_prop), volsize) != 0) { - nvlist_free(props); - nomem(); - } - } - } - - if (parents && zfs_name_valid(argv[0], type)) { - /* - * Now create the ancestors of target dataset. If the target - * already exists and '-p' option was used we should not - * complain. - */ - if (zfs_dataset_exists(g_zfs, argv[0], type)) { - ret = 0; - goto error; - } - if (zfs_create_ancestors(g_zfs, argv[0]) != 0) - goto error; - } - - /* pass to libzfs */ - if (zfs_create(g_zfs, argv[0], type, props) != 0) - goto error; - - if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL) - goto error; - - ret = 0; - - /* - * Mount and/or share the new filesystem as appropriate. We provide a - * verbose error message to let the user know that their filesystem was - * in fact created, even if we failed to mount or share it. - * If the user doesn't want the dataset automatically mounted, - * then skip the mount/share step altogether. - */ - if (!nomount && should_auto_mount(zhp)) { - if (zfs_mount(zhp, NULL, 0) != 0) { - (void) fprintf(stderr, gettext("filesystem " - "successfully created, but not mounted\n")); - ret = 1; - } else if (zfs_share(zhp) != 0) { - (void) fprintf(stderr, gettext("filesystem " - "successfully created, but not shared\n")); - ret = 1; - } - } - -error: - if (zhp) - zfs_close(zhp); - nvlist_free(props); - return (ret); -badusage: - nvlist_free(props); - usage(B_FALSE); - return (2); -} - -/* - * zfs destroy [-rRf] - * zfs destroy [-rRd] - * - * -r Recursively destroy all children - * -R Recursively destroy all dependents, including clones - * -f Force unmounting of any dependents - * -d If we can't destroy now, mark for deferred destruction - * - * Destroys the given dataset. By default, it will unmount any filesystems, - * and refuse to destroy a dataset that has any dependents. A dependent can - * either be a child, or a clone of a child. - */ -typedef struct destroy_cbdata { - boolean_t cb_first; - boolean_t cb_force; - boolean_t cb_recurse; - boolean_t cb_error; - boolean_t cb_doclones; - zfs_handle_t *cb_target; - boolean_t cb_defer_destroy; - boolean_t cb_verbose; - boolean_t cb_parsable; - boolean_t cb_dryrun; - nvlist_t *cb_nvl; - nvlist_t *cb_batchedsnaps; - - /* first snap in contiguous run */ - char *cb_firstsnap; - /* previous snap in contiguous run */ - char *cb_prevsnap; - int64_t cb_snapused; - char *cb_snapspec; - char *cb_bookmark; -} destroy_cbdata_t; - -/* - * Check for any dependents based on the '-r' or '-R' flags. - */ -static int -destroy_check_dependent(zfs_handle_t *zhp, void *data) -{ - destroy_cbdata_t *cbp = data; - const char *tname = zfs_get_name(cbp->cb_target); - const char *name = zfs_get_name(zhp); - - if (strncmp(tname, name, strlen(tname)) == 0 && - (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) { - /* - * This is a direct descendant, not a clone somewhere else in - * the hierarchy. - */ - if (cbp->cb_recurse) - goto out; - - if (cbp->cb_first) { - (void) fprintf(stderr, gettext("cannot destroy '%s': " - "%s has children\n"), - zfs_get_name(cbp->cb_target), - zfs_type_to_name(zfs_get_type(cbp->cb_target))); - (void) fprintf(stderr, gettext("use '-r' to destroy " - "the following datasets:\n")); - cbp->cb_first = B_FALSE; - cbp->cb_error = B_TRUE; - } - - (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); - } else { - /* - * This is a clone. We only want to report this if the '-r' - * wasn't specified, or the target is a snapshot. - */ - if (!cbp->cb_recurse && - zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT) - goto out; - - if (cbp->cb_first) { - (void) fprintf(stderr, gettext("cannot destroy '%s': " - "%s has dependent clones\n"), - zfs_get_name(cbp->cb_target), - zfs_type_to_name(zfs_get_type(cbp->cb_target))); - (void) fprintf(stderr, gettext("use '-R' to destroy " - "the following datasets:\n")); - cbp->cb_first = B_FALSE; - cbp->cb_error = B_TRUE; - cbp->cb_dryrun = B_TRUE; - } - - (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); - } - -out: - zfs_close(zhp); - return (0); -} - -static int -destroy_callback(zfs_handle_t *zhp, void *data) -{ - destroy_cbdata_t *cb = data; - const char *name = zfs_get_name(zhp); - - if (cb->cb_verbose) { - if (cb->cb_parsable) { - (void) printf("destroy\t%s\n", name); - } else if (cb->cb_dryrun) { - (void) printf(gettext("would destroy %s\n"), - name); - } else { - (void) printf(gettext("will destroy %s\n"), - name); - } - } - - /* - * Ignore pools (which we've already flagged as an error before getting - * here). - */ - if (strchr(zfs_get_name(zhp), '/') == NULL && - zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { - zfs_close(zhp); - return (0); - } - if (cb->cb_dryrun) { - zfs_close(zhp); - return (0); - } - - /* - * We batch up all contiguous snapshots (even of different - * filesystems) and destroy them with one ioctl. We can't - * simply do all snap deletions and then all fs deletions, - * because we must delete a clone before its origin. - */ - if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) { - fnvlist_add_boolean(cb->cb_batchedsnaps, name); - } else { - int error = zfs_destroy_snaps_nvl(g_zfs, - cb->cb_batchedsnaps, B_FALSE); - fnvlist_free(cb->cb_batchedsnaps); - cb->cb_batchedsnaps = fnvlist_alloc(); - - if (error != 0 || - zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 || - zfs_destroy(zhp, cb->cb_defer_destroy) != 0) { - zfs_close(zhp); - return (-1); - } - } - - zfs_close(zhp); - return (0); -} - -static int -destroy_print_cb(zfs_handle_t *zhp, void *arg) -{ - destroy_cbdata_t *cb = arg; - const char *name = zfs_get_name(zhp); - int err = 0; - - if (nvlist_exists(cb->cb_nvl, name)) { - if (cb->cb_firstsnap == NULL) - cb->cb_firstsnap = strdup(name); - if (cb->cb_prevsnap != NULL) - free(cb->cb_prevsnap); - /* this snap continues the current range */ - cb->cb_prevsnap = strdup(name); - if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL) - nomem(); - if (cb->cb_verbose) { - if (cb->cb_parsable) { - (void) printf("destroy\t%s\n", name); - } else if (cb->cb_dryrun) { - (void) printf(gettext("would destroy %s\n"), - name); - } else { - (void) printf(gettext("will destroy %s\n"), - name); - } - } - } else if (cb->cb_firstsnap != NULL) { - /* end of this range */ - uint64_t used = 0; - err = lzc_snaprange_space(cb->cb_firstsnap, - cb->cb_prevsnap, &used); - cb->cb_snapused += used; - free(cb->cb_firstsnap); - cb->cb_firstsnap = NULL; - free(cb->cb_prevsnap); - cb->cb_prevsnap = NULL; - } - zfs_close(zhp); - return (err); -} - -static int -destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) -{ - int err = 0; - assert(cb->cb_firstsnap == NULL); - assert(cb->cb_prevsnap == NULL); - err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb, 0, 0); - if (cb->cb_firstsnap != NULL) { - uint64_t used = 0; - if (err == 0) { - err = lzc_snaprange_space(cb->cb_firstsnap, - cb->cb_prevsnap, &used); - } - cb->cb_snapused += used; - free(cb->cb_firstsnap); - cb->cb_firstsnap = NULL; - free(cb->cb_prevsnap); - cb->cb_prevsnap = NULL; - } - return (err); -} - -static int -snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg) -{ - destroy_cbdata_t *cb = arg; - int err = 0; - - /* Check for clones. */ - if (!cb->cb_doclones && !cb->cb_defer_destroy) { - cb->cb_target = zhp; - cb->cb_first = B_TRUE; - err = zfs_iter_dependents(zhp, B_TRUE, - destroy_check_dependent, cb); - } - - if (err == 0) { - if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp))) - nomem(); - } - zfs_close(zhp); - return (err); -} - -static int -gather_snapshots(zfs_handle_t *zhp, void *arg) -{ - destroy_cbdata_t *cb = arg; - int err = 0; - - err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb); - if (err == ENOENT) - err = 0; - if (err != 0) - goto out; - - if (cb->cb_verbose) { - err = destroy_print_snapshots(zhp, cb); - if (err != 0) - goto out; - } - - if (cb->cb_recurse) - err = zfs_iter_filesystems(zhp, gather_snapshots, cb); - -out: - zfs_close(zhp); - return (err); -} - -static int -destroy_clones(destroy_cbdata_t *cb) -{ - nvpair_t *pair; - for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL); - pair != NULL; - pair = nvlist_next_nvpair(cb->cb_nvl, pair)) { - zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair), - ZFS_TYPE_SNAPSHOT); - if (zhp != NULL) { - boolean_t defer = cb->cb_defer_destroy; - int err = 0; - - /* - * We can't defer destroy non-snapshots, so set it to - * false while destroying the clones. - */ - cb->cb_defer_destroy = B_FALSE; - err = zfs_iter_dependents(zhp, B_FALSE, - destroy_callback, cb); - cb->cb_defer_destroy = defer; - zfs_close(zhp); - if (err != 0) - return (err); - } - } - return (0); -} - -static int -zfs_do_destroy(int argc, char **argv) -{ - destroy_cbdata_t cb = { 0 }; - int rv = 0; - int err = 0; - int c; - zfs_handle_t *zhp = NULL; - char *at, *pound; - zfs_type_t type = ZFS_TYPE_DATASET; - - /* check options */ - while ((c = getopt(argc, argv, "vpndfrR")) != -1) { - switch (c) { - case 'v': - cb.cb_verbose = B_TRUE; - break; - case 'p': - cb.cb_verbose = B_TRUE; - cb.cb_parsable = B_TRUE; - break; - case 'n': - cb.cb_dryrun = B_TRUE; - break; - case 'd': - cb.cb_defer_destroy = B_TRUE; - type = ZFS_TYPE_SNAPSHOT; - break; - case 'f': - cb.cb_force = B_TRUE; - break; - case 'r': - cb.cb_recurse = B_TRUE; - break; - case 'R': - cb.cb_recurse = B_TRUE; - cb.cb_doclones = B_TRUE; - break; - case '?': - default: - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* check number of arguments */ - if (argc == 0) { - (void) fprintf(stderr, gettext("missing dataset argument\n")); - usage(B_FALSE); - } - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - at = strchr(argv[0], '@'); - pound = strchr(argv[0], '#'); - if (at != NULL) { - - /* Build the list of snaps to destroy in cb_nvl. */ - cb.cb_nvl = fnvlist_alloc(); - - *at = '\0'; - zhp = zfs_open(g_zfs, argv[0], - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); - if (zhp == NULL) - return (1); - - cb.cb_snapspec = at + 1; - if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 || - cb.cb_error) { - rv = 1; - goto out; - } - - if (nvlist_empty(cb.cb_nvl)) { - (void) fprintf(stderr, gettext("could not find any " - "snapshots to destroy; check snapshot names.\n")); - rv = 1; - goto out; - } - - if (cb.cb_verbose) { - char buf[16]; - zfs_nicenum(cb.cb_snapused, buf, sizeof (buf)); - if (cb.cb_parsable) { - (void) printf("reclaim\t%llu\n", - cb.cb_snapused); - } else if (cb.cb_dryrun) { - (void) printf(gettext("would reclaim %s\n"), - buf); - } else { - (void) printf(gettext("will reclaim %s\n"), - buf); - } - } - - if (!cb.cb_dryrun) { - if (cb.cb_doclones) { - cb.cb_batchedsnaps = fnvlist_alloc(); - err = destroy_clones(&cb); - if (err == 0) { - err = zfs_destroy_snaps_nvl(g_zfs, - cb.cb_batchedsnaps, B_FALSE); - } - if (err != 0) { - rv = 1; - goto out; - } - } - if (err == 0) { - err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl, - cb.cb_defer_destroy); - } - } - - if (err != 0) - rv = 1; - } else if (pound != NULL) { - int err; - nvlist_t *nvl; - - if (cb.cb_dryrun) { - (void) fprintf(stderr, - "dryrun is not supported with bookmark\n"); - return (-1); - } - - if (cb.cb_defer_destroy) { - (void) fprintf(stderr, - "defer destroy is not supported with bookmark\n"); - return (-1); - } - - if (cb.cb_recurse) { - (void) fprintf(stderr, - "recursive is not supported with bookmark\n"); - return (-1); - } - - if (!zfs_bookmark_exists(argv[0])) { - (void) fprintf(stderr, gettext("bookmark '%s' " - "does not exist.\n"), argv[0]); - return (1); - } - - nvl = fnvlist_alloc(); - fnvlist_add_boolean(nvl, argv[0]); - - err = lzc_destroy_bookmarks(nvl, NULL); - if (err != 0) { - (void) zfs_standard_error(g_zfs, err, - "cannot destroy bookmark"); - } - - nvlist_free(cb.cb_nvl); - - return (err); - } else { - /* Open the given dataset */ - if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL) - return (1); - - cb.cb_target = zhp; - - /* - * Perform an explicit check for pools before going any further. - */ - if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL && - zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { - (void) fprintf(stderr, gettext("cannot destroy '%s': " - "operation does not apply to pools\n"), - zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use 'zfs destroy -r " - "%s' to destroy all datasets in the pool\n"), - zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use 'zpool destroy %s' " - "to destroy the pool itself\n"), zfs_get_name(zhp)); - rv = 1; - goto out; - } - - /* - * Check for any dependents and/or clones. - */ - cb.cb_first = B_TRUE; - if (!cb.cb_doclones && - zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, - &cb) != 0) { - rv = 1; - goto out; - } - - if (cb.cb_error) { - rv = 1; - goto out; - } - - cb.cb_batchedsnaps = fnvlist_alloc(); - if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, - &cb) != 0) { - rv = 1; - goto out; - } - - /* - * Do the real thing. The callback will close the - * handle regardless of whether it succeeds or not. - */ - err = destroy_callback(zhp, &cb); - zhp = NULL; - if (err == 0) { - err = zfs_destroy_snaps_nvl(g_zfs, - cb.cb_batchedsnaps, cb.cb_defer_destroy); - } - if (err != 0) - rv = 1; - } - -out: - fnvlist_free(cb.cb_batchedsnaps); - fnvlist_free(cb.cb_nvl); - if (zhp != NULL) - zfs_close(zhp); - return (rv); -} - -static boolean_t -is_recvd_column(zprop_get_cbdata_t *cbp) -{ - int i; - zfs_get_column_t col; - - for (i = 0; i < ZFS_GET_NCOLS && - (col = cbp->cb_columns[i]) != GET_COL_NONE; i++) - if (col == GET_COL_RECVD) - return (B_TRUE); - return (B_FALSE); -} - -/* - * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...] - * < all | property[,property]... > < fs | snap | vol > ... - * - * -r recurse over any child datasets - * -H scripted mode. Headers are stripped, and fields are separated - * by tabs instead of spaces. - * -o Set of fields to display. One of "name,property,value, - * received,source". Default is "name,property,value,source". - * "all" is an alias for all five. - * -s Set of sources to allow. One of - * "local,default,inherited,received,temporary,none". Default is - * all six. - * -p Display values in parsable (literal) format. - * - * Prints properties for the given datasets. The user can control which - * columns to display as well as which property types to allow. - */ - -/* - * Invoked to display the properties for a single dataset. - */ -static int -get_callback(zfs_handle_t *zhp, void *data) -{ - char buf[ZFS_MAXPROPLEN]; - char rbuf[ZFS_MAXPROPLEN]; - zprop_source_t sourcetype; - char source[ZFS_MAX_DATASET_NAME_LEN]; - zprop_get_cbdata_t *cbp = data; - nvlist_t *user_props = zfs_get_user_props(zhp); - zprop_list_t *pl = cbp->cb_proplist; - nvlist_t *propval; - char *strval; - char *sourceval; - boolean_t received = is_recvd_column(cbp); - - for (; pl != NULL; pl = pl->pl_next) { - char *recvdval = NULL; - /* - * Skip the special fake placeholder. This will also skip over - * the name property when 'all' is specified. - */ - if (pl->pl_prop == ZFS_PROP_NAME && - pl == cbp->cb_proplist) - continue; - - if (pl->pl_prop != ZPROP_INVAL) { - if (zfs_prop_get(zhp, pl->pl_prop, buf, - sizeof (buf), &sourcetype, source, - sizeof (source), - cbp->cb_literal) != 0) { - if (pl->pl_all) - continue; - if (!zfs_prop_valid_for_type(pl->pl_prop, - ZFS_TYPE_DATASET)) { - (void) fprintf(stderr, - gettext("No such property '%s'\n"), - zfs_prop_to_name(pl->pl_prop)); - continue; - } - sourcetype = ZPROP_SRC_NONE; - (void) strlcpy(buf, "-", sizeof (buf)); - } - - if (received && (zfs_prop_get_recvd(zhp, - zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf), - cbp->cb_literal) == 0)) - recvdval = rbuf; - - zprop_print_one_property(zfs_get_name(zhp), cbp, - zfs_prop_to_name(pl->pl_prop), - buf, sourcetype, source, recvdval); - } else if (zfs_prop_userquota(pl->pl_user_prop)) { - sourcetype = ZPROP_SRC_LOCAL; - - if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, - buf, sizeof (buf), cbp->cb_literal) != 0) { - sourcetype = ZPROP_SRC_NONE; - (void) strlcpy(buf, "-", sizeof (buf)); - } - - zprop_print_one_property(zfs_get_name(zhp), cbp, - pl->pl_user_prop, buf, sourcetype, source, NULL); - } else if (zfs_prop_written(pl->pl_user_prop)) { - sourcetype = ZPROP_SRC_LOCAL; - - if (zfs_prop_get_written(zhp, pl->pl_user_prop, - buf, sizeof (buf), cbp->cb_literal) != 0) { - sourcetype = ZPROP_SRC_NONE; - (void) strlcpy(buf, "-", sizeof (buf)); - } - - zprop_print_one_property(zfs_get_name(zhp), cbp, - pl->pl_user_prop, buf, sourcetype, source, NULL); - } else { - if (nvlist_lookup_nvlist(user_props, - pl->pl_user_prop, &propval) != 0) { - if (pl->pl_all) - continue; - sourcetype = ZPROP_SRC_NONE; - strval = "-"; - } else { - verify(nvlist_lookup_string(propval, - ZPROP_VALUE, &strval) == 0); - verify(nvlist_lookup_string(propval, - ZPROP_SOURCE, &sourceval) == 0); - - if (strcmp(sourceval, - zfs_get_name(zhp)) == 0) { - sourcetype = ZPROP_SRC_LOCAL; - } else if (strcmp(sourceval, - ZPROP_SOURCE_VAL_RECVD) == 0) { - sourcetype = ZPROP_SRC_RECEIVED; - } else { - sourcetype = ZPROP_SRC_INHERITED; - (void) strlcpy(source, - sourceval, sizeof (source)); - } - } - - if (received && (zfs_prop_get_recvd(zhp, - pl->pl_user_prop, rbuf, sizeof (rbuf), - cbp->cb_literal) == 0)) - recvdval = rbuf; - - zprop_print_one_property(zfs_get_name(zhp), cbp, - pl->pl_user_prop, strval, sourcetype, - source, recvdval); - } - } - - return (0); -} - -static int -zfs_do_get(int argc, char **argv) -{ - zprop_get_cbdata_t cb = { 0 }; - int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS; - int types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; - char *value, *fields; - int ret = 0; - int limit = 0; - zprop_list_t fake_name = { 0 }; - - /* - * Set up default columns and sources. - */ - cb.cb_sources = ZPROP_SRC_ALL; - cb.cb_columns[0] = GET_COL_NAME; - cb.cb_columns[1] = GET_COL_PROPERTY; - cb.cb_columns[2] = GET_COL_VALUE; - cb.cb_columns[3] = GET_COL_SOURCE; - cb.cb_type = ZFS_TYPE_DATASET; - - /* check options */ - while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) { - switch (c) { - case 'p': - cb.cb_literal = B_TRUE; - break; - case 'd': - limit = parse_depth(optarg, &flags); - break; - case 'r': - flags |= ZFS_ITER_RECURSE; - break; - case 'H': - cb.cb_scripted = B_TRUE; - break; - case ':': - (void) fprintf(stderr, gettext("missing argument for " - "'%c' option\n"), optopt); - usage(B_FALSE); - break; - case 'o': - /* - * Process the set of columns to display. We zero out - * the structure to give us a blank slate. - */ - bzero(&cb.cb_columns, sizeof (cb.cb_columns)); - i = 0; - while (*optarg != '\0') { - static char *col_subopts[] = - { "name", "property", "value", "received", - "source", "all", NULL }; - - if (i == ZFS_GET_NCOLS) { - (void) fprintf(stderr, gettext("too " - "many fields given to -o " - "option\n")); - usage(B_FALSE); - } - - switch (getsubopt(&optarg, col_subopts, - &value)) { - case 0: - cb.cb_columns[i++] = GET_COL_NAME; - break; - case 1: - cb.cb_columns[i++] = GET_COL_PROPERTY; - break; - case 2: - cb.cb_columns[i++] = GET_COL_VALUE; - break; - case 3: - cb.cb_columns[i++] = GET_COL_RECVD; - flags |= ZFS_ITER_RECVD_PROPS; - break; - case 4: - cb.cb_columns[i++] = GET_COL_SOURCE; - break; - case 5: - if (i > 0) { - (void) fprintf(stderr, - gettext("\"all\" conflicts " - "with specific fields " - "given to -o option\n")); - usage(B_FALSE); - } - cb.cb_columns[0] = GET_COL_NAME; - cb.cb_columns[1] = GET_COL_PROPERTY; - cb.cb_columns[2] = GET_COL_VALUE; - cb.cb_columns[3] = GET_COL_RECVD; - cb.cb_columns[4] = GET_COL_SOURCE; - flags |= ZFS_ITER_RECVD_PROPS; - i = ZFS_GET_NCOLS; - break; - default: - (void) fprintf(stderr, - gettext("invalid column name " - "'%s'\n"), suboptarg); - usage(B_FALSE); - } - } - break; - - case 's': - cb.cb_sources = 0; - while (*optarg != '\0') { - static char *source_subopts[] = { - "local", "default", "inherited", - "received", "temporary", "none", - NULL }; - - switch (getsubopt(&optarg, source_subopts, - &value)) { - case 0: - cb.cb_sources |= ZPROP_SRC_LOCAL; - break; - case 1: - cb.cb_sources |= ZPROP_SRC_DEFAULT; - break; - case 2: - cb.cb_sources |= ZPROP_SRC_INHERITED; - break; - case 3: - cb.cb_sources |= ZPROP_SRC_RECEIVED; - break; - case 4: - cb.cb_sources |= ZPROP_SRC_TEMPORARY; - break; - case 5: - cb.cb_sources |= ZPROP_SRC_NONE; - break; - default: - (void) fprintf(stderr, - gettext("invalid source " - "'%s'\n"), suboptarg); - usage(B_FALSE); - } - } - break; - - case 't': - types = 0; - flags &= ~ZFS_ITER_PROP_LISTSNAPS; - while (*optarg != '\0') { - static char *type_subopts[] = { "filesystem", - "volume", "snapshot", "bookmark", - "all", NULL }; - - switch (getsubopt(&optarg, type_subopts, - &value)) { - case 0: - types |= ZFS_TYPE_FILESYSTEM; - break; - case 1: - types |= ZFS_TYPE_VOLUME; - break; - case 2: - types |= ZFS_TYPE_SNAPSHOT; - break; - case 3: - types |= ZFS_TYPE_BOOKMARK; - break; - case 4: - types = ZFS_TYPE_DATASET | - ZFS_TYPE_BOOKMARK; - break; - - default: - (void) fprintf(stderr, - gettext("invalid type '%s'\n"), - suboptarg); - usage(B_FALSE); - } - } - break; - - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - if (argc < 1) { - (void) fprintf(stderr, gettext("missing property " - "argument\n")); - usage(B_FALSE); - } - - fields = argv[0]; - - if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) - != 0) - usage(B_FALSE); - - argc--; - argv++; - - /* - * As part of zfs_expand_proplist(), we keep track of the maximum column - * width for each property. For the 'NAME' (and 'SOURCE') columns, we - * need to know the maximum name length. However, the user likely did - * not specify 'name' as one of the properties to fetch, so we need to - * make sure we always include at least this property for - * print_get_headers() to work properly. - */ - if (cb.cb_proplist != NULL) { - fake_name.pl_prop = ZFS_PROP_NAME; - fake_name.pl_width = strlen(gettext("NAME")); - fake_name.pl_next = cb.cb_proplist; - cb.cb_proplist = &fake_name; - } - - cb.cb_first = B_TRUE; - - /* run for each object */ - ret = zfs_for_each(argc, argv, flags, types, NULL, - &cb.cb_proplist, limit, get_callback, &cb); - - if (cb.cb_proplist == &fake_name) - zprop_free_list(fake_name.pl_next); - else - zprop_free_list(cb.cb_proplist); - - return (ret); -} - -/* - * inherit [-rS] ... - * - * -r Recurse over all children - * -S Revert to received value, if any - * - * For each dataset specified on the command line, inherit the given property - * from its parent. Inheriting a property at the pool level will cause it to - * use the default value. The '-r' flag will recurse over all children, and is - * useful for setting a property on a hierarchy-wide basis, regardless of any - * local modifications for each dataset. - */ - -typedef struct inherit_cbdata { - const char *cb_propname; - boolean_t cb_received; -} inherit_cbdata_t; - -static int -inherit_recurse_cb(zfs_handle_t *zhp, void *data) -{ - inherit_cbdata_t *cb = data; - zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname); - - /* - * If we're doing it recursively, then ignore properties that - * are not valid for this type of dataset. - */ - if (prop != ZPROP_INVAL && - !zfs_prop_valid_for_type(prop, zfs_get_type(zhp))) - return (0); - - return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); -} - -static int -inherit_cb(zfs_handle_t *zhp, void *data) -{ - inherit_cbdata_t *cb = data; - - return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); -} - -static int -zfs_do_inherit(int argc, char **argv) -{ - int c; - zfs_prop_t prop; - inherit_cbdata_t cb = { 0 }; - char *propname; - int ret = 0; - int flags = 0; - boolean_t received = B_FALSE; - - /* check options */ - while ((c = getopt(argc, argv, "rS")) != -1) { - switch (c) { - case 'r': - flags |= ZFS_ITER_RECURSE; - break; - case 'S': - received = B_TRUE; - break; - case '?': - default: - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing property argument\n")); - usage(B_FALSE); - } - if (argc < 2) { - (void) fprintf(stderr, gettext("missing dataset argument\n")); - usage(B_FALSE); - } - - propname = argv[0]; - argc--; - argv++; - - if ((prop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { - if (zfs_prop_readonly(prop)) { - (void) fprintf(stderr, gettext( - "%s property is read-only\n"), - propname); - return (1); - } - if (!zfs_prop_inheritable(prop) && !received) { - (void) fprintf(stderr, gettext("'%s' property cannot " - "be inherited\n"), propname); - if (prop == ZFS_PROP_QUOTA || - prop == ZFS_PROP_RESERVATION || - prop == ZFS_PROP_REFQUOTA || - prop == ZFS_PROP_REFRESERVATION) { - (void) fprintf(stderr, gettext("use 'zfs set " - "%s=none' to clear\n"), propname); - (void) fprintf(stderr, gettext("use 'zfs " - "inherit -S %s' to revert to received " - "value\n"), propname); - } - return (1); - } - if (received && (prop == ZFS_PROP_VOLSIZE || - prop == ZFS_PROP_VERSION)) { - (void) fprintf(stderr, gettext("'%s' property cannot " - "be reverted to a received value\n"), propname); - return (1); - } - } else if (!zfs_prop_user(propname)) { - (void) fprintf(stderr, gettext("invalid property '%s'\n"), - propname); - usage(B_FALSE); - } - - cb.cb_propname = propname; - cb.cb_received = received; - - if (flags & ZFS_ITER_RECURSE) { - ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, - NULL, NULL, 0, inherit_recurse_cb, &cb); - } else { - ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, - NULL, NULL, 0, inherit_cb, &cb); - } - - return (ret); -} - -typedef struct upgrade_cbdata { - uint64_t cb_numupgraded; - uint64_t cb_numsamegraded; - uint64_t cb_numfailed; - uint64_t cb_version; - boolean_t cb_newer; - boolean_t cb_foundone; - char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN]; -} upgrade_cbdata_t; - -static int -same_pool(zfs_handle_t *zhp, const char *name) -{ - int len1 = strcspn(name, "/@"); - const char *zhname = zfs_get_name(zhp); - int len2 = strcspn(zhname, "/@"); - - if (len1 != len2) - return (B_FALSE); - return (strncmp(name, zhname, len1) == 0); -} - -static int -upgrade_list_callback(zfs_handle_t *zhp, void *data) -{ - upgrade_cbdata_t *cb = data; - int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); - - /* list if it's old/new */ - if ((!cb->cb_newer && version < ZPL_VERSION) || - (cb->cb_newer && version > ZPL_VERSION)) { - char *str; - if (cb->cb_newer) { - str = gettext("The following filesystems are " - "formatted using a newer software version and\n" - "cannot be accessed on the current system.\n\n"); - } else { - str = gettext("The following filesystems are " - "out of date, and can be upgraded. After being\n" - "upgraded, these filesystems (and any 'zfs send' " - "streams generated from\n" - "subsequent snapshots) will no longer be " - "accessible by older software versions.\n\n"); - } - - if (!cb->cb_foundone) { - (void) puts(str); - (void) printf(gettext("VER FILESYSTEM\n")); - (void) printf(gettext("--- ------------\n")); - cb->cb_foundone = B_TRUE; - } - - (void) printf("%2u %s\n", version, zfs_get_name(zhp)); - } - - return (0); -} - -static int -upgrade_set_callback(zfs_handle_t *zhp, void *data) -{ - upgrade_cbdata_t *cb = data; - int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); - int needed_spa_version; - int spa_version; - - if (zfs_spa_version(zhp, &spa_version) < 0) - return (-1); - - needed_spa_version = zfs_spa_version_map(cb->cb_version); - - if (needed_spa_version < 0) - return (-1); - - if (spa_version < needed_spa_version) { - /* can't upgrade */ - (void) printf(gettext("%s: can not be " - "upgraded; the pool version needs to first " - "be upgraded\nto version %d\n\n"), - zfs_get_name(zhp), needed_spa_version); - cb->cb_numfailed++; - return (0); - } - - /* upgrade */ - if (version < cb->cb_version) { - char verstr[16]; - (void) snprintf(verstr, sizeof (verstr), - "%llu", cb->cb_version); - if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) { - /* - * If they did "zfs upgrade -a", then we could - * be doing ioctls to different pools. We need - * to log this history once to each pool, and bypass - * the normal history logging that happens in main(). - */ - (void) zpool_log_history(g_zfs, history_str); - log_history = B_FALSE; - } - if (zfs_prop_set(zhp, "version", verstr) == 0) - cb->cb_numupgraded++; - else - cb->cb_numfailed++; - (void) strcpy(cb->cb_lastfs, zfs_get_name(zhp)); - } else if (version > cb->cb_version) { - /* can't downgrade */ - (void) printf(gettext("%s: can not be downgraded; " - "it is already at version %u\n"), - zfs_get_name(zhp), version); - cb->cb_numfailed++; - } else { - cb->cb_numsamegraded++; - } - return (0); -} - -/* - * zfs upgrade - * zfs upgrade -v - * zfs upgrade [-r] [-V ] <-a | filesystem> - */ -static int -zfs_do_upgrade(int argc, char **argv) -{ - boolean_t all = B_FALSE; - boolean_t showversions = B_FALSE; - int ret = 0; - upgrade_cbdata_t cb = { 0 }; - int c; - int flags = ZFS_ITER_ARGS_CAN_BE_PATHS; - - /* check options */ - while ((c = getopt(argc, argv, "rvV:a")) != -1) { - switch (c) { - case 'r': - flags |= ZFS_ITER_RECURSE; - break; - case 'v': - showversions = B_TRUE; - break; - case 'V': - if (zfs_prop_string_to_index(ZFS_PROP_VERSION, - optarg, &cb.cb_version) != 0) { - (void) fprintf(stderr, - gettext("invalid version %s\n"), optarg); - usage(B_FALSE); - } - break; - case 'a': - all = B_TRUE; - break; - case '?': - default: - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) | cb.cb_version)) - usage(B_FALSE); - if (showversions && (flags & ZFS_ITER_RECURSE || all || - cb.cb_version || argc)) - usage(B_FALSE); - if ((all || argc) && (showversions)) - usage(B_FALSE); - if (all && argc) - usage(B_FALSE); - - if (showversions) { - /* Show info on available versions. */ - (void) printf(gettext("The following filesystem versions are " - "supported:\n\n")); - (void) printf(gettext("VER DESCRIPTION\n")); - (void) printf("--- -----------------------------------------" - "---------------\n"); - (void) printf(gettext(" 1 Initial ZFS filesystem version\n")); - (void) printf(gettext(" 2 Enhanced directory entries\n")); - (void) printf(gettext(" 3 Case insensitive and filesystem " - "user identifier (FUID)\n")); - (void) printf(gettext(" 4 userquota, groupquota " - "properties\n")); - (void) printf(gettext(" 5 System attributes\n")); - (void) printf(gettext("\nFor more information on a particular " - "version, including supported releases,\n")); - (void) printf("see the ZFS Administration Guide.\n\n"); - ret = 0; - } else if (argc || all) { - /* Upgrade filesystems */ - if (cb.cb_version == 0) - cb.cb_version = ZPL_VERSION; - ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM, - NULL, NULL, 0, upgrade_set_callback, &cb); - (void) printf(gettext("%llu filesystems upgraded\n"), - cb.cb_numupgraded); - if (cb.cb_numsamegraded) { - (void) printf(gettext("%llu filesystems already at " - "this version\n"), - cb.cb_numsamegraded); - } - if (cb.cb_numfailed != 0) - ret = 1; - } else { - /* List old-version filesystems */ - boolean_t found; - (void) printf(gettext("This system is currently running " - "ZFS filesystem version %llu.\n\n"), ZPL_VERSION); - - flags |= ZFS_ITER_RECURSE; - ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, - NULL, NULL, 0, upgrade_list_callback, &cb); - - found = cb.cb_foundone; - cb.cb_foundone = B_FALSE; - cb.cb_newer = B_TRUE; - - ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, - NULL, NULL, 0, upgrade_list_callback, &cb); - - if (!cb.cb_foundone && !found) { - (void) printf(gettext("All filesystems are " - "formatted with the current version.\n")); - } - } - - return (ret); -} - -/* - * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...] - * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot - * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...] - * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot - * - * -H Scripted mode; elide headers and separate columns by tabs. - * -i Translate SID to POSIX ID. - * -n Print numeric ID instead of user/group name. - * -o Control which fields to display. - * -p Use exact (parsable) numeric output. - * -s Specify sort columns, descending order. - * -S Specify sort columns, ascending order. - * -t Control which object types to display. - * - * Displays space consumed by, and quotas on, each user in the specified - * filesystem or snapshot. - */ - -/* us_field_types, us_field_hdr and us_field_names should be kept in sync */ -enum us_field_types { - USFIELD_TYPE, - USFIELD_NAME, - USFIELD_USED, - USFIELD_QUOTA -}; -static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA" }; -static char *us_field_names[] = { "type", "name", "used", "quota" }; -#define USFIELD_LAST (sizeof (us_field_names) / sizeof (char *)) - -#define USTYPE_PSX_GRP (1 << 0) -#define USTYPE_PSX_USR (1 << 1) -#define USTYPE_SMB_GRP (1 << 2) -#define USTYPE_SMB_USR (1 << 3) -#define USTYPE_ALL \ - (USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR) - -static int us_type_bits[] = { - USTYPE_PSX_GRP, - USTYPE_PSX_USR, - USTYPE_SMB_GRP, - USTYPE_SMB_USR, - USTYPE_ALL -}; -static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup", - "smbuser", "all" }; - -typedef struct us_node { - nvlist_t *usn_nvl; - uu_avl_node_t usn_avlnode; - uu_list_node_t usn_listnode; -} us_node_t; - -typedef struct us_cbdata { - nvlist_t **cb_nvlp; - uu_avl_pool_t *cb_avl_pool; - uu_avl_t *cb_avl; - boolean_t cb_numname; - boolean_t cb_nicenum; - boolean_t cb_sid2posix; - zfs_userquota_prop_t cb_prop; - zfs_sort_column_t *cb_sortcol; - size_t cb_width[USFIELD_LAST]; -} us_cbdata_t; - -static boolean_t us_populated = B_FALSE; - -typedef struct { - zfs_sort_column_t *si_sortcol; - boolean_t si_numname; -} us_sort_info_t; - -static int -us_field_index(char *field) -{ - int i; - - for (i = 0; i < USFIELD_LAST; i++) { - if (strcmp(field, us_field_names[i]) == 0) - return (i); - } - - return (-1); -} - -static int -us_compare(const void *larg, const void *rarg, void *unused) -{ - const us_node_t *l = larg; - const us_node_t *r = rarg; - us_sort_info_t *si = (us_sort_info_t *)unused; - zfs_sort_column_t *sortcol = si->si_sortcol; - boolean_t numname = si->si_numname; - nvlist_t *lnvl = l->usn_nvl; - nvlist_t *rnvl = r->usn_nvl; - int rc = 0; - boolean_t lvb, rvb; - - for (; sortcol != NULL; sortcol = sortcol->sc_next) { - char *lvstr = ""; - char *rvstr = ""; - uint32_t lv32 = 0; - uint32_t rv32 = 0; - uint64_t lv64 = 0; - uint64_t rv64 = 0; - zfs_prop_t prop = sortcol->sc_prop; - const char *propname = NULL; - boolean_t reverse = sortcol->sc_reverse; - - switch (prop) { - case ZFS_PROP_TYPE: - propname = "type"; - (void) nvlist_lookup_uint32(lnvl, propname, &lv32); - (void) nvlist_lookup_uint32(rnvl, propname, &rv32); - if (rv32 != lv32) - rc = (rv32 < lv32) ? 1 : -1; - break; - case ZFS_PROP_NAME: - propname = "name"; - if (numname) { -compare_nums: - (void) nvlist_lookup_uint64(lnvl, propname, - &lv64); - (void) nvlist_lookup_uint64(rnvl, propname, - &rv64); - if (rv64 != lv64) - rc = (rv64 < lv64) ? 1 : -1; - } else { - if ((nvlist_lookup_string(lnvl, propname, - &lvstr) == ENOENT) || - (nvlist_lookup_string(rnvl, propname, - &rvstr) == ENOENT)) { - goto compare_nums; - } - rc = strcmp(lvstr, rvstr); - } - break; - case ZFS_PROP_USED: - case ZFS_PROP_QUOTA: - if (!us_populated) - break; - if (prop == ZFS_PROP_USED) - propname = "used"; - else - propname = "quota"; - (void) nvlist_lookup_uint64(lnvl, propname, &lv64); - (void) nvlist_lookup_uint64(rnvl, propname, &rv64); - if (rv64 != lv64) - rc = (rv64 < lv64) ? 1 : -1; - break; - - default: - break; - } - - if (rc != 0) { - if (rc < 0) - return (reverse ? 1 : -1); - else - return (reverse ? -1 : 1); - } - } - - /* - * If entries still seem to be the same, check if they are of the same - * type (smbentity is added only if we are doing SID to POSIX ID - * translation where we can have duplicate type/name combinations). - */ - if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 && - nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 && - lvb != rvb) - return (lvb < rvb ? -1 : 1); - - return (0); -} - -static inline const char * -us_type2str(unsigned field_type) -{ - switch (field_type) { - case USTYPE_PSX_USR: - return ("POSIX User"); - case USTYPE_PSX_GRP: - return ("POSIX Group"); - case USTYPE_SMB_USR: - return ("SMB User"); - case USTYPE_SMB_GRP: - return ("SMB Group"); - default: - return ("Undefined"); - } -} - -static int -userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space) -{ - us_cbdata_t *cb = (us_cbdata_t *)arg; - zfs_userquota_prop_t prop = cb->cb_prop; - char *name = NULL; - char *propname; - char sizebuf[32]; - us_node_t *node; - uu_avl_pool_t *avl_pool = cb->cb_avl_pool; - uu_avl_t *avl = cb->cb_avl; - uu_avl_index_t idx; - nvlist_t *props; - us_node_t *n; - zfs_sort_column_t *sortcol = cb->cb_sortcol; - unsigned type = 0; - const char *typestr; - size_t namelen; - size_t typelen; - size_t sizelen; - int typeidx, nameidx, sizeidx; - us_sort_info_t sortinfo = { sortcol, cb->cb_numname }; - boolean_t smbentity = B_FALSE; - - if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) - nomem(); - node = safe_malloc(sizeof (us_node_t)); - uu_avl_node_init(node, &node->usn_avlnode, avl_pool); - node->usn_nvl = props; - - if (domain != NULL && domain[0] != '\0') { - /* SMB */ - char sid[MAXNAMELEN + 32]; - uid_t id; -#ifdef illumos - int err; - int flag = IDMAP_REQ_FLG_USE_CACHE; -#endif - - smbentity = B_TRUE; - - (void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid); - - if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) { - type = USTYPE_SMB_GRP; -#ifdef illumos - err = sid_to_id(sid, B_FALSE, &id); -#endif - } else { - type = USTYPE_SMB_USR; -#ifdef illumos - err = sid_to_id(sid, B_TRUE, &id); -#endif - } - -#ifdef illumos - if (err == 0) { - rid = id; - if (!cb->cb_sid2posix) { - if (type == USTYPE_SMB_USR) { - (void) idmap_getwinnamebyuid(rid, flag, - &name, NULL); - } else { - (void) idmap_getwinnamebygid(rid, flag, - &name, NULL); - } - if (name == NULL) - name = sid; - } - } -#endif - } - - if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') { - /* POSIX or -i */ - if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) { - type = USTYPE_PSX_GRP; - if (!cb->cb_numname) { - struct group *g; - - if ((g = getgrgid(rid)) != NULL) - name = g->gr_name; - } - } else { - type = USTYPE_PSX_USR; - if (!cb->cb_numname) { - struct passwd *p; - - if ((p = getpwuid(rid)) != NULL) - name = p->pw_name; - } - } - } - - /* - * Make sure that the type/name combination is unique when doing - * SID to POSIX ID translation (hence changing the type from SMB to - * POSIX). - */ - if (cb->cb_sid2posix && - nvlist_add_boolean_value(props, "smbentity", smbentity) != 0) - nomem(); - - /* Calculate/update width of TYPE field */ - typestr = us_type2str(type); - typelen = strlen(gettext(typestr)); - typeidx = us_field_index("type"); - if (typelen > cb->cb_width[typeidx]) - cb->cb_width[typeidx] = typelen; - if (nvlist_add_uint32(props, "type", type) != 0) - nomem(); - - /* Calculate/update width of NAME field */ - if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) { - if (nvlist_add_uint64(props, "name", rid) != 0) - nomem(); - namelen = snprintf(NULL, 0, "%u", rid); - } else { - if (nvlist_add_string(props, "name", name) != 0) - nomem(); - namelen = strlen(name); - } - nameidx = us_field_index("name"); - if (namelen > cb->cb_width[nameidx]) - cb->cb_width[nameidx] = namelen; - - /* - * Check if this type/name combination is in the list and update it; - * otherwise add new node to the list. - */ - if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) { - uu_avl_insert(avl, node, idx); - } else { - nvlist_free(props); - free(node); - node = n; - props = node->usn_nvl; - } - - /* Calculate/update width of USED/QUOTA fields */ - if (cb->cb_nicenum) - zfs_nicenum(space, sizebuf, sizeof (sizebuf)); - else - (void) snprintf(sizebuf, sizeof (sizebuf), "%llu", space); - sizelen = strlen(sizebuf); - if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED) { - propname = "used"; - if (!nvlist_exists(props, "quota")) - (void) nvlist_add_uint64(props, "quota", 0); - } else { - propname = "quota"; - if (!nvlist_exists(props, "used")) - (void) nvlist_add_uint64(props, "used", 0); - } - sizeidx = us_field_index(propname); - if (sizelen > cb->cb_width[sizeidx]) - cb->cb_width[sizeidx] = sizelen; - - if (nvlist_add_uint64(props, propname, space) != 0) - nomem(); - - return (0); -} - -static void -print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types, - size_t *width, us_node_t *node) -{ - nvlist_t *nvl = node->usn_nvl; - char valstr[MAXNAMELEN]; - boolean_t first = B_TRUE; - int cfield = 0; - int field; - uint32_t ustype; - - /* Check type */ - (void) nvlist_lookup_uint32(nvl, "type", &ustype); - if (!(ustype & types)) - return; - - while ((field = fields[cfield]) != USFIELD_LAST) { - nvpair_t *nvp = NULL; - data_type_t type; - uint32_t val32; - uint64_t val64; - char *strval = NULL; - - while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { - if (strcmp(nvpair_name(nvp), - us_field_names[field]) == 0) - break; - } - - type = nvpair_type(nvp); - switch (type) { - case DATA_TYPE_UINT32: - (void) nvpair_value_uint32(nvp, &val32); - break; - case DATA_TYPE_UINT64: - (void) nvpair_value_uint64(nvp, &val64); - break; - case DATA_TYPE_STRING: - (void) nvpair_value_string(nvp, &strval); - break; - default: - (void) fprintf(stderr, "invalid data type\n"); - } - - switch (field) { - case USFIELD_TYPE: - strval = (char *)us_type2str(val32); - break; - case USFIELD_NAME: - if (type == DATA_TYPE_UINT64) { - (void) sprintf(valstr, "%llu", val64); - strval = valstr; - } - break; - case USFIELD_USED: - case USFIELD_QUOTA: - if (type == DATA_TYPE_UINT64) { - if (parsable) { - (void) sprintf(valstr, "%llu", val64); - } else { - zfs_nicenum(val64, valstr, - sizeof (valstr)); - } - if (field == USFIELD_QUOTA && - strcmp(valstr, "0") == 0) - strval = "none"; - else - strval = valstr; - } - break; - } - - if (!first) { - if (scripted) - (void) printf("\t"); - else - (void) printf(" "); - } - if (scripted) - (void) printf("%s", strval); - else if (field == USFIELD_TYPE || field == USFIELD_NAME) - (void) printf("%-*s", width[field], strval); - else - (void) printf("%*s", width[field], strval); - - first = B_FALSE; - cfield++; - } - - (void) printf("\n"); -} - -static void -print_us(boolean_t scripted, boolean_t parsable, int *fields, int types, - size_t *width, boolean_t rmnode, uu_avl_t *avl) -{ - us_node_t *node; - const char *col; - int cfield = 0; - int field; - - if (!scripted) { - boolean_t first = B_TRUE; - - while ((field = fields[cfield]) != USFIELD_LAST) { - col = gettext(us_field_hdr[field]); - if (field == USFIELD_TYPE || field == USFIELD_NAME) { - (void) printf(first ? "%-*s" : " %-*s", - width[field], col); - } else { - (void) printf(first ? "%*s" : " %*s", - width[field], col); - } - first = B_FALSE; - cfield++; - } - (void) printf("\n"); - } - - for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) { - print_us_node(scripted, parsable, fields, types, width, node); - if (rmnode) - nvlist_free(node->usn_nvl); - } -} - -static int -zfs_do_userspace(int argc, char **argv) -{ - zfs_handle_t *zhp; - zfs_userquota_prop_t p; - - uu_avl_pool_t *avl_pool; - uu_avl_t *avl_tree; - uu_avl_walk_t *walk; - char *delim; - char deffields[] = "type,name,used,quota"; - char *ofield = NULL; - char *tfield = NULL; - int cfield = 0; - int fields[256]; - int i; - boolean_t scripted = B_FALSE; - boolean_t prtnum = B_FALSE; - boolean_t parsable = B_FALSE; - boolean_t sid2posix = B_FALSE; - int ret = 0; - int c; - zfs_sort_column_t *sortcol = NULL; - int types = USTYPE_PSX_USR | USTYPE_SMB_USR; - us_cbdata_t cb; - us_node_t *node; - us_node_t *rmnode; - uu_list_pool_t *listpool; - uu_list_t *list; - uu_avl_index_t idx = 0; - uu_list_index_t idx2 = 0; - - if (argc < 2) - usage(B_FALSE); - - if (strcmp(argv[0], "groupspace") == 0) - /* Toggle default group types */ - types = USTYPE_PSX_GRP | USTYPE_SMB_GRP; - - while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) { - switch (c) { - case 'n': - prtnum = B_TRUE; - break; - case 'H': - scripted = B_TRUE; - break; - case 'p': - parsable = B_TRUE; - break; - case 'o': - ofield = optarg; - break; - case 's': - case 'S': - if (zfs_add_sort_column(&sortcol, optarg, - c == 's' ? B_FALSE : B_TRUE) != 0) { - (void) fprintf(stderr, - gettext("invalid field '%s'\n"), optarg); - usage(B_FALSE); - } - break; - case 't': - tfield = optarg; - break; - case 'i': - sid2posix = B_TRUE; - break; - case ':': - (void) fprintf(stderr, gettext("missing argument for " - "'%c' option\n"), optopt); - usage(B_FALSE); - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - if (argc < 1) { - (void) fprintf(stderr, gettext("missing dataset name\n")); - usage(B_FALSE); - } - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - /* Use default output fields if not specified using -o */ - if (ofield == NULL) - ofield = deffields; - do { - if ((delim = strchr(ofield, ',')) != NULL) - *delim = '\0'; - if ((fields[cfield++] = us_field_index(ofield)) == -1) { - (void) fprintf(stderr, gettext("invalid type '%s' " - "for -o option\n"), ofield); - return (-1); - } - if (delim != NULL) - ofield = delim + 1; - } while (delim != NULL); - fields[cfield] = USFIELD_LAST; - - /* Override output types (-t option) */ - if (tfield != NULL) { - types = 0; - - do { - boolean_t found = B_FALSE; - - if ((delim = strchr(tfield, ',')) != NULL) - *delim = '\0'; - for (i = 0; i < sizeof (us_type_bits) / sizeof (int); - i++) { - if (strcmp(tfield, us_type_names[i]) == 0) { - found = B_TRUE; - types |= us_type_bits[i]; - break; - } - } - if (!found) { - (void) fprintf(stderr, gettext("invalid type " - "'%s' for -t option\n"), tfield); - return (-1); - } - if (delim != NULL) - tfield = delim + 1; - } while (delim != NULL); - } - - if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL) - return (1); - - if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t), - offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL) - nomem(); - if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) - nomem(); - - /* Always add default sorting columns */ - (void) zfs_add_sort_column(&sortcol, "type", B_FALSE); - (void) zfs_add_sort_column(&sortcol, "name", B_FALSE); - - cb.cb_sortcol = sortcol; - cb.cb_numname = prtnum; - cb.cb_nicenum = !parsable; - cb.cb_avl_pool = avl_pool; - cb.cb_avl = avl_tree; - cb.cb_sid2posix = sid2posix; - - for (i = 0; i < USFIELD_LAST; i++) - cb.cb_width[i] = strlen(gettext(us_field_hdr[i])); - - for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { - if (((p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA) && - !(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) || - ((p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) && - !(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP)))) - continue; - cb.cb_prop = p; - if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0) - return (ret); - } - - /* Sort the list */ - if ((node = uu_avl_first(avl_tree)) == NULL) - return (0); - - us_populated = B_TRUE; - - listpool = uu_list_pool_create("tmplist", sizeof (us_node_t), - offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT); - list = uu_list_create(listpool, NULL, UU_DEFAULT); - uu_list_node_init(node, &node->usn_listnode, listpool); - - while (node != NULL) { - rmnode = node; - node = uu_avl_next(avl_tree, node); - uu_avl_remove(avl_tree, rmnode); - if (uu_list_find(list, rmnode, NULL, &idx2) == NULL) - uu_list_insert(list, rmnode, idx2); - } - - for (node = uu_list_first(list); node != NULL; - node = uu_list_next(list, node)) { - us_sort_info_t sortinfo = { sortcol, cb.cb_numname }; - - if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL) - uu_avl_insert(avl_tree, node, idx); - } - - uu_list_destroy(list); - uu_list_pool_destroy(listpool); - - /* Print and free node nvlist memory */ - print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE, - cb.cb_avl); - - zfs_free_sort_columns(sortcol); - - /* Clean up the AVL tree */ - if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) - nomem(); - - while ((node = uu_avl_walk_next(walk)) != NULL) { - uu_avl_remove(cb.cb_avl, node); - free(node); - } - - uu_avl_walk_end(walk); - uu_avl_destroy(avl_tree); - uu_avl_pool_destroy(avl_pool); - - return (ret); -} - -/* - * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property] ... - * [-t type[,...]] [filesystem|volume|snapshot] ... - * - * -H Scripted mode; elide headers and separate columns by tabs. - * -p Display values in parsable (literal) format. - * -r Recurse over all children. - * -d Limit recursion by depth. - * -o Control which fields to display. - * -s Specify sort columns, descending order. - * -S Specify sort columns, ascending order. - * -t Control which object types to display. - * - * When given no arguments, list all filesystems in the system. - * Otherwise, list the specified datasets, optionally recursing down them if - * '-r' is specified. - */ -typedef struct list_cbdata { - boolean_t cb_first; - boolean_t cb_literal; - boolean_t cb_scripted; - zprop_list_t *cb_proplist; -} list_cbdata_t; - -/* - * Given a list of columns to display, output appropriate headers for each one. - */ -static void -print_header(list_cbdata_t *cb) -{ - zprop_list_t *pl = cb->cb_proplist; - char headerbuf[ZFS_MAXPROPLEN]; - const char *header; - int i; - boolean_t first = B_TRUE; - boolean_t right_justify; - - for (; pl != NULL; pl = pl->pl_next) { - if (!first) { - (void) printf(" "); - } else { - first = B_FALSE; - } - - right_justify = B_FALSE; - if (pl->pl_prop != ZPROP_INVAL) { - header = zfs_prop_column_name(pl->pl_prop); - right_justify = zfs_prop_align_right(pl->pl_prop); - } else { - for (i = 0; pl->pl_user_prop[i] != '\0'; i++) - headerbuf[i] = toupper(pl->pl_user_prop[i]); - headerbuf[i] = '\0'; - header = headerbuf; - } - - if (pl->pl_next == NULL && !right_justify) - (void) printf("%s", header); - else if (right_justify) - (void) printf("%*s", pl->pl_width, header); - else - (void) printf("%-*s", pl->pl_width, header); - } - - (void) printf("\n"); -} - -/* - * Given a dataset and a list of fields, print out all the properties according - * to the described layout. - */ -static void -print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) -{ - zprop_list_t *pl = cb->cb_proplist; - boolean_t first = B_TRUE; - char property[ZFS_MAXPROPLEN]; - nvlist_t *userprops = zfs_get_user_props(zhp); - nvlist_t *propval; - char *propstr; - boolean_t right_justify; - - for (; pl != NULL; pl = pl->pl_next) { - if (!first) { - if (cb->cb_scripted) - (void) printf("\t"); - else - (void) printf(" "); - } else { - first = B_FALSE; - } - - if (pl->pl_prop == ZFS_PROP_NAME) { - (void) strlcpy(property, zfs_get_name(zhp), - sizeof (property)); - propstr = property; - right_justify = zfs_prop_align_right(pl->pl_prop); - } else if (pl->pl_prop != ZPROP_INVAL) { - if (zfs_prop_get(zhp, pl->pl_prop, property, - sizeof (property), NULL, NULL, 0, - cb->cb_literal) != 0) - propstr = "-"; - else - propstr = property; - right_justify = zfs_prop_align_right(pl->pl_prop); - } else if (zfs_prop_userquota(pl->pl_user_prop)) { - if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, - property, sizeof (property), cb->cb_literal) != 0) - propstr = "-"; - else - propstr = property; - right_justify = B_TRUE; - } else if (zfs_prop_written(pl->pl_user_prop)) { - if (zfs_prop_get_written(zhp, pl->pl_user_prop, - property, sizeof (property), cb->cb_literal) != 0) - propstr = "-"; - else - propstr = property; - right_justify = B_TRUE; - } else { - if (nvlist_lookup_nvlist(userprops, - pl->pl_user_prop, &propval) != 0) - propstr = "-"; - else - verify(nvlist_lookup_string(propval, - ZPROP_VALUE, &propstr) == 0); - right_justify = B_FALSE; - } - - /* - * If this is being called in scripted mode, or if this is the - * last column and it is left-justified, don't include a width - * format specifier. - */ - if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) - (void) printf("%s", propstr); - else if (right_justify) - (void) printf("%*s", pl->pl_width, propstr); - else - (void) printf("%-*s", pl->pl_width, propstr); - } - - (void) printf("\n"); -} - -/* - * Generic callback function to list a dataset or snapshot. - */ -static int -list_callback(zfs_handle_t *zhp, void *data) -{ - list_cbdata_t *cbp = data; - - if (cbp->cb_first) { - if (!cbp->cb_scripted) - print_header(cbp); - cbp->cb_first = B_FALSE; - } - - print_dataset(zhp, cbp); - - return (0); -} - -static int -zfs_do_list(int argc, char **argv) -{ - int c; - static char default_fields[] = - "name,used,available,referenced,mountpoint"; - int types = ZFS_TYPE_DATASET; - boolean_t types_specified = B_FALSE; - char *fields = NULL; - list_cbdata_t cb = { 0 }; - char *value; - int limit = 0; - int ret = 0; - zfs_sort_column_t *sortcol = NULL; - int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS; - - /* check options */ - while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) { - switch (c) { - case 'o': - fields = optarg; - break; - case 'p': - cb.cb_literal = B_TRUE; - flags |= ZFS_ITER_LITERAL_PROPS; - break; - case 'd': - limit = parse_depth(optarg, &flags); - break; - case 'r': - flags |= ZFS_ITER_RECURSE; - break; - case 'H': - cb.cb_scripted = B_TRUE; - break; - case 's': - if (zfs_add_sort_column(&sortcol, optarg, - B_FALSE) != 0) { - (void) fprintf(stderr, - gettext("invalid property '%s'\n"), optarg); - usage(B_FALSE); - } - break; - case 'S': - if (zfs_add_sort_column(&sortcol, optarg, - B_TRUE) != 0) { - (void) fprintf(stderr, - gettext("invalid property '%s'\n"), optarg); - usage(B_FALSE); - } - break; - case 't': - types = 0; - types_specified = B_TRUE; - flags &= ~ZFS_ITER_PROP_LISTSNAPS; - while (*optarg != '\0') { - static char *type_subopts[] = { "filesystem", - "volume", "snapshot", "snap", "bookmark", - "all", NULL }; - - switch (getsubopt(&optarg, type_subopts, - &value)) { - case 0: - types |= ZFS_TYPE_FILESYSTEM; - break; - case 1: - types |= ZFS_TYPE_VOLUME; - break; - case 2: - case 3: - types |= ZFS_TYPE_SNAPSHOT; - break; - case 4: - types |= ZFS_TYPE_BOOKMARK; - break; - case 5: - types = ZFS_TYPE_DATASET | - ZFS_TYPE_BOOKMARK; - break; - default: - (void) fprintf(stderr, - gettext("invalid type '%s'\n"), - suboptarg); - usage(B_FALSE); - } - } - break; - case ':': - (void) fprintf(stderr, gettext("missing argument for " - "'%c' option\n"), optopt); - usage(B_FALSE); - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - if (fields == NULL) - fields = default_fields; - - /* - * If we are only going to list snapshot names and sort by name, - * then we can use faster version. - */ - if (strcmp(fields, "name") == 0 && zfs_sort_only_by_name(sortcol)) - flags |= ZFS_ITER_SIMPLE; - - /* - * If "-o space" and no types were specified, don't display snapshots. - */ - if (strcmp(fields, "space") == 0 && types_specified == B_FALSE) - types &= ~ZFS_TYPE_SNAPSHOT; - - /* - * If the user specifies '-o all', the zprop_get_list() doesn't - * normally include the name of the dataset. For 'zfs list', we always - * want this property to be first. - */ - if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) - != 0) - usage(B_FALSE); - - cb.cb_first = B_TRUE; - - ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist, - limit, list_callback, &cb); - - zprop_free_list(cb.cb_proplist); - zfs_free_sort_columns(sortcol); - - if (ret == 0 && cb.cb_first && !cb.cb_scripted) - (void) printf(gettext("no datasets available\n")); - - return (ret); -} - -/* - * zfs rename [-f] - * zfs rename [-f] -p - * zfs rename -r - * zfs rename - * zfs rename -u [-p] - * - * Renames the given dataset to another of the same type. - * - * The '-p' flag creates all the non-existing ancestors of the target first. - */ -/* ARGSUSED */ -static int -zfs_do_rename(int argc, char **argv) -{ - zfs_handle_t *zhp; - renameflags_t flags = { 0 }; - int c; - int ret = 0; - int types; - boolean_t parents = B_FALSE; - boolean_t bookmarks = B_FALSE; - char *snapshot = NULL; - - /* check options */ - while ((c = getopt(argc, argv, "fpru")) != -1) { - switch (c) { - case 'p': - parents = B_TRUE; - break; - case 'r': - flags.recurse = B_TRUE; - break; - case 'u': - flags.nounmount = B_TRUE; - break; - case 'f': - flags.forceunmount = B_TRUE; - break; - case '?': - default: - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing source dataset " - "argument\n")); - usage(B_FALSE); - } - if (argc < 2) { - (void) fprintf(stderr, gettext("missing target dataset " - "argument\n")); - usage(B_FALSE); - } - if (argc > 2) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - if (flags.recurse && parents) { - (void) fprintf(stderr, gettext("-p and -r options are mutually " - "exclusive\n")); - usage(B_FALSE); - } - - if (flags.recurse && strchr(argv[0], '@') == NULL) { - (void) fprintf(stderr, gettext("source dataset for recursive " - "rename must be a snapshot\n")); - usage(B_FALSE); - } - - if (flags.nounmount && parents) { - (void) fprintf(stderr, gettext("-u and -p options are mutually " - "exclusive\n")); - usage(B_FALSE); - } - - if (strchr(argv[0], '#') != NULL) - bookmarks = B_TRUE; - - if (bookmarks && (flags.nounmount || flags.recurse || - flags.forceunmount || parents)) { - (void) fprintf(stderr, gettext("options are not supported " - "for renaming bookmarks\n")); - usage(B_FALSE); - } - - if (flags.nounmount) - types = ZFS_TYPE_FILESYSTEM; - else if (parents) - types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; - else if (bookmarks) - types = ZFS_TYPE_BOOKMARK; - else - types = ZFS_TYPE_DATASET; - - if (flags.recurse) { - /* - * When we do recursive rename we are fine when the given - * snapshot for the given dataset doesn't exist - it can - * still exists below. - */ - - snapshot = strchr(argv[0], '@'); - assert(snapshot != NULL); - *snapshot = '\0'; - snapshot++; - } - - if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) - return (1); - - /* If we were asked and the name looks good, try to create ancestors. */ - if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) && - zfs_create_ancestors(g_zfs, argv[1]) != 0) { - zfs_close(zhp); - return (1); - } - - ret = (zfs_rename(zhp, snapshot, argv[1], flags) != 0); - - zfs_close(zhp); - return (ret); -} - -/* - * zfs promote - * - * Promotes the given clone fs to be the parent - */ -/* ARGSUSED */ -static int -zfs_do_promote(int argc, char **argv) -{ - zfs_handle_t *zhp; - int ret = 0; - - /* check options */ - if (argc > 1 && argv[1][0] == '-') { - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - argv[1][1]); - usage(B_FALSE); - } - - /* check number of arguments */ - if (argc < 2) { - (void) fprintf(stderr, gettext("missing clone filesystem" - " argument\n")); - usage(B_FALSE); - } - if (argc > 2) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); - if (zhp == NULL) - return (1); - - ret = (zfs_promote(zhp) != 0); - - - zfs_close(zhp); - return (ret); -} - -/* - * zfs rollback [-rRf] - * - * -r Delete any intervening snapshots before doing rollback - * -R Delete any snapshots and their clones - * -f ignored for backwards compatability - * - * Given a filesystem, rollback to a specific snapshot, discarding any changes - * since then and making it the active dataset. If more recent snapshots exist, - * the command will complain unless the '-r' flag is given. - */ -typedef struct rollback_cbdata { - uint64_t cb_create; - uint8_t cb_younger_ds_printed; - boolean_t cb_first; - int cb_doclones; - char *cb_target; - int cb_error; - boolean_t cb_recurse; -} rollback_cbdata_t; - -static int -rollback_check_dependent(zfs_handle_t *zhp, void *data) -{ - rollback_cbdata_t *cbp = data; - - if (cbp->cb_first && cbp->cb_recurse) { - (void) fprintf(stderr, gettext("cannot rollback to " - "'%s': clones of previous snapshots exist\n"), - cbp->cb_target); - (void) fprintf(stderr, gettext("use '-R' to " - "force deletion of the following clones and " - "dependents:\n")); - cbp->cb_first = 0; - cbp->cb_error = 1; - } - - (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); - - zfs_close(zhp); - return (0); -} - -/* - * Report some snapshots/bookmarks more recent than the one specified. - * Used when '-r' is not specified. We reuse this same callback for the - * snapshot dependents - if 'cb_dependent' is set, then this is a - * dependent and we should report it without checking the transaction group. - */ -static int -rollback_check(zfs_handle_t *zhp, void *data) -{ - rollback_cbdata_t *cbp = data; - /* - * Max number of younger snapshots and/or bookmarks to display before - * we stop the iteration. - */ - const uint8_t max_younger = 32; - - if (cbp->cb_doclones) { - zfs_close(zhp); - return (0); - } - - if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { - if (cbp->cb_first && !cbp->cb_recurse) { - (void) fprintf(stderr, gettext("cannot " - "rollback to '%s': more recent snapshots " - "or bookmarks exist\n"), - cbp->cb_target); - (void) fprintf(stderr, gettext("use '-r' to " - "force deletion of the following " - "snapshots and bookmarks:\n")); - cbp->cb_first = 0; - cbp->cb_error = 1; - } - - if (cbp->cb_recurse) { - if (zfs_iter_dependents(zhp, B_TRUE, - rollback_check_dependent, cbp) != 0) { - zfs_close(zhp); - return (-1); - } - } else { - (void) fprintf(stderr, "%s\n", - zfs_get_name(zhp)); - cbp->cb_younger_ds_printed++; - } - } - zfs_close(zhp); - - if (cbp->cb_younger_ds_printed == max_younger) { - /* - * This non-recursive rollback is going to fail due to the - * presence of snapshots and/or bookmarks that are younger than - * the rollback target. - * We printed some of the offending objects, now we stop - * zfs_iter_snapshot/bookmark iteration so we can fail fast and - * avoid iterating over the rest of the younger objects - */ - (void) fprintf(stderr, gettext("Output limited to %d " - "snapshots/bookmarks\n"), max_younger); - return (-1); - } - return (0); -} - -static int -zfs_do_rollback(int argc, char **argv) -{ - int ret = 0; - int c; - boolean_t force = B_FALSE; - rollback_cbdata_t cb = { 0 }; - zfs_handle_t *zhp, *snap; - char parentname[ZFS_MAX_DATASET_NAME_LEN]; - char *delim; - uint64_t min_txg = 0; - - /* check options */ - while ((c = getopt(argc, argv, "rRf")) != -1) { - switch (c) { - case 'r': - cb.cb_recurse = 1; - break; - case 'R': - cb.cb_recurse = 1; - cb.cb_doclones = 1; - break; - case 'f': - force = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing dataset argument\n")); - usage(B_FALSE); - } - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - /* open the snapshot */ - if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) - return (1); - - /* open the parent dataset */ - (void) strlcpy(parentname, argv[0], sizeof (parentname)); - verify((delim = strrchr(parentname, '@')) != NULL); - *delim = '\0'; - if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) { - zfs_close(snap); - return (1); - } - - /* - * Check for more recent snapshots and/or clones based on the presence - * of '-r' and '-R'. - */ - cb.cb_target = argv[0]; - cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); - cb.cb_first = B_TRUE; - cb.cb_error = 0; - - if (cb.cb_create > 0) - min_txg = cb.cb_create; - - if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb, - min_txg, 0)) != 0) - goto out; - if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0) - goto out; - - if ((ret = cb.cb_error) != 0) - goto out; - - /* - * Rollback parent to the given snapshot. - */ - ret = zfs_rollback(zhp, snap, force); - -out: - zfs_close(snap); - zfs_close(zhp); - - if (ret == 0) - return (0); - else - return (1); -} - -/* - * zfs set property=value ... { fs | snap | vol } ... - * - * Sets the given properties for all datasets specified on the command line. - */ - -static int -set_callback(zfs_handle_t *zhp, void *data) -{ - nvlist_t *props = data; - - if (zfs_prop_set_list(zhp, props) != 0) { - switch (libzfs_errno(g_zfs)) { - case EZFS_MOUNTFAILED: - (void) fprintf(stderr, gettext("property may be set " - "but unable to remount filesystem\n")); - break; - case EZFS_SHARENFSFAILED: - (void) fprintf(stderr, gettext("property may be set " - "but unable to reshare filesystem\n")); - break; - } - return (1); - } - return (0); -} - -static int -zfs_do_set(int argc, char **argv) -{ - nvlist_t *props = NULL; - int ds_start = -1; /* argv idx of first dataset arg */ - int ret = 0; - - /* check for options */ - if (argc > 1 && argv[1][0] == '-') { - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - argv[1][1]); - usage(B_FALSE); - } - - /* check number of arguments */ - if (argc < 2) { - (void) fprintf(stderr, gettext("missing arguments\n")); - usage(B_FALSE); - } - if (argc < 3) { - if (strchr(argv[1], '=') == NULL) { - (void) fprintf(stderr, gettext("missing property=value " - "argument(s)\n")); - } else { - (void) fprintf(stderr, gettext("missing dataset " - "name(s)\n")); - } - usage(B_FALSE); - } - - /* validate argument order: prop=val args followed by dataset args */ - for (int i = 1; i < argc; i++) { - if (strchr(argv[i], '=') != NULL) { - if (ds_start > 0) { - /* out-of-order prop=val argument */ - (void) fprintf(stderr, gettext("invalid " - "argument order\n"), i); - usage(B_FALSE); - } - } else if (ds_start < 0) { - ds_start = i; - } - } - if (ds_start < 0) { - (void) fprintf(stderr, gettext("missing dataset name(s)\n")); - usage(B_FALSE); - } - - /* Populate a list of property settings */ - if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) - nomem(); - for (int i = 1; i < ds_start; i++) { - if ((ret = parseprop(props, argv[i])) != 0) - goto error; - } - - ret = zfs_for_each(argc - ds_start, argv + ds_start, 0, - ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props); - -error: - nvlist_free(props); - return (ret); -} - -typedef struct snap_cbdata { - nvlist_t *sd_nvl; - boolean_t sd_recursive; - const char *sd_snapname; -} snap_cbdata_t; - -static int -zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) -{ - snap_cbdata_t *sd = arg; - char *name; - int rv = 0; - int error; - - if (sd->sd_recursive && - zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) { - zfs_close(zhp); - return (0); - } - - error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname); - if (error == -1) - nomem(); - fnvlist_add_boolean(sd->sd_nvl, name); - free(name); - - if (sd->sd_recursive) - rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); - zfs_close(zhp); - return (rv); -} - -/* - * zfs snapshot [-r] [-o prop=value] ... - * - * Creates a snapshot with the given name. While functionally equivalent to - * 'zfs create', it is a separate command to differentiate intent. - */ -static int -zfs_do_snapshot(int argc, char **argv) -{ - int ret = 0; - int c; - nvlist_t *props; - snap_cbdata_t sd = { 0 }; - boolean_t multiple_snaps = B_FALSE; - - if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) - nomem(); - if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0) - nomem(); - - /* check options */ - while ((c = getopt(argc, argv, "ro:")) != -1) { - switch (c) { - case 'o': - if (parseprop(props, optarg) != 0) - return (1); - break; - case 'r': - sd.sd_recursive = B_TRUE; - multiple_snaps = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - goto usage; - } - } - - argc -= optind; - argv += optind; - - /* check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing snapshot argument\n")); - goto usage; - } - - if (argc > 1) - multiple_snaps = B_TRUE; - for (; argc > 0; argc--, argv++) { - char *atp; - zfs_handle_t *zhp; - - atp = strchr(argv[0], '@'); - if (atp == NULL) - goto usage; - *atp = '\0'; - sd.sd_snapname = atp + 1; - zhp = zfs_open(g_zfs, argv[0], - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); - if (zhp == NULL) - goto usage; - if (zfs_snapshot_cb(zhp, &sd) != 0) - goto usage; - } - - ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props); - nvlist_free(sd.sd_nvl); - nvlist_free(props); - if (ret != 0 && multiple_snaps) - (void) fprintf(stderr, gettext("no snapshots were created\n")); - return (ret != 0); - -usage: - nvlist_free(sd.sd_nvl); - nvlist_free(props); - usage(B_FALSE); - return (-1); -} - -/* - * Send a backup stream to stdout. - */ -static int -zfs_do_send(int argc, char **argv) -{ - char *fromname = NULL; - char *toname = NULL; - char *resume_token = NULL; - char *cp; - zfs_handle_t *zhp; - sendflags_t flags = { 0 }; - int c, err; - nvlist_t *dbgnv = NULL; - boolean_t extraverbose = B_FALSE; - - struct option long_options[] = { - {"replicate", no_argument, NULL, 'R'}, - {"props", no_argument, NULL, 'p'}, - {"parsable", no_argument, NULL, 'P'}, - {"dedup", no_argument, NULL, 'D'}, - {"verbose", no_argument, NULL, 'v'}, - {"dryrun", no_argument, NULL, 'n'}, - {"large-block", no_argument, NULL, 'L'}, - {"embed", no_argument, NULL, 'e'}, - {"resume", required_argument, NULL, 't'}, - {"compressed", no_argument, NULL, 'c'}, - {0, 0, 0, 0} - }; - - /* check options */ - while ((c = getopt_long(argc, argv, ":i:I:RbDpVvnPLet:c", long_options, - NULL)) != -1) { - switch (c) { - case 'i': - if (fromname) - usage(B_FALSE); - fromname = optarg; - break; - case 'I': - if (fromname) - usage(B_FALSE); - fromname = optarg; - flags.doall = B_TRUE; - break; - case 'R': - flags.replicate = B_TRUE; - break; - case 'p': - flags.props = B_TRUE; - break; - case 'P': - flags.parsable = B_TRUE; - flags.verbose = B_TRUE; - break; - case 'V': - flags.progress = B_TRUE; - flags.progressastitle = B_TRUE; - break; - case 'v': - if (flags.verbose) - extraverbose = B_TRUE; - flags.verbose = B_TRUE; - flags.progress = B_TRUE; - break; - case 'D': - flags.dedup = B_TRUE; - break; - case 'n': - flags.dryrun = B_TRUE; - break; - case 'L': - flags.largeblock = B_TRUE; - break; - case 'e': - flags.embed_data = B_TRUE; - break; - case 't': - resume_token = optarg; - break; - case 'c': - flags.compress = B_TRUE; - break; - case ':': - (void) fprintf(stderr, gettext("missing argument for " - "'%c' option\n"), optopt); - usage(B_FALSE); - break; - case '?': - /*FALLTHROUGH*/ - default: - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - if (resume_token != NULL) { - if (fromname != NULL || flags.replicate || flags.props || - flags.dedup) { - (void) fprintf(stderr, - gettext("invalid flags combined with -t\n")); - usage(B_FALSE); - } - if (argc != 0) { - (void) fprintf(stderr, gettext("no additional " - "arguments are permitted with -t\n")); - usage(B_FALSE); - } - } else { - if (argc < 1) { - (void) fprintf(stderr, - gettext("missing snapshot argument\n")); - usage(B_FALSE); - } - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - } - - if (!flags.dryrun && isatty(STDOUT_FILENO)) { - (void) fprintf(stderr, - gettext("Error: Stream can not be written to a terminal.\n" - "You must redirect standard output.\n")); - return (1); - } - - if (resume_token != NULL) { - return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO, - resume_token)); - } - - /* - * Special case sending a filesystem, or from a bookmark. - */ - if (strchr(argv[0], '@') == NULL || - (fromname && strchr(fromname, '#') != NULL)) { - char frombuf[ZFS_MAX_DATASET_NAME_LEN]; - - if (flags.replicate || flags.doall || flags.props || - flags.dedup || (strchr(argv[0], '@') == NULL && - (flags.dryrun || flags.verbose || flags.progress))) { - (void) fprintf(stderr, gettext("Error: " - "Unsupported flag with filesystem or bookmark.\n")); - return (1); - } - - zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); - if (zhp == NULL) - return (1); - - if (fromname != NULL && - (fromname[0] == '#' || fromname[0] == '@')) { - /* - * Incremental source name begins with # or @. - * Default to same fs as target. - */ - (void) strncpy(frombuf, argv[0], sizeof (frombuf)); - cp = strchr(frombuf, '@'); - if (cp != NULL) - *cp = '\0'; - (void) strlcat(frombuf, fromname, sizeof (frombuf)); - fromname = frombuf; - } - err = zfs_send_one(zhp, fromname, STDOUT_FILENO, flags); - zfs_close(zhp); - return (err != 0); - } - - cp = strchr(argv[0], '@'); - *cp = '\0'; - toname = cp + 1; - zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); - if (zhp == NULL) - return (1); - - /* - * If they specified the full path to the snapshot, chop off - * everything except the short name of the snapshot, but special - * case if they specify the origin. - */ - if (fromname && (cp = strchr(fromname, '@')) != NULL) { - char origin[ZFS_MAX_DATASET_NAME_LEN]; - zprop_source_t src; - - (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN, - origin, sizeof (origin), &src, NULL, 0, B_FALSE); - - if (strcmp(origin, fromname) == 0) { - fromname = NULL; - flags.fromorigin = B_TRUE; - } else { - *cp = '\0'; - if (cp != fromname && strcmp(argv[0], fromname)) { - (void) fprintf(stderr, - gettext("incremental source must be " - "in same filesystem\n")); - usage(B_FALSE); - } - fromname = cp + 1; - if (strchr(fromname, '@') || strchr(fromname, '/')) { - (void) fprintf(stderr, - gettext("invalid incremental source\n")); - usage(B_FALSE); - } - } - } - - if (flags.replicate && fromname == NULL) - flags.doall = B_TRUE; - - err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0, - extraverbose ? &dbgnv : NULL); - - if (extraverbose && dbgnv != NULL) { - /* - * dump_nvlist prints to stdout, but that's been - * redirected to a file. Make it print to stderr - * instead. - */ - (void) dup2(STDERR_FILENO, STDOUT_FILENO); - dump_nvlist(dbgnv, 0); - nvlist_free(dbgnv); - } - zfs_close(zhp); - - return (err != 0); -} - -/* - * Restore a backup stream from stdin. - */ -static int -zfs_do_receive(int argc, char **argv) -{ - int c, err = 0; - recvflags_t flags = { 0 }; - boolean_t abort_resumable = B_FALSE; - - nvlist_t *props; - nvpair_t *nvp = NULL; - - if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) - nomem(); - - /* check options */ - while ((c = getopt(argc, argv, ":o:denuvMFsA")) != -1) { - switch (c) { - case 'o': - if (parseprop(props, optarg) != 0) - return (1); - break; - case 'd': - flags.isprefix = B_TRUE; - break; - case 'e': - flags.isprefix = B_TRUE; - flags.istail = B_TRUE; - break; - case 'n': - flags.dryrun = B_TRUE; - break; - case 'u': - flags.nomount = B_TRUE; - break; - case 'v': - flags.verbose = B_TRUE; - break; - case 's': - flags.resumable = B_TRUE; - break; - case 'F': - flags.force = B_TRUE; - break; - case 'M': - flags.forceunmount = B_TRUE; - break; - case 'A': - abort_resumable = B_TRUE; - break; - case ':': - (void) fprintf(stderr, gettext("missing argument for " - "'%c' option\n"), optopt); - usage(B_FALSE); - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing snapshot argument\n")); - usage(B_FALSE); - } - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - while ((nvp = nvlist_next_nvpair(props, nvp))) { - if (strcmp(nvpair_name(nvp), "origin") != 0) { - (void) fprintf(stderr, gettext("invalid option")); - usage(B_FALSE); - } - } - - if (abort_resumable) { - if (flags.isprefix || flags.istail || flags.dryrun || - flags.resumable || flags.nomount) { - (void) fprintf(stderr, gettext("invalid option")); - usage(B_FALSE); - } - - char namebuf[ZFS_MAX_DATASET_NAME_LEN]; - (void) snprintf(namebuf, sizeof (namebuf), - "%s/%%recv", argv[0]); - - if (zfs_dataset_exists(g_zfs, namebuf, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { - zfs_handle_t *zhp = zfs_open(g_zfs, - namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); - if (zhp == NULL) - return (1); - err = zfs_destroy(zhp, B_FALSE); - } else { - zfs_handle_t *zhp = zfs_open(g_zfs, - argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); - if (zhp == NULL) - usage(B_FALSE); - if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) || - zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, - NULL, 0, NULL, NULL, 0, B_TRUE) == -1) { - (void) fprintf(stderr, - gettext("'%s' does not have any " - "resumable receive state to abort\n"), - argv[0]); - return (1); - } - err = zfs_destroy(zhp, B_FALSE); - } - - return (err != 0); - } - - if (isatty(STDIN_FILENO)) { - (void) fprintf(stderr, - gettext("Error: Backup stream can not be read " - "from a terminal.\n" - "You must redirect standard input.\n")); - return (1); - } - err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL); - - return (err != 0); -} - -/* - * allow/unallow stuff - */ -/* copied from zfs/sys/dsl_deleg.h */ -#define ZFS_DELEG_PERM_CREATE "create" -#define ZFS_DELEG_PERM_DESTROY "destroy" -#define ZFS_DELEG_PERM_SNAPSHOT "snapshot" -#define ZFS_DELEG_PERM_ROLLBACK "rollback" -#define ZFS_DELEG_PERM_CLONE "clone" -#define ZFS_DELEG_PERM_PROMOTE "promote" -#define ZFS_DELEG_PERM_RENAME "rename" -#define ZFS_DELEG_PERM_MOUNT "mount" -#define ZFS_DELEG_PERM_SHARE "share" -#define ZFS_DELEG_PERM_SEND "send" -#define ZFS_DELEG_PERM_RECEIVE "receive" -#define ZFS_DELEG_PERM_ALLOW "allow" -#define ZFS_DELEG_PERM_USERPROP "userprop" -#define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */ -#define ZFS_DELEG_PERM_USERQUOTA "userquota" -#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" -#define ZFS_DELEG_PERM_USERUSED "userused" -#define ZFS_DELEG_PERM_GROUPUSED "groupused" -#define ZFS_DELEG_PERM_HOLD "hold" -#define ZFS_DELEG_PERM_RELEASE "release" -#define ZFS_DELEG_PERM_DIFF "diff" -#define ZFS_DELEG_PERM_BOOKMARK "bookmark" -#define ZFS_DELEG_PERM_REMAP "remap" - -#define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE - -static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = { - { ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW }, - { ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE }, - { ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE }, - { ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY }, - { ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF}, - { ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD }, - { ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT }, - { ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE }, - { ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE }, - { ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE }, - { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, - { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, - { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, - { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, - { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, - { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK }, - { ZFS_DELEG_PERM_REMAP, ZFS_DELEG_NOTE_REMAP }, - - { ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, - { ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, - { ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, - { ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, - { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, - { NULL, ZFS_DELEG_NOTE_NONE } -}; - -/* permission structure */ -typedef struct deleg_perm { - zfs_deleg_who_type_t dp_who_type; - const char *dp_name; - boolean_t dp_local; - boolean_t dp_descend; -} deleg_perm_t; - -/* */ -typedef struct deleg_perm_node { - deleg_perm_t dpn_perm; - - uu_avl_node_t dpn_avl_node; -} deleg_perm_node_t; - -typedef struct fs_perm fs_perm_t; - -/* permissions set */ -typedef struct who_perm { - zfs_deleg_who_type_t who_type; - const char *who_name; /* id */ - char who_ug_name[256]; /* user/group name */ - fs_perm_t *who_fsperm; /* uplink */ - - uu_avl_t *who_deleg_perm_avl; /* permissions */ -} who_perm_t; - -/* */ -typedef struct who_perm_node { - who_perm_t who_perm; - uu_avl_node_t who_avl_node; -} who_perm_node_t; - -typedef struct fs_perm_set fs_perm_set_t; -/* fs permissions */ -struct fs_perm { - const char *fsp_name; - - uu_avl_t *fsp_sc_avl; /* sets,create */ - uu_avl_t *fsp_uge_avl; /* user,group,everyone */ - - fs_perm_set_t *fsp_set; /* uplink */ -}; - -/* */ -typedef struct fs_perm_node { - fs_perm_t fspn_fsperm; - uu_avl_t *fspn_avl; - - uu_list_node_t fspn_list_node; -} fs_perm_node_t; - -/* top level structure */ -struct fs_perm_set { - uu_list_pool_t *fsps_list_pool; - uu_list_t *fsps_list; /* list of fs_perms */ - - uu_avl_pool_t *fsps_named_set_avl_pool; - uu_avl_pool_t *fsps_who_perm_avl_pool; - uu_avl_pool_t *fsps_deleg_perm_avl_pool; -}; - -static inline const char * -deleg_perm_type(zfs_deleg_note_t note) -{ - /* subcommands */ - switch (note) { - /* SUBCOMMANDS */ - /* OTHER */ - case ZFS_DELEG_NOTE_GROUPQUOTA: - case ZFS_DELEG_NOTE_GROUPUSED: - case ZFS_DELEG_NOTE_USERPROP: - case ZFS_DELEG_NOTE_USERQUOTA: - case ZFS_DELEG_NOTE_USERUSED: - /* other */ - return (gettext("other")); - default: - return (gettext("subcommand")); - } -} - -static int -who_type2weight(zfs_deleg_who_type_t who_type) -{ - int res; - switch (who_type) { - case ZFS_DELEG_NAMED_SET_SETS: - case ZFS_DELEG_NAMED_SET: - res = 0; - break; - case ZFS_DELEG_CREATE_SETS: - case ZFS_DELEG_CREATE: - res = 1; - break; - case ZFS_DELEG_USER_SETS: - case ZFS_DELEG_USER: - res = 2; - break; - case ZFS_DELEG_GROUP_SETS: - case ZFS_DELEG_GROUP: - res = 3; - break; - case ZFS_DELEG_EVERYONE_SETS: - case ZFS_DELEG_EVERYONE: - res = 4; - break; - default: - res = -1; - } - - return (res); -} - -/* ARGSUSED */ -static int -who_perm_compare(const void *larg, const void *rarg, void *unused) -{ - const who_perm_node_t *l = larg; - const who_perm_node_t *r = rarg; - zfs_deleg_who_type_t ltype = l->who_perm.who_type; - zfs_deleg_who_type_t rtype = r->who_perm.who_type; - int lweight = who_type2weight(ltype); - int rweight = who_type2weight(rtype); - int res = lweight - rweight; - if (res == 0) - res = strncmp(l->who_perm.who_name, r->who_perm.who_name, - ZFS_MAX_DELEG_NAME-1); - - if (res == 0) - return (0); - if (res > 0) - return (1); - else - return (-1); -} - -/* ARGSUSED */ -static int -deleg_perm_compare(const void *larg, const void *rarg, void *unused) -{ - const deleg_perm_node_t *l = larg; - const deleg_perm_node_t *r = rarg; - int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name, - ZFS_MAX_DELEG_NAME-1); - - if (res == 0) - return (0); - - if (res > 0) - return (1); - else - return (-1); -} - -static inline void -fs_perm_set_init(fs_perm_set_t *fspset) -{ - bzero(fspset, sizeof (fs_perm_set_t)); - - if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool", - sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node), - NULL, UU_DEFAULT)) == NULL) - nomem(); - if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL, - UU_DEFAULT)) == NULL) - nomem(); - - if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create( - "named_set_avl_pool", sizeof (who_perm_node_t), offsetof( - who_perm_node_t, who_avl_node), who_perm_compare, - UU_DEFAULT)) == NULL) - nomem(); - - if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create( - "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof( - who_perm_node_t, who_avl_node), who_perm_compare, - UU_DEFAULT)) == NULL) - nomem(); - - if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create( - "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof( - deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT)) - == NULL) - nomem(); -} - -static inline void fs_perm_fini(fs_perm_t *); -static inline void who_perm_fini(who_perm_t *); - -static inline void -fs_perm_set_fini(fs_perm_set_t *fspset) -{ - fs_perm_node_t *node = uu_list_first(fspset->fsps_list); - - while (node != NULL) { - fs_perm_node_t *next_node = - uu_list_next(fspset->fsps_list, node); - fs_perm_t *fsperm = &node->fspn_fsperm; - fs_perm_fini(fsperm); - uu_list_remove(fspset->fsps_list, node); - free(node); - node = next_node; - } - - uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool); - uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool); - uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool); -} - -static inline void -deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type, - const char *name) -{ - deleg_perm->dp_who_type = type; - deleg_perm->dp_name = name; -} - -static inline void -who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm, - zfs_deleg_who_type_t type, const char *name) -{ - uu_avl_pool_t *pool; - pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool; - - bzero(who_perm, sizeof (who_perm_t)); - - if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL, - UU_DEFAULT)) == NULL) - nomem(); - - who_perm->who_type = type; - who_perm->who_name = name; - who_perm->who_fsperm = fsperm; -} - -static inline void -who_perm_fini(who_perm_t *who_perm) -{ - deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl); - - while (node != NULL) { - deleg_perm_node_t *next_node = - uu_avl_next(who_perm->who_deleg_perm_avl, node); - - uu_avl_remove(who_perm->who_deleg_perm_avl, node); - free(node); - node = next_node; - } - - uu_avl_destroy(who_perm->who_deleg_perm_avl); -} - -static inline void -fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname) -{ - uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool; - uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool; - - bzero(fsperm, sizeof (fs_perm_t)); - - if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT)) - == NULL) - nomem(); - - if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT)) - == NULL) - nomem(); - - fsperm->fsp_set = fspset; - fsperm->fsp_name = fsname; -} - -static inline void -fs_perm_fini(fs_perm_t *fsperm) -{ - who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl); - while (node != NULL) { - who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl, - node); - who_perm_t *who_perm = &node->who_perm; - who_perm_fini(who_perm); - uu_avl_remove(fsperm->fsp_sc_avl, node); - free(node); - node = next_node; - } - - node = uu_avl_first(fsperm->fsp_uge_avl); - while (node != NULL) { - who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl, - node); - who_perm_t *who_perm = &node->who_perm; - who_perm_fini(who_perm); - uu_avl_remove(fsperm->fsp_uge_avl, node); - free(node); - node = next_node; - } - - uu_avl_destroy(fsperm->fsp_sc_avl); - uu_avl_destroy(fsperm->fsp_uge_avl); -} - -static void -set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node, - zfs_deleg_who_type_t who_type, const char *name, char locality) -{ - uu_avl_index_t idx = 0; - - deleg_perm_node_t *found_node = NULL; - deleg_perm_t *deleg_perm = &node->dpn_perm; - - deleg_perm_init(deleg_perm, who_type, name); - - if ((found_node = uu_avl_find(avl, node, NULL, &idx)) - == NULL) - uu_avl_insert(avl, node, idx); - else { - node = found_node; - deleg_perm = &node->dpn_perm; - } - - - switch (locality) { - case ZFS_DELEG_LOCAL: - deleg_perm->dp_local = B_TRUE; - break; - case ZFS_DELEG_DESCENDENT: - deleg_perm->dp_descend = B_TRUE; - break; - case ZFS_DELEG_NA: - break; - default: - assert(B_FALSE); /* invalid locality */ - } -} - -static inline int -parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality) -{ - nvpair_t *nvp = NULL; - fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set; - uu_avl_t *avl = who_perm->who_deleg_perm_avl; - zfs_deleg_who_type_t who_type = who_perm->who_type; - - while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { - const char *name = nvpair_name(nvp); - data_type_t type = nvpair_type(nvp); - uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool; - deleg_perm_node_t *node = - safe_malloc(sizeof (deleg_perm_node_t)); - - assert(type == DATA_TYPE_BOOLEAN); - - uu_avl_node_init(node, &node->dpn_avl_node, avl_pool); - set_deleg_perm_node(avl, node, who_type, name, locality); - } - - return (0); -} - -static inline int -parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl) -{ - nvpair_t *nvp = NULL; - fs_perm_set_t *fspset = fsperm->fsp_set; - - while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { - nvlist_t *nvl2 = NULL; - const char *name = nvpair_name(nvp); - uu_avl_t *avl = NULL; - uu_avl_pool_t *avl_pool = NULL; - zfs_deleg_who_type_t perm_type = name[0]; - char perm_locality = name[1]; - const char *perm_name = name + 3; - boolean_t is_set = B_TRUE; - who_perm_t *who_perm = NULL; - - assert('$' == name[2]); - - if (nvpair_value_nvlist(nvp, &nvl2) != 0) - return (-1); - - switch (perm_type) { - case ZFS_DELEG_CREATE: - case ZFS_DELEG_CREATE_SETS: - case ZFS_DELEG_NAMED_SET: - case ZFS_DELEG_NAMED_SET_SETS: - avl_pool = fspset->fsps_named_set_avl_pool; - avl = fsperm->fsp_sc_avl; - break; - case ZFS_DELEG_USER: - case ZFS_DELEG_USER_SETS: - case ZFS_DELEG_GROUP: - case ZFS_DELEG_GROUP_SETS: - case ZFS_DELEG_EVERYONE: - case ZFS_DELEG_EVERYONE_SETS: - avl_pool = fspset->fsps_who_perm_avl_pool; - avl = fsperm->fsp_uge_avl; - break; - - default: - assert(!"unhandled zfs_deleg_who_type_t"); - } - - if (is_set) { - who_perm_node_t *found_node = NULL; - who_perm_node_t *node = safe_malloc( - sizeof (who_perm_node_t)); - who_perm = &node->who_perm; - uu_avl_index_t idx = 0; - - uu_avl_node_init(node, &node->who_avl_node, avl_pool); - who_perm_init(who_perm, fsperm, perm_type, perm_name); - - if ((found_node = uu_avl_find(avl, node, NULL, &idx)) - == NULL) { - if (avl == fsperm->fsp_uge_avl) { - uid_t rid = 0; - struct passwd *p = NULL; - struct group *g = NULL; - const char *nice_name = NULL; - - switch (perm_type) { - case ZFS_DELEG_USER_SETS: - case ZFS_DELEG_USER: - rid = atoi(perm_name); - p = getpwuid(rid); - if (p) - nice_name = p->pw_name; - break; - case ZFS_DELEG_GROUP_SETS: - case ZFS_DELEG_GROUP: - rid = atoi(perm_name); - g = getgrgid(rid); - if (g) - nice_name = g->gr_name; - break; - - default: - break; - } - - if (nice_name != NULL) - (void) strlcpy( - node->who_perm.who_ug_name, - nice_name, 256); - else { - /* User or group unknown */ - (void) snprintf( - node->who_perm.who_ug_name, - sizeof ( - node->who_perm.who_ug_name), - "(unknown: %d)", rid); - } - } - - uu_avl_insert(avl, node, idx); - } else { - node = found_node; - who_perm = &node->who_perm; - } - } - - (void) parse_who_perm(who_perm, nvl2, perm_locality); - } - - return (0); -} - -static inline int -parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl) -{ - nvpair_t *nvp = NULL; - uu_avl_index_t idx = 0; - - while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { - nvlist_t *nvl2 = NULL; - const char *fsname = nvpair_name(nvp); - data_type_t type = nvpair_type(nvp); - fs_perm_t *fsperm = NULL; - fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t)); - if (node == NULL) - nomem(); - - fsperm = &node->fspn_fsperm; - - assert(DATA_TYPE_NVLIST == type); - - uu_list_node_init(node, &node->fspn_list_node, - fspset->fsps_list_pool); - - idx = uu_list_numnodes(fspset->fsps_list); - fs_perm_init(fsperm, fspset, fsname); - - if (nvpair_value_nvlist(nvp, &nvl2) != 0) - return (-1); - - (void) parse_fs_perm(fsperm, nvl2); - - uu_list_insert(fspset->fsps_list, node, idx); - } - - return (0); -} - -static inline const char * -deleg_perm_comment(zfs_deleg_note_t note) -{ - const char *str = ""; - - /* subcommands */ - switch (note) { - /* SUBCOMMANDS */ - case ZFS_DELEG_NOTE_ALLOW: - str = gettext("Must also have the permission that is being" - "\n\t\t\t\tallowed"); - break; - case ZFS_DELEG_NOTE_CLONE: - str = gettext("Must also have the 'create' ability and 'mount'" - "\n\t\t\t\tability in the origin file system"); - break; - case ZFS_DELEG_NOTE_CREATE: - str = gettext("Must also have the 'mount' ability"); - break; - case ZFS_DELEG_NOTE_DESTROY: - str = gettext("Must also have the 'mount' ability"); - break; - case ZFS_DELEG_NOTE_DIFF: - str = gettext("Allows lookup of paths within a dataset;" - "\n\t\t\t\tgiven an object number. Ordinary users need this" - "\n\t\t\t\tin order to use zfs diff"); - break; - case ZFS_DELEG_NOTE_HOLD: - str = gettext("Allows adding a user hold to a snapshot"); - break; - case ZFS_DELEG_NOTE_MOUNT: - str = gettext("Allows mount/umount of ZFS datasets"); - break; - case ZFS_DELEG_NOTE_PROMOTE: - str = gettext("Must also have the 'mount'\n\t\t\t\tand" - " 'promote' ability in the origin file system"); - break; - case ZFS_DELEG_NOTE_RECEIVE: - str = gettext("Must also have the 'mount' and 'create'" - " ability"); - break; - case ZFS_DELEG_NOTE_RELEASE: - str = gettext("Allows releasing a user hold which\n\t\t\t\t" - "might destroy the snapshot"); - break; - case ZFS_DELEG_NOTE_RENAME: - str = gettext("Must also have the 'mount' and 'create'" - "\n\t\t\t\tability in the new parent"); - break; - case ZFS_DELEG_NOTE_ROLLBACK: - str = gettext(""); - break; - case ZFS_DELEG_NOTE_SEND: - str = gettext(""); - break; - case ZFS_DELEG_NOTE_SHARE: - str = gettext("Allows sharing file systems over NFS or SMB" - "\n\t\t\t\tprotocols"); - break; - case ZFS_DELEG_NOTE_SNAPSHOT: - str = gettext(""); - break; -/* - * case ZFS_DELEG_NOTE_VSCAN: - * str = gettext(""); - * break; - */ - /* OTHER */ - case ZFS_DELEG_NOTE_GROUPQUOTA: - str = gettext("Allows accessing any groupquota@... property"); - break; - case ZFS_DELEG_NOTE_GROUPUSED: - str = gettext("Allows reading any groupused@... property"); - break; - case ZFS_DELEG_NOTE_USERPROP: - str = gettext("Allows changing any user property"); - break; - case ZFS_DELEG_NOTE_USERQUOTA: - str = gettext("Allows accessing any userquota@... property"); - break; - case ZFS_DELEG_NOTE_USERUSED: - str = gettext("Allows reading any userused@... property"); - break; - /* other */ - default: - str = ""; - } - - return (str); -} - -struct allow_opts { - boolean_t local; - boolean_t descend; - boolean_t user; - boolean_t group; - boolean_t everyone; - boolean_t create; - boolean_t set; - boolean_t recursive; /* unallow only */ - boolean_t prt_usage; - - boolean_t prt_perms; - char *who; - char *perms; - const char *dataset; -}; - -static inline int -prop_cmp(const void *a, const void *b) -{ - const char *str1 = *(const char **)a; - const char *str2 = *(const char **)b; - return (strcmp(str1, str2)); -} - -static void -allow_usage(boolean_t un, boolean_t requested, const char *msg) -{ - const char *opt_desc[] = { - "-h", gettext("show this help message and exit"), - "-l", gettext("set permission locally"), - "-d", gettext("set permission for descents"), - "-u", gettext("set permission for user"), - "-g", gettext("set permission for group"), - "-e", gettext("set permission for everyone"), - "-c", gettext("set create time permission"), - "-s", gettext("define permission set"), - /* unallow only */ - "-r", gettext("remove permissions recursively"), - }; - size_t unallow_size = sizeof (opt_desc) / sizeof (char *); - size_t allow_size = unallow_size - 2; - const char *props[ZFS_NUM_PROPS]; - int i; - size_t count = 0; - FILE *fp = requested ? stdout : stderr; - zprop_desc_t *pdtbl = zfs_prop_get_table(); - const char *fmt = gettext("%-16s %-14s\t%s\n"); - - (void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW : - HELP_ALLOW)); - (void) fprintf(fp, gettext("Options:\n")); - for (i = 0; i < (un ? unallow_size : allow_size); i++) { - const char *opt = opt_desc[i++]; - const char *optdsc = opt_desc[i]; - (void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc); - } - - (void) fprintf(fp, gettext("\nThe following permissions are " - "supported:\n\n")); - (void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"), - gettext("NOTES")); - for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) { - const char *perm_name = zfs_deleg_perm_tbl[i].z_perm; - zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note; - const char *perm_type = deleg_perm_type(perm_note); - const char *perm_comment = deleg_perm_comment(perm_note); - (void) fprintf(fp, fmt, perm_name, perm_type, perm_comment); - } - - for (i = 0; i < ZFS_NUM_PROPS; i++) { - zprop_desc_t *pd = &pdtbl[i]; - if (pd->pd_visible != B_TRUE) - continue; - - if (pd->pd_attr == PROP_READONLY) - continue; - - props[count++] = pd->pd_name; - } - props[count] = NULL; - - qsort(props, count, sizeof (char *), prop_cmp); - - for (i = 0; i < count; i++) - (void) fprintf(fp, fmt, props[i], gettext("property"), ""); - - if (msg != NULL) - (void) fprintf(fp, gettext("\nzfs: error: %s"), msg); - - exit(requested ? 0 : 2); -} - -static inline const char * -munge_args(int argc, char **argv, boolean_t un, size_t expected_argc, - char **permsp) -{ - if (un && argc == expected_argc - 1) - *permsp = NULL; - else if (argc == expected_argc) - *permsp = argv[argc - 2]; - else - allow_usage(un, B_FALSE, - gettext("wrong number of parameters\n")); - - return (argv[argc - 1]); -} - -static void -parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts) -{ - int uge_sum = opts->user + opts->group + opts->everyone; - int csuge_sum = opts->create + opts->set + uge_sum; - int ldcsuge_sum = csuge_sum + opts->local + opts->descend; - int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum; - - if (uge_sum > 1) - allow_usage(un, B_FALSE, - gettext("-u, -g, and -e are mutually exclusive\n")); - - if (opts->prt_usage) { - if (argc == 0 && all_sum == 0) - allow_usage(un, B_TRUE, NULL); - else - usage(B_FALSE); - } - - if (opts->set) { - if (csuge_sum > 1) - allow_usage(un, B_FALSE, - gettext("invalid options combined with -s\n")); - - opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); - if (argv[0][0] != '@') - allow_usage(un, B_FALSE, - gettext("invalid set name: missing '@' prefix\n")); - opts->who = argv[0]; - } else if (opts->create) { - if (ldcsuge_sum > 1) - allow_usage(un, B_FALSE, - gettext("invalid options combined with -c\n")); - opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); - } else if (opts->everyone) { - if (csuge_sum > 1) - allow_usage(un, B_FALSE, - gettext("invalid options combined with -e\n")); - opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); - } else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone") - == 0) { - opts->everyone = B_TRUE; - argc--; - argv++; - opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); - } else if (argc == 1 && !un) { - opts->prt_perms = B_TRUE; - opts->dataset = argv[argc-1]; - } else { - opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); - opts->who = argv[0]; - } - - if (!opts->local && !opts->descend) { - opts->local = B_TRUE; - opts->descend = B_TRUE; - } -} - -static void -store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend, - const char *who, char *perms, nvlist_t *top_nvl) -{ - int i; - char ld[2] = { '\0', '\0' }; - char who_buf[MAXNAMELEN + 32]; - char base_type = '\0'; - char set_type = '\0'; - nvlist_t *base_nvl = NULL; - nvlist_t *set_nvl = NULL; - nvlist_t *nvl; - - if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0) - nomem(); - if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0) - nomem(); - - switch (type) { - case ZFS_DELEG_NAMED_SET_SETS: - case ZFS_DELEG_NAMED_SET: - set_type = ZFS_DELEG_NAMED_SET_SETS; - base_type = ZFS_DELEG_NAMED_SET; - ld[0] = ZFS_DELEG_NA; - break; - case ZFS_DELEG_CREATE_SETS: - case ZFS_DELEG_CREATE: - set_type = ZFS_DELEG_CREATE_SETS; - base_type = ZFS_DELEG_CREATE; - ld[0] = ZFS_DELEG_NA; - break; - case ZFS_DELEG_USER_SETS: - case ZFS_DELEG_USER: - set_type = ZFS_DELEG_USER_SETS; - base_type = ZFS_DELEG_USER; - if (local) - ld[0] = ZFS_DELEG_LOCAL; - if (descend) - ld[1] = ZFS_DELEG_DESCENDENT; - break; - case ZFS_DELEG_GROUP_SETS: - case ZFS_DELEG_GROUP: - set_type = ZFS_DELEG_GROUP_SETS; - base_type = ZFS_DELEG_GROUP; - if (local) - ld[0] = ZFS_DELEG_LOCAL; - if (descend) - ld[1] = ZFS_DELEG_DESCENDENT; - break; - case ZFS_DELEG_EVERYONE_SETS: - case ZFS_DELEG_EVERYONE: - set_type = ZFS_DELEG_EVERYONE_SETS; - base_type = ZFS_DELEG_EVERYONE; - if (local) - ld[0] = ZFS_DELEG_LOCAL; - if (descend) - ld[1] = ZFS_DELEG_DESCENDENT; - break; - - default: - assert(set_type != '\0' && base_type != '\0'); - } - - if (perms != NULL) { - char *curr = perms; - char *end = curr + strlen(perms); - - while (curr < end) { - char *delim = strchr(curr, ','); - if (delim == NULL) - delim = end; - else - *delim = '\0'; - - if (curr[0] == '@') - nvl = set_nvl; - else - nvl = base_nvl; - - (void) nvlist_add_boolean(nvl, curr); - if (delim != end) - *delim = ','; - curr = delim + 1; - } - - for (i = 0; i < 2; i++) { - char locality = ld[i]; - if (locality == 0) - continue; - - if (!nvlist_empty(base_nvl)) { - if (who != NULL) - (void) snprintf(who_buf, - sizeof (who_buf), "%c%c$%s", - base_type, locality, who); - else - (void) snprintf(who_buf, - sizeof (who_buf), "%c%c$", - base_type, locality); - - (void) nvlist_add_nvlist(top_nvl, who_buf, - base_nvl); - } - - - if (!nvlist_empty(set_nvl)) { - if (who != NULL) - (void) snprintf(who_buf, - sizeof (who_buf), "%c%c$%s", - set_type, locality, who); - else - (void) snprintf(who_buf, - sizeof (who_buf), "%c%c$", - set_type, locality); - - (void) nvlist_add_nvlist(top_nvl, who_buf, - set_nvl); - } - } - } else { - for (i = 0; i < 2; i++) { - char locality = ld[i]; - if (locality == 0) - continue; - - if (who != NULL) - (void) snprintf(who_buf, sizeof (who_buf), - "%c%c$%s", base_type, locality, who); - else - (void) snprintf(who_buf, sizeof (who_buf), - "%c%c$", base_type, locality); - (void) nvlist_add_boolean(top_nvl, who_buf); - - if (who != NULL) - (void) snprintf(who_buf, sizeof (who_buf), - "%c%c$%s", set_type, locality, who); - else - (void) snprintf(who_buf, sizeof (who_buf), - "%c%c$", set_type, locality); - (void) nvlist_add_boolean(top_nvl, who_buf); - } - } -} - -static int -construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) -{ - if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0) - nomem(); - - if (opts->set) { - store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local, - opts->descend, opts->who, opts->perms, *nvlp); - } else if (opts->create) { - store_allow_perm(ZFS_DELEG_CREATE, opts->local, - opts->descend, NULL, opts->perms, *nvlp); - } else if (opts->everyone) { - store_allow_perm(ZFS_DELEG_EVERYONE, opts->local, - opts->descend, NULL, opts->perms, *nvlp); - } else { - char *curr = opts->who; - char *end = curr + strlen(curr); - - while (curr < end) { - const char *who; - zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN; - char *endch; - char *delim = strchr(curr, ','); - char errbuf[256]; - char id[64]; - struct passwd *p = NULL; - struct group *g = NULL; - - uid_t rid; - if (delim == NULL) - delim = end; - else - *delim = '\0'; - - rid = (uid_t)strtol(curr, &endch, 0); - if (opts->user) { - who_type = ZFS_DELEG_USER; - if (*endch != '\0') - p = getpwnam(curr); - else - p = getpwuid(rid); - - if (p != NULL) - rid = p->pw_uid; - else if (*endch != '\0') { - (void) snprintf(errbuf, 256, gettext( - "invalid user %s\n"), curr); - allow_usage(un, B_TRUE, errbuf); - } - } else if (opts->group) { - who_type = ZFS_DELEG_GROUP; - if (*endch != '\0') - g = getgrnam(curr); - else - g = getgrgid(rid); - - if (g != NULL) - rid = g->gr_gid; - else if (*endch != '\0') { - (void) snprintf(errbuf, 256, gettext( - "invalid group %s\n"), curr); - allow_usage(un, B_TRUE, errbuf); - } - } else { - if (*endch != '\0') { - p = getpwnam(curr); - } else { - p = getpwuid(rid); - } - - if (p == NULL) { - if (*endch != '\0') { - g = getgrnam(curr); - } else { - g = getgrgid(rid); - } - } - - if (p != NULL) { - who_type = ZFS_DELEG_USER; - rid = p->pw_uid; - } else if (g != NULL) { - who_type = ZFS_DELEG_GROUP; - rid = g->gr_gid; - } else { - (void) snprintf(errbuf, 256, gettext( - "invalid user/group %s\n"), curr); - allow_usage(un, B_TRUE, errbuf); - } - } - - (void) sprintf(id, "%u", rid); - who = id; - - store_allow_perm(who_type, opts->local, - opts->descend, who, opts->perms, *nvlp); - curr = delim + 1; - } - } - - return (0); -} - -static void -print_set_creat_perms(uu_avl_t *who_avl) -{ - const char *sc_title[] = { - gettext("Permission sets:\n"), - gettext("Create time permissions:\n"), - NULL - }; - const char **title_ptr = sc_title; - who_perm_node_t *who_node = NULL; - int prev_weight = -1; - - for (who_node = uu_avl_first(who_avl); who_node != NULL; - who_node = uu_avl_next(who_avl, who_node)) { - uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; - zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; - const char *who_name = who_node->who_perm.who_name; - int weight = who_type2weight(who_type); - boolean_t first = B_TRUE; - deleg_perm_node_t *deleg_node; - - if (prev_weight != weight) { - (void) printf(*title_ptr++); - prev_weight = weight; - } - - if (who_name == NULL || strnlen(who_name, 1) == 0) - (void) printf("\t"); - else - (void) printf("\t%s ", who_name); - - for (deleg_node = uu_avl_first(avl); deleg_node != NULL; - deleg_node = uu_avl_next(avl, deleg_node)) { - if (first) { - (void) printf("%s", - deleg_node->dpn_perm.dp_name); - first = B_FALSE; - } else - (void) printf(",%s", - deleg_node->dpn_perm.dp_name); - } - - (void) printf("\n"); - } -} - -static void -print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend, - const char *title) -{ - who_perm_node_t *who_node = NULL; - boolean_t prt_title = B_TRUE; - uu_avl_walk_t *walk; - - if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL) - nomem(); - - while ((who_node = uu_avl_walk_next(walk)) != NULL) { - const char *who_name = who_node->who_perm.who_name; - const char *nice_who_name = who_node->who_perm.who_ug_name; - uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; - zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; - char delim = ' '; - deleg_perm_node_t *deleg_node; - boolean_t prt_who = B_TRUE; - - for (deleg_node = uu_avl_first(avl); - deleg_node != NULL; - deleg_node = uu_avl_next(avl, deleg_node)) { - if (local != deleg_node->dpn_perm.dp_local || - descend != deleg_node->dpn_perm.dp_descend) - continue; - - if (prt_who) { - const char *who = NULL; - if (prt_title) { - prt_title = B_FALSE; - (void) printf(title); - } - - switch (who_type) { - case ZFS_DELEG_USER_SETS: - case ZFS_DELEG_USER: - who = gettext("user"); - if (nice_who_name) - who_name = nice_who_name; - break; - case ZFS_DELEG_GROUP_SETS: - case ZFS_DELEG_GROUP: - who = gettext("group"); - if (nice_who_name) - who_name = nice_who_name; - break; - case ZFS_DELEG_EVERYONE_SETS: - case ZFS_DELEG_EVERYONE: - who = gettext("everyone"); - who_name = NULL; - break; - - default: - assert(who != NULL); - } - - prt_who = B_FALSE; - if (who_name == NULL) - (void) printf("\t%s", who); - else - (void) printf("\t%s %s", who, who_name); - } - - (void) printf("%c%s", delim, - deleg_node->dpn_perm.dp_name); - delim = ','; - } - - if (!prt_who) - (void) printf("\n"); - } - - uu_avl_walk_end(walk); -} - -static void -print_fs_perms(fs_perm_set_t *fspset) -{ - fs_perm_node_t *node = NULL; - char buf[MAXNAMELEN + 32]; - const char *dsname = buf; - - for (node = uu_list_first(fspset->fsps_list); node != NULL; - node = uu_list_next(fspset->fsps_list, node)) { - uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl; - uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl; - int left = 0; - - (void) snprintf(buf, sizeof (buf), - gettext("---- Permissions on %s "), - node->fspn_fsperm.fsp_name); - (void) printf(dsname); - left = 70 - strlen(buf); - while (left-- > 0) - (void) printf("-"); - (void) printf("\n"); - - print_set_creat_perms(sc_avl); - print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE, - gettext("Local permissions:\n")); - print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE, - gettext("Descendent permissions:\n")); - print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE, - gettext("Local+Descendent permissions:\n")); - } -} - -static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL }; - -struct deleg_perms { - boolean_t un; - nvlist_t *nvl; -}; - -static int -set_deleg_perms(zfs_handle_t *zhp, void *data) -{ - struct deleg_perms *perms = (struct deleg_perms *)data; - zfs_type_t zfs_type = zfs_get_type(zhp); - - if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME) - return (0); - - return (zfs_set_fsacl(zhp, perms->un, perms->nvl)); -} - -static int -zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un) -{ - zfs_handle_t *zhp; - nvlist_t *perm_nvl = NULL; - nvlist_t *update_perm_nvl = NULL; - int error = 1; - int c; - struct allow_opts opts = { 0 }; - - const char *optstr = un ? "ldugecsrh" : "ldugecsh"; - - /* check opts */ - while ((c = getopt(argc, argv, optstr)) != -1) { - switch (c) { - case 'l': - opts.local = B_TRUE; - break; - case 'd': - opts.descend = B_TRUE; - break; - case 'u': - opts.user = B_TRUE; - break; - case 'g': - opts.group = B_TRUE; - break; - case 'e': - opts.everyone = B_TRUE; - break; - case 's': - opts.set = B_TRUE; - break; - case 'c': - opts.create = B_TRUE; - break; - case 'r': - opts.recursive = B_TRUE; - break; - case ':': - (void) fprintf(stderr, gettext("missing argument for " - "'%c' option\n"), optopt); - usage(B_FALSE); - break; - case 'h': - opts.prt_usage = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* check arguments */ - parse_allow_args(argc, argv, un, &opts); - - /* try to open the dataset */ - if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM | - ZFS_TYPE_VOLUME)) == NULL) { - (void) fprintf(stderr, "Failed to open dataset: %s\n", - opts.dataset); - return (-1); - } - - if (zfs_get_fsacl(zhp, &perm_nvl) != 0) - goto cleanup2; - - fs_perm_set_init(&fs_perm_set); - if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) { - (void) fprintf(stderr, "Failed to parse fsacl permissions\n"); - goto cleanup1; - } - - if (opts.prt_perms) - print_fs_perms(&fs_perm_set); - else { - (void) construct_fsacl_list(un, &opts, &update_perm_nvl); - if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0) - goto cleanup0; - - if (un && opts.recursive) { - struct deleg_perms data = { un, update_perm_nvl }; - if (zfs_iter_filesystems(zhp, set_deleg_perms, - &data) != 0) - goto cleanup0; - } - } - - error = 0; - -cleanup0: - nvlist_free(perm_nvl); - nvlist_free(update_perm_nvl); -cleanup1: - fs_perm_set_fini(&fs_perm_set); -cleanup2: - zfs_close(zhp); - - return (error); -} - -static int -zfs_do_allow(int argc, char **argv) -{ - return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE)); -} - -static int -zfs_do_unallow(int argc, char **argv) -{ - return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE)); -} - -static int -zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) -{ - int errors = 0; - int i; - const char *tag; - boolean_t recursive = B_FALSE; - const char *opts = holding ? "rt" : "r"; - int c; - - /* check options */ - while ((c = getopt(argc, argv, opts)) != -1) { - switch (c) { - case 'r': - recursive = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* check number of arguments */ - if (argc < 2) - usage(B_FALSE); - - tag = argv[0]; - --argc; - ++argv; - - if (holding && tag[0] == '.') { - /* tags starting with '.' are reserved for libzfs */ - (void) fprintf(stderr, gettext("tag may not start with '.'\n")); - usage(B_FALSE); - } - - for (i = 0; i < argc; ++i) { - zfs_handle_t *zhp; - char parent[ZFS_MAX_DATASET_NAME_LEN]; - const char *delim; - char *path = argv[i]; - - delim = strchr(path, '@'); - if (delim == NULL) { - (void) fprintf(stderr, - gettext("'%s' is not a snapshot\n"), path); - ++errors; - continue; - } - (void) strncpy(parent, path, delim - path); - parent[delim - path] = '\0'; - - zhp = zfs_open(g_zfs, parent, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); - if (zhp == NULL) { - ++errors; - continue; - } - if (holding) { - if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0) - ++errors; - } else { - if (zfs_release(zhp, delim+1, tag, recursive) != 0) - ++errors; - } - zfs_close(zhp); - } - - return (errors != 0); -} - -/* - * zfs hold [-r] [-t] ... - * - * -r Recursively hold - * - * Apply a user-hold with the given tag to the list of snapshots. - */ -static int -zfs_do_hold(int argc, char **argv) -{ - return (zfs_do_hold_rele_impl(argc, argv, B_TRUE)); -} - -/* - * zfs release [-r] ... - * - * -r Recursively release - * - * Release a user-hold with the given tag from the list of snapshots. - */ -static int -zfs_do_release(int argc, char **argv) -{ - return (zfs_do_hold_rele_impl(argc, argv, B_FALSE)); -} - -typedef struct holds_cbdata { - boolean_t cb_recursive; - const char *cb_snapname; - nvlist_t **cb_nvlp; - size_t cb_max_namelen; - size_t cb_max_taglen; -} holds_cbdata_t; - -#define STRFTIME_FMT_STR "%a %b %e %k:%M %Y" -#define DATETIME_BUF_LEN (32) -/* - * - */ -static void -print_holds(boolean_t scripted, boolean_t literal, size_t nwidth, - size_t tagwidth, nvlist_t *nvl) -{ - int i; - nvpair_t *nvp = NULL; - char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" }; - const char *col; - - if (!scripted) { - for (i = 0; i < 3; i++) { - col = gettext(hdr_cols[i]); - if (i < 2) - (void) printf("%-*s ", i ? tagwidth : nwidth, - col); - else - (void) printf("%s\n", col); - } - } - - while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { - char *zname = nvpair_name(nvp); - nvlist_t *nvl2; - nvpair_t *nvp2 = NULL; - (void) nvpair_value_nvlist(nvp, &nvl2); - while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) { - char tsbuf[DATETIME_BUF_LEN]; - char *tagname = nvpair_name(nvp2); - uint64_t val = 0; - time_t time; - struct tm t; - - (void) nvpair_value_uint64(nvp2, &val); - if (literal) - snprintf(tsbuf, DATETIME_BUF_LEN, "%llu", val); - else { - time = (time_t)val; - (void) localtime_r(&time, &t); - (void) strftime(tsbuf, DATETIME_BUF_LEN, - gettext(STRFTIME_FMT_STR), &t); - } - - if (scripted) { - (void) printf("%s\t%s\t%s\n", zname, - tagname, tsbuf); - } else { - (void) printf("%-*s %-*s %s\n", nwidth, - zname, tagwidth, tagname, tsbuf); - } - } - } -} - -/* - * Generic callback function to list a dataset or snapshot. - */ -static int -holds_callback(zfs_handle_t *zhp, void *data) -{ - holds_cbdata_t *cbp = data; - nvlist_t *top_nvl = *cbp->cb_nvlp; - nvlist_t *nvl = NULL; - nvpair_t *nvp = NULL; - const char *zname = zfs_get_name(zhp); - size_t znamelen = strlen(zname); - - if (cbp->cb_recursive && cbp->cb_snapname != NULL) { - const char *snapname; - char *delim = strchr(zname, '@'); - if (delim == NULL) - return (0); - - snapname = delim + 1; - if (strcmp(cbp->cb_snapname, snapname)) - return (0); - } - - if (zfs_get_holds(zhp, &nvl) != 0) - return (-1); - - if (znamelen > cbp->cb_max_namelen) - cbp->cb_max_namelen = znamelen; - - while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { - const char *tag = nvpair_name(nvp); - size_t taglen = strlen(tag); - if (taglen > cbp->cb_max_taglen) - cbp->cb_max_taglen = taglen; - } - - return (nvlist_add_nvlist(top_nvl, zname, nvl)); -} - -/* - * zfs holds [-Hp] [-r | -d max] ... - * - * -H Suppress header output - * -p Output literal values - * -r Recursively search for holds - * -d max Limit depth of recursive search - */ -static int -zfs_do_holds(int argc, char **argv) -{ - int errors = 0; - int c; - int i; - boolean_t scripted = B_FALSE; - boolean_t literal = B_FALSE; - boolean_t recursive = B_FALSE; - const char *opts = "d:rHp"; - nvlist_t *nvl; - - int types = ZFS_TYPE_SNAPSHOT; - holds_cbdata_t cb = { 0 }; - - int limit = 0; - int ret = 0; - int flags = 0; - - /* check options */ - while ((c = getopt(argc, argv, opts)) != -1) { - switch (c) { - case 'd': - limit = parse_depth(optarg, &flags); - recursive = B_TRUE; - break; - case 'r': - recursive = B_TRUE; - break; - case 'H': - scripted = B_TRUE; - break; - case 'p': - literal = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - if (recursive) { - types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; - flags |= ZFS_ITER_RECURSE; - } - - argc -= optind; - argv += optind; - - /* check number of arguments */ - if (argc < 1) - usage(B_FALSE); - - if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) - nomem(); - - for (i = 0; i < argc; ++i) { - char *snapshot = argv[i]; - const char *delim; - const char *snapname = NULL; - - delim = strchr(snapshot, '@'); - if (delim != NULL) { - snapname = delim + 1; - if (recursive) - snapshot[delim - snapshot] = '\0'; - } - - cb.cb_recursive = recursive; - cb.cb_snapname = snapname; - cb.cb_nvlp = &nvl; - - /* - * 1. collect holds data, set format options - */ - ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit, - holds_callback, &cb); - if (ret != 0) - ++errors; - } - - /* - * 2. print holds data - */ - print_holds(scripted, literal, cb.cb_max_namelen, cb.cb_max_taglen, - nvl); - - if (nvlist_empty(nvl)) - (void) printf(gettext("no datasets available\n")); - - nvlist_free(nvl); - - return (0 != errors); -} - -#define CHECK_SPINNER 30 -#define SPINNER_TIME 3 /* seconds */ -#define MOUNT_TIME 1 /* seconds */ - -typedef struct get_all_state { - boolean_t ga_verbose; - get_all_cb_t *ga_cbp; -} get_all_state_t; - -static int -get_one_dataset(zfs_handle_t *zhp, void *data) -{ - static char *spin[] = { "-", "\\", "|", "/" }; - static int spinval = 0; - static int spincheck = 0; - static time_t last_spin_time = (time_t)0; - get_all_state_t *state = data; - zfs_type_t type = zfs_get_type(zhp); - - if (state->ga_verbose) { - if (--spincheck < 0) { - time_t now = time(NULL); - if (last_spin_time + SPINNER_TIME < now) { - update_progress(spin[spinval++ % 4]); - last_spin_time = now; - } - spincheck = CHECK_SPINNER; - } - } - - /* - * Interate over any nested datasets. - */ - if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) { - zfs_close(zhp); - return (1); - } - - /* - * Skip any datasets whose type does not match. - */ - if ((type & ZFS_TYPE_FILESYSTEM) == 0) { - zfs_close(zhp); - return (0); - } - libzfs_add_handle(state->ga_cbp, zhp); - assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc); - - return (0); -} - -static void -get_all_datasets(get_all_cb_t *cbp, boolean_t verbose) -{ - get_all_state_t state = { - .ga_verbose = verbose, - .ga_cbp = cbp - }; - - if (verbose) - set_progress_header(gettext("Reading ZFS config")); - (void) zfs_iter_root(g_zfs, get_one_dataset, &state); - - if (verbose) - finish_progress(gettext("done.")); -} - -/* - * Generic callback for sharing or mounting filesystems. Because the code is so - * similar, we have a common function with an extra parameter to determine which - * mode we are using. - */ -typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t; - -typedef struct share_mount_state { - share_mount_op_t sm_op; - boolean_t sm_verbose; - int sm_flags; - char *sm_options; - char *sm_proto; /* only valid for OP_SHARE */ - pthread_mutex_t sm_lock; /* protects the remaining fields */ - uint_t sm_total; /* number of filesystems to process */ - uint_t sm_done; /* number of filesystems processed */ - int sm_status; /* -1 if any of the share/mount operations failed */ -} share_mount_state_t; - -/* - * Share or mount a dataset. - */ -static int -share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, - boolean_t explicit, const char *options) -{ - char mountpoint[ZFS_MAXPROPLEN]; - char shareopts[ZFS_MAXPROPLEN]; - char smbshareopts[ZFS_MAXPROPLEN]; - const char *cmdname = op == OP_SHARE ? "share" : "mount"; - struct mnttab mnt; - uint64_t zoned, canmount; - boolean_t shared_nfs, shared_smb; - - assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM); - - /* - * Check to make sure we can mount/share this dataset. If we - * are in the global zone and the filesystem is exported to a - * local zone, or if we are in a local zone and the - * filesystem is not exported, then it is an error. - */ - zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); - - if (zoned && getzoneid() == GLOBAL_ZONEID) { - if (!explicit) - return (0); - - (void) fprintf(stderr, gettext("cannot %s '%s': " - "dataset is exported to a local zone\n"), cmdname, - zfs_get_name(zhp)); - return (1); - - } else if (!zoned && getzoneid() != GLOBAL_ZONEID) { - if (!explicit) - return (0); - - (void) fprintf(stderr, gettext("cannot %s '%s': " - "permission denied\n"), cmdname, - zfs_get_name(zhp)); - return (1); - } - - /* - * Ignore any filesystems which don't apply to us. This - * includes those with a legacy mountpoint, or those with - * legacy share options. - */ - verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, - sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); - verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts, - sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0); - verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts, - sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0); - - if (op == OP_SHARE && strcmp(shareopts, "off") == 0 && - strcmp(smbshareopts, "off") == 0) { - if (!explicit) - return (0); - - (void) fprintf(stderr, gettext("cannot share '%s': " - "legacy share\n"), zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("to " - "share this filesystem set " - "sharenfs property on\n")); - return (1); - } - - /* - * We cannot share or mount legacy filesystems. If the - * shareopts is non-legacy but the mountpoint is legacy, we - * treat it as a legacy share. - */ - if (strcmp(mountpoint, "legacy") == 0) { - if (!explicit) - return (0); - - (void) fprintf(stderr, gettext("cannot %s '%s': " - "legacy mountpoint\n"), cmdname, zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use %s(8) to " - "%s this filesystem\n"), cmdname, cmdname); - return (1); - } - - if (strcmp(mountpoint, "none") == 0) { - if (!explicit) - return (0); - - (void) fprintf(stderr, gettext("cannot %s '%s': no " - "mountpoint set\n"), cmdname, zfs_get_name(zhp)); - return (1); - } - - /* - * canmount explicit outcome - * on no pass through - * on yes pass through - * off no return 0 - * off yes display error, return 1 - * noauto no return 0 - * noauto yes pass through - */ - canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); - if (canmount == ZFS_CANMOUNT_OFF) { - if (!explicit) - return (0); - - (void) fprintf(stderr, gettext("cannot %s '%s': " - "'canmount' property is set to 'off'\n"), cmdname, - zfs_get_name(zhp)); - return (1); - } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) { - return (0); - } - - /* - * If this filesystem is inconsistent and has a receive resume - * token, we can not mount it. - */ - if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && - zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, - NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { - if (!explicit) - return (0); - - (void) fprintf(stderr, gettext("cannot %s '%s': " - "Contains partially-completed state from " - "\"zfs receive -r\", which can be resumed with " - "\"zfs send -t\"\n"), - cmdname, zfs_get_name(zhp)); - return (1); - } - - /* - * At this point, we have verified that the mountpoint and/or - * shareopts are appropriate for auto management. If the - * filesystem is already mounted or shared, return (failing - * for explicit requests); otherwise mount or share the - * filesystem. - */ - switch (op) { - case OP_SHARE: - - shared_nfs = zfs_is_shared_nfs(zhp, NULL); - shared_smb = zfs_is_shared_smb(zhp, NULL); - - if ((shared_nfs && shared_smb) || - (shared_nfs && strcmp(shareopts, "on") == 0 && - strcmp(smbshareopts, "off") == 0) || - (shared_smb && strcmp(smbshareopts, "on") == 0 && - strcmp(shareopts, "off") == 0)) { - if (!explicit) - return (0); - - (void) fprintf(stderr, gettext("cannot share " - "'%s': filesystem already shared\n"), - zfs_get_name(zhp)); - return (1); - } - - if (!zfs_is_mounted(zhp, NULL) && - zfs_mount(zhp, NULL, 0) != 0) - return (1); - - if (protocol == NULL) { - if (zfs_shareall(zhp) != 0) - return (1); - } else if (strcmp(protocol, "nfs") == 0) { - if (zfs_share_nfs(zhp)) - return (1); - } else if (strcmp(protocol, "smb") == 0) { - if (zfs_share_smb(zhp)) - return (1); - } else { - (void) fprintf(stderr, gettext("cannot share " - "'%s': invalid share type '%s' " - "specified\n"), - zfs_get_name(zhp), protocol); - return (1); - } - - break; - - case OP_MOUNT: - if (options == NULL) - mnt.mnt_mntopts = ""; - else - mnt.mnt_mntopts = (char *)options; - - if (!hasmntopt(&mnt, MNTOPT_REMOUNT) && - zfs_is_mounted(zhp, NULL)) { - if (!explicit) - return (0); - - (void) fprintf(stderr, gettext("cannot mount " - "'%s': filesystem already mounted\n"), - zfs_get_name(zhp)); - return (1); - } - - if (zfs_mount(zhp, options, flags) != 0) - return (1); - break; - } - - return (0); -} - -/* - * Reports progress in the form "(current/total)". Not thread-safe. - */ -static void -report_mount_progress(int current, int total) -{ - static time_t last_progress_time = 0; - time_t now = time(NULL); - char info[32]; - - /* display header if we're here for the first time */ - if (current == 1) { - set_progress_header(gettext("Mounting ZFS filesystems")); - } else if (current != total && last_progress_time + MOUNT_TIME >= now) { - /* too soon to report again */ - return; - } - - last_progress_time = now; - - (void) sprintf(info, "(%d/%d)", current, total); - - if (current == total) - finish_progress(info); - else - update_progress(info); -} - -/* - * zfs_foreach_mountpoint() callback that mounts or shares on filesystem and - * updates the progress meter - */ -static int -share_mount_one_cb(zfs_handle_t *zhp, void *arg) -{ - share_mount_state_t *sms = arg; - int ret; - - ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto, - B_FALSE, sms->sm_options); - - pthread_mutex_lock(&sms->sm_lock); - if (ret != 0) - sms->sm_status = ret; - sms->sm_done++; - if (sms->sm_verbose) - report_mount_progress(sms->sm_done, sms->sm_total); - pthread_mutex_unlock(&sms->sm_lock); - return (ret); -} - -static void -append_options(char *mntopts, char *newopts) -{ - int len = strlen(mntopts); - - /* original length plus new string to append plus 1 for the comma */ - if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) { - (void) fprintf(stderr, gettext("the opts argument for " - "'%c' option is too long (more than %d chars)\n"), - "-o", MNT_LINE_MAX); - usage(B_FALSE); - } - - if (*mntopts) - mntopts[len++] = ','; - - (void) strcpy(&mntopts[len], newopts); -} - -static int -share_mount(int op, int argc, char **argv) -{ - int do_all = 0; - boolean_t verbose = B_FALSE; - int c, ret = 0; - char *options = NULL; - int flags = 0; - - /* check options */ - while ((c = getopt(argc, argv, op == OP_MOUNT ? ":avo:O" : "a")) - != -1) { - switch (c) { - case 'a': - do_all = 1; - break; - case 'v': - verbose = B_TRUE; - break; - case 'o': - if (*optarg == '\0') { - (void) fprintf(stderr, gettext("empty mount " - "options (-o) specified\n")); - usage(B_FALSE); - } - - if (options == NULL) - options = safe_malloc(MNT_LINE_MAX + 1); - - /* option validation is done later */ - append_options(options, optarg); - break; - - case 'O': - warnx("no overlay mounts support on FreeBSD, ignoring"); - break; - case ':': - (void) fprintf(stderr, gettext("missing argument for " - "'%c' option\n"), optopt); - usage(B_FALSE); - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* check number of arguments */ - if (do_all) { - char *protocol = NULL; - - if (op == OP_SHARE && argc > 0) { - if (strcmp(argv[0], "nfs") != 0 && - strcmp(argv[0], "smb") != 0) { - (void) fprintf(stderr, gettext("share type " - "must be 'nfs' or 'smb'\n")); - usage(B_FALSE); - } - protocol = argv[0]; - argc--; - argv++; - } - - if (argc != 0) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - start_progress_timer(); - get_all_cb_t cb = { 0 }; - get_all_datasets(&cb, verbose); - - if (cb.cb_used == 0) { - if (options != NULL) - free(options); - return (0); - } - -#ifdef illumos - if (op == OP_SHARE) { - sa_init_selective_arg_t sharearg; - sharearg.zhandle_arr = cb.cb_handles; - sharearg.zhandle_len = cb.cb_used; - if ((ret = zfs_init_libshare_arg(g_zfs, - SA_INIT_SHARE_API_SELECTIVE, &sharearg)) != SA_OK) { - (void) fprintf(stderr, gettext( - "Could not initialize libshare, %d"), ret); - return (ret); - } - } -#endif - share_mount_state_t share_mount_state = { 0 }; - share_mount_state.sm_op = op; - share_mount_state.sm_verbose = verbose; - share_mount_state.sm_flags = flags; - share_mount_state.sm_options = options; - share_mount_state.sm_proto = protocol; - share_mount_state.sm_total = cb.cb_used; - pthread_mutex_init(&share_mount_state.sm_lock, NULL); - - /* - * libshare isn't mt-safe, so only do the operation in parallel - * if we're mounting. - */ - zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used, - share_mount_one_cb, &share_mount_state, op == OP_MOUNT); - ret = share_mount_state.sm_status; - - for (int i = 0; i < cb.cb_used; i++) - zfs_close(cb.cb_handles[i]); - free(cb.cb_handles); - } else if (argc == 0) { - struct mnttab entry; - - if ((op == OP_SHARE) || (options != NULL)) { - (void) fprintf(stderr, gettext("missing filesystem " - "argument (specify -a for all)\n")); - usage(B_FALSE); - } - - /* - * When mount is given no arguments, go through /etc/mnttab and - * display any active ZFS mounts. We hide any snapshots, since - * they are controlled automatically. - */ - rewind(mnttab_file); - while (getmntent(mnttab_file, &entry) == 0) { - if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 || - strchr(entry.mnt_special, '@') != NULL) - continue; - - (void) printf("%-30s %s\n", entry.mnt_special, - entry.mnt_mountp); - } - - } else { - zfs_handle_t *zhp; - - if (argc > 1) { - (void) fprintf(stderr, - gettext("too many arguments\n")); - usage(B_FALSE); - } - - if ((zhp = zfs_open(g_zfs, argv[0], - ZFS_TYPE_FILESYSTEM)) == NULL) { - ret = 1; - } else { - ret = share_mount_one(zhp, op, flags, NULL, B_TRUE, - options); - zfs_close(zhp); - } - } - - return (ret); -} - -/* - * zfs mount -a [nfs] - * zfs mount filesystem - * - * Mount all filesystems, or mount the given filesystem. - */ -static int -zfs_do_mount(int argc, char **argv) -{ - return (share_mount(OP_MOUNT, argc, argv)); -} - -/* - * zfs share -a [nfs | smb] - * zfs share filesystem - * - * Share all filesystems, or share the given filesystem. - */ -static int -zfs_do_share(int argc, char **argv) -{ - return (share_mount(OP_SHARE, argc, argv)); -} - -typedef struct unshare_unmount_node { - zfs_handle_t *un_zhp; - char *un_mountp; - uu_avl_node_t un_avlnode; -} unshare_unmount_node_t; - -/* ARGSUSED */ -static int -unshare_unmount_compare(const void *larg, const void *rarg, void *unused) -{ - const unshare_unmount_node_t *l = larg; - const unshare_unmount_node_t *r = rarg; - - return (strcmp(l->un_mountp, r->un_mountp)); -} - -/* - * Convenience routine used by zfs_do_umount() and manual_unmount(). Given an - * absolute path, find the entry /etc/mnttab, verify that its a ZFS filesystem, - * and unmount it appropriately. - */ -static int -unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) -{ - zfs_handle_t *zhp; - int ret = 0; - struct stat64 statbuf; - struct extmnttab entry; - const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; - ino_t path_inode; - - /* - * Search for the path in /etc/mnttab. Rather than looking for the - * specific path, which can be fooled by non-standard paths (i.e. ".." - * or "//"), we stat() the path and search for the corresponding - * (major,minor) device pair. - */ - if (stat64(path, &statbuf) != 0) { - (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"), - cmdname, path, strerror(errno)); - return (1); - } - path_inode = statbuf.st_ino; - - /* - * Search for the given (major,minor) pair in the mount table. - */ -#ifdef illumos - rewind(mnttab_file); - while ((ret = getextmntent(mnttab_file, &entry, 0)) == 0) { - if (entry.mnt_major == major(statbuf.st_dev) && - entry.mnt_minor == minor(statbuf.st_dev)) - break; - } -#else - { - struct statfs sfs; - - if (statfs(path, &sfs) != 0) { - (void) fprintf(stderr, "%s: %s\n", path, - strerror(errno)); - ret = -1; - } - statfs2mnttab(&sfs, &entry); - } -#endif - if (ret != 0) { - if (op == OP_SHARE) { - (void) fprintf(stderr, gettext("cannot %s '%s': not " - "currently mounted\n"), cmdname, path); - return (1); - } - (void) fprintf(stderr, gettext("warning: %s not in mnttab\n"), - path); - if ((ret = umount2(path, flags)) != 0) - (void) fprintf(stderr, gettext("%s: %s\n"), path, - strerror(errno)); - return (ret != 0); - } - - if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { - (void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS " - "filesystem\n"), cmdname, path); - return (1); - } - - if ((zhp = zfs_open(g_zfs, entry.mnt_special, - ZFS_TYPE_FILESYSTEM)) == NULL) - return (1); - - ret = 1; - if (stat64(entry.mnt_mountp, &statbuf) != 0) { - (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"), - cmdname, path, strerror(errno)); - goto out; - } else if (statbuf.st_ino != path_inode) { - (void) fprintf(stderr, gettext("cannot " - "%s '%s': not a mountpoint\n"), cmdname, path); - goto out; - } - - if (op == OP_SHARE) { - char nfs_mnt_prop[ZFS_MAXPROPLEN]; - char smbshare_prop[ZFS_MAXPROPLEN]; - - verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, - sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); - verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop, - sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0); - - if (strcmp(nfs_mnt_prop, "off") == 0 && - strcmp(smbshare_prop, "off") == 0) { - (void) fprintf(stderr, gettext("cannot unshare " - "'%s': legacy share\n"), path); -#ifdef illumos - (void) fprintf(stderr, gettext("use " - "unshare(1M) to unshare this filesystem\n")); -#endif - } else if (!zfs_is_shared(zhp)) { - (void) fprintf(stderr, gettext("cannot unshare '%s': " - "not currently shared\n"), path); - } else { - ret = zfs_unshareall_bypath(zhp, path); - } - } else { - char mtpt_prop[ZFS_MAXPROPLEN]; - - verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop, - sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0); - - if (is_manual) { - ret = zfs_unmount(zhp, NULL, flags); - } else if (strcmp(mtpt_prop, "legacy") == 0) { - (void) fprintf(stderr, gettext("cannot unmount " - "'%s': legacy mountpoint\n"), - zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use umount(8) " - "to unmount this filesystem\n")); - } else { - ret = zfs_unmountall(zhp, flags); - } - } - -out: - zfs_close(zhp); - - return (ret != 0); -} - -/* - * Generic callback for unsharing or unmounting a filesystem. - */ -static int -unshare_unmount(int op, int argc, char **argv) -{ - int do_all = 0; - int flags = 0; - int ret = 0; - int c; - zfs_handle_t *zhp; - char nfs_mnt_prop[ZFS_MAXPROPLEN]; - char sharesmb[ZFS_MAXPROPLEN]; - - /* check options */ - while ((c = getopt(argc, argv, op == OP_SHARE ? "a" : "af")) != -1) { - switch (c) { - case 'a': - do_all = 1; - break; - case 'f': - flags = MS_FORCE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - if (do_all) { - /* - * We could make use of zfs_for_each() to walk all datasets in - * the system, but this would be very inefficient, especially - * since we would have to linearly search /etc/mnttab for each - * one. Instead, do one pass through /etc/mnttab looking for - * zfs entries and call zfs_unmount() for each one. - * - * Things get a little tricky if the administrator has created - * mountpoints beneath other ZFS filesystems. In this case, we - * have to unmount the deepest filesystems first. To accomplish - * this, we place all the mountpoints in an AVL tree sorted by - * the special type (dataset name), and walk the result in - * reverse to make sure to get any snapshots first. - */ - struct mnttab entry; - uu_avl_pool_t *pool; - uu_avl_t *tree = NULL; - unshare_unmount_node_t *node; - uu_avl_index_t idx; - uu_avl_walk_t *walk; - - if (argc != 0) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - if (((pool = uu_avl_pool_create("unmount_pool", - sizeof (unshare_unmount_node_t), - offsetof(unshare_unmount_node_t, un_avlnode), - unshare_unmount_compare, UU_DEFAULT)) == NULL) || - ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL)) - nomem(); - - rewind(mnttab_file); - while (getmntent(mnttab_file, &entry) == 0) { - - /* ignore non-ZFS entries */ - if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) - continue; - - /* ignore snapshots */ - if (strchr(entry.mnt_special, '@') != NULL) - continue; - - if ((zhp = zfs_open(g_zfs, entry.mnt_special, - ZFS_TYPE_FILESYSTEM)) == NULL) { - ret = 1; - continue; - } - - /* - * Ignore datasets that are excluded/restricted by - * parent pool name. - */ - if (zpool_skip_pool(zfs_get_pool_name(zhp))) { - zfs_close(zhp); - continue; - } - - switch (op) { - case OP_SHARE: - verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, - nfs_mnt_prop, - sizeof (nfs_mnt_prop), - NULL, NULL, 0, B_FALSE) == 0); - if (strcmp(nfs_mnt_prop, "off") != 0) - break; - verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, - nfs_mnt_prop, - sizeof (nfs_mnt_prop), - NULL, NULL, 0, B_FALSE) == 0); - if (strcmp(nfs_mnt_prop, "off") == 0) - continue; - break; - case OP_MOUNT: - /* Ignore legacy mounts */ - verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, - nfs_mnt_prop, - sizeof (nfs_mnt_prop), - NULL, NULL, 0, B_FALSE) == 0); - if (strcmp(nfs_mnt_prop, "legacy") == 0) - continue; - /* Ignore canmount=noauto mounts */ - if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == - ZFS_CANMOUNT_NOAUTO) - continue; - default: - break; - } - - node = safe_malloc(sizeof (unshare_unmount_node_t)); - node->un_zhp = zhp; - node->un_mountp = safe_strdup(entry.mnt_mountp); - - uu_avl_node_init(node, &node->un_avlnode, pool); - - if (uu_avl_find(tree, node, NULL, &idx) == NULL) { - uu_avl_insert(tree, node, idx); - } else { - zfs_close(node->un_zhp); - free(node->un_mountp); - free(node); - } - } - - /* - * Walk the AVL tree in reverse, unmounting each filesystem and - * removing it from the AVL tree in the process. - */ - if ((walk = uu_avl_walk_start(tree, - UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) - nomem(); - - while ((node = uu_avl_walk_next(walk)) != NULL) { - uu_avl_remove(tree, node); - - switch (op) { - case OP_SHARE: - if (zfs_unshareall_bypath(node->un_zhp, - node->un_mountp) != 0) - ret = 1; - break; - - case OP_MOUNT: - if (zfs_unmount(node->un_zhp, - node->un_mountp, flags) != 0) - ret = 1; - break; - } - - zfs_close(node->un_zhp); - free(node->un_mountp); - free(node); - } - - uu_avl_walk_end(walk); - uu_avl_destroy(tree); - uu_avl_pool_destroy(pool); - - } else { - if (argc != 1) { - if (argc == 0) - (void) fprintf(stderr, - gettext("missing filesystem argument\n")); - else - (void) fprintf(stderr, - gettext("too many arguments\n")); - usage(B_FALSE); - } - - /* - * We have an argument, but it may be a full path or a ZFS - * filesystem. Pass full paths off to unmount_path() (shared by - * manual_unmount), otherwise open the filesystem and pass to - * zfs_unmount(). - */ - if (argv[0][0] == '/') - return (unshare_unmount_path(op, argv[0], - flags, B_FALSE)); - - if ((zhp = zfs_open(g_zfs, argv[0], - ZFS_TYPE_FILESYSTEM)) == NULL) - return (1); - - verify(zfs_prop_get(zhp, op == OP_SHARE ? - ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, - nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, - NULL, 0, B_FALSE) == 0); - - switch (op) { - case OP_SHARE: - verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, - nfs_mnt_prop, - sizeof (nfs_mnt_prop), - NULL, NULL, 0, B_FALSE) == 0); - verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, - sharesmb, sizeof (sharesmb), NULL, NULL, - 0, B_FALSE) == 0); - - if (strcmp(nfs_mnt_prop, "off") == 0 && - strcmp(sharesmb, "off") == 0) { - (void) fprintf(stderr, gettext("cannot " - "unshare '%s': legacy share\n"), - zfs_get_name(zhp)); -#ifdef illumos - (void) fprintf(stderr, gettext("use " - "unshare(1M) to unshare this " - "filesystem\n")); -#endif - ret = 1; - } else if (!zfs_is_shared(zhp)) { - (void) fprintf(stderr, gettext("cannot " - "unshare '%s': not currently " - "shared\n"), zfs_get_name(zhp)); - ret = 1; - } else if (zfs_unshareall(zhp) != 0) { - ret = 1; - } - break; - - case OP_MOUNT: - if (strcmp(nfs_mnt_prop, "legacy") == 0) { - (void) fprintf(stderr, gettext("cannot " - "unmount '%s': legacy " - "mountpoint\n"), zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use " - "umount(8) to unmount this " - "filesystem\n")); - ret = 1; - } else if (!zfs_is_mounted(zhp, NULL)) { - (void) fprintf(stderr, gettext("cannot " - "unmount '%s': not currently " - "mounted\n"), - zfs_get_name(zhp)); - ret = 1; - } else if (zfs_unmountall(zhp, flags) != 0) { - ret = 1; - } - break; - } - - zfs_close(zhp); - } - - return (ret); -} - -/* - * zfs unmount -a - * zfs unmount filesystem - * - * Unmount all filesystems, or a specific ZFS filesystem. - */ -static int -zfs_do_unmount(int argc, char **argv) -{ - return (unshare_unmount(OP_MOUNT, argc, argv)); -} - -/* - * zfs unshare -a - * zfs unshare filesystem - * - * Unshare all filesystems, or a specific ZFS filesystem. - */ -static int -zfs_do_unshare(int argc, char **argv) -{ - return (unshare_unmount(OP_SHARE, argc, argv)); -} - -/* - * Attach/detach the given dataset to/from the given jail - */ -/* ARGSUSED */ -static int -do_jail(int argc, char **argv, int attach) -{ - zfs_handle_t *zhp; - int jailid, ret; - - /* check number of arguments */ - if (argc < 3) { - (void) fprintf(stderr, gettext("missing argument(s)\n")); - usage(B_FALSE); - } - if (argc > 3) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - jailid = jail_getid(argv[1]); - if (jailid < 0) { - (void) fprintf(stderr, gettext("invalid jail id or name\n")); - usage(B_FALSE); - } - - zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM); - if (zhp == NULL) - return (1); - - ret = (zfs_jail(zhp, jailid, attach) != 0); - - zfs_close(zhp); - return (ret); -} - -/* - * zfs jail jailid filesystem - * - * Attach the given dataset to the given jail - */ -/* ARGSUSED */ -static int -zfs_do_jail(int argc, char **argv) -{ - - return (do_jail(argc, argv, 1)); -} - -/* - * zfs unjail jailid filesystem - * - * Detach the given dataset from the given jail - */ -/* ARGSUSED */ -static int -zfs_do_unjail(int argc, char **argv) -{ - - return (do_jail(argc, argv, 0)); -} - -/* - * Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is - * 'legacy'. Otherwise, complain that use should be using 'zfs mount'. - */ -static int -manual_mount(int argc, char **argv) -{ - zfs_handle_t *zhp; - char mountpoint[ZFS_MAXPROPLEN]; - char mntopts[MNT_LINE_MAX] = { '\0' }; - int ret = 0; - int c; - int flags = 0; - char *dataset, *path; - - /* check options */ - while ((c = getopt(argc, argv, ":mo:O")) != -1) { - switch (c) { - case 'o': - (void) strlcpy(mntopts, optarg, sizeof (mntopts)); - break; - case 'O': - flags |= MS_OVERLAY; - break; - case 'm': - flags |= MS_NOMNTTAB; - break; - case ':': - (void) fprintf(stderr, gettext("missing argument for " - "'%c' option\n"), optopt); - usage(B_FALSE); - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - (void) fprintf(stderr, gettext("usage: mount [-o opts] " - "\n")); - return (2); - } - } - - argc -= optind; - argv += optind; - - /* check that we only have two arguments */ - if (argc != 2) { - if (argc == 0) - (void) fprintf(stderr, gettext("missing dataset " - "argument\n")); - else if (argc == 1) - (void) fprintf(stderr, - gettext("missing mountpoint argument\n")); - else - (void) fprintf(stderr, gettext("too many arguments\n")); - (void) fprintf(stderr, "usage: mount \n"); - return (2); - } - - dataset = argv[0]; - path = argv[1]; - - /* try to open the dataset */ - if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_FILESYSTEM)) == NULL) - return (1); - - (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, - sizeof (mountpoint), NULL, NULL, 0, B_FALSE); - - /* check for legacy mountpoint and complain appropriately */ - ret = 0; - if (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0) { - if (zmount(dataset, path, flags, MNTTYPE_ZFS, - NULL, 0, mntopts, sizeof (mntopts)) != 0) { - (void) fprintf(stderr, gettext("mount failed: %s\n"), - strerror(errno)); - ret = 1; - } - } else { - (void) fprintf(stderr, gettext("filesystem '%s' cannot be " - "mounted using 'mount -t zfs'\n"), dataset); - (void) fprintf(stderr, gettext("Use 'zfs set mountpoint=%s' " - "instead.\n"), path); - (void) fprintf(stderr, gettext("If you must use 'mount -t zfs' " - "or /etc/fstab, use 'zfs set mountpoint=legacy'.\n")); - (void) fprintf(stderr, gettext("See zfs(8) for more " - "information.\n")); - ret = 1; - } - - return (ret); -} - -/* - * Called when invoked as /etc/fs/zfs/umount. Unlike a manual mount, we allow - * unmounts of non-legacy filesystems, as this is the dominant administrative - * interface. - */ -static int -manual_unmount(int argc, char **argv) -{ - int flags = 0; - int c; - - /* check options */ - while ((c = getopt(argc, argv, "f")) != -1) { - switch (c) { - case 'f': - flags = MS_FORCE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - (void) fprintf(stderr, gettext("usage: unmount [-f] " - "\n")); - return (2); - } - } - - argc -= optind; - argv += optind; - - /* check arguments */ - if (argc != 1) { - if (argc == 0) - (void) fprintf(stderr, gettext("missing path " - "argument\n")); - else - (void) fprintf(stderr, gettext("too many arguments\n")); - (void) fprintf(stderr, gettext("usage: unmount [-f] \n")); - return (2); - } - - return (unshare_unmount_path(OP_MOUNT, argv[0], flags, B_TRUE)); -} - -static int -find_command_idx(char *command, int *idx) -{ - int i; - - for (i = 0; i < NCOMMAND; i++) { - if (command_table[i].name == NULL) - continue; - - if (strcmp(command, command_table[i].name) == 0) { - *idx = i; - return (0); - } - } - return (1); -} - -static int -zfs_do_diff(int argc, char **argv) -{ - zfs_handle_t *zhp; - int flags = 0; - char *tosnap = NULL; - char *fromsnap = NULL; - char *atp, *copy; - int err = 0; - int c; - - while ((c = getopt(argc, argv, "FHt")) != -1) { - switch (c) { - case 'F': - flags |= ZFS_DIFF_CLASSIFY; - break; - case 'H': - flags |= ZFS_DIFF_PARSEABLE; - break; - case 't': - flags |= ZFS_DIFF_TIMESTAMP; - break; - default: - (void) fprintf(stderr, - gettext("invalid option '%c'\n"), optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - if (argc < 1) { - (void) fprintf(stderr, - gettext("must provide at least one snapshot name\n")); - usage(B_FALSE); - } - - if (argc > 2) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - fromsnap = argv[0]; - tosnap = (argc == 2) ? argv[1] : NULL; - - copy = NULL; - if (*fromsnap != '@') - copy = strdup(fromsnap); - else if (tosnap) - copy = strdup(tosnap); - if (copy == NULL) - usage(B_FALSE); - - if ((atp = strchr(copy, '@')) != NULL) - *atp = '\0'; - - if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL) - return (1); - - free(copy); - - /* - * Ignore SIGPIPE so that the library can give us - * information on any failure - */ - (void) sigignore(SIGPIPE); - - err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags); - - zfs_close(zhp); - - return (err != 0); -} - -/* - * zfs remap - * - * Remap the indirect blocks in the given fileystem or volume. - */ -static int -zfs_do_remap(int argc, char **argv) -{ - const char *fsname; - int err = 0; - int c; - - /* check options */ - while ((c = getopt(argc, argv, "")) != -1) { - switch (c) { - case '?': - (void) fprintf(stderr, - gettext("invalid option '%c'\n"), optopt); - usage(B_FALSE); - } - } - - if (argc != 2) { - (void) fprintf(stderr, gettext("wrong number of arguments\n")); - usage(B_FALSE); - } - - fsname = argv[1]; - err = zfs_remap_indirects(g_zfs, fsname); - - return (err); -} - -/* - * zfs bookmark - * - * Creates a bookmark with the given name from the given snapshot. - */ -static int -zfs_do_bookmark(int argc, char **argv) -{ - char snapname[ZFS_MAX_DATASET_NAME_LEN]; - zfs_handle_t *zhp; - nvlist_t *nvl; - int ret = 0; - int c; - - /* check options */ - while ((c = getopt(argc, argv, "")) != -1) { - switch (c) { - case '?': - (void) fprintf(stderr, - gettext("invalid option '%c'\n"), optopt); - goto usage; - } - } - - argc -= optind; - argv += optind; - - /* check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing snapshot argument\n")); - goto usage; - } - if (argc < 2) { - (void) fprintf(stderr, gettext("missing bookmark argument\n")); - goto usage; - } - - if (strchr(argv[1], '#') == NULL) { - (void) fprintf(stderr, - gettext("invalid bookmark name '%s' -- " - "must contain a '#'\n"), argv[1]); - goto usage; - } - - if (argv[0][0] == '@') { - /* - * Snapshot name begins with @. - * Default to same fs as bookmark. - */ - (void) strncpy(snapname, argv[1], sizeof (snapname)); - *strchr(snapname, '#') = '\0'; - (void) strlcat(snapname, argv[0], sizeof (snapname)); - } else { - (void) strncpy(snapname, argv[0], sizeof (snapname)); - } - zhp = zfs_open(g_zfs, snapname, ZFS_TYPE_SNAPSHOT); - if (zhp == NULL) - goto usage; - zfs_close(zhp); - - - nvl = fnvlist_alloc(); - fnvlist_add_string(nvl, argv[1], snapname); - ret = lzc_bookmark(nvl, NULL); - fnvlist_free(nvl); - - if (ret != 0) { - const char *err_msg = NULL; - char errbuf[1024]; - - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, - "cannot create bookmark '%s'"), argv[1]); - - switch (ret) { - case EXDEV: - err_msg = "bookmark is in a different pool"; - break; - case EEXIST: - err_msg = "bookmark exists"; - break; - case EINVAL: - err_msg = "invalid argument"; - break; - case ENOTSUP: - err_msg = "bookmark feature not enabled"; - break; - case ENOSPC: - err_msg = "out of space"; - break; - default: - (void) zfs_standard_error(g_zfs, ret, errbuf); - break; - } - if (err_msg != NULL) { - (void) fprintf(stderr, "%s: %s\n", errbuf, - dgettext(TEXT_DOMAIN, err_msg)); - } - } - - return (ret != 0); - -usage: - usage(B_FALSE); - return (-1); -} - -static int -zfs_do_channel_program(int argc, char **argv) -{ - int ret, fd; - char c; - char *progbuf, *filename, *poolname; - size_t progsize, progread; - nvlist_t *outnvl = NULL; - uint64_t instrlimit = ZCP_DEFAULT_INSTRLIMIT; - uint64_t memlimit = ZCP_DEFAULT_MEMLIMIT; - boolean_t sync_flag = B_TRUE, json_output = B_FALSE; - zpool_handle_t *zhp; - - /* check options */ - while (-1 != - (c = getopt(argc, argv, "jnt:(instr-limit)m:(memory-limit)"))) { - switch (c) { - case 't': - case 'm': { - uint64_t arg; - char *endp; - - errno = 0; - arg = strtoull(optarg, &endp, 0); - if (errno != 0 || *endp != '\0') { - (void) fprintf(stderr, gettext( - "invalid argument " - "'%s': expected integer\n"), optarg); - goto usage; - } - - if (c == 't') { - if (arg > ZCP_MAX_INSTRLIMIT || arg == 0) { - (void) fprintf(stderr, gettext( - "Invalid instruction limit: " - "%s\n"), optarg); - return (1); - } else { - instrlimit = arg; - } - } else { - ASSERT3U(c, ==, 'm'); - if (arg > ZCP_MAX_MEMLIMIT || arg == 0) { - (void) fprintf(stderr, gettext( - "Invalid memory limit: " - "%s\n"), optarg); - return (1); - } else { - memlimit = arg; - } - } - break; - } - case 'n': { - sync_flag = B_FALSE; - break; - } - case 'j': { - json_output = B_TRUE; - break; - } - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - goto usage; - } - } - - argc -= optind; - argv += optind; - - if (argc < 2) { - (void) fprintf(stderr, - gettext("invalid number of arguments\n")); - goto usage; - } - - poolname = argv[0]; - filename = argv[1]; - if (strcmp(filename, "-") == 0) { - fd = 0; - filename = "standard input"; - } else if ((fd = open(filename, O_RDONLY)) < 0) { - (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), - filename, strerror(errno)); - return (1); - } - - if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { - (void) fprintf(stderr, gettext("cannot open pool '%s'"), - poolname); - return (1); - } - zpool_close(zhp); - - /* - * Read in the channel program, expanding the program buffer as - * necessary. - */ - progread = 0; - progsize = 1024; - progbuf = safe_malloc(progsize); - do { - ret = read(fd, progbuf + progread, progsize - progread); - progread += ret; - if (progread == progsize && ret > 0) { - progsize *= 2; - progbuf = safe_realloc(progbuf, progsize); - } - } while (ret > 0); - - if (fd != 0) - (void) close(fd); - if (ret < 0) { - free(progbuf); - (void) fprintf(stderr, - gettext("cannot read '%s': %s\n"), - filename, strerror(errno)); - return (1); - } - progbuf[progread] = '\0'; - - /* - * Any remaining arguments are passed as arguments to the lua script as - * a string array: - * { - * "argv" -> [ "arg 1", ... "arg n" ], - * } - */ - nvlist_t *argnvl = fnvlist_alloc(); - fnvlist_add_string_array(argnvl, ZCP_ARG_CLIARGV, argv + 2, argc - 2); - - if (sync_flag) { - ret = lzc_channel_program(poolname, progbuf, - instrlimit, memlimit, argnvl, &outnvl); - } else { - ret = lzc_channel_program_nosync(poolname, progbuf, - instrlimit, memlimit, argnvl, &outnvl); - } - - if (ret != 0) { - /* - * On error, report the error message handed back by lua if one - * exists. Otherwise, generate an appropriate error message, - * falling back on strerror() for an unexpected return code. - */ - char *errstring = NULL; - const char *msg = gettext("Channel program execution failed"); - if (outnvl != NULL && nvlist_exists(outnvl, ZCP_RET_ERROR)) { - (void) nvlist_lookup_string(outnvl, - ZCP_RET_ERROR, &errstring); - if (errstring == NULL) - errstring = strerror(ret); - } else { - switch (ret) { - case EINVAL: - errstring = - "Invalid instruction or memory limit."; - break; - case ENOMEM: - errstring = "Return value too large."; - break; - case ENOSPC: - errstring = "Memory limit exhausted."; - break; -#ifdef illumos - case ETIME: -#else - case ETIMEDOUT: -#endif - errstring = "Timed out."; - break; - case EPERM: - errstring = "Permission denied. Channel " - "programs must be run as root."; - break; - default: - (void) zfs_standard_error(g_zfs, ret, msg); - } - } - if (errstring != NULL) - (void) fprintf(stderr, "%s:\n%s\n", msg, errstring); - } else { - if (json_output) { - (void) nvlist_print_json(stdout, outnvl); - } else if (nvlist_empty(outnvl)) { - (void) fprintf(stdout, gettext("Channel program fully " - "executed and did not produce output.\n")); - } else { - (void) fprintf(stdout, gettext("Channel program fully " - "executed and produced output:\n")); - dump_nvlist(outnvl, 4); - } - } - - free(progbuf); - fnvlist_free(outnvl); - fnvlist_free(argnvl); - return (ret != 0); - -usage: - usage(B_FALSE); - return (-1); -} - -int -main(int argc, char **argv) -{ - int ret = 0; - int i; - char *progname; - char *cmdname; - - (void) setlocale(LC_ALL, ""); - (void) textdomain(TEXT_DOMAIN); - - opterr = 0; - - if ((g_zfs = libzfs_init()) == NULL) { - (void) fprintf(stderr, gettext("internal error: failed to " - "initialize ZFS library\n")); - return (1); - } - - zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); - - libzfs_print_on_error(g_zfs, B_TRUE); - - if ((mnttab_file = fopen(MNTTAB, "r")) == NULL) { - (void) fprintf(stderr, gettext("internal error: unable to " - "open %s\n"), MNTTAB); - return (1); - } - - /* - * This command also doubles as the /etc/fs mount and unmount program. - * Determine if we should take this behavior based on argv[0]. - */ - progname = basename(argv[0]); - if (strcmp(progname, "mount") == 0) { - ret = manual_mount(argc, argv); - } else if (strcmp(progname, "umount") == 0) { - ret = manual_unmount(argc, argv); - } else { - /* - * Make sure the user has specified some command. - */ - if (argc < 2) { - (void) fprintf(stderr, gettext("missing command\n")); - usage(B_FALSE); - } - - cmdname = argv[1]; - - /* - * The 'umount' command is an alias for 'unmount' - */ - if (strcmp(cmdname, "umount") == 0) - cmdname = "unmount"; - - /* - * The 'recv' command is an alias for 'receive' - */ - if (strcmp(cmdname, "recv") == 0) - cmdname = "receive"; - - /* - * The 'snap' command is an alias for 'snapshot' - */ - if (strcmp(cmdname, "snap") == 0) - cmdname = "snapshot"; - - /* - * Special case '-?' - */ - if (strcmp(cmdname, "-?") == 0) - usage(B_TRUE); - - /* - * Run the appropriate command. - */ - libzfs_mnttab_cache(g_zfs, B_TRUE); - if (find_command_idx(cmdname, &i) == 0) { - current_command = &command_table[i]; - ret = command_table[i].func(argc - 1, argv + 1); - } else if (strchr(cmdname, '=') != NULL) { - verify(find_command_idx("set", &i) == 0); - current_command = &command_table[i]; - ret = command_table[i].func(argc, argv); - } else { - (void) fprintf(stderr, gettext("unrecognized " - "command '%s'\n"), cmdname); - usage(B_FALSE); - } - libzfs_mnttab_cache(g_zfs, B_FALSE); - } - - (void) fclose(mnttab_file); - - if (ret == 0 && log_history) - (void) zpool_log_history(g_zfs, history_str); - - libzfs_fini(g_zfs); - - /* - * The 'ZFS_ABORT' environment variable causes us to dump core on exit - * for the purposes of running ::findleaks. - */ - if (getenv("ZFS_ABORT") != NULL) { - (void) printf("dumping core by request\n"); - abort(); - } - - return (ret); -} diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs_util.h b/cddl/contrib/opensolaris/cmd/zfs/zfs_util.h deleted file mode 100644 index a56af59adb15..000000000000 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs_util.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#ifndef _ZFS_UTIL_H -#define _ZFS_UTIL_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -void * safe_malloc(size_t size); -void nomem(void); -extern libzfs_handle_t *g_zfs; - -#ifdef __cplusplus -} -#endif - -#endif /* _ZFS_UTIL_H */ diff --git a/cddl/contrib/opensolaris/cmd/zhack/zhack.c b/cddl/contrib/opensolaris/cmd/zhack/zhack.c deleted file mode 100644 index 20a0c60e6a18..000000000000 --- a/cddl/contrib/opensolaris/cmd/zhack/zhack.c +++ /dev/null @@ -1,535 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. - * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ - -/* - * zhack is a debugging tool that can write changes to ZFS pool using libzpool - * for testing purposes. Altering pools with zhack is unsupported and may - * result in corrupted pools. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#undef verify -#include - -extern boolean_t zfeature_checks_disable; - -const char cmdname[] = "zhack"; -libzfs_handle_t *g_zfs; -static importargs_t g_importargs; -static char *g_pool; -static boolean_t g_readonly; - -static void -usage(void) -{ - (void) fprintf(stderr, - "Usage: %s [-c cachefile] [-d dir] ...\n" - "where is one of the following:\n" - "\n", cmdname); - - (void) fprintf(stderr, - " feature stat \n" - " print information about enabled features\n" - " feature enable [-d desc] \n" - " add a new enabled feature to the pool\n" - " -d sets the feature's description\n" - " feature ref [-md] \n" - " change the refcount on the given feature\n" - " -d decrease instead of increase the refcount\n" - " -m add the feature to the label if increasing refcount\n" - "\n" - " : should be a feature guid\n"); - exit(1); -} - - -static void -fatal(spa_t *spa, void *tag, const char *fmt, ...) -{ - va_list ap; - - if (spa != NULL) { - spa_close(spa, tag); - (void) spa_export(g_pool, NULL, B_TRUE, B_FALSE); - } - - va_start(ap, fmt); - (void) fprintf(stderr, "%s: ", cmdname); - (void) vfprintf(stderr, fmt, ap); - va_end(ap); - (void) fprintf(stderr, "\n"); - - exit(1); -} - -/* ARGSUSED */ -static int -space_delta_cb(dmu_object_type_t bonustype, void *data, - uint64_t *userp, uint64_t *groupp) -{ - /* - * Is it a valid type of object to track? - */ - if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) - return (ENOENT); - (void) fprintf(stderr, "modifying object that needs user accounting"); - abort(); - /* NOTREACHED */ -} - -/* - * Target is the dataset whose pool we want to open. - */ -static void -zhack_import(char *target, boolean_t readonly) -{ - nvlist_t *config; - nvlist_t *props; - int error; - - kernel_init(readonly ? FREAD : (FREAD | FWRITE)); - g_zfs = libzfs_init(); - ASSERT(g_zfs != NULL); - - dmu_objset_register_type(DMU_OST_ZFS, space_delta_cb); - - g_readonly = readonly; - g_importargs.unique = B_TRUE; - g_importargs.can_be_active = readonly; - g_pool = strdup(target); - - error = zpool_tryimport(g_zfs, target, &config, &g_importargs); - if (error) - fatal(NULL, FTAG, "cannot import '%s': %s", target, - libzfs_error_description(g_zfs)); - - props = NULL; - if (readonly) { - VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_uint64(props, - zpool_prop_to_name(ZPOOL_PROP_READONLY), 1) == 0); - } - - zfeature_checks_disable = B_TRUE; - error = spa_import(target, config, props, - (readonly ? ZFS_IMPORT_SKIP_MMP : ZFS_IMPORT_NORMAL)); - zfeature_checks_disable = B_FALSE; - if (error == EEXIST) - error = 0; - - if (error) - fatal(NULL, FTAG, "can't import '%s': %s", target, - strerror(error)); -} - -static void -zhack_spa_open(char *target, boolean_t readonly, void *tag, spa_t **spa) -{ - int err; - - zhack_import(target, readonly); - - zfeature_checks_disable = B_TRUE; - err = spa_open(target, spa, tag); - zfeature_checks_disable = B_FALSE; - - if (err != 0) - fatal(*spa, FTAG, "cannot open '%s': %s", target, - strerror(err)); - if (spa_version(*spa) < SPA_VERSION_FEATURES) { - fatal(*spa, FTAG, "'%s' has version %d, features not enabled", - target, (int)spa_version(*spa)); - } -} - -static void -dump_obj(objset_t *os, uint64_t obj, const char *name) -{ - zap_cursor_t zc; - zap_attribute_t za; - - (void) printf("%s_obj:\n", name); - - for (zap_cursor_init(&zc, os, obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - if (za.za_integer_length == 8) { - ASSERT(za.za_num_integers == 1); - (void) printf("\t%s = %llu\n", - za.za_name, (u_longlong_t)za.za_first_integer); - } else { - ASSERT(za.za_integer_length == 1); - char val[1024]; - VERIFY(zap_lookup(os, obj, za.za_name, - 1, sizeof (val), val) == 0); - (void) printf("\t%s = %s\n", za.za_name, val); - } - } - zap_cursor_fini(&zc); -} - -static void -dump_mos(spa_t *spa) -{ - nvlist_t *nv = spa->spa_label_features; - - (void) printf("label config:\n"); - for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); - pair != NULL; - pair = nvlist_next_nvpair(nv, pair)) { - (void) printf("\t%s\n", nvpair_name(pair)); - } -} - -static void -zhack_do_feature_stat(int argc, char **argv) -{ - spa_t *spa; - objset_t *os; - char *target; - - argc--; - argv++; - - if (argc < 1) { - (void) fprintf(stderr, "error: missing pool name\n"); - usage(); - } - target = argv[0]; - - zhack_spa_open(target, B_TRUE, FTAG, &spa); - os = spa->spa_meta_objset; - - dump_obj(os, spa->spa_feat_for_read_obj, "for_read"); - dump_obj(os, spa->spa_feat_for_write_obj, "for_write"); - dump_obj(os, spa->spa_feat_desc_obj, "descriptions"); - if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { - dump_obj(os, spa->spa_feat_enabled_txg_obj, "enabled_txg"); - } - dump_mos(spa); - - spa_close(spa, FTAG); -} - -static void -zhack_feature_enable_sync(void *arg, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - zfeature_info_t *feature = arg; - - feature_enable_sync(spa, feature, tx); - - spa_history_log_internal(spa, "zhack enable feature", tx, - "guid=%s flags=%x", - feature->fi_guid, feature->fi_flags); -} - -static void -zhack_do_feature_enable(int argc, char **argv) -{ - char c; - char *desc, *target; - spa_t *spa; - objset_t *mos; - zfeature_info_t feature; - spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; - - /* - * Features are not added to the pool's label until their refcounts - * are incremented, so fi_mos can just be left as false for now. - */ - desc = NULL; - feature.fi_uname = "zhack"; - feature.fi_flags = 0; - feature.fi_depends = nodeps; - feature.fi_feature = SPA_FEATURE_NONE; - - optind = 1; - while ((c = getopt(argc, argv, "rmd:")) != -1) { - switch (c) { - case 'r': - feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT; - break; - case 'd': - desc = strdup(optarg); - break; - default: - usage(); - break; - } - } - - if (desc == NULL) - desc = strdup("zhack injected"); - feature.fi_desc = desc; - - argc -= optind; - argv += optind; - - if (argc < 2) { - (void) fprintf(stderr, "error: missing feature or pool name\n"); - usage(); - } - target = argv[0]; - feature.fi_guid = argv[1]; - - if (!zfeature_is_valid_guid(feature.fi_guid)) - fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid); - - zhack_spa_open(target, B_FALSE, FTAG, &spa); - mos = spa->spa_meta_objset; - - if (zfeature_is_supported(feature.fi_guid)) - fatal(spa, FTAG, "'%s' is a real feature, will not enable"); - if (0 == zap_contains(mos, spa->spa_feat_desc_obj, feature.fi_guid)) - fatal(spa, FTAG, "feature already enabled: %s", - feature.fi_guid); - - VERIFY0(dsl_sync_task(spa_name(spa), NULL, - zhack_feature_enable_sync, &feature, 5, ZFS_SPACE_CHECK_NORMAL)); - - spa_close(spa, FTAG); - - free(desc); -} - -static void -feature_incr_sync(void *arg, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - zfeature_info_t *feature = arg; - uint64_t refcount; - - VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount)); - feature_sync(spa, feature, refcount + 1, tx); - spa_history_log_internal(spa, "zhack feature incr", tx, - "name=%s", feature->fi_guid); -} - -static void -feature_decr_sync(void *arg, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - zfeature_info_t *feature = arg; - uint64_t refcount; - - VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount)); - feature_sync(spa, feature, refcount - 1, tx); - spa_history_log_internal(spa, "zhack feature decr", tx, - "name=%s", feature->fi_guid); -} - -static void -zhack_do_feature_ref(int argc, char **argv) -{ - char c; - char *target; - boolean_t decr = B_FALSE; - spa_t *spa; - objset_t *mos; - zfeature_info_t feature; - spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; - - /* - * fi_desc does not matter here because it was written to disk - * when the feature was enabled, but we need to properly set the - * feature for read or write based on the information we read off - * disk later. - */ - feature.fi_uname = "zhack"; - feature.fi_flags = 0; - feature.fi_desc = NULL; - feature.fi_depends = nodeps; - feature.fi_feature = SPA_FEATURE_NONE; - - optind = 1; - while ((c = getopt(argc, argv, "md")) != -1) { - switch (c) { - case 'm': - feature.fi_flags |= ZFEATURE_FLAG_MOS; - break; - case 'd': - decr = B_TRUE; - break; - default: - usage(); - break; - } - } - argc -= optind; - argv += optind; - - if (argc < 2) { - (void) fprintf(stderr, "error: missing feature or pool name\n"); - usage(); - } - target = argv[0]; - feature.fi_guid = argv[1]; - - if (!zfeature_is_valid_guid(feature.fi_guid)) - fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid); - - zhack_spa_open(target, B_FALSE, FTAG, &spa); - mos = spa->spa_meta_objset; - - if (zfeature_is_supported(feature.fi_guid)) { - fatal(spa, FTAG, - "'%s' is a real feature, will not change refcount"); - } - - if (0 == zap_contains(mos, spa->spa_feat_for_read_obj, - feature.fi_guid)) { - feature.fi_flags &= ~ZFEATURE_FLAG_READONLY_COMPAT; - } else if (0 == zap_contains(mos, spa->spa_feat_for_write_obj, - feature.fi_guid)) { - feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT; - } else { - fatal(spa, FTAG, "feature is not enabled: %s", feature.fi_guid); - } - - if (decr) { - uint64_t count; - if (feature_get_refcount_from_disk(spa, &feature, - &count) == 0 && count != 0) { - fatal(spa, FTAG, "feature refcount already 0: %s", - feature.fi_guid); - } - } - - VERIFY0(dsl_sync_task(spa_name(spa), NULL, - decr ? feature_decr_sync : feature_incr_sync, &feature, - 5, ZFS_SPACE_CHECK_NORMAL)); - - spa_close(spa, FTAG); -} - -static int -zhack_do_feature(int argc, char **argv) -{ - char *subcommand; - - argc--; - argv++; - if (argc == 0) { - (void) fprintf(stderr, - "error: no feature operation specified\n"); - usage(); - } - - subcommand = argv[0]; - if (strcmp(subcommand, "stat") == 0) { - zhack_do_feature_stat(argc, argv); - } else if (strcmp(subcommand, "enable") == 0) { - zhack_do_feature_enable(argc, argv); - } else if (strcmp(subcommand, "ref") == 0) { - zhack_do_feature_ref(argc, argv); - } else { - (void) fprintf(stderr, "error: unknown subcommand: %s\n", - subcommand); - usage(); - } - - return (0); -} - -#define MAX_NUM_PATHS 1024 - -int -main(int argc, char **argv) -{ - extern void zfs_prop_init(void); - - char *path[MAX_NUM_PATHS]; - const char *subcommand; - int rv = 0; - char c; - - g_importargs.path = path; - - dprintf_setup(&argc, argv); - zfs_prop_init(); - - while ((c = getopt(argc, argv, "c:d:")) != -1) { - switch (c) { - case 'c': - g_importargs.cachefile = optarg; - break; - case 'd': - assert(g_importargs.paths < MAX_NUM_PATHS); - g_importargs.path[g_importargs.paths++] = optarg; - break; - default: - usage(); - break; - } - } - - argc -= optind; - argv += optind; - optind = 1; - - if (argc == 0) { - (void) fprintf(stderr, "error: no command specified\n"); - usage(); - } - - subcommand = argv[0]; - - if (strcmp(subcommand, "feature") == 0) { - rv = zhack_do_feature(argc, argv); - } else { - (void) fprintf(stderr, "error: unknown subcommand: %s\n", - subcommand); - usage(); - } - - if (!g_readonly && spa_export(g_pool, NULL, B_TRUE, B_FALSE) != 0) { - fatal(NULL, FTAG, "pool export failed; " - "changes may not be committed to disk\n"); - } - - libzfs_fini(g_zfs); - kernel_fini(); - - return (rv); -} diff --git a/cddl/contrib/opensolaris/cmd/zinject/translate.c b/cddl/contrib/opensolaris/cmd/zinject/translate.c deleted file mode 100644 index 99a3d0ca4ff3..000000000000 --- a/cddl/contrib/opensolaris/cmd/zinject/translate.c +++ /dev/null @@ -1,492 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2020 by Delphix. All rights reserved. - */ - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "zinject.h" - -extern void kernel_init(int); -extern void kernel_fini(void); - -static int debug; - -static void -ziprintf(const char *fmt, ...) -{ - va_list ap; - - if (!debug) - return; - - va_start(ap, fmt); - (void) vprintf(fmt, ap); - va_end(ap); -} - -static void -compress_slashes(const char *src, char *dest) -{ - while (*src != '\0') { - *dest = *src++; - while (*dest == '/' && *src == '/') - ++src; - ++dest; - } - *dest = '\0'; -} - -/* - * Given a full path to a file, translate into a dataset name and a relative - * path within the dataset. 'dataset' must be at least MAXNAMELEN characters, - * and 'relpath' must be at least MAXPATHLEN characters. We also pass a stat64 - * buffer, which we need later to get the object ID. - */ -static int -parse_pathname(const char *inpath, char *dataset, char *relpath, - struct stat64 *statbuf) -{ - struct statfs sfs; - const char *rel; - char fullpath[MAXPATHLEN]; - - compress_slashes(inpath, fullpath); - - if (fullpath[0] != '/') { - (void) fprintf(stderr, "invalid object '%s': must be full " - "path\n", fullpath); - usage(); - return (-1); - } - - if (strlen(fullpath) >= MAXPATHLEN) { - (void) fprintf(stderr, "invalid object; pathname too long\n"); - return (-1); - } - - if (stat64(fullpath, statbuf) != 0) { - (void) fprintf(stderr, "cannot open '%s': %s\n", - fullpath, strerror(errno)); - return (-1); - } - - if (statfs(fullpath, &sfs) == -1) { - (void) fprintf(stderr, "cannot find mountpoint for '%s': %s\n", - fullpath, strerror(errno)); - return (-1); - } - - if (strcmp(sfs.f_fstypename, MNTTYPE_ZFS) != 0) { - (void) fprintf(stderr, "invalid path '%s': not a ZFS " - "filesystem\n", fullpath); - return (-1); - } - - if (strncmp(fullpath, sfs.f_mntonname, strlen(sfs.f_mntonname)) != 0) { - (void) fprintf(stderr, "invalid path '%s': mountpoint " - "doesn't match path\n", fullpath); - return (-1); - } - - (void) strcpy(dataset, sfs.f_mntfromname); - - rel = fullpath + strlen(sfs.f_mntonname); - if (rel[0] == '/') - rel++; - (void) strcpy(relpath, rel); - - return (0); -} - -/* - * Convert from a (dataset, path) pair into a (objset, object) pair. Note that - * we grab the object number from the inode number, since looking this up via - * libzpool is a real pain. - */ -/* ARGSUSED */ -static int -object_from_path(const char *dataset, const char *path, struct stat64 *statbuf, - zinject_record_t *record) -{ - objset_t *os; - int err; - - /* - * Before doing any libzpool operations, call sync() to ensure that the - * on-disk state is consistent with the in-core state. - */ - sync(); - - err = dmu_objset_own(dataset, DMU_OST_ZFS, B_TRUE, FTAG, &os); - if (err != 0) { - (void) fprintf(stderr, "cannot open dataset '%s': %s\n", - dataset, strerror(err)); - return (-1); - } - - record->zi_objset = dmu_objset_id(os); - record->zi_object = statbuf->st_ino; - - dmu_objset_disown(os, FTAG); - - return (0); -} - -/* - * Calculate the real range based on the type, level, and range given. - */ -static int -calculate_range(const char *dataset, err_type_t type, int level, char *range, - zinject_record_t *record) -{ - objset_t *os = NULL; - dnode_t *dn = NULL; - int err; - int ret = -1; - - /* - * Determine the numeric range from the string. - */ - if (range == NULL) { - /* - * If range is unspecified, set the range to [0,-1], which - * indicates that the whole object should be treated as an - * error. - */ - record->zi_start = 0; - record->zi_end = -1ULL; - } else { - char *end; - - /* XXX add support for suffixes */ - record->zi_start = strtoull(range, &end, 10); - - - if (*end == '\0') - record->zi_end = record->zi_start + 1; - else if (*end == ',') - record->zi_end = strtoull(end + 1, &end, 10); - - if (*end != '\0') { - (void) fprintf(stderr, "invalid range '%s': must be " - "a numeric range of the form 'start[,end]'\n", - range); - goto out; - } - } - - switch (type) { - case TYPE_DATA: - break; - - case TYPE_DNODE: - /* - * If this is a request to inject faults into the dnode, then we - * must translate the current (objset,object) pair into an - * offset within the metadnode for the objset. Specifying any - * kind of range with type 'dnode' is illegal. - */ - if (range != NULL) { - (void) fprintf(stderr, "range cannot be specified when " - "type is 'dnode'\n"); - goto out; - } - - record->zi_start = record->zi_object * sizeof (dnode_phys_t); - record->zi_end = record->zi_start + sizeof (dnode_phys_t); - record->zi_object = 0; - break; - } - - /* - * Get the dnode associated with object, so we can calculate the block - * size. - */ - if ((err = dmu_objset_own(dataset, DMU_OST_ANY, - B_TRUE, FTAG, &os)) != 0) { - (void) fprintf(stderr, "cannot open dataset '%s': %s\n", - dataset, strerror(err)); - goto out; - } - - if (record->zi_object == 0) { - dn = DMU_META_DNODE(os); - } else { - err = dnode_hold(os, record->zi_object, FTAG, &dn); - if (err != 0) { - (void) fprintf(stderr, "failed to hold dnode " - "for object %llu\n", - (u_longlong_t)record->zi_object); - goto out; - } - } - - - ziprintf("data shift: %d\n", (int)dn->dn_datablkshift); - ziprintf(" ind shift: %d\n", (int)dn->dn_indblkshift); - - /* - * Translate range into block IDs. - */ - if (record->zi_start != 0 || record->zi_end != -1ULL) { - record->zi_start >>= dn->dn_datablkshift; - record->zi_end >>= dn->dn_datablkshift; - } - - /* - * Check level, and then translate level 0 blkids into ranges - * appropriate for level of indirection. - */ - record->zi_level = level; - if (level > 0) { - ziprintf("level 0 blkid range: [%llu, %llu]\n", - record->zi_start, record->zi_end); - - if (level >= dn->dn_nlevels) { - (void) fprintf(stderr, "level %d exceeds max level " - "of object (%d)\n", level, dn->dn_nlevels - 1); - goto out; - } - - if (record->zi_start != 0 || record->zi_end != 0) { - int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - - for (; level > 0; level--) { - record->zi_start >>= shift; - record->zi_end >>= shift; - } - } - } - - ret = 0; -out: - if (dn) { - if (dn != DMU_META_DNODE(os)) - dnode_rele(dn, FTAG); - } - if (os) - dmu_objset_disown(os, FTAG); - - return (ret); -} - -int -translate_record(err_type_t type, const char *object, const char *range, - int level, zinject_record_t *record, char *poolname, char *dataset) -{ - char path[MAXPATHLEN]; - char *slash; - struct stat64 statbuf; - int ret = -1; - - kernel_init(FREAD); - - debug = (getenv("ZINJECT_DEBUG") != NULL); - - ziprintf("translating: %s\n", object); - - if (MOS_TYPE(type)) { - /* - * MOS objects are treated specially. - */ - switch (type) { - case TYPE_MOS: - record->zi_type = 0; - break; - case TYPE_MOSDIR: - record->zi_type = DMU_OT_OBJECT_DIRECTORY; - break; - case TYPE_METASLAB: - record->zi_type = DMU_OT_OBJECT_ARRAY; - break; - case TYPE_CONFIG: - record->zi_type = DMU_OT_PACKED_NVLIST; - break; - case TYPE_BPOBJ: - record->zi_type = DMU_OT_BPOBJ; - break; - case TYPE_SPACEMAP: - record->zi_type = DMU_OT_SPACE_MAP; - break; - case TYPE_ERRLOG: - record->zi_type = DMU_OT_ERROR_LOG; - break; - } - - dataset[0] = '\0'; - (void) strcpy(poolname, object); - return (0); - } - - /* - * Convert a full path into a (dataset, file) pair. - */ - if (parse_pathname(object, dataset, path, &statbuf) != 0) - goto err; - - ziprintf(" dataset: %s\n", dataset); - ziprintf(" path: %s\n", path); - - /* - * Convert (dataset, file) into (objset, object) - */ - if (object_from_path(dataset, path, &statbuf, record) != 0) - goto err; - - ziprintf("raw objset: %llu\n", record->zi_objset); - ziprintf("raw object: %llu\n", record->zi_object); - - /* - * For the given object, calculate the real (type, level, range) - */ - if (calculate_range(dataset, type, level, (char *)range, record) != 0) - goto err; - - ziprintf(" objset: %llu\n", record->zi_objset); - ziprintf(" object: %llu\n", record->zi_object); - if (record->zi_start == 0 && - record->zi_end == -1ULL) - ziprintf(" range: all\n"); - else - ziprintf(" range: [%llu, %llu]\n", record->zi_start, - record->zi_end); - - /* - * Copy the pool name - */ - (void) strcpy(poolname, dataset); - if ((slash = strchr(poolname, '/')) != NULL) - *slash = '\0'; - - ret = 0; - -err: - kernel_fini(); - return (ret); -} - -int -translate_raw(const char *str, zinject_record_t *record) -{ - /* - * A raw bookmark of the form objset:object:level:blkid, where each - * number is a hexidecimal value. - */ - if (sscanf(str, "%llx:%llx:%x:%llx", (u_longlong_t *)&record->zi_objset, - (u_longlong_t *)&record->zi_object, &record->zi_level, - (u_longlong_t *)&record->zi_start) != 4) { - (void) fprintf(stderr, "bad raw spec '%s': must be of the form " - "'objset:object:level:blkid'\n", str); - return (-1); - } - - record->zi_end = record->zi_start; - - return (0); -} - -int -translate_device(const char *pool, const char *device, err_type_t label_type, - zinject_record_t *record) -{ - char *end; - zpool_handle_t *zhp; - nvlist_t *tgt; - boolean_t isspare, iscache; - - /* - * Given a device name or GUID, create an appropriate injection record - * with zi_guid set. - */ - if ((zhp = zpool_open(g_zfs, pool)) == NULL) - return (-1); - - record->zi_guid = strtoull(device, &end, 16); - if (record->zi_guid == 0 || *end != '\0') { - tgt = zpool_find_vdev(zhp, device, &isspare, &iscache, NULL); - - if (tgt == NULL) { - (void) fprintf(stderr, "cannot find device '%s' in " - "pool '%s'\n", device, pool); - return (-1); - } - - verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, - &record->zi_guid) == 0); - } - - /* - * Device faults can take on three different forms: - * 1). delayed or hanging I/O - * 2). zfs label faults - * 3). generic disk faults - */ - if (record->zi_timer != 0) { - record->zi_cmd = ZINJECT_DELAY_IO; - } else if (label_type != TYPE_INVAL) { - record->zi_cmd = ZINJECT_LABEL_FAULT; - } else { - record->zi_cmd = ZINJECT_DEVICE_FAULT; - } - - switch (label_type) { - case TYPE_LABEL_UBERBLOCK: - record->zi_start = offsetof(vdev_label_t, vl_uberblock[0]); - record->zi_end = record->zi_start + VDEV_UBERBLOCK_RING - 1; - break; - case TYPE_LABEL_NVLIST: - record->zi_start = offsetof(vdev_label_t, vl_vdev_phys); - record->zi_end = record->zi_start + VDEV_PHYS_SIZE - 1; - break; - case TYPE_LABEL_PAD1: - record->zi_start = offsetof(vdev_label_t, vl_pad1); - record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1; - break; - case TYPE_LABEL_PAD2: - record->zi_start = offsetof(vdev_label_t, vl_be); - record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1; - break; - } - return (0); -} diff --git a/cddl/contrib/opensolaris/cmd/zinject/zinject.c b/cddl/contrib/opensolaris/cmd/zinject/zinject.c deleted file mode 100644 index bf42bc483830..000000000000 --- a/cddl/contrib/opensolaris/cmd/zinject/zinject.c +++ /dev/null @@ -1,1093 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. - */ - -/* - * ZFS Fault Injector - * - * This userland component takes a set of options and uses libzpool to translate - * from a user-visible object type and name to an internal representation. - * There are two basic types of faults: device faults and data faults. - * - * - * DEVICE FAULTS - * - * Errors can be injected into a particular vdev using the '-d' option. This - * option takes a path or vdev GUID to uniquely identify the device within a - * pool. There are two types of errors that can be injected, EIO and ENXIO, - * that can be controlled through the '-e' option. The default is ENXIO. For - * EIO failures, any attempt to read data from the device will return EIO, but - * subsequent attempt to reopen the device will succeed. For ENXIO failures, - * any attempt to read from the device will return EIO, but any attempt to - * reopen the device will also return ENXIO. - * For label faults, the -L option must be specified. This allows faults - * to be injected into either the nvlist, uberblock, pad1, or pad2 region - * of all the labels for the specified device. - * - * This form of the command looks like: - * - * zinject -d device [-e errno] [-L ] pool - * - * - * DATA FAULTS - * - * We begin with a tuple of the form: - * - * - * - * type A string describing the type of data to target. Each type - * implicitly describes how to interpret 'object'. Currently, - * the following values are supported: - * - * data User data for a file - * dnode Dnode for a file or directory - * - * The following MOS objects are special. Instead of injecting - * errors on a particular object or blkid, we inject errors across - * all objects of the given type. - * - * mos Any data in the MOS - * mosdir object directory - * config pool configuration - * bpobj blkptr list - * spacemap spacemap - * metaslab metaslab - * errlog persistent error log - * - * level Object level. Defaults to '0', not applicable to all types. If - * a range is given, this corresponds to the indirect block - * corresponding to the specific range. - * - * range A numerical range [start,end) within the object. Defaults to - * the full size of the file. - * - * object A string describing the logical location of the object. For - * files and directories (currently the only supported types), - * this is the path of the object on disk. - * - * This is translated, via libzpool, into the following internal representation: - * - * - * - * These types should be self-explanatory. This tuple is then passed to the - * kernel via a special ioctl() to initiate fault injection for the given - * object. Note that 'type' is not strictly necessary for fault injection, but - * is used when translating existing faults into a human-readable string. - * - * - * The command itself takes one of the forms: - * - * zinject - * zinject <-a | -u pool> - * zinject -c - * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level] - * [-r range] - * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool - * - * With no arguments, the command prints all currently registered injection - * handlers, with their numeric identifiers. - * - * The '-c' option will clear the given handler, or all handlers if 'all' is - * specified. - * - * The '-e' option takes a string describing the errno to simulate. This must - * be either 'io' or 'checksum'. In most cases this will result in the same - * behavior, but RAID-Z will produce a different set of ereports for this - * situation. - * - * The '-a', '-u', and '-m' flags toggle internal flush behavior. If '-a' is - * specified, then the ARC cache is flushed appropriately. If '-u' is - * specified, then the underlying SPA is unloaded. Either of these flags can be - * specified independently of any other handlers. The '-m' flag automatically - * does an unmount and remount of the underlying dataset to aid in flushing the - * cache. - * - * The '-f' flag controls the frequency of errors injected, expressed as a - * integer percentage between 1 and 100. The default is 100. - * - * The this form is responsible for actually injecting the handler into the - * framework. It takes the arguments described above, translates them to the - * internal tuple using libzpool, and then issues an ioctl() to register the - * handler. - * - * The final form can target a specific bookmark, regardless of whether a - * human-readable interface has been designed. It allows developers to specify - * a particular block by number. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include - -#undef verify /* both libzfs.h and zfs_context.h want to define this */ - -#include "zinject.h" - -libzfs_handle_t *g_zfs; -int zfs_fd; - -#ifndef ECKSUM -#define ECKSUM EBADE -#endif - -static const char *errtable[TYPE_INVAL] = { - "data", - "dnode", - "mos", - "mosdir", - "metaslab", - "config", - "bpobj", - "spacemap", - "errlog", - "uber", - "nvlist", - "pad1", - "pad2" -}; - -static err_type_t -name_to_type(const char *arg) -{ - int i; - for (i = 0; i < TYPE_INVAL; i++) - if (strcmp(errtable[i], arg) == 0) - return (i); - - return (TYPE_INVAL); -} - -static const char * -type_to_name(uint64_t type) -{ - switch (type) { - case DMU_OT_OBJECT_DIRECTORY: - return ("mosdir"); - case DMU_OT_OBJECT_ARRAY: - return ("metaslab"); - case DMU_OT_PACKED_NVLIST: - return ("config"); - case DMU_OT_BPOBJ: - return ("bpobj"); - case DMU_OT_SPACE_MAP: - return ("spacemap"); - case DMU_OT_ERROR_LOG: - return ("errlog"); - default: - return ("-"); - } -} - - -/* - * Print usage message. - */ -void -usage(void) -{ - (void) printf( - "usage:\n" - "\n" - "\tzinject\n" - "\n" - "\t\tList all active injection records.\n" - "\n" - "\tzinject -c \n" - "\n" - "\t\tClear the particular record (if given a numeric ID), or\n" - "\t\tall records if 'all' is specificed.\n" - "\n" - "\tzinject -p pool\n" - "\n" - "\t\tInject a panic fault at the specified function. Only \n" - "\t\tfunctions which call spa_vdev_config_exit(), or \n" - "\t\tspa_vdev_exit() will trigger a panic.\n" - "\n" - "\tzinject -d device [-e errno] [-L ] [-F]\n" - "\t [-T pool\n" - "\n" - "\t\tInject a fault into a particular device or the device's\n" - "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n " - "\t\t'pad1', or 'pad2'.\n" - "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n" - "\n" - "\tzinject -d device -A pool\n" - "\n" - "\t\tPerform a specific action on a particular device\n" - "\n" - "\tzinject -d device -D latency:lanes pool\n" - "\n" - "\t\tAdd an artificial delay to IO requests on a particular\n" - "\t\tdevice, such that the requests take a minimum of 'latency'\n" - "\t\tmilliseconds to complete. Each delay has an associated\n" - "\t\tnumber of 'lanes' which defines the number of concurrent\n" - "\t\tIO requests that can be processed.\n" - "\n" - "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n" - "\t\tthe device will only be able to service a single IO request\n" - "\t\tat a time with each request taking 10 ms to complete. So,\n" - "\t\tif only a single request is submitted every 10 ms, the\n" - "\t\taverage latency will be 10 ms; but if more than one request\n" - "\t\tis submitted every 10 ms, the average latency will be more\n" - "\t\tthan 10 ms.\n" - "\n" - "\t\tSimilarly, if a delay of 10 ms is specified to have two\n" - "\t\tlanes (-D 10:2), then the device will be able to service\n" - "\t\ttwo requests at a time, each with a minimum latency of\n" - "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n" - "\t\tthe average latency will be 10 ms; but if more than two\n" - "\t\trequests are submitted every 10 ms, the average latency\n" - "\t\twill be more than 10 ms.\n" - "\n" - "\t\tAlso note, these delays are additive. So two invocations\n" - "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n" - "\t\tof '-D 10:2'. This also means, one can specify multiple\n" - "\t\tlanes with differing target latencies. For example, an\n" - "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n" - "\t\tcreate 3 lanes on the device; one lane with a latency\n" - "\t\tof 10 ms and two lanes with a 25 ms latency.\n" - "\n" - "\tzinject -I [-s | -g ] pool\n" - "\n" - "\t\tCause the pool to stop writing blocks yet not\n" - "\t\treport errors for a duration. Simulates buggy hardware\n" - "\t\tthat fails to honor cache flush requests.\n" - "\t\tDefault duration is 30 seconds. The machine is panicked\n" - "\t\tat the end of the duration.\n" - "\n" - "\tzinject -b objset:object:level:blkid pool\n" - "\n" - "\t\tInject an error into pool 'pool' with the numeric bookmark\n" - "\t\tspecified by the remaining tuple. Each number is in\n" - "\t\thexidecimal, and only one block can be specified.\n" - "\n" - "\tzinject [-q] <-t type> [-e errno] [-l level] [-r range]\n" - "\t [-a] [-m] [-u] [-f freq] \n" - "\n" - "\t\tInject an error into the object specified by the '-t' option\n" - "\t\tand the object descriptor. The 'object' parameter is\n" - "\t\tinterperted depending on the '-t' option.\n" - "\n" - "\t\t-q\tQuiet mode. Only print out the handler number added.\n" - "\t\t-e\tInject a specific error. Must be either 'io' or\n" - "\t\t\t'checksum'. Default is 'io'.\n" - "\t\t-l\tInject error at a particular block level. Default is " - "0.\n" - "\t\t-m\tAutomatically remount underlying filesystem.\n" - "\t\t-r\tInject error over a particular logical range of an\n" - "\t\t\tobject. Will be translated to the appropriate blkid\n" - "\t\t\trange according to the object's properties.\n" - "\t\t-a\tFlush the ARC cache. Can be specified without any\n" - "\t\t\tassociated object.\n" - "\t\t-u\tUnload the associated pool. Can be specified with only\n" - "\t\t\ta pool object.\n" - "\t\t-f\tOnly inject errors a fraction of the time. Expressed as\n" - "\t\t\ta percentage between 1 and 100.\n" - "\n" - "\t-t data\t\tInject an error into the plain file contents of a\n" - "\t\t\tfile. The object must be specified as a complete path\n" - "\t\t\tto a file on a ZFS filesystem.\n" - "\n" - "\t-t dnode\tInject an error into the metadnode in the block\n" - "\t\t\tcorresponding to the dnode for a file or directory. The\n" - "\t\t\t'-r' option is incompatible with this mode. The object\n" - "\t\t\tis specified as a complete path to a file or directory\n" - "\t\t\ton a ZFS filesystem.\n" - "\n" - "\t-t \tInject errors into the MOS for objects of the given\n" - "\t\t\ttype. Valid types are: mos, mosdir, config, bpobj,\n" - "\t\t\tspacemap, metaslab, errlog. The only valid is\n" - "\t\t\tthe poolname.\n"); -} - -static int -iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *), - void *data) -{ - zfs_cmd_t zc = { 0 }; - int ret; - - while (ioctl(zfs_fd, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0) - if ((ret = func((int)zc.zc_guid, zc.zc_name, - &zc.zc_inject_record, data)) != 0) - return (ret); - - if (errno != ENOENT) { - (void) fprintf(stderr, "Unable to list handlers: %s\n", - strerror(errno)); - return (-1); - } - - return (0); -} - -static int -print_data_handler(int id, const char *pool, zinject_record_t *record, - void *data) -{ - int *count = data; - - if (record->zi_guid != 0 || record->zi_func[0] != '\0') - return (0); - - if (*count == 0) { - (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-15s\n", - "ID", "POOL", "OBJSET", "OBJECT", "TYPE", "LVL", "RANGE"); - (void) printf("--- --------------- ------ " - "------ -------- --- ---------------\n"); - } - - *count += 1; - - (void) printf("%3d %-15s %-6llu %-6llu %-8s %3d ", id, pool, - (u_longlong_t)record->zi_objset, (u_longlong_t)record->zi_object, - type_to_name(record->zi_type), record->zi_level); - - if (record->zi_start == 0 && - record->zi_end == -1ULL) - (void) printf("all\n"); - else - (void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start, - (u_longlong_t)record->zi_end); - - return (0); -} - -static int -print_device_handler(int id, const char *pool, zinject_record_t *record, - void *data) -{ - int *count = data; - - if (record->zi_guid == 0 || record->zi_func[0] != '\0') - return (0); - - if (record->zi_cmd == ZINJECT_DELAY_IO) - return (0); - - if (*count == 0) { - (void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID"); - (void) printf("--- --------------- ----------------\n"); - } - - *count += 1; - - (void) printf("%3d %-15s %llx\n", id, pool, - (u_longlong_t)record->zi_guid); - - return (0); -} - -static int -print_delay_handler(int id, const char *pool, zinject_record_t *record, - void *data) -{ - int *count = data; - - if (record->zi_guid == 0 || record->zi_func[0] != '\0') - return (0); - - if (record->zi_cmd != ZINJECT_DELAY_IO) - return (0); - - if (*count == 0) { - (void) printf("%3s %-15s %-15s %-15s %s\n", - "ID", "POOL", "DELAY (ms)", "LANES", "GUID"); - (void) printf("--- --------------- --------------- " - "--------------- ----------------\n"); - } - - *count += 1; - - (void) printf("%3d %-15s %-15llu %-15llu %llx\n", id, pool, - (u_longlong_t)NSEC2MSEC(record->zi_timer), - (u_longlong_t)record->zi_nlanes, - (u_longlong_t)record->zi_guid); - - return (0); -} - -static int -print_panic_handler(int id, const char *pool, zinject_record_t *record, - void *data) -{ - int *count = data; - - if (record->zi_func[0] == '\0') - return (0); - - if (*count == 0) { - (void) printf("%3s %-15s %s\n", "ID", "POOL", "FUNCTION"); - (void) printf("--- --------------- ----------------\n"); - } - - *count += 1; - - (void) printf("%3d %-15s %s\n", id, pool, record->zi_func); - - return (0); -} - -/* - * Print all registered error handlers. Returns the number of handlers - * registered. - */ -static int -print_all_handlers(void) -{ - int count = 0, total = 0; - - (void) iter_handlers(print_device_handler, &count); - if (count > 0) { - total += count; - (void) printf("\n"); - count = 0; - } - - (void) iter_handlers(print_delay_handler, &count); - if (count > 0) { - total += count; - (void) printf("\n"); - count = 0; - } - - (void) iter_handlers(print_data_handler, &count); - if (count > 0) { - total += count; - (void) printf("\n"); - count = 0; - } - - (void) iter_handlers(print_panic_handler, &count); - - return (count + total); -} - -/* ARGSUSED */ -static int -cancel_one_handler(int id, const char *pool, zinject_record_t *record, - void *data) -{ - zfs_cmd_t zc = { 0 }; - - zc.zc_guid = (uint64_t)id; - - if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { - (void) fprintf(stderr, "failed to remove handler %d: %s\n", - id, strerror(errno)); - return (1); - } - - return (0); -} - -/* - * Remove all fault injection handlers. - */ -static int -cancel_all_handlers(void) -{ - int ret = iter_handlers(cancel_one_handler, NULL); - - if (ret == 0) - (void) printf("removed all registered handlers\n"); - - return (ret); -} - -/* - * Remove a specific fault injection handler. - */ -static int -cancel_handler(int id) -{ - zfs_cmd_t zc = { 0 }; - - zc.zc_guid = (uint64_t)id; - - if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { - (void) fprintf(stderr, "failed to remove handler %d: %s\n", - id, strerror(errno)); - return (1); - } - - (void) printf("removed handler %d\n", id); - - return (0); -} - -/* - * Register a new fault injection handler. - */ -static int -register_handler(const char *pool, int flags, zinject_record_t *record, - int quiet) -{ - zfs_cmd_t zc = { 0 }; - - (void) strcpy(zc.zc_name, pool); - zc.zc_inject_record = *record; - zc.zc_guid = flags; - - if (ioctl(zfs_fd, ZFS_IOC_INJECT_FAULT, &zc) != 0) { - (void) fprintf(stderr, "failed to add handler: %s\n", - strerror(errno)); - return (1); - } - - if (flags & ZINJECT_NULL) - return (0); - - if (quiet) { - (void) printf("%llu\n", (u_longlong_t)zc.zc_guid); - } else { - (void) printf("Added handler %llu with the following " - "properties:\n", (u_longlong_t)zc.zc_guid); - (void) printf(" pool: %s\n", pool); - if (record->zi_guid) { - (void) printf(" vdev: %llx\n", - (u_longlong_t)record->zi_guid); - } else if (record->zi_func[0] != '\0') { - (void) printf(" panic function: %s\n", - record->zi_func); - } else if (record->zi_duration > 0) { - (void) printf(" time: %lld seconds\n", - (u_longlong_t)record->zi_duration); - } else if (record->zi_duration < 0) { - (void) printf(" txgs: %lld \n", - (u_longlong_t)-record->zi_duration); - } else { - (void) printf("objset: %llu\n", - (u_longlong_t)record->zi_objset); - (void) printf("object: %llu\n", - (u_longlong_t)record->zi_object); - (void) printf(" type: %llu\n", - (u_longlong_t)record->zi_type); - (void) printf(" level: %d\n", record->zi_level); - if (record->zi_start == 0 && - record->zi_end == -1ULL) - (void) printf(" range: all\n"); - else - (void) printf(" range: [%llu, %llu)\n", - (u_longlong_t)record->zi_start, - (u_longlong_t)record->zi_end); - } - } - - return (0); -} - -int -perform_action(const char *pool, zinject_record_t *record, int cmd) -{ - zfs_cmd_t zc = { 0 }; - - ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED); - (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); - zc.zc_guid = record->zi_guid; - zc.zc_cookie = cmd; - - if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) - return (0); - - return (1); -} - -static int -parse_delay(char *str, uint64_t *delay, uint64_t *nlanes) -{ - unsigned long scan_delay; - unsigned long scan_nlanes; - - if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2) - return (1); - - /* - * We explicitly disallow a delay of zero here, because we key - * off this value being non-zero in translate_device(), to - * determine if the fault is a ZINJECT_DELAY_IO fault or not. - */ - if (scan_delay == 0) - return (1); - - /* - * The units for the CLI delay parameter is milliseconds, but - * the data passed to the kernel is interpreted as nanoseconds. - * Thus we scale the milliseconds to nanoseconds here, and this - * nanosecond value is used to pass the delay to the kernel. - */ - *delay = MSEC2NSEC(scan_delay); - *nlanes = scan_nlanes; - - return (0); -} - -int -main(int argc, char **argv) -{ - int c; - char *range = NULL; - char *cancel = NULL; - char *end; - char *raw = NULL; - char *device = NULL; - int level = 0; - int quiet = 0; - int error = 0; - int domount = 0; - int io_type = ZIO_TYPES; - int action = VDEV_STATE_UNKNOWN; - err_type_t type = TYPE_INVAL; - err_type_t label = TYPE_INVAL; - zinject_record_t record = { 0 }; - char pool[MAXNAMELEN]; - char dataset[MAXNAMELEN]; - zfs_handle_t *zhp; - int nowrites = 0; - int dur_txg = 0; - int dur_secs = 0; - int ret; - int flags = 0; - - if ((g_zfs = libzfs_init()) == NULL) { - (void) fprintf(stderr, "internal error: failed to " - "initialize ZFS library\n"); - return (1); - } - - libzfs_print_on_error(g_zfs, B_TRUE); - - if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) { - (void) fprintf(stderr, "failed to open ZFS device\n"); - return (1); - } - - if (argc == 1) { - /* - * No arguments. Print the available handlers. If there are no - * available handlers, direct the user to '-h' for help - * information. - */ - if (print_all_handlers() == 0) { - (void) printf("No handlers registered.\n"); - (void) printf("Run 'zinject -h' for usage " - "information.\n"); - } - - return (0); - } - - while ((c = getopt(argc, argv, - ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { - switch (c) { - case 'a': - flags |= ZINJECT_FLUSH_ARC; - break; - case 'A': - if (strcasecmp(optarg, "degrade") == 0) { - action = VDEV_STATE_DEGRADED; - } else if (strcasecmp(optarg, "fault") == 0) { - action = VDEV_STATE_FAULTED; - } else { - (void) fprintf(stderr, "invalid action '%s': " - "must be 'degrade' or 'fault'\n", optarg); - usage(); - return (1); - } - break; - case 'b': - raw = optarg; - break; - case 'c': - cancel = optarg; - break; - case 'd': - device = optarg; - break; - case 'D': - ret = parse_delay(optarg, &record.zi_timer, - &record.zi_nlanes); - if (ret != 0) { - (void) fprintf(stderr, "invalid i/o delay " - "value: '%s'\n", optarg); - usage(); - return (1); - } - break; - case 'e': - if (strcasecmp(optarg, "io") == 0) { - error = EIO; - } else if (strcasecmp(optarg, "checksum") == 0) { - error = ECKSUM; - } else if (strcasecmp(optarg, "nxio") == 0) { - error = ENXIO; - } else if (strcasecmp(optarg, "dtl") == 0) { - error = ECHILD; - } else { - (void) fprintf(stderr, "invalid error type " - "'%s': must be 'io', 'checksum' or " - "'nxio'\n", optarg); - usage(); - return (1); - } - break; - case 'f': - record.zi_freq = atoi(optarg); - if (record.zi_freq < 1 || record.zi_freq > 100) { - (void) fprintf(stderr, "frequency range must " - "be in the range (0, 100]\n"); - return (1); - } - break; - case 'F': - record.zi_failfast = B_TRUE; - break; - case 'g': - dur_txg = 1; - record.zi_duration = (int)strtol(optarg, &end, 10); - if (record.zi_duration <= 0 || *end != '\0') { - (void) fprintf(stderr, "invalid duration '%s': " - "must be a positive integer\n", optarg); - usage(); - return (1); - } - /* store duration of txgs as its negative */ - record.zi_duration *= -1; - break; - case 'h': - usage(); - return (0); - case 'I': - /* default duration, if one hasn't yet been defined */ - nowrites = 1; - if (dur_secs == 0 && dur_txg == 0) - record.zi_duration = 30; - break; - case 'l': - level = (int)strtol(optarg, &end, 10); - if (*end != '\0') { - (void) fprintf(stderr, "invalid level '%s': " - "must be an integer\n", optarg); - usage(); - return (1); - } - break; - case 'm': - domount = 1; - break; - case 'p': - (void) strlcpy(record.zi_func, optarg, - sizeof (record.zi_func)); - record.zi_cmd = ZINJECT_PANIC; - break; - case 'q': - quiet = 1; - break; - case 'r': - range = optarg; - break; - case 's': - dur_secs = 1; - record.zi_duration = (int)strtol(optarg, &end, 10); - if (record.zi_duration <= 0 || *end != '\0') { - (void) fprintf(stderr, "invalid duration '%s': " - "must be a positive integer\n", optarg); - usage(); - return (1); - } - break; - case 'T': - if (strcasecmp(optarg, "read") == 0) { - io_type = ZIO_TYPE_READ; - } else if (strcasecmp(optarg, "write") == 0) { - io_type = ZIO_TYPE_WRITE; - } else if (strcasecmp(optarg, "free") == 0) { - io_type = ZIO_TYPE_FREE; - } else if (strcasecmp(optarg, "claim") == 0) { - io_type = ZIO_TYPE_CLAIM; - } else if (strcasecmp(optarg, "all") == 0) { - io_type = ZIO_TYPES; - } else { - (void) fprintf(stderr, "invalid I/O type " - "'%s': must be 'read', 'write', 'free', " - "'claim' or 'all'\n", optarg); - usage(); - return (1); - } - break; - case 't': - if ((type = name_to_type(optarg)) == TYPE_INVAL && - !MOS_TYPE(type)) { - (void) fprintf(stderr, "invalid type '%s'\n", - optarg); - usage(); - return (1); - } - break; - case 'u': - flags |= ZINJECT_UNLOAD_SPA; - break; - case 'L': - if ((label = name_to_type(optarg)) == TYPE_INVAL && - !LABEL_TYPE(type)) { - (void) fprintf(stderr, "invalid label type " - "'%s'\n", optarg); - usage(); - return (1); - } - break; - case ':': - (void) fprintf(stderr, "option -%c requires an " - "operand\n", optopt); - usage(); - return (1); - case '?': - (void) fprintf(stderr, "invalid option '%c'\n", - optopt); - usage(); - return (2); - } - } - - argc -= optind; - argv += optind; - - if (record.zi_duration != 0) - record.zi_cmd = ZINJECT_IGNORED_WRITES; - - if (cancel != NULL) { - /* - * '-c' is invalid with any other options. - */ - if (raw != NULL || range != NULL || type != TYPE_INVAL || - level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) { - (void) fprintf(stderr, "cancel (-c) incompatible with " - "any other options\n"); - usage(); - return (2); - } - if (argc != 0) { - (void) fprintf(stderr, "extraneous argument to '-c'\n"); - usage(); - return (2); - } - - if (strcmp(cancel, "all") == 0) { - return (cancel_all_handlers()); - } else { - int id = (int)strtol(cancel, &end, 10); - if (*end != '\0') { - (void) fprintf(stderr, "invalid handle id '%s':" - " must be an integer or 'all'\n", cancel); - usage(); - return (1); - } - return (cancel_handler(id)); - } - } - - if (device != NULL) { - /* - * Device (-d) injection uses a completely different mechanism - * for doing injection, so handle it separately here. - */ - if (raw != NULL || range != NULL || type != TYPE_INVAL || - level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) { - (void) fprintf(stderr, "device (-d) incompatible with " - "data error injection\n"); - usage(); - return (2); - } - - if (argc != 1) { - (void) fprintf(stderr, "device (-d) injection requires " - "a single pool name\n"); - usage(); - return (2); - } - - (void) strcpy(pool, argv[0]); - dataset[0] = '\0'; - - if (error == ECKSUM) { - (void) fprintf(stderr, "device error type must be " - "'io' or 'nxio'\n"); - return (1); - } - - record.zi_iotype = io_type; - if (translate_device(pool, device, label, &record) != 0) - return (1); - if (!error) - error = ENXIO; - - if (action != VDEV_STATE_UNKNOWN) - return (perform_action(pool, &record, action)); - - } else if (raw != NULL) { - if (range != NULL || type != TYPE_INVAL || level != 0 || - record.zi_cmd != ZINJECT_UNINITIALIZED) { - (void) fprintf(stderr, "raw (-b) format with " - "any other options\n"); - usage(); - return (2); - } - - if (argc != 1) { - (void) fprintf(stderr, "raw (-b) format expects a " - "single pool name\n"); - usage(); - return (2); - } - - (void) strcpy(pool, argv[0]); - dataset[0] = '\0'; - - if (error == ENXIO) { - (void) fprintf(stderr, "data error type must be " - "'checksum' or 'io'\n"); - return (1); - } - - record.zi_cmd = ZINJECT_DATA_FAULT; - if (translate_raw(raw, &record) != 0) - return (1); - if (!error) - error = EIO; - } else if (record.zi_cmd == ZINJECT_PANIC) { - if (raw != NULL || range != NULL || type != TYPE_INVAL || - level != 0 || device != NULL) { - (void) fprintf(stderr, "panic (-p) incompatible with " - "other options\n"); - usage(); - return (2); - } - - if (argc < 1 || argc > 2) { - (void) fprintf(stderr, "panic (-p) injection requires " - "a single pool name and an optional id\n"); - usage(); - return (2); - } - - (void) strcpy(pool, argv[0]); - if (argv[1] != NULL) - record.zi_type = atoi(argv[1]); - dataset[0] = '\0'; - } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) { - if (nowrites == 0) { - (void) fprintf(stderr, "-s or -g meaningless " - "without -I (ignore writes)\n"); - usage(); - return (2); - } else if (dur_secs && dur_txg) { - (void) fprintf(stderr, "choose a duration either " - "in seconds (-s) or a number of txgs (-g) " - "but not both\n"); - usage(); - return (2); - } else if (argc != 1) { - (void) fprintf(stderr, "ignore writes (-I) " - "injection requires a single pool name\n"); - usage(); - return (2); - } - - (void) strcpy(pool, argv[0]); - dataset[0] = '\0'; - } else if (type == TYPE_INVAL) { - if (flags == 0) { - (void) fprintf(stderr, "at least one of '-b', '-d', " - "'-t', '-a', '-p', '-I' or '-u' " - "must be specified\n"); - usage(); - return (2); - } - - if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) { - (void) strcpy(pool, argv[0]); - dataset[0] = '\0'; - } else if (argc != 0) { - (void) fprintf(stderr, "extraneous argument for " - "'-f'\n"); - usage(); - return (2); - } - - flags |= ZINJECT_NULL; - } else { - if (argc != 1) { - (void) fprintf(stderr, "missing object\n"); - usage(); - return (2); - } - - if (error == ENXIO) { - (void) fprintf(stderr, "data error type must be " - "'checksum' or 'io'\n"); - return (1); - } - - record.zi_cmd = ZINJECT_DATA_FAULT; - if (translate_record(type, argv[0], range, level, &record, pool, - dataset) != 0) - return (1); - if (!error) - error = EIO; - } - - /* - * If this is pool-wide metadata, unmount everything. The ioctl() will - * unload the pool, so that we trigger spa-wide reopen of metadata next - * time we access the pool. - */ - if (dataset[0] != '\0' && domount) { - if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL) - return (1); - - if (zfs_unmount(zhp, NULL, 0) != 0) - return (1); - } - - record.zi_error = error; - - ret = register_handler(pool, flags, &record, quiet); - - if (dataset[0] != '\0' && domount) - ret = (zfs_mount(zhp, NULL, 0) != 0); - - libzfs_fini(g_zfs); - - return (ret); -} diff --git a/cddl/contrib/opensolaris/cmd/zinject/zinject.h b/cddl/contrib/opensolaris/cmd/zinject/zinject.h deleted file mode 100644 index 46fdcad8b31f..000000000000 --- a/cddl/contrib/opensolaris/cmd/zinject/zinject.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#ifndef _ZINJECT_H -#define _ZINJECT_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef enum { - TYPE_DATA, /* plain file contents */ - TYPE_DNODE, /* metadnode contents */ - TYPE_MOS, /* all MOS data */ - TYPE_MOSDIR, /* MOS object directory */ - TYPE_METASLAB, /* metaslab objects */ - TYPE_CONFIG, /* MOS config */ - TYPE_BPOBJ, /* block pointer list */ - TYPE_SPACEMAP, /* space map objects */ - TYPE_ERRLOG, /* persistent error log */ - TYPE_LABEL_UBERBLOCK, /* label specific uberblock */ - TYPE_LABEL_NVLIST, /* label specific nvlist */ - TYPE_LABEL_PAD1, /* label specific 8K pad1 area */ - TYPE_LABEL_PAD2, /* label specific 8K pad2 area */ - TYPE_INVAL -} err_type_t; - -#define MOS_TYPE(t) \ - ((t) >= TYPE_MOS && (t) < TYPE_LABEL_UBERBLOCK) - -#define LABEL_TYPE(t) \ - ((t) >= TYPE_LABEL_UBERBLOCK && (t) < TYPE_INVAL) - -int translate_record(err_type_t type, const char *object, const char *range, - int level, zinject_record_t *record, char *poolname, char *dataset); -int translate_raw(const char *raw, zinject_record_t *record); -int translate_device(const char *pool, const char *device, - err_type_t label_type, zinject_record_t *record); -void usage(void); - -extern libzfs_handle_t *g_zfs; - -#ifdef __cplusplus -} -#endif - -#endif /* _ZINJECT_H */ diff --git a/cddl/contrib/opensolaris/cmd/zlook/zlook.c b/cddl/contrib/opensolaris/cmd/zlook/zlook.c deleted file mode 100644 index 29a6559f9023..000000000000 --- a/cddl/contrib/opensolaris/cmd/zlook/zlook.c +++ /dev/null @@ -1,411 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -/* - * This is a test program that uses ioctls to the ZFS Unit Test driver - * to perform readdirs or lookups using flags not normally available - * to user-land programs. This allows testing of the flags' - * behavior outside of a complicated consumer, such as the SMB driver. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define _KERNEL - -#include -#include - -#undef _KERNEL - -#define MAXBUF (64 * 1024) -#define BIGBUF 4096 -#define LILBUF (sizeof (dirent_t)) - -#define DIRENT_NAMELEN(reclen) \ - ((reclen) - (offsetof(dirent_t, d_name[0]))) - -static void -usage(char *pnam) -{ - (void) fprintf(stderr, "Usage:\n %s -l [-is] dir-to-look-in " - "file-in-dir [xfile-on-file]\n", pnam); - (void) fprintf(stderr, " %s -i [-ls] dir-to-look-in " - "file-in-dir [xfile-on-file]\n", pnam); - (void) fprintf(stderr, " %s -s [-il] dir-to-look-in " - "file-in-dir [xfile-on-file]\n", pnam); - (void) fprintf(stderr, "\t Perform a lookup\n"); - (void) fprintf(stderr, "\t -l == lookup\n"); - (void) fprintf(stderr, "\t -i == request FIGNORECASE\n"); - (void) fprintf(stderr, "\t -s == request stat(2) and xvattr info\n"); - (void) fprintf(stderr, " %s -r [-ea] [-b buffer-size-in-bytes] " - "dir-to-look-in [file-in-dir]\n", pnam); - (void) fprintf(stderr, " %s -e [-ra] [-b buffer-size-in-bytes] " - "dir-to-look-in [file-in-dir]\n", pnam); - (void) fprintf(stderr, " %s -a [-re] [-b buffer-size-in-bytes] " - "dir-to-look-in [file-in-dir]\n", pnam); - (void) fprintf(stderr, "\t Perform a readdir\n"); - (void) fprintf(stderr, "\t -r == readdir\n"); - (void) fprintf(stderr, "\t -e == request extended entries\n"); - (void) fprintf(stderr, "\t -a == request access filtering\n"); - (void) fprintf(stderr, "\t -b == buffer size (default 4K)\n"); - (void) fprintf(stderr, " %s -A path\n", pnam); - (void) fprintf(stderr, "\t Look up _PC_ACCESS_FILTERING " - "for path with pathconf(2)\n"); - (void) fprintf(stderr, " %s -E path\n", pnam); - (void) fprintf(stderr, "\t Look up _PC_SATTR_EXISTS " - "for path with pathconf(2)\n"); - (void) fprintf(stderr, " %s -S path\n", pnam); - (void) fprintf(stderr, "\t Look up _PC_SATTR_EXISTS " - "for path with pathconf(2)\n"); - exit(EINVAL); -} - -static void -print_extd_entries(zut_readdir_t *r) -{ - struct edirent *eodp; - char *bufstart; - - eodp = (edirent_t *)(uintptr_t)r->zr_buf; - bufstart = (char *)eodp; - while ((char *)eodp < bufstart + r->zr_bytes) { - char *blanks = " "; - int i = 0; - while (i < EDIRENT_NAMELEN(eodp->ed_reclen)) { - if (!eodp->ed_name[i]) - break; - (void) printf("%c", eodp->ed_name[i++]); - } - if (i < 16) - (void) printf("%.*s", 16 - i, blanks); - (void) printf("\t%x\n", eodp->ed_eflags); - eodp = (edirent_t *)((intptr_t)eodp + eodp->ed_reclen); - } -} - -static void -print_entries(zut_readdir_t *r) -{ - dirent64_t *dp; - char *bufstart; - - dp = (dirent64_t *)(intptr_t)r->zr_buf; - bufstart = (char *)dp; - while ((char *)dp < bufstart + r->zr_bytes) { - int i = 0; - while (i < DIRENT_NAMELEN(dp->d_reclen)) { - if (!dp->d_name[i]) - break; - (void) printf("%c", dp->d_name[i++]); - } - (void) printf("\n"); - dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen); - } -} - -static void -print_stats(struct stat64 *sb) -{ - char timebuf[512]; - - (void) printf("st_mode\t\t\t%04lo\n", (unsigned long)sb->st_mode); - (void) printf("st_ino\t\t\t%llu\n", (unsigned long long)sb->st_ino); - (void) printf("st_nlink\t\t%lu\n", (unsigned long)sb->st_nlink); - (void) printf("st_uid\t\t\t%d\n", sb->st_uid); - (void) printf("st_gid\t\t\t%d\n", sb->st_gid); - (void) printf("st_size\t\t\t%lld\n", (long long)sb->st_size); - (void) printf("st_blksize\t\t%ld\n", (long)sb->st_blksize); - (void) printf("st_blocks\t\t%lld\n", (long long)sb->st_blocks); - - timebuf[0] = 0; - if (ctime_r(&sb->st_atime, timebuf, 512)) { - (void) printf("st_atime\t\t"); - (void) printf("%s", timebuf); - } - timebuf[0] = 0; - if (ctime_r(&sb->st_mtime, timebuf, 512)) { - (void) printf("st_mtime\t\t"); - (void) printf("%s", timebuf); - } - timebuf[0] = 0; - if (ctime_r(&sb->st_ctime, timebuf, 512)) { - (void) printf("st_ctime\t\t"); - (void) printf("%s", timebuf); - } -} - -static void -print_xvs(uint64_t xvs) -{ - uint_t bits; - int idx = 0; - - if (xvs == 0) - return; - - (void) printf("-------------------\n"); - (void) printf("Attribute bit(s) set:\n"); - (void) printf("-------------------\n"); - - bits = xvs & ((1 << F_ATTR_ALL) - 1); - while (bits) { - uint_t rest = bits >> 1; - if (bits & 1) { - (void) printf("%s", attr_to_name((f_attr_t)idx)); - if (rest) - (void) printf(", "); - } - idx++; - bits = rest; - } - (void) printf("\n"); -} - -int -main(int argc, char **argv) -{ - zut_lookup_t lk = {0}; - zut_readdir_t rd = {0}; - boolean_t checking = B_FALSE; - boolean_t looking = B_FALSE; - boolean_t reading = B_FALSE; - boolean_t bflag = B_FALSE; - long rddir_bufsize = BIGBUF; - int error = 0; - int check; - int fd; - int c; - - while ((c = getopt(argc, argv, "lisaerb:ASE")) != -1) { - switch (c) { - case 'l': - looking = B_TRUE; - break; - case 'i': - lk.zl_reqflags |= ZUT_IGNORECASE; - looking = B_TRUE; - break; - case 's': - lk.zl_reqflags |= ZUT_GETSTAT; - looking = B_TRUE; - break; - case 'a': - rd.zr_reqflags |= ZUT_ACCFILTER; - reading = B_TRUE; - break; - case 'e': - rd.zr_reqflags |= ZUT_EXTRDDIR; - reading = B_TRUE; - break; - case 'r': - reading = B_TRUE; - break; - case 'b': - reading = B_TRUE; - bflag = B_TRUE; - rddir_bufsize = strtol(optarg, NULL, 0); - break; - case 'A': - checking = B_TRUE; - check = _PC_ACCESS_FILTERING; - break; - case 'S': - checking = B_TRUE; - check = _PC_SATTR_ENABLED; - break; - case 'E': - checking = B_TRUE; - check = _PC_SATTR_EXISTS; - break; - case '?': - default: - usage(argv[0]); /* no return */ - } - } - - if ((checking && looking) || (checking && reading) || - (looking && reading) || (!reading && bflag) || - (!checking && !reading && !looking)) - usage(argv[0]); /* no return */ - - if (rddir_bufsize < LILBUF || rddir_bufsize > MAXBUF) { - (void) fprintf(stderr, "Sorry, buffer size " - "must be >= %d and less than or equal to %d bytes.\n", - (int)LILBUF, MAXBUF); - exit(EINVAL); - } - - if (checking) { - char pathbuf[MAXPATHLEN]; - long result; - - if (argc - optind < 1) - usage(argv[0]); /* no return */ - (void) strlcpy(pathbuf, argv[optind], MAXPATHLEN); - result = pathconf(pathbuf, check); - (void) printf("pathconf(2) check for %s\n", pathbuf); - switch (check) { - case _PC_SATTR_ENABLED: - (void) printf("System attributes "); - if (result != 0) - (void) printf("Enabled\n"); - else - (void) printf("Not enabled\n"); - break; - case _PC_SATTR_EXISTS: - (void) printf("System attributes "); - if (result != 0) - (void) printf("Exist\n"); - else - (void) printf("Do not exist\n"); - break; - case _PC_ACCESS_FILTERING: - (void) printf("Access filtering "); - if (result != 0) - (void) printf("Available\n"); - else - (void) printf("Not available\n"); - break; - } - return (result); - } - - if ((fd = open(ZUT_DEV, O_RDONLY)) < 0) { - perror(ZUT_DEV); - return (ENXIO); - } - - if (reading) { - char *buf; - - if (argc - optind < 1) - usage(argv[0]); /* no return */ - - (void) strlcpy(rd.zr_dir, argv[optind], MAXPATHLEN); - if (argc - optind > 1) { - (void) strlcpy(rd.zr_file, argv[optind + 1], - MAXNAMELEN); - rd.zr_reqflags |= ZUT_XATTR; - } - - if ((buf = malloc(rddir_bufsize)) == NULL) { - error = errno; - perror("malloc"); - (void) close(fd); - return (error); - } - - rd.zr_buf = (uint64_t)(uintptr_t)buf; - rd.zr_buflen = rddir_bufsize; - - while (!rd.zr_eof) { - int ierr; - - if ((ierr = ioctl(fd, ZUT_IOC_READDIR, &rd)) != 0) { - (void) fprintf(stderr, - "IOCTL error: %s (%d)\n", - strerror(ierr), ierr); - free(buf); - (void) close(fd); - return (ierr); - } - if (rd.zr_retcode) { - (void) fprintf(stderr, - "readdir result: %s (%d)\n", - strerror(rd.zr_retcode), rd.zr_retcode); - free(buf); - (void) close(fd); - return (rd.zr_retcode); - } - if (rd.zr_reqflags & ZUT_EXTRDDIR) - print_extd_entries(&rd); - else - print_entries(&rd); - } - free(buf); - } else { - int ierr; - - if (argc - optind < 2) - usage(argv[0]); /* no return */ - - (void) strlcpy(lk.zl_dir, argv[optind], MAXPATHLEN); - (void) strlcpy(lk.zl_file, argv[optind + 1], MAXNAMELEN); - if (argc - optind > 2) { - (void) strlcpy(lk.zl_xfile, - argv[optind + 2], MAXNAMELEN); - lk.zl_reqflags |= ZUT_XATTR; - } - - if ((ierr = ioctl(fd, ZUT_IOC_LOOKUP, &lk)) != 0) { - (void) fprintf(stderr, - "IOCTL error: %s (%d)\n", - strerror(ierr), ierr); - (void) close(fd); - return (ierr); - } - - (void) printf("\nLookup of "); - if (lk.zl_reqflags & ZUT_XATTR) { - (void) printf("extended attribute \"%s\" of ", - lk.zl_xfile); - } - (void) printf("file \"%s\" ", lk.zl_file); - (void) printf("in directory \"%s\" ", lk.zl_dir); - if (lk.zl_retcode) { - (void) printf("failed: %s (%d)\n", - strerror(lk.zl_retcode), lk.zl_retcode); - (void) close(fd); - return (lk.zl_retcode); - } - - (void) printf("succeeded.\n"); - if (lk.zl_reqflags & ZUT_IGNORECASE) { - (void) printf("----------------------------\n"); - (void) printf("dirent flags: 0x%0x\n", lk.zl_deflags); - (void) printf("real name: %s\n", lk.zl_real); - } - if (lk.zl_reqflags & ZUT_GETSTAT) { - (void) printf("----------------------------\n"); - print_stats(&lk.zl_statbuf); - print_xvs(lk.zl_xvattrs); - } - } - - (void) close(fd); - return (0); -} diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 b/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 deleted file mode 100644 index 980d4da0e31b..000000000000 --- a/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 +++ /dev/null @@ -1,674 +0,0 @@ -'\" te -.\" Copyright (c) 2012, Martin Matuska . -.\" All Rights Reserved. -.\" -.\" The contents of this file are subject to the terms of the -.\" Common Development and Distribution License (the "License"). -.\" You may not use this file except in compliance with the License. -.\" -.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -.\" or http://www.opensolaris.org/os/licensing. -.\" See the License for the specific language governing permissions -.\" and limitations under the License. -.\" -.\" When distributing Covered Code, include this CDDL HEADER in each -.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. -.\" If applicable, add the following below this CDDL HEADER, with the -.\" fields enclosed by brackets "[]" replaced with your own identifying -.\" information: Portions Copyright [yyyy] [name of copyright owner] -.\" -.\" Copyright (c) 2012, 2017 by Delphix. All rights reserved. -.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -.\" Copyright (c) 2013, Joyent, Inc. All rights reserved. -.\" -.\" $FreeBSD$ -.\" -.Dd August 16, 2019 -.Dt ZPOOL-FEATURES 7 -.Os -.Sh NAME -.Nm zpool-features -.Nd ZFS pool feature descriptions -.Sh DESCRIPTION -ZFS pool on\-disk format versions are specified via "features" which replace -the old on\-disk format numbers (the last supported on\-disk format number is -28). -To enable a feature on a pool use the -.Cm upgrade -subcommand of the -.Xr zpool 8 -command, or set the -.Sy feature@feature_name -property to -.Ar enabled . -.Pp -The pool format does not affect file system version compatibility or the ability -to send file systems between pools. -.Pp -Since most features can be enabled independently of each other the on\-disk -format of the pool is specified by the set of all features marked as -.Sy active -on the pool. -If the pool was created by another software version this set may -include unsupported features. -.Ss Identifying features -Every feature has a guid of the form -.Sy com.example:feature_name . -The reverse DNS name ensures that the feature's guid is unique across all ZFS -implementations. -When unsupported features are encountered on a pool they will -be identified by their guids. -Refer to the documentation for the ZFS implementation that created the pool -for information about those features. -.Pp -Each supported feature also has a short name. -By convention a feature's short name is the portion of its guid which follows -the ':' (e.g. -.Sy com.example:feature_name -would have the short name -.Sy feature_name ), -however a feature's short name may differ across ZFS implementations if -following the convention would result in name conflicts. -.Ss Feature states -Features can be in one of three states: -.Bl -tag -width "XXXXXXXX" -.It Sy active -This feature's on\-disk format changes are in effect on the pool. -Support for this feature is required to import the pool in read\-write mode. -If this feature is not read-only compatible, support is also required to -import the pool in read\-only mode (see "Read\-only compatibility"). -.It Sy enabled -An administrator has marked this feature as enabled on the pool, but the -feature's on\-disk format changes have not been made yet. -The pool can still be imported by software that does not support this feature, -but changes may be made to the on\-disk format at any time which will move -the feature to the -.Sy active -state. -Some features may support returning to the -.Sy enabled -state after becoming -.Sy active . -See feature\-specific documentation for details. -.It Sy disabled -This feature's on\-disk format changes have not been made and will not be made -unless an administrator moves the feature to the -.Sy enabled -state. -Features cannot be disabled once they have been enabled. -.El -.Pp -The state of supported features is exposed through pool properties of the form -.Sy feature@short_name . -.Ss Read\-only compatibility -Some features may make on\-disk format changes that do not interfere with other -software's ability to read from the pool. -These features are referred to as "read\-only compatible". -If all unsupported features on a pool are read\-only compatible, the pool can -be imported in read\-only mode by setting the -.Sy readonly -property during import (see -.Xr zpool 8 -for details on importing pools). -.Ss Unsupported features -For each unsupported feature enabled on an imported pool a pool property -named -.Sy unsupported@feature_guid -will indicate why the import was allowed despite the unsupported feature. -Possible values for this property are: -.Bl -tag -width "XXXXXXXX" -.It Sy inactive -The feature is in the -.Sy enabled -state and therefore the pool's on\-disk format is still compatible with -software that does not support this feature. -.It Sy readonly -The feature is read\-only compatible and the pool has been imported in -read\-only mode. -.El -.Ss Feature dependencies -Some features depend on other features being enabled in order to function -properly. -Enabling a feature will automatically enable any features it depends on. -.Sh FEATURES -The following features are supported on this system: -.Bl -tag -width "XXXXXXXX" -.It Sy async_destroy -.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:async_destroy" -.It GUID Ta com.delphix:async_destroy -.It READ\-ONLY COMPATIBLE Ta yes -.It DEPENDENCIES Ta none -.El -.Pp -Destroying a file system requires traversing all of its data in order to -return its used space to the pool. -Without -.Sy async_destroy -the file system is not fully removed until all space has been reclaimed. -If the destroy operation is interrupted by a reboot or power outage the next -attempt to open the pool will need to complete the destroy operation -synchronously. -.Pp -When -.Sy async_destroy -is enabled the file system's data will be reclaimed by a background process, -allowing the destroy operation to complete without traversing the entire file -system. -The background process is able to resume interrupted destroys after the pool -has been opened, eliminating the need to finish interrupted destroys as part -of the open operation. -The amount of space remaining to be reclaimed by the background process is -available through the -.Sy freeing -property. -.Pp -This feature is only -.Sy active -while -.Sy freeing -is non\-zero. -.It Sy empty_bpobj -.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:empty_bpobj" -.It GUID Ta com.delphix:empty_bpobj -.It READ\-ONLY COMPATIBLE Ta yes -.It DEPENDENCIES Ta none -.El -.Pp -This feature increases the performance of creating and using a large number -of snapshots of a single filesystem or volume, and also reduces the disk -space required. -.Pp -When there are many snapshots, each snapshot uses many Block Pointer Objects -.Pq bpobj's -to track blocks associated with that snapshot. -However, in common use cases, most of these bpobj's are empty. -This feature allows us to create each bpobj on-demand, thus eliminating the -empty bpobjs. -.Pp -This feature is -.Sy active -while there are any filesystems, volumes, or snapshots which were created -after enabling this feature. -.It Sy filesystem_limits -.Bl -column "READ\-ONLY COMPATIBLE" "com.joyent:filesystem_limits" -.It GUID Ta com.joyent:filesystem_limits -.It READ\-ONLY COMPATIBLE Ta yes -.It DEPENDENCIES Ta extensible_dataset -.El -.Pp -This feature enables filesystem and snapshot limits. -These limits can be used -to control how many filesystems and/or snapshots can be created at the point in -the tree on which the limits are set. -.Pp -This feature is -.Sy active -once either of the limit properties has been -set on a dataset. -Once activated the feature is never deactivated. -.It Sy lz4_compress -.Bl -column "READ\-ONLY COMPATIBLE" "org.illumos:lz4_compress" -.It GUID Ta org.illumos:lz4_compress -.It READ\-ONLY COMPATIBLE Ta no -.It DEPENDENCIES Ta none -.El -.Pp -.Sy lz4 -is a high-performance real-time compression algorithm that -features significantly faster compression and decompression as well as a -higher compression ratio than the older -.Sy lzjb -compression. -Typically, -.Sy lz4 -compression is approximately 50% faster on -compressible data and 200% faster on incompressible data than -.Sy lzjb . -It is also approximately 80% faster on decompression, while -giving approximately 10% better compression ratio. -.Pp -When the -.Sy lz4_compress -feature is set to -.Sy enabled , -the -administrator can turn on -.Sy lz4 -compression on any dataset on the -pool using the -.Xr zfs 8 -command. -Also, all newly written metadata -will be compressed with -.Sy lz4 -algorithm. -Since this feature is not read-only compatible, this -operation will render the pool unimportable on systems without support -for the -.Sy lz4_compress -feature. -Booting off of -.Sy lz4 --compressed root pools is supported. -.Pp -This feature becomes -.Sy active -as soon as it is enabled and will -never return to being -.Sy enabled . -.It Sy multi_vdev_crash_dump -.Bl -column "READ\-ONLY COMPATIBLE" "com.joyent:multi_vdev_crash_dump" -.It GUID Ta com.joyent:multi_vdev_crash_dump -.It READ\-ONLY COMPATIBLE Ta no -.It DEPENDENCIES Ta none -.El -.Pp -This feature allows a dump device to be configured with a pool comprised -of multiple vdevs. -Those vdevs may be arranged in any mirrored or raidz -configuration. -.\" TODO: this is not yet supported on FreeBSD. -.\" .Pp -.\" When the -.\" .Sy multi_vdev_crash_dump -.\" feature is set to -.\" .Sy enabled , -.\" the administrator can use the -.\" .Xr dumpon 8 -.\" command to configure a -.\" dump device on a pool comprised of multiple vdevs. -.It Sy spacemap_histogram -.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:spacemap_histogram" -.It GUID Ta com.delphix:spacemap_histogram -.It READ\-ONLY COMPATIBLE Ta yes -.It DEPENDENCIES Ta none -.El -.Pp -This feature allows ZFS to maintain more information about how free space -is organized within the pool. -If this feature is -.Sy enabled , -ZFS will -set this feature to -.Sy active -when a new space map object is created or -an existing space map is upgraded to the new format. -Once the feature is -.Sy active , -it will remain in that state until the pool is destroyed. -.It Sy extensible_dataset -.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:extensible_dataset" -.It GUID Ta com.delphix:extensible_dataset -.It READ\-ONLY COMPATIBLE Ta no -.It DEPENDENCIES Ta none -.El -.Pp -This feature allows more flexible use of internal ZFS data structures, -and exists for other features to depend on. -.Pp -This feature will be -.Sy active -when the first dependent feature uses it, -and will be returned to the -.Sy enabled -state when all datasets that use -this feature are destroyed. -.It Sy bookmarks -.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:bookmarks" -.It GUID Ta com.delphix:bookmarks -.It READ\-ONLY COMPATIBLE Ta yes -.It DEPENDENCIES Ta extensible_dataset -.El -.Pp -This feature enables use of the -.Nm zfs -.Cm bookmark -subcommand. -.Pp -This feature is -.Sy active -while any bookmarks exist in the pool. -All bookmarks in the pool can be listed by running -.Nm zfs -.Cm list -.Fl t No bookmark Fl r Ar poolname . -.It Sy enabled_txg -.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:enabled_txg" -.It GUID Ta com.delphix:enabled_txg -.It READ\-ONLY COMPATIBLE Ta yes -.It DEPENDENCIES Ta none -.El -.Pp -Once this feature is enabled ZFS records the transaction group number -in which new features are enabled. -This has no user-visible impact, -but other features may depend on this feature. -.Pp -This feature becomes -.Sy active -as soon as it is enabled and will -never return to being -.Sy enabled . -.It Sy hole_birth -.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:hole_birth" -.It GUID Ta com.delphix:hole_birth -.It READ\-ONLY COMPATIBLE Ta no -.It DEPENDENCIES Ta enabled_txg -.El -.Pp -This feature improves performance of incremental sends -.Pq Dq zfs send -i -and receives for objects with many holes. -The most common case of -hole-filled objects is zvols. -.Pp -An incremental send stream from snapshot -.Sy A -to snapshot -.Sy B -contains information about every block that changed between -.Sy A -and -.Sy B . -Blocks which did not change between those snapshots can be -identified and omitted from the stream using a piece of metadata called -the 'block birth time', but birth times are not recorded for holes -.Pq blocks filled only with zeroes . -Since holes created after -.Sy A -cannot be -distinguished from holes created before -.Sy A , -information about every -hole in the entire filesystem or zvol is included in the send stream. -.Pp -For workloads where holes are rare this is not a problem. -However, when -incrementally replicating filesystems or zvols with many holes -.Pq for example a zvol formatted with another filesystem -a lot of time will -be spent sending and receiving unnecessary information about holes that -already exist on the receiving side. -.Pp -Once the -.Sy hole_birth -feature has been enabled the block birth times -of all new holes will be recorded. -Incremental sends between snapshots -created after this feature is enabled will use this new metadata to avoid -sending information about holes that already exist on the receiving side. -.Pp -This feature becomes -.Sy active -as soon as it is enabled and will -never return to being -.Sy enabled . -.It Sy embedded_data -.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:embedded_data" -.It GUID Ta com.delphix:embedded_data -.It READ\-ONLY COMPATIBLE Ta no -.It DEPENDENCIES Ta none -.El -.Pp -This feature improves the performance and compression ratio of -highly-compressible blocks. -Blocks whose contents can compress to 112 bytes -or smaller can take advantage of this feature. -.Pp -When this feature is enabled, the contents of highly-compressible blocks are -stored in the block "pointer" itself -.Po a misnomer in this case, as it contains -the compressed data, rather than a pointer to its location on disk -.Pc . -Thus -the space of the block -.Pq one sector, typically 512 bytes or 4KB -is saved, -and no additional i/o is needed to read and write the data block. -.Pp -This feature becomes -.Sy active -as soon as it is enabled and will -never return to being -.Sy enabled . -.It Sy zpool_checkpoint -.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:zpool_checkpoint" -.It GUID Ta com.delphix:zpool_checkpoint -.It READ\-ONLY COMPATIBLE Ta yes -.It DEPENDENCIES Ta none -.El -.Pp -This feature enables the "zpool checkpoint" subcommand that can -checkpoint the state of the pool at the time it was issued and later -rewind back to it or discard it. -.Pp -This feature becomes -.Sy active -when the "zpool checkpoint" command is used to checkpoint the pool. -The feature will only return back to being -.Sy enabled -when the pool is rewound or the checkpoint has been discarded. -.It Sy device_removal -.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:device_removal" -.It GUID Ta com.delphix:device_removal -.It READ\-ONLY COMPATIBLE Ta no -.It DEPENDENCIES Ta none -.El -.Pp -This feature enables the "zpool remove" subcommand to remove top-level -vdevs, evacuating them to reduce the total size of the pool. -.Pp -This feature becomes -.Sy active -when the "zpool remove" command is used -on a top-level vdev, and will never return to being -.Sy enabled . -.It Sy obsolete_counts -.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:obsolete_counts" -.It GUID Ta com.delphix:obsolete_counts -.It READ\-ONLY COMPATIBLE Ta yes -.It DEPENDENCIES Ta device_removal -.El -.Pp -This feature is an enhancement of device_removal, which will over time -reduce the memory used to track removed devices. When indirect blocks -are freed or remapped, we note that their part of the indirect mapping -is "obsolete", i.e. no longer needed. See also the "zfs remap" -subcommand in -.Xr zfs 8 . - -This feature becomes -.Sy active -when the "zpool remove" command is -used on a top-level vdev, and will never return to being -.Sy enabled . -.It Sy spacemap_v2 -.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:spacemap_v2" -.It GUID Ta com.delphix:spacemap_v2 -.It READ\-ONLY COMPATIBLE Ta yes -.It DEPENDENCIES Ta none -.El -.Pp -This feature enables the use of the new space map encoding which -consists of two words (instead of one) whenever it is advantageous. -The new encoding allows space maps to represent large regions of -space more efficiently on-disk while also increasing their maximum -addressable offset. -.Pp -This feature becomes -.Sy active -as soon as it is enabled and will -never return to being -.Sy enabled . -.It Sy large_blocks -.Bl -column "READ\-ONLY COMPATIBLE" "org.open-zfs:large_block" -.It GUID Ta org.open-zfs:large_block -.It READ\-ONLY COMPATIBLE Ta no -.It DEPENDENCIES Ta extensible_dataset -.El -.Pp -The -.Sy large_block -feature allows the record size on a dataset to be -set larger than 128KB. -.Pp -This feature becomes -.Sy active -once a -.Sy recordsize -property has been set larger than 128KB, and will return to being -.Sy enabled -once all filesystems that have ever had their recordsize larger than 128KB -are destroyed. -.Pp -Booting from datasets that use the -.Sy large_block -feature is supported by the -.Fx -boot loader. -.It Sy large_dnode -.Bl -column "READ\-ONLY COMPATIBLE" "org.zfsonlinux:large_dnode" -.It GUID Ta org.zfsonlinux:large_dnode -.It READ\-ONLY COMPATIBLE Ta no -.It DEPENDENCIES Ta extensible_dataset -.El -.Pp -The -.Sy large_dnode -feature allows the size of dnodes in a dataset to be set larger than 512B. -.Pp -This feature becomes -.Sy active -once a dataset contains an object with a dnode larger than 512B, -which occurs as a result of setting the -.Sy dnodesize -dataset property to a value other than -.Sy legacy . -The feature will return to being -.Sy enabled -once all filesystems that have ever contained a dnode larger than 512B are -destroyed. -Large dnodes allow more data to be stored in the bonus buffer, thus potentially -improving performance by avoiding the use of spill blocks. -.It Sy sha512 -.Bl -column "READ\-ONLY COMPATIBLE" "org.illumos:sha512" -.It GUID Ta org.illumos:sha512 -.It READ\-ONLY COMPATIBLE Ta no -.It DEPENDENCIES Ta extensible_dataset -.El -.Pp -The -.Sy sha512 -feature enables the use of the SHA-512/256 truncated hash algorithm -.Pq FIPS 180-4 -for checksum and dedup. -The native 64-bit arithmetic of SHA-512 provides an approximate 50% -performance boost over SHA-256 on 64-bit hardware and is thus a good -minimum-change replacement candidate for systems where hash performance is -important, but these systems cannot for whatever reason utilize the faster -.Sy skein -algorithms. -.Pp -When the -.Sy sha512 -feature is set to -.Sy enabled , -the administrator can turn on the -.Sy sha512 -checksum on any dataset using the -.Dl # zfs set checksum=sha512 Ar dataset -command. -This feature becomes -.Sy active -once a -.Sy checksum -property has been set to -.Sy sha512 , -and will return to being -.Sy enabled -once all filesystems that have ever had their checksum set to -.Sy sha512 -are destroyed. -.It Sy skein -.Bl -column "READ\-ONLY COMPATIBLE" "org.illumos:skein" -.It GUID Ta org.illumos:skein -.It READ\-ONLY COMPATIBLE Ta no -.It DEPENDENCIES Ta extensible_dataset -.El -.Pp -The -.Sy skein -feature enables the use of the Skein hash algorithm for checksum and dedup. -Skein is a high-performance secure hash algorithm that was a finalist in the -NIST SHA-3 competition. -It provides a very high security margin and high performance on 64-bit hardware -.Pq 80% faster than SHA-256 . -This implementation also utilizes the new salted checksumming functionality in -ZFS, which means that the checksum is pre-seeded with a secret 256-bit random -key -.Pq stored on the pool -before being fed the data block to be checksummed. -Thus the produced checksums are unique to a given pool, preventing hash -collision attacks on systems with dedup. -.Pp -When the -.Sy skein -feature is set to -.Sy enabled , -the administrator can turn on the -.Sy skein -checksum on any dataset using the -.Dl # zfs set checksum=skein Ar dataset -command. -This feature becomes -.Sy active -once a -.Sy checksum -property has been set to -.Sy skein , -and will return to being -.Sy enabled -once all filesystems that have ever had their checksum set to -.Sy skein -are destroyed. -Booting off of pools using -.Sy skein -is supported. -.It Sy allocation_classes -.Bl -column "READ\-ONLY COMPATIBLE" "com.intel:allocation_classes" -.It GUID Ta com.intel:allocation_classes -.It READ\-ONLY COMPATIBLE Ta yes -.It DEPENDENCIES Ta none -.El -.Pp -This feature enables support for separate allocation classes. -.Pp -This feature becomes -.Sy active -when a dedicated allocation class vdev -(dedup or special) is created with -.Dq zpool create -or -.Dq zpool add . -With device removal, it can be returned to the -.Sy enabled -state if all the top-level vdevs from an allocation class are removed. -.El -.Sh SEE ALSO -.Xr zpool 8 -.Sh AUTHORS -This manual page is a -.Xr mdoc 7 -reimplementation of the -.Tn illumos -manual page -.Em zpool-features(5) , -modified and customized for -.Fx -and licensed under the Common Development and Distribution License -.Pq Tn CDDL . -.Pp -The -.Xr mdoc 7 -implementation of this manual page was initially written by -.An Martin Matuska Aq mm@FreeBSD.org . diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool.8 b/cddl/contrib/opensolaris/cmd/zpool/zpool.8 deleted file mode 100644 index f5caffb95d79..000000000000 --- a/cddl/contrib/opensolaris/cmd/zpool/zpool.8 +++ /dev/null @@ -1,2485 +0,0 @@ -'\" te -.\" Copyright (c) 2012, Martin Matuska . -.\" Copyright (c) 2013-2014, Xin Li . -.\" All Rights Reserved. -.\" -.\" The contents of this file are subject to the terms of the -.\" Common Development and Distribution License (the "License"). -.\" You may not use this file except in compliance with the License. -.\" -.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -.\" or http://www.opensolaris.org/os/licensing. -.\" See the License for the specific language governing permissions -.\" and limitations under the License. -.\" -.\" When distributing Covered Code, include this CDDL HEADER in each -.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. -.\" If applicable, add the following below this CDDL HEADER, with the -.\" fields enclosed by brackets "[]" replaced with your own identifying -.\" information: Portions Copyright [yyyy] [name of copyright owner] -.\" -.\" Copyright (c) 2010, Sun Microsystems, Inc. All Rights Reserved. -.\" Copyright (c) 2011, Justin T. Gibbs -.\" Copyright (c) 2012, Glen Barber -.\" Copyright (c) 2012, 2017 by Delphix. All Rights Reserved. -.\" Copyright 2017 Nexenta Systems, Inc. -.\" Copyright (c) 2017 Datto Inc. -.\" Copyright (c) 2017 George Melikov. All Rights Reserved. -.\" Copyright 2019 Joyent, Inc. -.\" -.\" $FreeBSD$ -.\" -.Dd February 25, 2020 -.Dt ZPOOL 8 -.Os -.Sh NAME -.Nm zpool -.Nd configures ZFS storage pools -.Sh SYNOPSIS -.Nm -.Op Fl \&? -.Nm -.Cm add -.Op Fl fgLnP -.Ar pool vdev ... -.Nm -.Cm attach -.Op Fl f -.Ar pool device new_device -.Nm -.Cm checkpoint -.Op Fl d, -discard -.Ar pool -.Nm -.Cm clear -.Op Fl F Op Fl n -.Ar pool -.Op Ar device -.Nm -.Cm create -.Op Fl fnd -.Op Fl o Ar property Ns = Ns Ar value -.Ar ... -.Op Fl O Ar file-system-property Ns = Ns Ar value -.Ar ... -.Op Fl m Ar mountpoint -.Op Fl R Ar root -.Op Fl t Ar tempname -.Ar pool vdev ... -.Nm -.Cm destroy -.Op Fl f -.Ar pool -.Nm -.Cm detach -.Ar pool device -.Nm -.Cm export -.Op Fl f -.Ar pool ... -.Nm -.Cm get -.Op Fl Hp -.Op Fl o Ar field Ns Op , Ns Ar ... -.Ar all | property Ns Op , Ns Ar ... -.Ar pool ... -.Nm -.Cm history -.Op Fl il -.Op Ar pool -.Ar ... -.Nm -.Cm import -.Op Fl d Ar dir | Fl c Ar cachefile -.Op Fl D -.Nm -.Cm import -.Op Fl o Ar mntopts -.Op Fl o Ar property Ns = Ns Ar value -.Ar ... -.Op Fl -rewind-to-checkpoint -.Op Fl d Ar dir | Fl c Ar cachefile -.Op Fl D -.Op Fl f -.Op Fl m -.Op Fl N -.Op Fl R Ar root -.Op Fl F Op Fl n -.Fl a -.Nm -.Cm import -.Op Fl o Ar mntopts -.Op Fl o Ar property Ns = Ns Ar value -.Ar ... -.Op Fl -rewind-to-checkpoint -.Op Fl d Ar dir | Fl c Ar cachefile -.Op Fl D -.Op Fl f -.Op Fl m -.Op Fl N -.Op Fl R Ar root -.Op Fl t -.Op Fl F Op Fl n -.Ar pool | id -.Op Ar newpool -.Nm -.Cm initialize -.Op Fl cs -.Ar pool -.Op Ar device Ns ... -.Nm -.Cm iostat -.Op Fl v -.Op Fl T Cm d Ns | Ns Cm u -.Op Fl gLP -.Op Ar pool -.Ar ... -.Op Ar interval Op Ar count -.Nm -.Cm labelclear -.Op Fl f -.Ar device -.Nm -.Cm list -.Op Fl HgLpPv -.Op Fl o Ar property Ns Op , Ns Ar ... -.Op Fl T Cm d Ns | Ns Cm u -.Op Ar pool -.Ar ... -.Op Ar interval Op Ar count -.Nm -.Cm offline -.Op Fl t -.Ar pool device ... -.Nm -.Cm online -.Op Fl e -.Ar pool device ... -.Nm -.Cm reguid -.Ar pool -.Nm -.Cm remove -.Op Fl np -.Ar pool device ... -.Nm -.Cm remove -.Fl s -.Ar pool -.Nm -.Cm reopen -.Ar pool -.Nm -.Cm replace -.Op Fl f -.Ar pool device -.Op Ar new_device -.Nm -.Cm scrub -.Op Fl s | Fl p -.Ar pool ... -.Nm -.Cm set -.Ar property Ns = Ns Ar value pool -.Nm -.Cm split -.Op Fl gLnP -.Op Fl R Ar altroot -.Op Fl o Ar mntopts -.Op Fl o Ar property Ns = Ns Ar value -.Ar pool newpool -.Op Ar device ... -.Nm -.Cm status -.Op Fl DgLPvx -.Op Fl T Cm d Ns | Ns Cm u -.Op Ar pool -.Ar ... -.Op Ar interval Op Ar count -.Nm -.Cm sync -.Oo Ar pool Oc Ns ... -.Nm -.Cm upgrade -.Op Fl v -.Nm -.Cm upgrade -.Op Fl V Ar version -.Fl a | Ar pool ... -.Sh DESCRIPTION -The -.Nm -command configures -.Tn ZFS -storage pools. A storage pool is a collection of devices that provides physical -storage and data replication for -.Tn ZFS -datasets. -.Pp -All datasets within a storage pool share the same space. See -.Xr zfs 8 -for information on managing datasets. -.Ss Virtual Devices (vdevs) -A -.Qq virtual device -.Pq No vdev -describes a single device or a collection of devices organized according to -certain performance and fault characteristics. The following virtual devices -are supported: -.Bl -tag -width "XXXXXX" -.It Sy disk -A block device, typically located under -.Pa /dev . -.Tn ZFS -can use individual slices or partitions, though the recommended mode of -operation is to use whole disks. A disk can be specified by a full path to the -device or the -.Xr geom 4 -provider name. When given a whole disk, -.Tn ZFS -automatically labels the disk, if necessary. -.It Sy file -A regular file. The use of files as a backing store is strongly discouraged. It -is designed primarily for experimental purposes, as the fault tolerance of a -file is only as good the file system of which it is a part. A file must be -specified by a full path. -.It Sy mirror -A mirror of two or more devices. Data is replicated in an identical fashion -across all components of a mirror. A mirror with -.Em N -disks of size -.Em X -can hold -.Em X -bytes and can withstand -.Pq Em N-1 -devices failing before data integrity is compromised. -.It Sy raidz -(or -.Sy raidz1 raidz2 raidz3 ) . -A variation on -.Sy RAID-5 -that allows for better distribution of parity and eliminates the -.Qq Sy RAID-5 -write hole (in which data and parity become inconsistent after a power loss). -Data and parity is striped across all disks within a -.No raidz -group. -.Pp -A -.No raidz -group can have single-, double- , or triple parity, meaning that the -.No raidz -group can sustain one, two, or three failures, respectively, without -losing any data. The -.Sy raidz1 No vdev -type specifies a single-parity -.No raidz -group; the -.Sy raidz2 No vdev -type specifies a double-parity -.No raidz -group; and the -.Sy raidz3 No vdev -type specifies a triple-parity -.No raidz -group. The -.Sy raidz No vdev -type is an alias for -.Sy raidz1 . -.Pp -A -.No raidz -group with -.Em N -disks of size -.Em X -with -.Em P -parity disks can hold approximately -.Sm off -.Pq Em N-P -*X -.Sm on -bytes and can withstand -.Em P -device(s) failing before data integrity is compromised. The minimum number of -devices in a -.No raidz -group is one more than the number of parity disks. The -recommended number is between 3 and 9 to help increase performance. -.It Sy spare -A special -.No pseudo- Ns No vdev -which keeps track of available hot spares for a pool. -For more information, see the -.Qq Sx Hot Spares -section. -.It Sy log -A separate-intent log device. If more than one log device is specified, then -writes are load-balanced between devices. Log devices can be mirrored. However, -.No raidz -.No vdev -types are not supported for the intent log. For more information, -see the -.Qq Sx Intent Log -section. -.It Sy dedup -A device dedicated solely for allocating dedup data. -The redundancy of this device should match the redundancy of the other normal -devices in the pool. -If more than one dedup device is specified, then allocations are load-balanced -between devices. -.It Sy special -A device dedicated solely for allocating various kinds of internal metadata, -and optionally small file data. -The redundancy of this device should match the redundancy of the other normal -devices in the pool. -If more than one special device is specified, then allocations are -load-balanced between devices. -.Pp -For more information on special allocations, see the -.Sx Special Allocation Class -section. -.It Sy cache -A device used to cache storage pool data. -A cache device cannot be configured as a mirror or raidz group. -For more information, see the -.Qq Sx Cache Devices -section. -.El -.Pp -Virtual devices cannot be nested, so a mirror or -.No raidz -virtual device can only -contain files or disks. Mirrors of mirrors (or other combinations) are not -allowed. -.Pp -A pool can have any number of virtual devices at the top of the configuration -(known as -.Qq root -.No vdev Ns s). -Data is dynamically distributed across all top-level devices to balance data -among devices. As new virtual devices are added, -.Tn ZFS -automatically places data on the newly available devices. -.Pp -Virtual devices are specified one at a time on the command line, separated by -whitespace. The keywords -.Qq mirror -and -.Qq raidz -are used to distinguish where a group ends and another begins. For example, the -following creates two root -.No vdev Ns s, -each a mirror of two disks: -.Bd -literal -offset 2n -.Li # Ic zpool create mypool mirror da0 da1 mirror da2 da3 -.Ed -.Ss Device Failure and Recovery -.Tn ZFS -supports a rich set of mechanisms for handling device failure and data -corruption. All metadata and data is checksummed, and -.Tn ZFS -automatically repairs bad data from a good copy when corruption is detected. -.Pp -In order to take advantage of these features, a pool must make use of some form -of redundancy, using either mirrored or -.No raidz -groups. While -.Tn ZFS -supports running in a non-redundant configuration, where each root -.No vdev -is simply a disk or file, this is strongly discouraged. A single case of bit -corruption can render some or all of your data unavailable. -.Pp -A pool's health status is described by one of three states: online, degraded, -or faulted. An online pool has all devices operating normally. A degraded pool -is one in which one or more devices have failed, but the data is still -available due to a redundant configuration. A faulted pool has corrupted -metadata, or one or more faulted devices, and insufficient replicas to continue -functioning. -.Pp -The health of the top-level -.No vdev , -such as mirror or -.No raidz -device, is -potentially impacted by the state of its associated -.No vdev Ns s, -or component devices. A top-level -.No vdev -or component device is in one of the following states: -.Bl -tag -width "DEGRADED" -.It Sy DEGRADED -One or more top-level -.No vdev Ns s -is in the degraded state because one or more -component devices are offline. Sufficient replicas exist to continue -functioning. -.Pp -One or more component devices is in the degraded or faulted state, but -sufficient replicas exist to continue functioning. The underlying conditions -are as follows: -.Bl -bullet -offset 2n -.It -The number of checksum errors exceeds acceptable levels and the device is -degraded as an indication that something may be wrong. -.Tn ZFS -continues to use the device as necessary. -.It -The number of -.Tn I/O -errors exceeds acceptable levels. The device could not be -marked as faulted because there are insufficient replicas to continue -functioning. -.El -.It Sy FAULTED -One or more top-level -.No vdev Ns s -is in the faulted state because one or more -component devices are offline. Insufficient replicas exist to continue -functioning. -.Pp -One or more component devices is in the faulted state, and insufficient -replicas exist to continue functioning. The underlying conditions are as -follows: -.Bl -bullet -offset 2n -.It -The device could be opened, but the contents did not match expected values. -.It -The number of -.Tn I/O -errors exceeds acceptable levels and the device is faulted to -prevent further use of the device. -.El -.It Sy OFFLINE -The device was explicitly taken offline by the -.Qq Nm Cm offline -command. -.It Sy ONLINE -The device is online and functioning. -.It Sy REMOVED -The device was physically removed while the system was running. Device removal -detection is hardware-dependent and may not be supported on all platforms. -.It Sy UNAVAIL -The device could not be opened. If a pool is imported when a device was -unavailable, then the device will be identified by a unique identifier instead -of its path since the path was never correct in the first place. -.El -.Pp -If a device is removed and later reattached to the system, -.Tn ZFS -attempts to put the device online automatically. Device attach detection is -hardware-dependent and might not be supported on all platforms. -.Ss Hot Spares -.Tn ZFS -allows devices to be associated with pools as -.Qq hot spares . -These devices are not actively used in the pool, but when an active device -fails, it is automatically replaced by a hot spare. To create a pool with hot -spares, specify a -.Qq spare -.No vdev -with any number of devices. For example, -.Bd -literal -offset 2n -.Li # Ic zpool create pool mirror da0 da1 spare da2 da3 -.Ed -.Pp -Spares can be shared across multiple pools, and can be added with the -.Qq Nm Cm add -command and removed with the -.Qq Nm Cm remove -command. Once a spare replacement is initiated, a new "spare" -.No vdev -is created -within the configuration that will remain there until the original device is -replaced. At this point, the hot spare becomes available again if another -device fails. -.Pp -If a pool has a shared spare that is currently being used, the pool can not be -exported since other pools may use this shared spare, which may lead to -potential data corruption. -.Pp -Shared spares add some risk. -If the pools are imported on different hosts, and both pools suffer a device -failure at the same time, both could attempt to use the spare at the same time. -This may not be detected, resulting in data corruption. -.Pp -An in-progress spare replacement can be cancelled by detaching the hot spare. -If the original faulted device is detached, then the hot spare assumes its -place in the configuration, and is removed from the spare list of all active -pools. -.Pp -Spares cannot replace log devices. -.Pp -This feature requires a userland helper. -FreeBSD provides -.Xr zfsd 8 -for this purpose. -It must be manually enabled by adding -.Va zfsd_enable="YES" -to -.Pa /etc/rc.conf . -.Ss Intent Log -The -.Tn ZFS -Intent Log -.Pq Tn ZIL -satisfies -.Tn POSIX -requirements for synchronous transactions. For instance, databases often -require their transactions to be on stable storage devices when returning from -a system call. -.Tn NFS -and other applications can also use -.Xr fsync 2 -to ensure data stability. By default, the intent log is allocated from blocks -within the main pool. However, it might be possible to get better performance -using separate intent log devices such as -.Tn NVRAM -or a dedicated disk. For example: -.Bd -literal -offset 2n -.Li # Ic zpool create pool da0 da1 log da2 -.Ed -.Pp -Multiple log devices can also be specified, and they can be mirrored. See the -.Sx EXAMPLES -section for an example of mirroring multiple log devices. -.Pp -Log devices can be added, replaced, attached, detached, imported and exported -as part of the larger pool. -Mirrored devices can be removed by specifying the top-level mirror vdev. -.Ss Cache devices -Devices can be added to a storage pool as "cache devices." These devices -provide an additional layer of caching between main memory and disk. For -read-heavy workloads, where the working set size is much larger than what can -be cached in main memory, using cache devices allow much more of this working -set to be served from low latency media. Using cache devices provides the -greatest performance improvement for random read-workloads of mostly static -content. -.Pp -To create a pool with cache devices, specify a "cache" -.No vdev -with any number of devices. For example: -.Bd -literal -offset 2n -.Li # Ic zpool create pool da0 da1 cache da2 da3 -.Ed -.Pp -Cache devices cannot be mirrored or part of a -.No raidz -configuration. If a read -error is encountered on a cache device, that read -.Tn I/O -is reissued to the original storage pool device, which might be part of a -mirrored or -.No raidz -configuration. -.Pp -The content of the cache devices is considered volatile, as is the case with -other system caches. -.Ss Pool checkpoint -Before starting critical procedures that include destructive actions (e.g -.Nm zfs Cm destroy -), an administrator can checkpoint the pool's state and in the case of a -mistake or failure, rewind the entire pool back to the checkpoint. -Otherwise, the checkpoint can be discarded when the procedure has completed -successfully. -.Pp -A pool checkpoint can be thought of as a pool-wide snapshot and should be used -with care as it contains every part of the pool's state, from properties to vdev -configuration. -Thus, while a pool has a checkpoint certain operations are not allowed. -Specifically, vdev removal/attach/detach, mirror splitting, and -changing the pool's guid. -Adding a new vdev is supported but in the case of a rewind it will have to be -added again. -Finally, users of this feature should keep in mind that scrubs in a pool that -has a checkpoint do not repair checkpointed data. -.Pp -To create a checkpoint for a pool: -.Bd -literal -# zpool checkpoint pool -.Ed -.Pp -To later rewind to its checkpointed state, you need to first export it and -then rewind it during import: -.Bd -literal -# zpool export pool -# zpool import --rewind-to-checkpoint pool -.Ed -.Pp -To discard the checkpoint from a pool: -.Bd -literal -# zpool checkpoint -d pool -.Ed -.Pp -Dataset reservations (controlled by the -.Nm reservation -or -.Nm refreservation -zfs properties) may be unenforceable while a checkpoint exists, because the -checkpoint is allowed to consume the dataset's reservation. -Finally, data that is part of the checkpoint but has been freed in the -current state of the pool won't be scanned during a scrub. -.Ss Special Allocation Class -The allocations in the special class are dedicated to specific block types. -By default this includes all metadata, the indirect blocks of user data, and -any dedup data. -The class can also be provisioned to accept a limited percentage of small file -data blocks. -.Pp -A pool must always have at least one general (non-specified) vdev before -other devices can be assigned to the special class. -If the special class becomes full, then allocations intended for it will spill -back into the normal class. -.Pp -Dedup data can be excluded from the special class by setting the -.Sy vfs.zfs.ddt_data_is_special -sysctl to false (0). -.Pp -Inclusion of small file blocks in the special class is opt-in. -Each dataset can control the size of small file blocks allowed in the special -class by setting the -.Sy special_small_blocks -dataset property. -It defaults to zero so you must opt-in by setting it to a non-zero value. -See -.Xr zfs 1M -for more info on setting this property. -.Ss Properties -Each pool has several properties associated with it. Some properties are -read-only statistics while others are configurable and change the behavior of -the pool. The following are read-only properties: -.Bl -tag -width "dedupratio" -.It Sy allocated -Amount of storage space used within the pool. -.It Sy capacity -Percentage of pool space used. This property can also be referred to by its -shortened column name, "cap". -.It Sy dedupratio -The deduplication ratio specified for a pool, expressed as a multiplier. -For example, a -.Sy dedupratio -value of 1.76 indicates that 1.76 units of data were stored but only 1 unit of disk space was actually consumed. See -.Xr zfs 8 -for a description of the deduplication feature. -.It Sy expandsize -Amount of uninitialized space within the pool or device that can be used to -increase the total capacity of the pool. -Uninitialized space consists of -any space on an EFI labeled vdev which has not been brought online -.Pq i.e. zpool online -e . -This space occurs when a LUN is dynamically expanded. -.It Sy fragmentation -The amount of fragmentation in the pool. -.It Sy free -Number of blocks within the pool that are not allocated. -.It Sy freeing -After a file system or snapshot is destroyed, the space it was using is -returned to the pool asynchronously. -.Sy freeing -is the amount of space remaining to be reclaimed. -Over time -.Sy freeing -will decrease while -.Sy free -increases. -.It Sy guid -A unique identifier for the pool. -.It Sy health -The current health of the pool. Health can be -.Qq Sy ONLINE , -.Qq Sy DEGRADED , -.Qq Sy FAULTED , -.Qq Sy OFFLINE , -.Qq Sy REMOVED , -or -.Qq Sy UNAVAIL . -.It Sy size -Total size of the storage pool. -.It Sy unsupported@ Ns Ar feature_guid -Information about unsupported features that are enabled on the pool. -See -.Xr zpool-features 7 -for details. -.El -.Pp -The space usage properties report actual physical space available to the -storage pool. The physical space can be different from the total amount of -space that any contained datasets can actually use. The amount of space used in -a -.No raidz -configuration depends on the characteristics of the data being written. -In addition, -.Tn ZFS -reserves some space for internal accounting that the -.Xr zfs 8 -command takes into account, but the -.Xr zpool 8 -command does not. For non-full pools of a reasonable size, these effects should -be invisible. For small pools, or pools that are close to being completely -full, these discrepancies may become more noticeable. -.Pp -The following property can be set at creation time and import time: -.Bl -tag -width 2n -.It Sy altroot -Alternate root directory. If set, this directory is prepended to any mount -points within the pool. This can be used when examining an unknown pool where -the mount points cannot be trusted, or in an alternate boot environment, where -the typical paths are not valid. -.Sy altroot -is not a persistent property. It is valid only while the system is up. -Setting -.Sy altroot -defaults to using -.Cm cachefile=none , -though this may be overridden using an explicit setting. -.El -.Pp -The following property can only be set at import time: -.Bl -tag -width 2n -.It Sy readonly Ns = Ns Cm on No | Cm off -If set to -.Cm on , -pool will be imported in read-only mode with the following restrictions: -.Bl -bullet -offset 2n -.It -Synchronous data in the intent log will not be accessible -.It -Properties of the pool can not be changed -.It -Datasets of this pool can only be mounted read-only -.It -To write to a read-only pool, a export and import of the pool is required. -.El -.Pp -This property can also be referred to by its shortened column name, -.Sy rdonly . -.El -.Pp -The following properties can be set at creation time and import time, and later -changed with the -.Ic zpool set -command: -.Bl -tag -width 2n -.It Sy autoexpand Ns = Ns Cm on No | Cm off -Controls automatic pool expansion when the underlying LUN is grown. If set to -.Qq Cm on , -the pool will be resized according to the size of the expanded -device. If the device is part of a mirror or -.No raidz -then all devices within that -.No mirror/ Ns No raidz -group must be expanded before the new space is made available to -the pool. The default behavior is -.Qq off . -This property can also be referred to by its shortened column name, -.Sy expand . -.It Sy autoreplace Ns = Ns Cm on No | Cm off -Controls automatic device replacement. If set to -.Qq Cm off , -device replacement must be initiated by the administrator by using the -.Qq Nm Cm replace -command. If set to -.Qq Cm on , -any new device, found in the same -physical location as a device that previously belonged to the pool, is -automatically formatted and replaced. The default behavior is -.Qq Cm off . -This property can also be referred to by its shortened column name, "replace". -.It Sy bootfs Ns = Ns Ar pool Ns / Ns Ar dataset -Identifies the default bootable dataset for the root pool. This property is -expected to be set mainly by the installation and upgrade programs. -.It Sy cachefile Ns = Ns Ar path No | Cm none -Controls the location of where the pool configuration is cached. Discovering -all pools on system startup requires a cached copy of the configuration data -that is stored on the root file system. All pools in this cache are -automatically imported when the system boots. Some environments, such as -install and clustering, need to cache this information in a different location -so that pools are not automatically imported. Setting this property caches the -pool configuration in a different location that can later be imported with -.Qq Nm Cm import Fl c . -Setting it to the special value -.Qq Cm none -creates a temporary pool that is never cached, and the special value -.Cm '' -(empty string) uses the default location. -.It Sy comment Ns = Ns Ar text -A text string consisting of printable ASCII characters that will be stored -such that it is available even if the pool becomes faulted. -An administrator can provide additional information about a pool using this -property. -.It Sy dedupditto Ns = Ns Ar number -Threshold for the number of block ditto copies. If the reference count for a -deduplicated block increases above this number, a new ditto copy of this block -is automatically stored. Default setting is -.Cm 0 -which causes no ditto copies to be created for deduplicated blocks. -The miniumum legal nonzero setting is 100. -.It Sy delegation Ns = Ns Cm on No | Cm off -Controls whether a non-privileged user is granted access based on the dataset -permissions defined on the dataset. See -.Xr zfs 8 -for more information on -.Tn ZFS -delegated administration. -.It Sy failmode Ns = Ns Cm wait No | Cm continue No | Cm panic -Controls the system behavior in the event of catastrophic pool failure. This -condition is typically a result of a loss of connectivity to the underlying -storage device(s) or a failure of all devices within the pool. The behavior of -such an event is determined as follows: -.Bl -tag -width indent -.It Sy wait -Blocks all -.Tn I/O -access until the device connectivity is recovered and the errors are cleared. -This is the default behavior. -.It Sy continue -Returns -.Em EIO -to any new write -.Tn I/O -requests but allows reads to any of the remaining healthy devices. Any write -requests that have yet to be committed to disk would be blocked. -.It Sy panic -Prints out a message to the console and generates a system crash dump. -.El -.It Sy feature@ Ns Ar feature_name Ns = Ns Sy enabled -The value of this property is the current state of -.Ar feature_name . -The only valid value when setting this property is -.Sy enabled -which moves -.Ar feature_name -to the enabled state. -See -.Xr zpool-features 7 -for details on feature states. -.It Sy listsnapshots Ns = Ns Cm on No | Cm off -Controls whether information about snapshots associated with this pool is -output when -.Qq Nm zfs Cm list -is run without the -.Fl t -option. The default value is -.Cm off . -This property can also be referred to by its shortened name, -.Sy listsnaps . -.It Sy multihost Ns = Ns Sy on No | Sy off -Controls whether a pool activity check should be performed during -.Nm zpool Cm import . -When a pool is determined to be active it cannot be imported, even with the -.Fl f -option. -This property is intended to be used in failover configurations -where multiple hosts have access to a pool on shared storage. -.Pp -Multihost provides protection on import only. -It does not protect against an -individual device being used in multiple pools, regardless of the type of vdev. -See the discussion under -.Sy zpool create. -.Pp -When this property is on, periodic writes to storage occur to show the pool is -in use. -See -.Sy vfs.zfs.multihost_interval -sysctl. -In order to enable this property each host must set a unique hostid. -The default value is -.Sy off . -.It Sy version Ns = Ns Ar version -The current on-disk version of the pool. This can be increased, but never -decreased. The preferred method of updating pools is with the -.Qq Nm Cm upgrade -command, though this property can be used when a specific version is needed -for backwards compatibility. -Once feature flags is enabled on a pool this property will no longer have a -value. -.El -.Sh SUBCOMMANDS -All subcommands that modify state are logged persistently to the pool in their -original form. -.Pp -The -.Nm -command provides subcommands to create and destroy storage pools, add capacity -to storage pools, and provide information about the storage pools. The following -subcommands are supported: -.Bl -tag -width 2n -.It Xo -.Nm -.Op Fl \&? -.Xc -.Pp -Displays a help message. -.It Xo -.Nm -.Cm add -.Op Fl fgLnP -.Ar pool vdev ... -.Xc -.Pp -Adds the specified virtual devices to the given pool. The -.No vdev -specification is described in the -.Qq Sx Virtual Devices -section. The behavior of the -.Fl f -option, and the device checks performed are described in the -.Qq Nm Cm create -subcommand. -.Bl -tag -width indent -.It Fl f -Forces use of -.Ar vdev , -even if they appear in use or specify a conflicting replication level. -Not all devices can be overridden in this manner. -.It Fl g -Display -.Ar vdev , -GUIDs instead of the normal device names. -These GUIDs can be used in place of -device names for the zpool detach/offline/remove/replace commands. -.It Fl L -Display real paths for -.Ar vdev Ns s -resolving all symbolic links. -This can be used to look up the current block -device name regardless of the /dev/disk/ path used to open it. -.It Fl n -Displays the configuration that would be used without actually adding the -.Ar vdev Ns s. -The actual pool creation can still fail due to insufficient privileges or -device sharing. -.It Fl P -Display real paths for -.Ar vdev Ns s -instead of only the last component of the path. -This can be used in conjunction with the -.Fl L -flag. -.El -.It Xo -.Nm -.Cm attach -.Op Fl f -.Ar pool device new_device -.Xc -.Pp -Attaches -.Ar new_device -to an existing -.Sy zpool -device. The existing device cannot be part of a -.No raidz -configuration. If -.Ar device -is not currently part of a mirrored configuration, -.Ar device -automatically transforms into a two-way mirror of -.Ar device No and Ar new_device . -If -.Ar device -is part of a two-way mirror, attaching -.Ar new_device -creates a three-way mirror, and so on. In either case, -.Ar new_device -begins to resilver immediately. -.Bl -tag -width indent -.It Fl f -Forces use of -.Ar new_device , -even if its appears to be in use. Not all devices can be overridden in this -manner. -.El -.It Xo -.Nm -.Cm checkpoint -.Op Fl d, -discard -.Ar pool -.Xc -Checkpoints the current state of -.Ar pool -, which can be later restored by -.Nm zpool Cm import --rewind-to-checkpoint . -The existence of a checkpoint in a pool prohibits the following -.Nm zpool -commands: -.Cm remove , -.Cm attach , -.Cm detach , -.Cm split , -and -.Cm reguid . -In addition, it may break reservation boundaries if the pool lacks free -space. -The -.Nm zpool Cm status -command indicates the existence of a checkpoint or the progress of discarding a -checkpoint from a pool. -The -.Nm zpool Cm list -command reports how much space the checkpoint takes from the pool. -.Bl -tag -width Ds -.It Fl d, -discard -Discards an existing checkpoint from -.Ar pool . -.El -.It Xo -.Nm -.Cm clear -.Op Fl F Op Fl n -.Ar pool -.Op Ar device -.Xc -.Pp -Clears device errors in a pool. -If no arguments are specified, all device errors within the pool are cleared. -If one or more devices is specified, only those errors associated with the -specified device or devices are cleared. -If multihost is enabled, and the pool has been suspended, this will not -resume I/O. -While the pool was suspended, it may have been imported on -another host, and resuming I/O could result in pool damage. -.Bl -tag -width indent -.It Fl F -Initiates recovery mode for an unopenable pool. Attempts to discard the last -few transactions in the pool to return it to an openable state. Not all damaged -pools can be recovered by using this option. If successful, the data from the -discarded transactions is irretrievably lost. -.It Fl n -Used in combination with the -.Fl F -flag. Check whether discarding transactions would make the pool openable, but -do not actually discard any transactions. -.El -.It Xo -.Nm -.Cm create -.Op Fl fnd -.Op Fl o Ar property Ns = Ns Ar value -.Ar ... -.Op Fl O Ar file-system-property Ns = Ns Ar value -.Ar ... -.Op Fl m Ar mountpoint -.Op Fl R Ar root -.Op Fl t Ar tempname -.Ar pool vdev ... -.Xc -.Pp -Creates a new storage pool containing the virtual devices specified on the -command line. The pool name must begin with a letter, and can only contain -alphanumeric characters as well as underscore ("_"), dash ("-"), and period -("."). The pool names "mirror", "raidz", "spare" and "log" are reserved, as are -names beginning with the pattern "c[0-9]". The -.No vdev -specification is described in the -.Qq Sx Virtual Devices -section. -.Pp -The command attempts to verify that each device specified is accessible and not -currently in use by another subsystem. -However this check is not robust enough -to detect simultaneous attempts to use a new device in different pools, even if -.Sy multihost -is -.Sy enabled. -The -administrator must ensure that simultaneous invocations of any combination of -.Sy zpool replace , -.Sy zpool create , -.Sy zpool add , -or -.Sy zpool labelclear , -do not refer to the same device. -Using the same device in two pools will -result in pool corruption. -.Pp -There are some uses, such as being currently mounted, or specified as the -dedicated dump device, that prevents a device from ever being used by ZFS. -Other uses, such as having a preexisting UFS file system, can be overridden -with the -.Fl f -option. -.Pp -The command also checks that the replication strategy for the pool is -consistent. An attempt to combine redundant and non-redundant storage in a -single pool, or to mix disks and files, results in an error unless -.Fl f -is specified. The use of differently sized devices within a single -.No raidz -or mirror group is also flagged as an error unless -.Fl f -is specified. -.Pp -Unless the -.Fl R -option is specified, the default mount point is -.Qq Pa /pool . -The mount point must not exist or must be empty, or else the -root dataset cannot be mounted. This can be overridden with the -.Fl m -option. -.Pp -By default all supported features are enabled on the new pool unless the -.Fl d -option is specified. -.Bl -tag -width indent -.It Fl f -Forces use of -.Ar vdev Ns s, -even if they appear in use or specify a conflicting replication level. -Not all devices can be overridden in this manner. -.It Fl n -Displays the configuration that would be used without actually creating the -pool. The actual pool creation can still fail due to insufficient privileges or -device sharing. -.It Fl d -Do not enable any features on the new pool. -Individual features can be enabled by setting their corresponding properties -to -.Sy enabled -with the -.Fl o -option. -See -.Xr zpool-features 7 -for details about feature properties. -.It Xo -.Fl o Ar property Ns = Ns Ar value -.Op Fl o Ar property Ns = Ns Ar value -.Ar ... -.Xc -Sets the given pool properties. See the -.Qq Sx Properties -section for a list of valid properties that can be set. -.It Xo -.Fl O -.Ar file-system-property Ns = Ns Ar value -.Op Fl O Ar file-system-property Ns = Ns Ar value -.Ar ... -.Xc -Sets the given file system properties in the root file system of the pool. See -.Xr zfs 8 Properties -for a list of valid properties that -can be set. -.It Fl R Ar root -Equivalent to -.Qq Fl o Cm cachefile=none,altroot= Ns Pa root -.It Fl m Ar mountpoint -Sets the mount point for the root dataset. The default mount point is -.Qq Pa /pool -or -.Qq Cm altroot Ns Pa /pool -if -.Sy altroot -is specified. The mount point must be an absolute path, -.Qq Cm legacy , -or -.Qq Cm none . -For more information on dataset mount points, see -.Xr zfs 8 . -.It Fl t Ar tempname -Sets the in-core pool name to -.Pa tempname -while the on-disk name will be the name specified as the pool name -.Pa pool . -This will set the default -.Sy cachefile -property to -.Sy none . -This is intended to handle name space collisions when creating pools -for other systems, such as virtual machines or physical machines -whose pools live on network block devices. -.El -.It Xo -.Nm -.Cm destroy -.Op Fl f -.Ar pool -.Xc -.Pp -Destroys the given pool, freeing up any devices for other use. This command -tries to unmount any active datasets before destroying the pool. -.Bl -tag -width indent -.It Fl f -Forces any active datasets contained within the pool to be unmounted. -.El -.It Xo -.Nm -.Cm detach -.Ar pool device -.Xc -.Pp -Detaches -.Ar device -from a mirror. The operation is refused if there are no other valid replicas -of the data. -.It Xo -.Nm -.Cm export -.Op Fl f -.Ar pool ... -.Xc -.Pp -Exports the given pools from the system. All devices are marked as exported, -but are still considered in use by other subsystems. The devices can be moved -between systems (even those of different endianness) and imported as long as a -sufficient number of devices are present. -.Pp -Before exporting the pool, all datasets within the pool are unmounted. A pool -can not be exported if it has a shared spare that is currently being used. -.Pp -For pools to be portable, you must give the -.Nm -command whole disks, not just slices, so that -.Tn ZFS -can label the disks with portable -.Sy EFI -labels. Otherwise, disk drivers on platforms of different endianness will not -recognize the disks. -.Bl -tag -width indent -.It Fl f -Forcefully unmount all datasets, using the -.Qq Nm unmount Fl f -command. -.Pp -This command will forcefully export the pool even if it has a shared spare that -is currently being used. This may lead to potential data corruption. -.El -.It Xo -.Nm -.Cm get -.Op Fl Hp -.Op Fl o Ar field Ns Op , Ns Ar ... -.Ar all | property Ns Op , Ns Ar ... -.Ar pool ... -.Xc -.Pp -Retrieves the given list of properties (or all properties if -.Qq Cm all -is used) for the specified storage pool(s). These properties are displayed with -the following fields: -.Bl -column -offset indent "property" -.It name Ta Name of storage pool -.It property Ta Property name -.It value Ta Property value -.It source Ta Property source, either 'default' or 'local'. -.El -.Pp -See the -.Qq Sx Properties -section for more information on the available pool properties. -.It Fl H -Scripted mode. Do not display headers, and separate fields by a single tab -instead of arbitrary space. -.It Fl p -Display numbers in parsable (exact) values. -.It Fl o Ar field -A comma-separated list of columns to display. -.Sy name Ns , Ns -.Sy property Ns , Ns -.Sy value Ns , Ns -.Sy source -is the default value. -.It Xo -.Nm -.Cm history -.Op Fl il -.Op Ar pool -.Ar ... -.Xc -.Pp -Displays the command history of the specified pools or all pools if no pool is -specified. -.Bl -tag -width indent -.It Fl i -Displays internally logged -.Tn ZFS -events in addition to user initiated events. -.It Fl l -Displays log records in long format, which in addition to standard format -includes, the user name, the hostname, and the zone in which the operation was -performed. -.El -.It Xo -.Nm -.Cm import -.Op Fl d Ar dir | Fl c Ar cachefile -.Op Fl D -.Xc -.Pp -Lists pools available to import. If the -.Fl d -option is not specified, this command searches for devices in -.Qq Pa /dev . -The -.Fl d -option can be specified multiple times, and all directories are searched. If -the device appears to be part of an exported pool, this command displays a -summary of the pool with the name of the pool, a numeric identifier, as well as -the -.No vdev -layout and current health of the device for each device or file. -Destroyed pools, pools that were previously destroyed with the -.Qq Nm Cm destroy -command, are not listed unless the -.Fl D -option is specified. -.Pp -The numeric identifier is unique, and can be used instead of the pool name when -multiple exported pools of the same name are available. -.Bl -tag -width indent -.It Fl c Ar cachefile -Reads configuration from the given -.Ar cachefile -that was created with the -.Qq Sy cachefile -pool property. This -.Ar cachefile -is used instead of searching for devices. -.It Fl d Ar dir -Searches for devices or files in -.Ar dir . -The -.Fl d -option can be specified multiple times. -.It Fl D -Lists destroyed pools only. -.El -.It Xo -.Nm -.Cm import -.Op Fl o Ar mntopts -.Op Fl o Ar property Ns = Ns Ar value -.Ar ... -.Op Fl d Ar dir | Fl c Ar cachefile -.Op Fl D -.Op Fl f -.Op Fl m -.Op Fl N -.Op Fl R Ar root -.Op Fl F Op Fl n -.Fl a -.Xc -.Pp -Imports all pools found in the search directories. Identical to the previous -command, except that all pools with a sufficient number of devices available -are imported. Destroyed pools, pools that were previously destroyed with the -.Qq Nm Cm destroy -command, will not be imported unless the -.Fl D -option is specified. -.Bl -tag -width indent -.It Fl o Ar mntopts -Comma-separated list of mount options to use when mounting datasets within the -pool. See -.Xr zfs 8 -for a description of dataset properties and mount options. -.It Fl o Ar property Ns = Ns Ar value -Sets the specified property on the imported pool. See the -.Qq Sx Properties -section for more information on the available pool properties. -.It Fl c Ar cachefile -Reads configuration from the given -.Ar cachefile -that was created with the -.Qq Sy cachefile -pool property. This -.Ar cachefile -is used instead of searching for devices. -.It Fl d Ar dir -Searches for devices or files in -.Ar dir . -The -.Fl d -option can be specified multiple times. This option is incompatible with the -.Fl c -option. -.It Fl D -Imports destroyed pools only. The -.Fl f -option is also required. -.It Fl f -Forces import, even if the pool appears to be potentially active. -.It Fl m -Allows a pool to import when there is a missing log device. Recent transactions -can be lost because the log device will be discarded. -.It Fl N -Import the pool without mounting any file systems. -.It Fl R Ar root -Sets the -.Qq Sy cachefile -property to -.Qq Cm none -and the -.Qq Sy altroot -property to -.Qq Ar root -.It Fl F -Recovery mode for a non-importable pool. Attempt to return the pool to an -importable state by discarding the last few transactions. Not all damaged pools -can be recovered by using this option. If successful, the data from the -discarded transactions is irretrievably lost. This option is ignored if the -pool is importable or already imported. -.It Fl n -Used with the -.Fl F -recovery option. Determines whether a non-importable pool can be made -importable again, but does not actually perform the pool recovery. For more -details about pool recovery mode, see the -.Fl F -option, above. -.It Fl a -Searches for and imports all pools found. -.El -.It Xo -.Nm -.Cm import -.Op Fl o Ar mntopts -.Op Fl o Ar property Ns = Ns Ar value -.Ar ... -.Op Fl d Ar dir | Fl c Ar cachefile -.Op Fl D -.Op Fl f -.Op Fl m -.Op Fl N -.Op Fl R Ar root -.Op Fl t -.Op Fl F Op Fl n -.Ar pool | id -.Op Ar newpool -.Xc -.Pp -Imports a specific pool. A pool can be identified by its name or the numeric -identifier. If -.Ar newpool -is specified, the pool is imported using the name -.Ar newpool . -Otherwise, it is imported with the same name as its exported name. -.Pp -If a device is removed from a system without running -.Qq Nm Cm export -first, the device appears as potentially active. It cannot be determined if -this was a failed export, or whether the device is really in use from another -host. To import a pool in this state, the -.Fl f -option is required. -.Bl -tag -width indent -.It Fl o Ar mntopts -Comma-separated list of mount options to use when mounting datasets within the -pool. See -.Xr zfs 8 -for a description of dataset properties and mount options. -.It Fl o Ar property Ns = Ns Ar value -Sets the specified property on the imported pool. See the -.Qq Sx Properties -section for more information on the available pool properties. -.It Fl c Ar cachefile -Reads configuration from the given -.Ar cachefile -that was created with the -.Qq Sy cachefile -pool property. This -.Ar cachefile -is used instead of searching for devices. -.It Fl d Ar dir -Searches for devices or files in -.Ar dir . -The -.Fl d -option can be specified multiple times. This option is incompatible with the -.Fl c -option. -.It Fl D -Imports destroyed pools only. The -.Fl f -option is also required. -.It Fl f -Forces import, even if the pool appears to be potentially active. -.It Fl m -Allows a pool to import when there is a missing log device. Recent transactions -can be lost because the log device will be discarded. -.It Fl N -Import the pool without mounting any file systems. -.It Fl R Ar root -Equivalent to -.Qq Fl o Cm cachefile=none,altroot= Ns Pa root -.It Fl t -Used with -.Ar newpool . -Specifies that -.Ar newpool -is temporary. -Temporary pool names last until export. -Ensures that the original pool name will be used in all label updates and -therefore is retained upon export. -Will also set -.Sy cachefile -property to -.Sy none -when not explicitly specified. -.It Fl F -Recovery mode for a non-importable pool. Attempt to return the pool to an -importable state by discarding the last few transactions. Not all damaged pools -can be recovered by using this option. If successful, the data from the -discarded transactions is irretrievably lost. This option is ignored if the -pool is importable or already imported. -.It Fl n -Used with the -.Fl F -recovery option. Determines whether a non-importable pool can be made -importable again, but does not actually perform the pool recovery. For more -details about pool recovery mode, see the -.Fl F -option, above. -.It Fl -rewind-to-checkpoint -Rewinds pool to the checkpointed state. -Once the pool is imported with this flag there is no way to undo the rewind. -All changes and data that were written after the checkpoint are lost! -The only exception is when the -.Sy readonly -mounting option is enabled. -In this case, the checkpointed state of the pool is opened and an -administrator can see how the pool would look like if they were -to fully rewind. -.El -.It Xo -.Nm -.Cm initialize -.Op Fl cs -.Ar pool -.Op Ar device Ns ... -.Xc -Begins initializing by writing to all unallocated regions on the specified -devices, or all eligible devices in the pool if no individual devices are -specified. -Only leaf data or log devices may be initialized. -.Bl -tag -width Ds -.It Fl c, -cancel -Cancel initializing on the specified devices, or all eligible devices if none -are specified. -If one or more target devices are invalid or are not currently being -initialized, the command will fail and no cancellation will occur on any device. -.It Fl s -suspend -Suspend initializing on the specified devices, or all eligible devices if none -are specified. -If one or more target devices are invalid or are not currently being -initialized, the command will fail and no suspension will occur on any device. -Initializing can then be resumed by running -.Nm zpool Cm initialize -with no flags on the relevant target devices. -.El -.It Xo -.Nm -.Cm iostat -.Op Fl T Cm d Ns | Ns Cm u -.Op Fl gLPv -.Op Ar pool -.Ar ... -.Op Ar interval Op Ar count -.Xc -.Pp -Displays -.Tn I/O -statistics for the given pools. When given an interval, the statistics are -printed every -.Ar interval -seconds until -.Sy Ctrl-C -is pressed. If no -.Ar pools -are specified, statistics for every pool in the system is shown. If -.Ar count -is specified, the command exits after -.Ar count -reports are printed. -.Bl -tag -width indent -.It Fl T Cm d Ns | Ns Cm u -Print a timestamp. -.Pp -Use modifier -.Cm d -for standard date format. See -.Xr date 1 . -Use modifier -.Cm u -for unixtime -.Pq equals Qq Ic date +%s . -.It Fl g -Display vdev GUIDs instead of the normal device names. -These GUIDs can be used in place of device names for the zpool -detach/offline/remove/replace commands. -.It Fl L -Display real paths for vdevs resolving all symbolic links. -This can be used to look up the current block device name regardless of the -.Pa /dev/disk/ -path used to open it. -.It Fl P -Display full paths for vdevs instead of only the last component of -the path. -This can be used in conjunction with the -.Fl L -flag. -.It Fl v -Verbose statistics. -Reports usage statistics for individual vdevs within the -pool, in addition to the pool-wide statistics. -.El -.It Xo -.Nm -.Cm labelclear -.Op Fl f -.Ar device -.Xc -.Pp -Removes -.Tn ZFS -label information from the specified -.Ar device . -The -.Ar device -must not be part of an active pool configuration. -.Bl -tag -width indent -.It Fl f -Treat exported or foreign devices as inactive. -.El -.It Xo -.Nm -.Cm list -.Op Fl HgLpPv -.Op Fl o Ar property Ns Op , Ns Ar ... -.Op Fl T Cm d Ns | Ns Cm u -.Op Ar pool -.Ar ... -.Op Ar interval Op Ar count -.Xc -.Pp -Lists the given pools along with a health status and space usage. If no -.Ar pools -are specified, all pools in the system are listed. -.Pp -When given an interval, the output is printed every -.Ar interval -seconds until -.Sy Ctrl-C -is pressed. If -.Ar count -is specified, the command exits after -.Ar count -reports are printed. -.Bl -tag -width indent -.It Fl T Cm d Ns | Ns Cm u -Print a timestamp. -.Pp -Use modifier -.Cm d -for standard date format. See -.Xr date 1 . -Use modifier -.Cm u -for unixtime -.Pq equals Qq Ic date +%s . -.It Fl g -Display vdev GUIDs instead of the normal device names. -These GUIDs can be used in place of device names for the zpool -detach/offline/remove/replace commands. -.It Fl H -Scripted mode. Do not display headers, and separate fields by a single tab -instead of arbitrary space. -.It Fl L -Display real paths for vdevs resolving all symbolic links. -This can be used to look up the current block device name regardless of the -/dev/disk/ path used to open it. -.It Fl p -Display numbers in parsable -.Pq exact -values. -.It Fl P -Display full paths for vdevs instead of only the last component of -the path. -This can be used in conjunction with the -.Fl L -flag. -.It Fl v -Verbose statistics. Reports usage statistics for individual -.Em vdevs -within -the pool, in addition to the pool-wide statistics. -.It Fl o Ar property Ns Op , Ns Ar ... -Comma-separated list of properties to display. See the -.Qq Sx Properties -section for a list of valid properties. The default list is -.Sy name , -.Sy size , -.Sy allocated , -.Sy free , -.Sy checkpoint , -.Sy expandsize , -.Sy fragmentation , -.Sy capacity , -.Sy dedupratio , -.Sy health , -.Sy altroot . -.It Fl T Cm d Ns | Ns Cm u -Print a timestamp. -.Pp -Use modifier -.Cm d -for standard date format. See -.Xr date 1 . -Use modifier -.Cm u -for unixtime -.Pq equals Qq Ic date +%s . -.El -.It Xo -.Nm -.Cm offline -.Op Fl t -.Ar pool device ... -.Xc -.Pp -Takes the specified physical device offline. While the -.Ar device -is offline, no attempt is made to read or write to the device. -.Bl -tag -width indent -.It Fl t -Temporary. Upon reboot, the specified physical device reverts to its previous -state. -.El -.It Xo -.Nm -.Cm online -.Op Fl e -.Ar pool device ... -.Xc -.Pp -Brings the specified physical device online. -.Pp -This command is not applicable to spares or cache devices. -.Bl -tag -width indent -.It Fl e -Expand the device to use all available space. If the device is part of a mirror -or -.No raidz -then all devices must be expanded before the new space will become -available to the pool. -.El -.It Xo -.Nm -.Cm reguid -.Ar pool -.Xc -.Pp -Generates a new unique identifier for the pool. You must ensure that all -devices in this pool are online and healthy before performing this action. -.It Xo -.Nm -.Cm remove -.Op Fl np -.Ar pool device ... -.Xc -.Pp -Removes the specified device from the pool. -This command currently only supports removing hot spares, cache, log -devices and mirrored top-level vdevs (mirror of leaf devices); but not raidz. -.Pp -Removing a top-level vdev reduces the total amount of space in the storage pool. -The specified device will be evacuated by copying all allocated space from it to -the other devices in the pool. -In this case, the -.Nm zpool Cm remove -command initiates the removal and returns, while the evacuation continues in -the background. -The removal progress can be monitored with -.Nm zpool Cm status. -This feature must be enabled to be used, see -.Xr zpool-features 7 -.Pp -A mirrored top-level device (log or data) can be removed by specifying the -top-level mirror for the same. -Non-log devices or data devices that are part of a mirrored configuration can -be removed using the -.Qq Nm Cm detach -command. -.Bl -tag -width Ds -.It Fl n -Do not actually perform the removal ("no-op"). -Instead, print the estimated amount of memory that will be used by the -mapping table after the removal completes. -This is nonzero only for top-level vdevs. -.El -.Bl -tag -width Ds -.It Fl p -Used in conjunction with the -.Fl n -flag, displays numbers as parsable (exact) values. -.El -.It Xo -.Nm -.Cm remove -.Fl s -.Ar pool -.Xc -.Pp -Stops and cancels an in-progress removal of a top-level vdev. -.It Xo -.Nm -.Cm reopen -.Ar pool -.Xc -.Pp -Reopen all the vdevs associated with the pool. -.It Xo -.Nm -.Cm replace -.Op Fl f -.Ar pool device -.Op Ar new_device -.Xc -.Pp -Replaces -.Ar old_device -with -.Ar new_device . -This is equivalent to attaching -.Ar new_device , -waiting for it to resilver, and then detaching -.Ar old_device . -.Pp -The size of -.Ar new_device -must be greater than or equal to the minimum size -of all the devices in a mirror or -.No raidz -configuration. -.Pp -.Ar new_device -is required if the pool is not redundant. If -.Ar new_device -is not specified, it defaults to -.Ar old_device . -This form of replacement is useful after an existing disk has failed and has -been physically replaced. In this case, the new disk may have the same -.Pa /dev -path as the old device, even though it is actually a different disk. -.Tn ZFS -recognizes this. -.Bl -tag -width indent -.It Fl f -Forces use of -.Ar new_device , -even if its appears to be in use. Not all devices can be overridden in this -manner. -.El -.It Xo -.Nm -.Cm scrub -.Op Fl s | Fl p -.Ar pool ... -.Xc -.Pp -Begins a scrub or resumes a paused scrub. -The scrub examines all data in the specified pools to verify that it checksums -correctly. -For replicated -.Pq mirror or raidz -devices, ZFS automatically repairs any damage discovered during the scrub. -The -.Nm zpool Cm status -command reports the progress of the scrub and summarizes the results of the -scrub upon completion. -.Pp -Scrubbing and resilvering are very similar operations. -The difference is that resilvering only examines data that ZFS knows to be out -of date -.Po -for example, when attaching a new device to a mirror or replacing an existing -device -.Pc , -whereas scrubbing examines all data to discover silent errors due to hardware -faults or disk failure. -.Pp -Because scrubbing and resilvering are I/O-intensive operations, ZFS only allows -one at a time. -If a scrub is paused, the -.Nm zpool Cm scrub -resumes it. -If a resilver is in progress, ZFS does not allow a scrub to be started until the -resilver completes. -.Bl -tag -width Ds -.It Fl s -Stop scrubbing. -.El -.Bl -tag -width Ds -.It Fl p -Pause scrubbing. -Scrub pause state and progress are periodically synced to disk. -If the system is restarted or pool is exported during a paused scrub, -even after import, scrub will remain paused until it is resumed. -Once resumed the scrub will pick up from the place where it was last -checkpointed to disk. -To resume a paused scrub issue -.Nm zpool Cm scrub -again. -.El -.It Xo -.Nm -.Cm set -.Ar property Ns = Ns Ar value pool -.Xc -.Pp -Sets the given property on the specified pool. See the -.Qq Sx Properties -section for more information on what properties can be set and acceptable -values. -.It Xo -.Nm -.Cm split -.Op Fl gLnP -.Op Fl R Ar altroot -.Op Fl o Ar mntopts -.Op Fl o Ar property Ns = Ns Ar value -.Ar pool newpool -.Op Ar device ... -.Xc -.Pp -Splits off one disk from each mirrored top-level -.No vdev -in a pool and creates a new pool from the split-off disks. The original pool -must be made up of one or more mirrors and must not be in the process of -resilvering. The -.Cm split -subcommand chooses the last device in each mirror -.No vdev -unless overridden by a device specification on the command line. -.Pp -When using a -.Ar device -argument, -.Cm split -includes the specified device(s) in a new pool and, should any devices remain -unspecified, assigns the last device in each mirror -.No vdev -to that pool, as it does normally. If you are uncertain about the outcome of a -.Cm split -command, use the -.Fl n -("dry-run") option to ensure your command will have the effect you intend. -.Bl -tag -width indent -.It Fl R Ar altroot -Automatically import the newly created pool after splitting, using the -specified -.Ar altroot -parameter for the new pool's alternate root. See the -.Sy altroot -description in the -.Qq Sx Properties -section, above. -.It Fl g -Display vdev GUIDs instead of the normal device names. -These GUIDs can be used in place of device names for the zpool -detach/offline/remove/replace commands. -.It Fl L -Display real paths for vdevs resolving all symbolic links. -This can be used to look up the current block device name regardless of the -.Pa /dev/disk/ -path used to open it. -.It Fl n -Displays the configuration that would be created without actually splitting the -pool. The actual pool split could still fail due to insufficient privileges or -device status. -.It Fl o Ar mntopts -Comma-separated list of mount options to use when mounting datasets within the -pool. See -.Xr zfs 8 -for a description of dataset properties and mount options. Valid only in -conjunction with the -.Fl R -option. -.It Fl o Ar property Ns = Ns Ar value -Sets the specified property on the new pool. See the -.Qq Sx Properties -section, above, for more information on the available pool properties. -.It Fl P -Display full paths for vdevs instead of only the last component of -the path. -This can be used in conjunction with the -.Fl L -flag. -.El -.It Xo -.Nm -.Cm status -.Op Fl DgLPvx -.Op Fl T Cm d Ns | Ns Cm u -.Op Ar pool -.Ar ... -.Op Ar interval Op Ar count -.Xc -.Pp -Displays the detailed health status for the given pools. If no -.Ar pool -is specified, then the status of each pool in the system is displayed. For more -information on pool and device health, see the -.Qq Sx Device Failure and Recovery -section. -.Pp -When given an interval, the output is printed every -.Ar interval -seconds until -.Sy Ctrl-C -is pressed. If -.Ar count -is specified, the command exits after -.Ar count -reports are printed. -.Pp -If a scrub or resilver is in progress, this command reports the percentage -done and the estimated time to completion. Both of these are only approximate, -because the amount of data in the pool and the other workloads on the system -can change. -.Bl -tag -width indent -.It Fl D -Display a histogram of deduplication statistics, showing the allocated -.Pq physically present on disk -and referenced -.Pq logically referenced in the pool -block counts and sizes by reference count. -.It Fl g -Display vdev GUIDs instead of the normal device names. -These GUIDs can be used in place of device names for the zpool -detach/offline/remove/replace commands. -.It Fl L -Display real paths for vdevs resolving all symbolic links. -This can be used to look up the current block device name regardless of the -.Pa /dev/disk/ -path used to open it. -.It Fl P -Display full paths for vdevs instead of only the last component of -the path. -This can be used in conjunction with the -.Fl L -flag. -.It Fl T Cm d Ns | Ns Cm u -Print a timestamp. -.Pp -Use modifier -.Cm d -for standard date format. See -.Xr date 1 . -Use modifier -.Cm u -for unixtime -.Pq equals Qq Ic date +%s . -.It Fl v -Displays verbose data error information, printing out a complete list of all -data errors since the last complete pool scrub. -.It Fl x -Only display status for pools that are exhibiting errors or are otherwise -unavailable. -Warnings about pools not using the latest on-disk format, having non-native -block size or disabled features will not be included. -.El -.It Xo -.Nm -.Cm sync -.Oo Ar pool Oc Ns ... -.Xc -Forces all in-core dirty data to be written to the primary pool storage and -not the ZIL. -It will also update administrative information including quota reporting. -Without arguments, -.Nm zpool Cm sync -will sync all pools on the system. -Otherwise, it will only sync the specified -.Ar pool . -.It Xo -.Nm -.Cm upgrade -.Op Fl v -.Xc -.Pp -Displays pools which do not have all supported features enabled and pools -formatted using a legacy -.Tn ZFS -version number. -These pools can continue to be used, but some features may not be available. -Use -.Nm Cm upgrade Fl a -to enable all features on all pools. -.Bl -tag -width indent -.It Fl v -Displays legacy -.Tn ZFS -versions supported by the current software. -See -.Xr zpool-features 7 -for a description of feature flags features supported by the current software. -.El -.It Xo -.Nm -.Cm upgrade -.Op Fl V Ar version -.Fl a | Ar pool ... -.Xc -.Pp -Enables all supported features on the given pool. -Once this is done, the pool will no longer be accessible on systems that do -not support feature flags. -See -.Xr zpool-features 7 -for details on compatibility with systems that support feature flags, but do -not support all features enabled on the pool. -.Bl -tag -width indent -.It Fl a -Enables all supported features on all pools. -.It Fl V Ar version -Upgrade to the specified legacy version. If the -.Fl V -flag is specified, no features will be enabled on the pool. -This option can only be used to increase version number up to the last -supported legacy version number. -.El -.El -.Sh EXIT STATUS -The following exit values are returned: -.Bl -tag -offset 2n -width 2n -.It 0 -Successful completion. -.It 1 -An error occurred. -.It 2 -Invalid command line options were specified. -.El -.Sh ENVIRONMENT VARIABLES -.Bl -tag -width "ZPOOL_VDEV_NAME_FOLLOW_LINKS" -.It Ev ZPOOL_VDEV_NAME_GUID -Cause -.Nm zpool -subcommands to output vdev guids by default. -This behavior is identical to the -.Nm zpool status -g -command line option. -.It Ev ZPOOL_VDEV_NAME_FOLLOW_LINKS -Cause -.Nm zpool -subcommands to follow links for vdev names by default. -This behavior is identical to the -.Nm zpool status -L -command line option. -.It Ev ZPOOL_VDEV_NAME_PATH -Cause -.Nm zpool -subcommands to output full vdev path names by default. -This behavior is identical to the -.Nm zpool status -P -command line option. -.El -.Sh EXAMPLES -.Bl -tag -width 0n -.It Sy Example 1 No Creating a RAID-Z Storage Pool -.Pp -The following command creates a pool with a single -.No raidz -root -.No vdev -that consists of six disks. -.Bd -literal -offset 2n -.Li # Ic zpool create tank raidz da0 da1 da2 da3 da4 da5 -.Ed -.It Sy Example 2 No Creating a Mirrored Storage Pool -.Pp -The following command creates a pool with two mirrors, where each mirror -contains two disks. -.Bd -literal -offset 2n -.Li # Ic zpool create tank mirror da0 da1 mirror da2 da3 -.Ed -.It Sy Example 3 No Creating a Tn ZFS No Storage Pool by Using Partitions -.Pp -The following command creates an unmirrored pool using two GPT partitions. -.Bd -literal -offset 2n -.Li # Ic zpool create tank da0p3 da1p3 -.Ed -.It Sy Example 4 No Creating a Tn ZFS No Storage Pool by Using Files -.Pp -The following command creates an unmirrored pool using files. While not -recommended, a pool based on files can be useful for experimental purposes. -.Bd -literal -offset 2n -.Li # Ic zpool create tank /path/to/file/a /path/to/file/b -.Ed -.It Sy Example 5 No Adding a Mirror to a Tn ZFS No Storage Pool -.Pp -The following command adds two mirrored disks to the pool -.Em tank , -assuming the pool is already made up of two-way mirrors. The additional space -is immediately available to any datasets within the pool. -.Bd -literal -offset 2n -.Li # Ic zpool add tank mirror da2 da3 -.Ed -.It Sy Example 6 No Listing Available Tn ZFS No Storage Pools -.Pp -The following command lists all available pools on the system. -.Bd -literal -offset 2n -.Li # Ic zpool list -NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT -pool 2.70T 473G 2.24T 33% - 17% 1.00x ONLINE - -test 1.98G 89.5K 1.98G 48% - 0% 1.00x ONLINE - -.Ed -.It Sy Example 7 No Listing All Properties for a Pool -.Pp -The following command lists all the properties for a pool. -.Bd -literal -offset 2n -.Li # Ic zpool get all pool -pool size 2.70T - -pool capacity 17% - -pool altroot - default -pool health ONLINE - -pool guid 2501120270416322443 default -pool version 28 default -pool bootfs pool/root local -pool delegation on default -pool autoreplace off default -pool cachefile - default -pool failmode wait default -pool listsnapshots off default -pool autoexpand off default -pool dedupditto 0 default -pool dedupratio 1.00x - -pool free 2.24T - -pool allocated 473G - -pool readonly off - -.Ed -.It Sy Example 8 No Destroying a Tn ZFS No Storage Pool -.Pp -The following command destroys the pool -.Qq Em tank -and any datasets contained within. -.Bd -literal -offset 2n -.Li # Ic zpool destroy -f tank -.Ed -.It Sy Example 9 No Exporting a Tn ZFS No Storage Pool -.Pp -The following command exports the devices in pool -.Em tank -so that they can be relocated or later imported. -.Bd -literal -offset 2n -.Li # Ic zpool export tank -.Ed -.It Sy Example 10 No Importing a Tn ZFS No Storage Pool -.Pp -The following command displays available pools, and then imports the pool -.Qq Em tank -for use on the system. -.Pp -The results from this command are similar to the following: -.Bd -literal -offset 2n -.Li # Ic zpool import - - pool: tank - id: 15451357997522795478 - state: ONLINE -action: The pool can be imported using its name or numeric identifier. -config: - - tank ONLINE - mirror ONLINE - da0 ONLINE - da1 ONLINE -.Ed -.It Xo -.Sy Example 11 -Upgrading All -.Tn ZFS -Storage Pools to the Current Version -.Xc -.Pp -The following command upgrades all -.Tn ZFS -Storage pools to the current version of -the software. -.Bd -literal -offset 2n -.Li # Ic zpool upgrade -a -This system is currently running ZFS pool version 28. -.Ed -.It Sy Example 12 No Managing Hot Spares -.Pp -The following command creates a new pool with an available hot spare: -.Bd -literal -offset 2n -.Li # Ic zpool create tank mirror da0 da1 spare da2 -.Ed -.Pp -If one of the disks were to fail, the pool would be reduced to the degraded -state. The failed device can be replaced using the following command: -.Bd -literal -offset 2n -.Li # Ic zpool replace tank da0 da2 -.Ed -.Pp -Once the data has been resilvered, the spare is automatically removed and is -made available should another device fails. The hot spare can be permanently -removed from the pool using the following command: -.Bd -literal -offset 2n -.Li # Ic zpool remove tank da2 -.Ed -.It Xo -.Sy Example 13 -Creating a -.Tn ZFS -Pool with Mirrored Separate Intent Logs -.Xc -.Pp -The following command creates a -.Tn ZFS -storage pool consisting of two, two-way -mirrors and mirrored log devices: -.Bd -literal -offset 2n -.Li # Ic zpool create pool mirror da0 da1 mirror da2 da3 log mirror da4 da5 -.Ed -.It Sy Example 14 No Adding Cache Devices to a Tn ZFS No Pool -.Pp -The following command adds two disks for use as cache devices to a -.Tn ZFS -storage pool: -.Bd -literal -offset 2n -.Li # Ic zpool add pool cache da2 da3 -.Ed -.Pp -Once added, the cache devices gradually fill with content from main memory. -Depending on the size of your cache devices, it could take over an hour for -them to fill. Capacity and reads can be monitored using the -.Cm iostat -subcommand as follows: -.Bd -literal -offset 2n -.Li # Ic zpool iostat -v pool 5 -.Ed -.It Xo -.Sy Example 15 -Displaying expanded space on a device -.Xc -.Pp -The following command dipslays the detailed information for the -.Em data -pool. -This pool is comprised of a single -.Em raidz -vdev where one of its -devices increased its capacity by 10GB. -In this example, the pool will not -be able to utilized this extra capacity until all the devices under the -.Em raidz -vdev have been expanded. -.Bd -literal -offset 2n -.Li # Ic zpool list -v data -NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT -data 23.9G 14.6G 9.30G 48% - 61% 1.00x ONLINE - - raidz1 23.9G 14.6G 9.30G 48% - - ada0 - - - - - - ada1 - - - - 10G - ada2 - - - - - -.Ed -.It Xo -.Sy Example 16 -Removing a Mirrored top-level (Log or Data) Device -.Xc -.Pp -The following commands remove the mirrored log device -.Sy mirror-2 -and mirrored top-level data device -.Sy mirror-1 . -.Pp -Given this configuration: -.Bd -literal -offset 2n - pool: tank - state: ONLINE - scrub: none requested - config: - - NAME STATE READ WRITE CKSUM - tank ONLINE 0 0 0 - mirror-0 ONLINE 0 0 0 - da0 ONLINE 0 0 0 - da1 ONLINE 0 0 0 - mirror-1 ONLINE 0 0 0 - da2 ONLINE 0 0 0 - da3 ONLINE 0 0 0 - logs - mirror-2 ONLINE 0 0 0 - da4 ONLINE 0 0 0 - da5 ONLINE 0 0 0 -.Ed -.Pp -The command to remove the mirrored log -.Em mirror-2 -is: -.Bd -literal -offset 2n -.Li # Ic zpool remove tank mirror-2 -.Ed -.Pp -The command to remove the mirrored data -.Em mirror-1 -is: -.Bd -literal -offset 2n -.Li # Ic zpool remove tank mirror-1 -.Ed -.It Xo -.Sy Example 17 -Recovering a Faulted -.Tn ZFS -Pool -.Xc -.Pp -If a pool is faulted but recoverable, a message indicating this state is -provided by -.Qq Nm Cm status -if the pool was cached (see the -.Fl c Ar cachefile -argument above), or as part of the error output from a failed -.Qq Nm Cm import -of the pool. -.Pp -Recover a cached pool with the -.Qq Nm Cm clear -command: -.Bd -literal -offset 2n -.Li # Ic zpool clear -F data -Pool data returned to its state as of Tue Sep 08 13:23:35 2009. -Discarded approximately 29 seconds of transactions. -.Ed -.Pp -If the pool configuration was not cached, use -.Qq Nm Cm import -with the recovery mode flag: -.Bd -literal -offset 2n -.Li # Ic zpool import -F data -Pool data returned to its state as of Tue Sep 08 13:23:35 2009. -Discarded approximately 29 seconds of transactions. -.Ed -.El -.Sh SEE ALSO -.Xr zpool-features 7 , -.Xr zfs 8 , -.Xr zfsd 8 -.Sh HISTORY -The -.Nm -utility first appeared in -.Fx 7.0 . -.Sh AUTHORS -This manual page is a -.Xr mdoc 7 -reimplementation of the -.Tn OpenSolaris -manual page -.Em zpool(1M) , -modified and customized for -.Fx -and licensed under the Common Development and Distribution License -.Pq Tn CDDL . -.Pp -The -.Xr mdoc 7 -implementation of this manual page was initially written by -.An Martin Matuska Aq mm@FreeBSD.org . diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool_iter.c b/cddl/contrib/opensolaris/cmd/zpool/zpool_iter.c deleted file mode 100644 index 2f7de933ed41..000000000000 --- a/cddl/contrib/opensolaris/cmd/zpool/zpool_iter.c +++ /dev/null @@ -1,255 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright 2016 Igor Kozhukhov . - */ - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "zpool_util.h" - -/* - * Private interface for iterating over pools specified on the command line. - * Most consumers will call for_each_pool, but in order to support iostat, we - * allow fined grained control through the zpool_list_t interface. - */ - -typedef struct zpool_node { - zpool_handle_t *zn_handle; - uu_avl_node_t zn_avlnode; - int zn_mark; -} zpool_node_t; - -struct zpool_list { - boolean_t zl_findall; - uu_avl_t *zl_avl; - uu_avl_pool_t *zl_pool; - zprop_list_t **zl_proplist; -}; - -/* ARGSUSED */ -static int -zpool_compare(const void *larg, const void *rarg, void *unused) -{ - zpool_handle_t *l = ((zpool_node_t *)larg)->zn_handle; - zpool_handle_t *r = ((zpool_node_t *)rarg)->zn_handle; - const char *lname = zpool_get_name(l); - const char *rname = zpool_get_name(r); - - return (strcmp(lname, rname)); -} - -/* - * Callback function for pool_list_get(). Adds the given pool to the AVL tree - * of known pools. - */ -static int -add_pool(zpool_handle_t *zhp, void *data) -{ - zpool_list_t *zlp = data; - zpool_node_t *node = safe_malloc(sizeof (zpool_node_t)); - uu_avl_index_t idx; - - node->zn_handle = zhp; - uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool); - if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) { - if (zlp->zl_proplist && - zpool_expand_proplist(zhp, zlp->zl_proplist) != 0) { - zpool_close(zhp); - free(node); - return (-1); - } - uu_avl_insert(zlp->zl_avl, node, idx); - } else { - zpool_close(zhp); - free(node); - return (-1); - } - - return (0); -} - -/* - * Create a list of pools based on the given arguments. If we're given no - * arguments, then iterate over all pools in the system and add them to the AVL - * tree. Otherwise, add only those pool explicitly specified on the command - * line. - */ -zpool_list_t * -pool_list_get(int argc, char **argv, zprop_list_t **proplist, int *err) -{ - zpool_list_t *zlp; - - zlp = safe_malloc(sizeof (zpool_list_t)); - - zlp->zl_pool = uu_avl_pool_create("zfs_pool", sizeof (zpool_node_t), - offsetof(zpool_node_t, zn_avlnode), zpool_compare, UU_DEFAULT); - - if (zlp->zl_pool == NULL) - zpool_no_memory(); - - if ((zlp->zl_avl = uu_avl_create(zlp->zl_pool, NULL, - UU_DEFAULT)) == NULL) - zpool_no_memory(); - - zlp->zl_proplist = proplist; - - if (argc == 0) { - (void) zpool_iter(g_zfs, add_pool, zlp); - zlp->zl_findall = B_TRUE; - } else { - int i; - - for (i = 0; i < argc; i++) { - zpool_handle_t *zhp; - - if ((zhp = zpool_open_canfail(g_zfs, argv[i])) != - NULL) { - if (add_pool(zhp, zlp) != 0) - *err = B_TRUE; - } else { - *err = B_TRUE; - } - } - } - - return (zlp); -} - -/* - * Search for any new pools, adding them to the list. We only add pools when no - * options were given on the command line. Otherwise, we keep the list fixed as - * those that were explicitly specified. - */ -void -pool_list_update(zpool_list_t *zlp) -{ - if (zlp->zl_findall) - (void) zpool_iter(g_zfs, add_pool, zlp); -} - -/* - * Iterate over all pools in the list, executing the callback for each - */ -int -pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func, - void *data) -{ - zpool_node_t *node, *next_node; - int ret = 0; - - for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next_node) { - next_node = uu_avl_next(zlp->zl_avl, node); - if (zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL || - unavail) - ret |= func(node->zn_handle, data); - } - - return (ret); -} - -/* - * Remove the given pool from the list. When running iostat, we want to remove - * those pools that no longer exist. - */ -void -pool_list_remove(zpool_list_t *zlp, zpool_handle_t *zhp) -{ - zpool_node_t search, *node; - - search.zn_handle = zhp; - if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) { - uu_avl_remove(zlp->zl_avl, node); - zpool_close(node->zn_handle); - free(node); - } -} - -/* - * Free all the handles associated with this list. - */ -void -pool_list_free(zpool_list_t *zlp) -{ - uu_avl_walk_t *walk; - zpool_node_t *node; - - if ((walk = uu_avl_walk_start(zlp->zl_avl, UU_WALK_ROBUST)) == NULL) { - (void) fprintf(stderr, - gettext("internal error: out of memory")); - exit(1); - } - - while ((node = uu_avl_walk_next(walk)) != NULL) { - uu_avl_remove(zlp->zl_avl, node); - zpool_close(node->zn_handle); - free(node); - } - - uu_avl_walk_end(walk); - uu_avl_destroy(zlp->zl_avl); - uu_avl_pool_destroy(zlp->zl_pool); - - free(zlp); -} - -/* - * Returns the number of elements in the pool list. - */ -int -pool_list_count(zpool_list_t *zlp) -{ - return (uu_avl_numnodes(zlp->zl_avl)); -} - -/* - * High level function which iterates over all pools given on the command line, - * using the pool_list_* interfaces. - */ -int -for_each_pool(int argc, char **argv, boolean_t unavail, - zprop_list_t **proplist, zpool_iter_f func, void *data) -{ - zpool_list_t *list; - int ret = 0; - - if ((list = pool_list_get(argc, argv, proplist, &ret)) == NULL) - return (1); - - if (pool_list_iter(list, unavail, func, data) != 0) - ret = 1; - - pool_list_free(list); - - return (ret); -} diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c b/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c deleted file mode 100644 index 35c3db7893df..000000000000 --- a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c +++ /dev/null @@ -1,6742 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2012 by Frederik Wessels. All rights reserved. - * Copyright (c) 2012 Martin Matuska . All rights reserved. - * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. - * Copyright 2016 Igor Kozhukhov . - * Copyright 2016 Nexenta Systems, Inc. - * Copyright (c) 2017 Datto Inc. - * Copyright (c) 2017, Intel Corporation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "zpool_util.h" -#include "zfs_comutil.h" -#include "zfeature_common.h" - -#include "statcommon.h" - -libzfs_handle_t *g_zfs; - -static int zpool_do_create(int, char **); -static int zpool_do_destroy(int, char **); - -static int zpool_do_add(int, char **); -static int zpool_do_remove(int, char **); -static int zpool_do_labelclear(int, char **); - -static int zpool_do_checkpoint(int, char **); - -static int zpool_do_list(int, char **); -static int zpool_do_iostat(int, char **); -static int zpool_do_status(int, char **); - -static int zpool_do_online(int, char **); -static int zpool_do_offline(int, char **); -static int zpool_do_clear(int, char **); -static int zpool_do_reopen(int, char **); - -static int zpool_do_reguid(int, char **); - -static int zpool_do_attach(int, char **); -static int zpool_do_detach(int, char **); -static int zpool_do_replace(int, char **); -static int zpool_do_split(int, char **); - -static int zpool_do_initialize(int, char **); -static int zpool_do_scrub(int, char **); - -static int zpool_do_import(int, char **); -static int zpool_do_export(int, char **); - -static int zpool_do_upgrade(int, char **); - -static int zpool_do_history(int, char **); - -static int zpool_do_get(int, char **); -static int zpool_do_set(int, char **); - -static int zpool_do_sync(int, char **); - -/* - * These libumem hooks provide a reasonable set of defaults for the allocator's - * debugging facilities. - */ - -#ifdef DEBUG -const char * -_umem_debug_init(void) -{ - return ("default,verbose"); /* $UMEM_DEBUG setting */ -} - -const char * -_umem_logging_init(void) -{ - return ("fail,contents"); /* $UMEM_LOGGING setting */ -} -#endif - -typedef enum { - HELP_ADD, - HELP_ATTACH, - HELP_CLEAR, - HELP_CREATE, - HELP_CHECKPOINT, - HELP_DESTROY, - HELP_DETACH, - HELP_EXPORT, - HELP_HISTORY, - HELP_IMPORT, - HELP_IOSTAT, - HELP_LABELCLEAR, - HELP_LIST, - HELP_OFFLINE, - HELP_ONLINE, - HELP_REPLACE, - HELP_REMOVE, - HELP_INITIALIZE, - HELP_SCRUB, - HELP_STATUS, - HELP_UPGRADE, - HELP_GET, - HELP_SET, - HELP_SPLIT, - HELP_SYNC, - HELP_REGUID, - HELP_REOPEN -} zpool_help_t; - - -typedef struct zpool_command { - const char *name; - int (*func)(int, char **); - zpool_help_t usage; -} zpool_command_t; - -/* - * Master command table. Each ZFS command has a name, associated function, and - * usage message. The usage messages need to be internationalized, so we have - * to have a function to return the usage message based on a command index. - * - * These commands are organized according to how they are displayed in the usage - * message. An empty command (one with a NULL name) indicates an empty line in - * the generic usage message. - */ -static zpool_command_t command_table[] = { - { "create", zpool_do_create, HELP_CREATE }, - { "destroy", zpool_do_destroy, HELP_DESTROY }, - { NULL }, - { "add", zpool_do_add, HELP_ADD }, - { "remove", zpool_do_remove, HELP_REMOVE }, - { NULL }, - { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR }, - { NULL }, - { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT }, - { NULL }, - { "list", zpool_do_list, HELP_LIST }, - { "iostat", zpool_do_iostat, HELP_IOSTAT }, - { "status", zpool_do_status, HELP_STATUS }, - { NULL }, - { "online", zpool_do_online, HELP_ONLINE }, - { "offline", zpool_do_offline, HELP_OFFLINE }, - { "clear", zpool_do_clear, HELP_CLEAR }, - { "reopen", zpool_do_reopen, HELP_REOPEN }, - { NULL }, - { "attach", zpool_do_attach, HELP_ATTACH }, - { "detach", zpool_do_detach, HELP_DETACH }, - { "replace", zpool_do_replace, HELP_REPLACE }, - { "split", zpool_do_split, HELP_SPLIT }, - { NULL }, - { "initialize", zpool_do_initialize, HELP_INITIALIZE }, - { "scrub", zpool_do_scrub, HELP_SCRUB }, - { NULL }, - { "import", zpool_do_import, HELP_IMPORT }, - { "export", zpool_do_export, HELP_EXPORT }, - { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, - { "reguid", zpool_do_reguid, HELP_REGUID }, - { NULL }, - { "history", zpool_do_history, HELP_HISTORY }, - { "get", zpool_do_get, HELP_GET }, - { "set", zpool_do_set, HELP_SET }, - { "sync", zpool_do_sync, HELP_SYNC }, -}; - -#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) - -#define VDEV_ALLOC_CLASS_LOGS "logs" - -static zpool_command_t *current_command; -static char history_str[HIS_MAX_RECORD_LEN]; -static boolean_t log_history = B_TRUE; -static uint_t timestamp_fmt = NODATE; - -static const char * -get_usage(zpool_help_t idx) -{ - switch (idx) { - case HELP_ADD: - return (gettext("\tadd [-fgLnP] ...\n")); - case HELP_ATTACH: - return (gettext("\tattach [-f] " - "\n")); - case HELP_CLEAR: - return (gettext("\tclear [-nF] [device]\n")); - case HELP_CREATE: - return (gettext("\tcreate [-fnd] [-B] " - "[-o property=value] ... \n" - "\t [-O file-system-property=value] ...\n" - "\t [-m mountpoint] [-R root] [-t tempname] " - " ...\n")); - case HELP_CHECKPOINT: - return (gettext("\tcheckpoint [--discard] ...\n")); - case HELP_DESTROY: - return (gettext("\tdestroy [-f] \n")); - case HELP_DETACH: - return (gettext("\tdetach \n")); - case HELP_EXPORT: - return (gettext("\texport [-f] ...\n")); - case HELP_HISTORY: - return (gettext("\thistory [-il] [] ...\n")); - case HELP_IMPORT: - return (gettext("\timport [-d dir] [-D]\n" - "\timport [-o mntopts] [-o property=value] ... \n" - "\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] " - "[-R root] [-F [-n]] -a\n" - "\timport [-o mntopts] [-o property=value] ... \n" - "\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] " - "[-R root] [-F [-n]] [-t]\n" - "\t [--rewind-to-checkpoint] [newpool]\n")); - case HELP_IOSTAT: - return (gettext("\tiostat [-gLPv] [-T d|u] [pool] ... " - "[interval [count]]\n")); - case HELP_LABELCLEAR: - return (gettext("\tlabelclear [-f] \n")); - case HELP_LIST: - return (gettext("\tlist [-gHLpPv] [-o property[,...]] " - "[-T d|u] [pool] ... [interval [count]]\n")); - case HELP_OFFLINE: - return (gettext("\toffline [-t] ...\n")); - case HELP_ONLINE: - return (gettext("\tonline [-e] ...\n")); - case HELP_REPLACE: - return (gettext("\treplace [-f] " - "[new-device]\n")); - case HELP_REMOVE: - return (gettext("\tremove [-nps] ...\n")); - case HELP_REOPEN: - return (gettext("\treopen \n")); - case HELP_INITIALIZE: - return (gettext("\tinitialize [-cs] [ ...]\n")); - case HELP_SCRUB: - return (gettext("\tscrub [-s | -p] ...\n")); - case HELP_STATUS: - return (gettext("\tstatus [-DgLPvx] [-T d|u] [pool] ... " - "[interval [count]]\n")); - case HELP_UPGRADE: - return (gettext("\tupgrade [-v]\n" - "\tupgrade [-V version] <-a | pool ...>\n")); - case HELP_GET: - return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] " - "<\"all\" | property[,...]> ...\n")); - case HELP_SET: - return (gettext("\tset \n")); - case HELP_SPLIT: - return (gettext("\tsplit [-gLnP] [-R altroot] [-o mntopts]\n" - "\t [-o property=value] " - "[ ...]\n")); - case HELP_REGUID: - return (gettext("\treguid \n")); - case HELP_SYNC: - return (gettext("\tsync [pool] ...\n")); - } - - abort(); - /* NOTREACHED */ -} - - -/* - * Callback routine that will print out a pool property value. - */ -static int -print_prop_cb(int prop, void *cb) -{ - FILE *fp = cb; - - (void) fprintf(fp, "\t%-19s ", zpool_prop_to_name(prop)); - - if (zpool_prop_readonly(prop)) - (void) fprintf(fp, " NO "); - else - (void) fprintf(fp, " YES "); - - if (zpool_prop_values(prop) == NULL) - (void) fprintf(fp, "-\n"); - else - (void) fprintf(fp, "%s\n", zpool_prop_values(prop)); - - return (ZPROP_CONT); -} - -/* - * Display usage message. If we're inside a command, display only the usage for - * that command. Otherwise, iterate over the entire command table and display - * a complete usage message. - */ -void -usage(boolean_t requested) -{ - FILE *fp = requested ? stdout : stderr; - - if (current_command == NULL) { - int i; - - (void) fprintf(fp, gettext("usage: zpool command args ...\n")); - (void) fprintf(fp, - gettext("where 'command' is one of the following:\n\n")); - - for (i = 0; i < NCOMMAND; i++) { - if (command_table[i].name == NULL) - (void) fprintf(fp, "\n"); - else - (void) fprintf(fp, "%s", - get_usage(command_table[i].usage)); - } - } else { - (void) fprintf(fp, gettext("usage:\n")); - (void) fprintf(fp, "%s", get_usage(current_command->usage)); - } - - if (current_command != NULL && - ((strcmp(current_command->name, "set") == 0) || - (strcmp(current_command->name, "get") == 0) || - (strcmp(current_command->name, "list") == 0))) { - - (void) fprintf(fp, - gettext("\nthe following properties are supported:\n")); - - (void) fprintf(fp, "\n\t%-19s %s %s\n\n", - "PROPERTY", "EDIT", "VALUES"); - - /* Iterate over all properties */ - (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE, - ZFS_TYPE_POOL); - - (void) fprintf(fp, "\t%-19s ", "feature@..."); - (void) fprintf(fp, "YES disabled | enabled | active\n"); - - (void) fprintf(fp, gettext("\nThe feature@ properties must be " - "appended with a feature name.\nSee zpool-features(7).\n")); - } - - /* - * See comments at end of main(). - */ - if (getenv("ZFS_ABORT") != NULL) { - (void) printf("dumping core by request\n"); - abort(); - } - - exit(requested ? 0 : 2); -} - -/* - * print a pool vdev config for dry runs - */ -static void -print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, - const char *match, int name_flags) -{ - nvlist_t **child; - uint_t c, children; - char *vname; - boolean_t printed = B_FALSE; - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) { - if (name != NULL) - (void) printf("\t%*s%s\n", indent, "", name); - return; - } - - for (c = 0; c < children; c++) { - uint64_t is_log = B_FALSE; - char *class = ""; - - (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - &is_log); - if (is_log) - class = VDEV_ALLOC_BIAS_LOG; - (void) nvlist_lookup_string(child[c], - ZPOOL_CONFIG_ALLOCATION_BIAS, &class); - if (strcmp(match, class) != 0) - continue; - - if (!printed && name != NULL) { - (void) printf("\t%*s%s\n", indent, "", name); - printed = B_TRUE; - } - vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags); - print_vdev_tree(zhp, vname, child[c], indent + 2, "", - name_flags); - free(vname); - } -} - -static boolean_t -prop_list_contains_feature(nvlist_t *proplist) -{ - nvpair_t *nvp; - for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp; - nvp = nvlist_next_nvpair(proplist, nvp)) { - if (zpool_prop_feature(nvpair_name(nvp))) - return (B_TRUE); - } - return (B_FALSE); -} - -/* - * Add a property pair (name, string-value) into a property nvlist. - */ -static int -add_prop_list(const char *propname, char *propval, nvlist_t **props, - boolean_t poolprop) -{ - zpool_prop_t prop = ZPROP_INVAL; - zfs_prop_t fprop; - nvlist_t *proplist; - const char *normnm; - char *strval; - - if (*props == NULL && - nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) { - (void) fprintf(stderr, - gettext("internal error: out of memory\n")); - return (1); - } - - proplist = *props; - - if (poolprop) { - const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION); - - if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL && - !zpool_prop_feature(propname)) { - (void) fprintf(stderr, gettext("property '%s' is " - "not a valid pool property\n"), propname); - return (2); - } - - /* - * feature@ properties and version should not be specified - * at the same time. - */ - if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) && - nvlist_exists(proplist, vname)) || - (prop == ZPOOL_PROP_VERSION && - prop_list_contains_feature(proplist))) { - (void) fprintf(stderr, gettext("'feature@' and " - "'version' properties cannot be specified " - "together\n")); - return (2); - } - - - if (zpool_prop_feature(propname)) - normnm = propname; - else - normnm = zpool_prop_to_name(prop); - } else { - if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { - normnm = zfs_prop_to_name(fprop); - } else { - normnm = propname; - } - } - - if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && - prop != ZPOOL_PROP_CACHEFILE) { - (void) fprintf(stderr, gettext("property '%s' " - "specified multiple times\n"), propname); - return (2); - } - - if (nvlist_add_string(proplist, normnm, propval) != 0) { - (void) fprintf(stderr, gettext("internal " - "error: out of memory\n")); - return (1); - } - - return (0); -} - -/* - * Set a default property pair (name, string-value) in a property nvlist - */ -static int -add_prop_list_default(const char *propname, char *propval, nvlist_t **props, - boolean_t poolprop) -{ - char *pval; - - if (nvlist_lookup_string(*props, propname, &pval) == 0) - return (0); - - return (add_prop_list(propname, propval, props, poolprop)); -} - -/* - * zpool add [-fgLnP] [-o property=value] ... - * - * -f Force addition of devices, even if they appear in use - * -g Display guid for individual vdev name. - * -L Follow links when resolving vdev path name. - * -n Do not add the devices, but display the resulting layout if - * they were to be added. - * -P Display full path for vdev name. - * - * Adds the given vdevs to 'pool'. As with create, the bulk of this work is - * handled by get_vdev_spec(), which constructs the nvlist needed to pass to - * libzfs. - */ -int -zpool_do_add(int argc, char **argv) -{ - boolean_t force = B_FALSE; - boolean_t dryrun = B_FALSE; - int name_flags = 0; - int c; - nvlist_t *nvroot; - char *poolname; - zpool_boot_label_t boot_type; - uint64_t boot_size; - int ret; - zpool_handle_t *zhp; - nvlist_t *config; - - /* check options */ - while ((c = getopt(argc, argv, "fgLnP")) != -1) { - switch (c) { - case 'f': - force = B_TRUE; - break; - case 'g': - name_flags |= VDEV_NAME_GUID; - break; - case 'L': - name_flags |= VDEV_NAME_FOLLOW_LINKS; - break; - case 'n': - dryrun = B_TRUE; - break; - case 'P': - name_flags |= VDEV_NAME_PATH; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* get pool name and check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool name argument\n")); - usage(B_FALSE); - } - if (argc < 2) { - (void) fprintf(stderr, gettext("missing vdev specification\n")); - usage(B_FALSE); - } - - poolname = argv[0]; - - argc--; - argv++; - - if ((zhp = zpool_open(g_zfs, poolname)) == NULL) - return (1); - - if ((config = zpool_get_config(zhp, NULL)) == NULL) { - (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), - poolname); - zpool_close(zhp); - return (1); - } - - if (zpool_is_bootable(zhp)) - boot_type = ZPOOL_COPY_BOOT_LABEL; - else - boot_type = ZPOOL_NO_BOOT_LABEL; - - /* pass off to get_vdev_spec for processing */ - boot_size = zpool_get_prop_int(zhp, ZPOOL_PROP_BOOTSIZE, NULL); - nvroot = make_root_vdev(zhp, force, !force, B_FALSE, dryrun, - boot_type, boot_size, argc, argv); - if (nvroot == NULL) { - zpool_close(zhp); - return (1); - } - - if (dryrun) { - nvlist_t *poolnvroot; - - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &poolnvroot) == 0); - - (void) printf(gettext("would update '%s' to the following " - "configuration:\n"), zpool_get_name(zhp)); - - /* print original main pool and new tree */ - print_vdev_tree(zhp, poolname, poolnvroot, 0, "", - name_flags | VDEV_NAME_TYPE_ID); - print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags); - - /* print other classes: 'dedup', 'special', and 'log' */ - print_vdev_tree(zhp, "dedup", poolnvroot, 0, - VDEV_ALLOC_BIAS_DEDUP, name_flags); - print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, - name_flags); - - print_vdev_tree(zhp, "special", poolnvroot, 0, - VDEV_ALLOC_BIAS_SPECIAL, name_flags); - print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, - name_flags); - - print_vdev_tree(zhp, "logs", poolnvroot, 0, VDEV_ALLOC_BIAS_LOG, - name_flags); - print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_LOG, - name_flags); - - ret = 0; - } else { - ret = (zpool_add(zhp, nvroot) != 0); - } - - nvlist_free(nvroot); - zpool_close(zhp); - - return (ret); -} - -/* - * zpool remove ... - * - * Removes the given vdev from the pool. - */ -int -zpool_do_remove(int argc, char **argv) -{ - char *poolname; - int i, ret = 0; - zpool_handle_t *zhp; - boolean_t stop = B_FALSE; - boolean_t noop = B_FALSE; - boolean_t parsable = B_FALSE; - char c; - - /* check options */ - while ((c = getopt(argc, argv, "nps")) != -1) { - switch (c) { - case 'n': - noop = B_TRUE; - break; - case 'p': - parsable = B_TRUE; - break; - case 's': - stop = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* get pool name and check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool name argument\n")); - usage(B_FALSE); - } - - poolname = argv[0]; - - if ((zhp = zpool_open(g_zfs, poolname)) == NULL) - return (1); - - if (stop && noop) { - (void) fprintf(stderr, gettext("stop request ignored\n")); - return (0); - } - - if (stop) { - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - if (zpool_vdev_remove_cancel(zhp) != 0) - ret = 1; - } else { - if (argc < 2) { - (void) fprintf(stderr, gettext("missing device\n")); - usage(B_FALSE); - } - - for (i = 1; i < argc; i++) { - if (noop) { - uint64_t size; - - if (zpool_vdev_indirect_size(zhp, argv[i], - &size) != 0) { - ret = 1; - break; - } - if (parsable) { - (void) printf("%s %llu\n", - argv[i], size); - } else { - char valstr[32]; - zfs_nicenum(size, valstr, - sizeof (valstr)); - (void) printf("Memory that will be " - "used after removing %s: %s\n", - argv[i], valstr); - } - } else { - if (zpool_vdev_remove(zhp, argv[i]) != 0) - ret = 1; - } - } - } - - return (ret); -} - -/* - * zpool labelclear [-f] - * - * -f Force clearing the label for the vdevs which are members of - * the exported or foreign pools. - * - * Verifies that the vdev is not active and zeros out the label information - * on the device. - */ -int -zpool_do_labelclear(int argc, char **argv) -{ - char vdev[MAXPATHLEN]; - char *name = NULL; - struct stat st; - int c, fd, ret = 0; - nvlist_t *config; - pool_state_t state; - boolean_t inuse = B_FALSE; - boolean_t force = B_FALSE; - - /* check options */ - while ((c = getopt(argc, argv, "f")) != -1) { - switch (c) { - case 'f': - force = B_TRUE; - break; - default: - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* get vdev name */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing vdev name\n")); - usage(B_FALSE); - } - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - /* - * Check if we were given absolute path and use it as is. - * Otherwise if the provided vdev name doesn't point to a file, - * try prepending dsk path and appending s0. - */ - (void) strlcpy(vdev, argv[0], sizeof (vdev)); - if (vdev[0] != '/' && stat(vdev, &st) != 0) { - char *s; - - (void) snprintf(vdev, sizeof (vdev), "%s/%s", -#ifdef illumos - ZFS_DISK_ROOT, argv[0]); - if ((s = strrchr(argv[0], 's')) == NULL || - !isdigit(*(s + 1))) - (void) strlcat(vdev, "s0", sizeof (vdev)); -#else - "/dev", argv[0]); -#endif - if (stat(vdev, &st) != 0) { - (void) fprintf(stderr, gettext( - "failed to find device %s, try specifying absolute " - "path instead\n"), argv[0]); - return (1); - } - } - - if ((fd = open(vdev, O_RDWR)) < 0) { - (void) fprintf(stderr, gettext("failed to open %s: %s\n"), - vdev, strerror(errno)); - return (1); - } - - if (zpool_read_label(fd, &config) != 0) { - (void) fprintf(stderr, - gettext("failed to read label from %s\n"), vdev); - return (1); - } - nvlist_free(config); - - ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse); - if (ret != 0) { - (void) fprintf(stderr, - gettext("failed to check state for %s\n"), vdev); - return (1); - } - - if (!inuse) - goto wipe_label; - - switch (state) { - default: - case POOL_STATE_ACTIVE: - case POOL_STATE_SPARE: - case POOL_STATE_L2CACHE: - (void) fprintf(stderr, gettext( - "%s is a member (%s) of pool \"%s\"\n"), - vdev, zpool_pool_state_to_name(state), name); - ret = 1; - goto errout; - - case POOL_STATE_EXPORTED: - if (force) - break; - (void) fprintf(stderr, gettext( - "use '-f' to override the following error:\n" - "%s is a member of exported pool \"%s\"\n"), - vdev, name); - ret = 1; - goto errout; - - case POOL_STATE_POTENTIALLY_ACTIVE: - if (force) - break; - (void) fprintf(stderr, gettext( - "use '-f' to override the following error:\n" - "%s is a member of potentially active pool \"%s\"\n"), - vdev, name); - ret = 1; - goto errout; - - case POOL_STATE_DESTROYED: - /* inuse should never be set for a destroyed pool */ - assert(0); - break; - } - -wipe_label: - ret = zpool_clear_label(fd); - if (ret != 0) { - (void) fprintf(stderr, - gettext("failed to clear label for %s\n"), vdev); - } - -errout: - free(name); - (void) close(fd); - - return (ret); -} - -/* - * zpool create [-fnd] [-B] [-o property=value] ... - * [-O file-system-property=value] ... - * [-R root] [-m mountpoint] [-t tempname] ... - * - * -B Create boot partition. - * -f Force creation, even if devices appear in use - * -n Do not create the pool, but display the resulting layout if it - * were to be created. - * -R Create a pool under an alternate root - * -m Set default mountpoint for the root dataset. By default it's - * '/' - * -t Use the temporary name until the pool is exported. - * -o Set property=value. - * -d Don't automatically enable all supported pool features - * (individual features can be enabled with -o). - * -O Set fsproperty=value in the pool's root file system - * - * Creates the named pool according to the given vdev specification. The - * bulk of the vdev processing is done in get_vdev_spec() in zpool_vdev.c. Once - * we get the nvlist back from get_vdev_spec(), we either print out the contents - * (if '-n' was specified), or pass it to libzfs to do the creation. - */ - -#define SYSTEM256 (256 * 1024 * 1024) -int -zpool_do_create(int argc, char **argv) -{ - boolean_t force = B_FALSE; - boolean_t dryrun = B_FALSE; - boolean_t enable_all_pool_feat = B_TRUE; - zpool_boot_label_t boot_type = ZPOOL_NO_BOOT_LABEL; - uint64_t boot_size = 0; - int c; - nvlist_t *nvroot = NULL; - char *poolname; - char *tname = NULL; - int ret = 1; - char *altroot = NULL; - char *mountpoint = NULL; - nvlist_t *fsprops = NULL; - nvlist_t *props = NULL; - char *propval; - - /* check options */ - while ((c = getopt(argc, argv, ":fndBR:m:o:O:t:")) != -1) { - switch (c) { - case 'f': - force = B_TRUE; - break; - case 'n': - dryrun = B_TRUE; - break; - case 'd': - enable_all_pool_feat = B_FALSE; - break; - case 'B': -#ifdef illumos - /* - * We should create the system partition. - * Also make sure the size is set. - */ - boot_type = ZPOOL_CREATE_BOOT_LABEL; - if (boot_size == 0) - boot_size = SYSTEM256; - break; -#else - (void) fprintf(stderr, - gettext("option '%c' is not supported\n"), - optopt); - goto badusage; -#endif - case 'R': - altroot = optarg; - if (add_prop_list(zpool_prop_to_name( - ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) - goto errout; - if (add_prop_list_default(zpool_prop_to_name( - ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) - goto errout; - break; - case 'm': - /* Equivalent to -O mountpoint=optarg */ - mountpoint = optarg; - break; - case 'o': - if ((propval = strchr(optarg, '=')) == NULL) { - (void) fprintf(stderr, gettext("missing " - "'=' for -o option\n")); - goto errout; - } - *propval = '\0'; - propval++; - - if (add_prop_list(optarg, propval, &props, B_TRUE)) - goto errout; - - /* - * Get bootsize value for make_root_vdev(). - */ - if (zpool_name_to_prop(optarg) == ZPOOL_PROP_BOOTSIZE) { - if (zfs_nicestrtonum(g_zfs, propval, - &boot_size) < 0 || boot_size == 0) { - (void) fprintf(stderr, - gettext("bad boot partition size " - "'%s': %s\n"), propval, - libzfs_error_description(g_zfs)); - goto errout; - } - } - - /* - * If the user is creating a pool that doesn't support - * feature flags, don't enable any features. - */ - if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) { - char *end; - u_longlong_t ver; - - ver = strtoull(propval, &end, 10); - if (*end == '\0' && - ver < SPA_VERSION_FEATURES) { - enable_all_pool_feat = B_FALSE; - } - } - if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT) - altroot = propval; - break; - case 'O': - if ((propval = strchr(optarg, '=')) == NULL) { - (void) fprintf(stderr, gettext("missing " - "'=' for -O option\n")); - goto errout; - } - *propval = '\0'; - propval++; - - /* - * Mountpoints are checked and then added later. - * Uniquely among properties, they can be specified - * more than once, to avoid conflict with -m. - */ - if (0 == strcmp(optarg, - zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { - mountpoint = propval; - } else if (add_prop_list(optarg, propval, &fsprops, - B_FALSE)) { - goto errout; - } - break; - case 't': - /* - * Sanity check temporary pool name. - */ - if (strchr(optarg, '/') != NULL) { - (void) fprintf(stderr, gettext("cannot create " - "'%s': invalid character '/' in temporary " - "name\n"), optarg); - (void) fprintf(stderr, gettext("use 'zfs " - "create' to create a dataset\n")); - goto errout; - } - - if (add_prop_list(zpool_prop_to_name( - ZPOOL_PROP_TNAME), optarg, &props, B_TRUE)) - goto errout; - if (add_prop_list_default(zpool_prop_to_name( - ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) - goto errout; - tname = optarg; - break; - case ':': - (void) fprintf(stderr, gettext("missing argument for " - "'%c' option\n"), optopt); - goto badusage; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - goto badusage; - } - } - - argc -= optind; - argv += optind; - - /* get pool name and check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool name argument\n")); - goto badusage; - } - if (argc < 2) { - (void) fprintf(stderr, gettext("missing vdev specification\n")); - goto badusage; - } - - poolname = argv[0]; - - /* - * As a special case, check for use of '/' in the name, and direct the - * user to use 'zfs create' instead. - */ - if (strchr(poolname, '/') != NULL) { - (void) fprintf(stderr, gettext("cannot create '%s': invalid " - "character '/' in pool name\n"), poolname); - (void) fprintf(stderr, gettext("use 'zfs create' to " - "create a dataset\n")); - goto errout; - } - - /* - * Make sure the bootsize is set when ZPOOL_CREATE_BOOT_LABEL is used, - * and not set otherwise. - */ - if (boot_type == ZPOOL_CREATE_BOOT_LABEL) { - const char *propname; - char *strptr, *buf = NULL; - int rv; - - propname = zpool_prop_to_name(ZPOOL_PROP_BOOTSIZE); - if (nvlist_lookup_string(props, propname, &strptr) != 0) { - (void) asprintf(&buf, "%" PRIu64, boot_size); - if (buf == NULL) { - (void) fprintf(stderr, - gettext("internal error: out of memory\n")); - goto errout; - } - rv = add_prop_list(propname, buf, &props, B_TRUE); - free(buf); - if (rv != 0) - goto errout; - } - } else { - const char *propname; - char *strptr; - - propname = zpool_prop_to_name(ZPOOL_PROP_BOOTSIZE); - if (nvlist_lookup_string(props, propname, &strptr) == 0) { - (void) fprintf(stderr, gettext("error: setting boot " - "partition size requires option '-B'\n")); - goto errout; - } - } - - /* pass off to get_vdev_spec for bulk processing */ - nvroot = make_root_vdev(NULL, force, !force, B_FALSE, dryrun, - boot_type, boot_size, argc - 1, argv + 1); - if (nvroot == NULL) - goto errout; - - /* make_root_vdev() allows 0 toplevel children if there are spares */ - if (!zfs_allocatable_devs(nvroot)) { - (void) fprintf(stderr, gettext("invalid vdev " - "specification: at least one toplevel vdev must be " - "specified\n")); - goto errout; - } - - if (altroot != NULL && altroot[0] != '/') { - (void) fprintf(stderr, gettext("invalid alternate root '%s': " - "must be an absolute path\n"), altroot); - goto errout; - } - - /* - * Check the validity of the mountpoint and direct the user to use the - * '-m' mountpoint option if it looks like its in use. - * Ignore the checks if the '-f' option is given. - */ - if (!force && (mountpoint == NULL || - (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && - strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0))) { - char buf[MAXPATHLEN]; - DIR *dirp; - - if (mountpoint && mountpoint[0] != '/') { - (void) fprintf(stderr, gettext("invalid mountpoint " - "'%s': must be an absolute path, 'legacy', or " - "'none'\n"), mountpoint); - goto errout; - } - - if (mountpoint == NULL) { - if (altroot != NULL) - (void) snprintf(buf, sizeof (buf), "%s/%s", - altroot, poolname); - else - (void) snprintf(buf, sizeof (buf), "/%s", - poolname); - } else { - if (altroot != NULL) - (void) snprintf(buf, sizeof (buf), "%s%s", - altroot, mountpoint); - else - (void) snprintf(buf, sizeof (buf), "%s", - mountpoint); - } - - if ((dirp = opendir(buf)) == NULL && errno != ENOENT) { - (void) fprintf(stderr, gettext("mountpoint '%s' : " - "%s\n"), buf, strerror(errno)); - (void) fprintf(stderr, gettext("use '-m' " - "option to provide a different default\n")); - goto errout; - } else if (dirp) { - int count = 0; - - while (count < 3 && readdir(dirp) != NULL) - count++; - (void) closedir(dirp); - - if (count > 2) { - (void) fprintf(stderr, gettext("mountpoint " - "'%s' exists and is not empty\n"), buf); - (void) fprintf(stderr, gettext("use '-m' " - "option to provide a " - "different default\n")); - goto errout; - } - } - } - - /* - * Now that the mountpoint's validity has been checked, ensure that - * the property is set appropriately prior to creating the pool. - */ - if (mountpoint != NULL) { - ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), - mountpoint, &fsprops, B_FALSE); - if (ret != 0) - goto errout; - } - - ret = 1; - if (dryrun) { - /* - * For a dry run invocation, print out a basic message and run - * through all the vdevs in the list and print out in an - * appropriate hierarchy. - */ - (void) printf(gettext("would create '%s' with the " - "following layout:\n\n"), poolname); - - print_vdev_tree(NULL, poolname, nvroot, 0, "", 0); - print_vdev_tree(NULL, "dedup", nvroot, 0, - VDEV_ALLOC_BIAS_DEDUP, 0); - print_vdev_tree(NULL, "special", nvroot, 0, - VDEV_ALLOC_BIAS_SPECIAL, 0); - print_vdev_tree(NULL, "logs", nvroot, 0, - VDEV_ALLOC_BIAS_LOG, 0); - - ret = 0; - } else { - /* - * Hand off to libzfs. - */ - if (enable_all_pool_feat) { - spa_feature_t i; - for (i = 0; i < SPA_FEATURES; i++) { - char propname[MAXPATHLEN]; - zfeature_info_t *feat = &spa_feature_table[i]; - - (void) snprintf(propname, sizeof (propname), - "feature@%s", feat->fi_uname); - - /* - * Skip feature if user specified it manually - * on the command line. - */ - if (nvlist_exists(props, propname)) - continue; - - ret = add_prop_list(propname, - ZFS_FEATURE_ENABLED, &props, B_TRUE); - if (ret != 0) - goto errout; - } - } - - ret = 1; - if (zpool_create(g_zfs, poolname, - nvroot, props, fsprops) == 0) { - zfs_handle_t *pool = zfs_open(g_zfs, - tname ? tname : poolname, ZFS_TYPE_FILESYSTEM); - if (pool != NULL) { - if (zfs_mount(pool, NULL, 0) == 0) - ret = zfs_shareall(pool); - zfs_close(pool); - } - } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { - (void) fprintf(stderr, gettext("pool name may have " - "been omitted\n")); - } - } - -errout: - nvlist_free(nvroot); - nvlist_free(fsprops); - nvlist_free(props); - return (ret); -badusage: - nvlist_free(fsprops); - nvlist_free(props); - usage(B_FALSE); - return (2); -} - -/* - * zpool destroy - * - * -f Forcefully unmount any datasets - * - * Destroy the given pool. Automatically unmounts any datasets in the pool. - */ -int -zpool_do_destroy(int argc, char **argv) -{ - boolean_t force = B_FALSE; - int c; - char *pool; - zpool_handle_t *zhp; - int ret; - - /* check options */ - while ((c = getopt(argc, argv, "f")) != -1) { - switch (c) { - case 'f': - force = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* check arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool argument\n")); - usage(B_FALSE); - } - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - pool = argv[0]; - - if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { - /* - * As a special case, check for use of '/' in the name, and - * direct the user to use 'zfs destroy' instead. - */ - if (strchr(pool, '/') != NULL) - (void) fprintf(stderr, gettext("use 'zfs destroy' to " - "destroy a dataset\n")); - return (1); - } - - if (zpool_disable_datasets(zhp, force) != 0) { - (void) fprintf(stderr, gettext("could not destroy '%s': " - "could not unmount datasets\n"), zpool_get_name(zhp)); - return (1); - } - - /* The history must be logged as part of the export */ - log_history = B_FALSE; - - ret = (zpool_destroy(zhp, history_str) != 0); - - zpool_close(zhp); - - return (ret); -} - -/* - * zpool export [-f] ... - * - * -f Forcefully unmount datasets - * - * Export the given pools. By default, the command will attempt to cleanly - * unmount any active datasets within the pool. If the '-f' flag is specified, - * then the datasets will be forcefully unmounted. - */ -int -zpool_do_export(int argc, char **argv) -{ - boolean_t force = B_FALSE; - boolean_t hardforce = B_FALSE; - int c; - zpool_handle_t *zhp; - int ret; - int i; - - /* check options */ - while ((c = getopt(argc, argv, "fF")) != -1) { - switch (c) { - case 'f': - force = B_TRUE; - break; - case 'F': - hardforce = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* check arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool argument\n")); - usage(B_FALSE); - } - - ret = 0; - for (i = 0; i < argc; i++) { - if ((zhp = zpool_open_canfail(g_zfs, argv[i])) == NULL) { - ret = 1; - continue; - } - - if (zpool_disable_datasets(zhp, force) != 0) { - ret = 1; - zpool_close(zhp); - continue; - } - - /* The history must be logged as part of the export */ - log_history = B_FALSE; - - if (hardforce) { - if (zpool_export_force(zhp, history_str) != 0) - ret = 1; - } else if (zpool_export(zhp, force, history_str) != 0) { - ret = 1; - } - - zpool_close(zhp); - } - - return (ret); -} - -/* - * Given a vdev configuration, determine the maximum width needed for the device - * name column. - */ -static int -max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max, - int name_flags) -{ - char *name; - nvlist_t **child; - uint_t c, children; - int ret; - - name = zpool_vdev_name(g_zfs, zhp, nv, name_flags | VDEV_NAME_TYPE_ID); - if (strlen(name) + depth > max) - max = strlen(name) + depth; - - free(name); - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, - &child, &children) == 0) { - for (c = 0; c < children; c++) - if ((ret = max_width(zhp, child[c], depth + 2, - max, name_flags)) > max) - max = ret; - } - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, - &child, &children) == 0) { - for (c = 0; c < children; c++) - if ((ret = max_width(zhp, child[c], depth + 2, - max, name_flags)) > max) - max = ret; - } - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (c = 0; c < children; c++) - if ((ret = max_width(zhp, child[c], depth + 2, - max, name_flags)) > max) - max = ret; - } - - return (max); -} - -typedef struct spare_cbdata { - uint64_t cb_guid; - zpool_handle_t *cb_zhp; -} spare_cbdata_t; - -static boolean_t -find_vdev(nvlist_t *nv, uint64_t search) -{ - uint64_t guid; - nvlist_t **child; - uint_t c, children; - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && - search == guid) - return (B_TRUE); - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (c = 0; c < children; c++) - if (find_vdev(child[c], search)) - return (B_TRUE); - } - - return (B_FALSE); -} - -static int -find_spare(zpool_handle_t *zhp, void *data) -{ - spare_cbdata_t *cbp = data; - nvlist_t *config, *nvroot; - - config = zpool_get_config(zhp, NULL); - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - - if (find_vdev(nvroot, cbp->cb_guid)) { - cbp->cb_zhp = zhp; - return (1); - } - - zpool_close(zhp); - return (0); -} - -typedef struct status_cbdata { - int cb_count; - int cb_name_flags; - int cb_namewidth; - boolean_t cb_allpools; - boolean_t cb_verbose; - boolean_t cb_explain; - boolean_t cb_first; - boolean_t cb_dedup_stats; - boolean_t cb_print_status; -} status_cbdata_t; - -/* - * Print out configuration state as requested by status_callback. - */ -static void -print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, - nvlist_t *nv, int depth, boolean_t isspare) -{ - nvlist_t **child; - uint_t c, vsc, children; - pool_scan_stat_t *ps = NULL; - vdev_stat_t *vs; - char rbuf[6], wbuf[6], cbuf[6]; - char *vname; - uint64_t notpresent; - uint64_t ashift; - spare_cbdata_t spare_cb; - const char *state; - char *type; - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) - children = 0; - - verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &vsc) == 0); - - verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); - - if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) - return; - - state = zpool_state_to_name(vs->vs_state, vs->vs_aux); - if (isspare) { - /* - * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for - * online drives. - */ - if (vs->vs_aux == VDEV_AUX_SPARED) - state = "INUSE"; - else if (vs->vs_state == VDEV_STATE_HEALTHY) - state = "AVAIL"; - } - - (void) printf("\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, - name, state); - - if (!isspare) { - zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); - zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); - zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); - (void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf); - } - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, - ¬present) == 0 || - vs->vs_state <= VDEV_STATE_CANT_OPEN) { - char *path; - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) - (void) printf(" was %s", path); - } else if (vs->vs_aux != 0) { - (void) printf(" "); - - switch (vs->vs_aux) { - case VDEV_AUX_OPEN_FAILED: - (void) printf(gettext("cannot open")); - break; - - case VDEV_AUX_BAD_GUID_SUM: - (void) printf(gettext("missing device")); - break; - - case VDEV_AUX_NO_REPLICAS: - (void) printf(gettext("insufficient replicas")); - break; - - case VDEV_AUX_VERSION_NEWER: - (void) printf(gettext("newer version")); - break; - - case VDEV_AUX_UNSUP_FEAT: - (void) printf(gettext("unsupported feature(s)")); - break; - - case VDEV_AUX_ASHIFT_TOO_BIG: - (void) printf(gettext("unsupported minimum blocksize")); - break; - - case VDEV_AUX_SPARED: - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, - &spare_cb.cb_guid) == 0); - if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) { - if (strcmp(zpool_get_name(spare_cb.cb_zhp), - zpool_get_name(zhp)) == 0) - (void) printf(gettext("currently in " - "use")); - else - (void) printf(gettext("in use by " - "pool '%s'"), - zpool_get_name(spare_cb.cb_zhp)); - zpool_close(spare_cb.cb_zhp); - } else { - (void) printf(gettext("currently in use")); - } - break; - - case VDEV_AUX_ERR_EXCEEDED: - (void) printf(gettext("too many errors")); - break; - - case VDEV_AUX_IO_FAILURE: - (void) printf(gettext("experienced I/O failures")); - break; - - case VDEV_AUX_BAD_LOG: - (void) printf(gettext("bad intent log")); - break; - - case VDEV_AUX_EXTERNAL: - (void) printf(gettext("external device fault")); - break; - - case VDEV_AUX_SPLIT_POOL: - (void) printf(gettext("split into new pool")); - break; - - case VDEV_AUX_ACTIVE: - (void) printf(gettext("currently in use")); - break; - - case VDEV_AUX_CHILDREN_OFFLINE: - (void) printf(gettext("all children offline")); - break; - - default: - (void) printf(gettext("corrupted data")); - break; - } - } else if (children == 0 && !isspare && - VDEV_STAT_VALID(vs_physical_ashift, vsc) && - vs->vs_configured_ashift < vs->vs_physical_ashift) { - (void) printf( - gettext(" block size: %dB configured, %dB native"), - 1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift); - } - - (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS, - (uint64_t **)&ps, &c); - - if (ps != NULL && ps->pss_state == DSS_SCANNING && - vs->vs_scan_processed != 0 && children == 0) { - (void) printf(gettext(" (%s)"), - (ps->pss_func == POOL_SCAN_RESILVER) ? - "resilvering" : "repairing"); - } - - if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE || - vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED || - vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) && - !vs->vs_scan_removing) { - char zbuf[1024]; - char tbuf[256]; - struct tm zaction_ts; - - time_t t = vs->vs_initialize_action_time; - int initialize_pct = 100; - if (vs->vs_initialize_state != VDEV_INITIALIZE_COMPLETE) { - initialize_pct = (vs->vs_initialize_bytes_done * 100 / - (vs->vs_initialize_bytes_est + 1)); - } - - (void) localtime_r(&t, &zaction_ts); - (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); - - switch (vs->vs_initialize_state) { - case VDEV_INITIALIZE_SUSPENDED: - (void) snprintf(zbuf, sizeof (zbuf), - ", suspended, started at %s", tbuf); - break; - case VDEV_INITIALIZE_ACTIVE: - (void) snprintf(zbuf, sizeof (zbuf), - ", started at %s", tbuf); - break; - case VDEV_INITIALIZE_COMPLETE: - (void) snprintf(zbuf, sizeof (zbuf), - ", completed at %s", tbuf); - break; - } - - (void) printf(gettext(" (%d%% initialized%s)"), - initialize_pct, zbuf); - } - - (void) printf("\n"); - - for (c = 0; c < children; c++) { - uint64_t islog = B_FALSE, ishole = B_FALSE; - - /* Don't print logs or holes here */ - (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - &islog); - (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, - &ishole); - if (islog || ishole) - continue; - /* Only print normal classes here */ - if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) - continue; - - vname = zpool_vdev_name(g_zfs, zhp, child[c], - cb->cb_name_flags | VDEV_NAME_TYPE_ID); - print_status_config(zhp, cb, vname, child[c], depth + 2, - isspare); - free(vname); - } -} - -/* - * Print the configuration of an exported pool. Iterate over all vdevs in the - * pool, printing out the name and status for each one. - */ -static void -print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, - int depth) -{ - nvlist_t **child; - uint_t c, children; - vdev_stat_t *vs; - char *type, *vname; - - verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_MISSING) == 0 || - strcmp(type, VDEV_TYPE_HOLE) == 0) - return; - - verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &c) == 0); - - (void) printf("\t%*s%-*s", depth, "", cb->cb_namewidth - depth, name); - (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux)); - - if (vs->vs_aux != 0) { - (void) printf(" "); - - switch (vs->vs_aux) { - case VDEV_AUX_OPEN_FAILED: - (void) printf(gettext("cannot open")); - break; - - case VDEV_AUX_BAD_GUID_SUM: - (void) printf(gettext("missing device")); - break; - - case VDEV_AUX_NO_REPLICAS: - (void) printf(gettext("insufficient replicas")); - break; - - case VDEV_AUX_VERSION_NEWER: - (void) printf(gettext("newer version")); - break; - - case VDEV_AUX_UNSUP_FEAT: - (void) printf(gettext("unsupported feature(s)")); - break; - - case VDEV_AUX_ERR_EXCEEDED: - (void) printf(gettext("too many errors")); - break; - - case VDEV_AUX_ACTIVE: - (void) printf(gettext("currently in use")); - break; - - case VDEV_AUX_CHILDREN_OFFLINE: - (void) printf(gettext("all children offline")); - break; - - default: - (void) printf(gettext("corrupted data")); - break; - } - } - (void) printf("\n"); - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) - return; - - for (c = 0; c < children; c++) { - uint64_t is_log = B_FALSE; - - (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - &is_log); - if (is_log) - continue; - if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) - continue; - - vname = zpool_vdev_name(g_zfs, NULL, child[c], - cb->cb_name_flags | VDEV_NAME_TYPE_ID); - print_import_config(cb, vname, child[c], depth + 2); - free(vname); - } - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, - &child, &children) == 0) { - (void) printf(gettext("\tcache\n")); - for (c = 0; c < children; c++) { - vname = zpool_vdev_name(g_zfs, NULL, child[c], - cb->cb_name_flags); - (void) printf("\t %s\n", vname); - free(vname); - } - } - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, - &child, &children) == 0) { - (void) printf(gettext("\tspares\n")); - for (c = 0; c < children; c++) { - vname = zpool_vdev_name(g_zfs, NULL, child[c], - cb->cb_name_flags); - (void) printf("\t %s\n", vname); - free(vname); - } - } -} - -/* - * Print specialized class vdevs. - * - * These are recorded as top level vdevs in the main pool child array - * but with "is_log" set to 1 or an "alloc_bias" string. We use either - * print_status_config() or print_import_config() to print the top level - * class vdevs then any of their children (eg mirrored slogs) are printed - * recursively - which works because only the top level vdev is marked. - */ -static void -print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, - const char *class) -{ - uint_t c, children; - nvlist_t **child; - boolean_t printed = B_FALSE; - - assert(zhp != NULL || !cb->cb_verbose); - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, - &children) != 0) - return; - - for (c = 0; c < children; c++) { - uint64_t is_log = B_FALSE; - char *bias = NULL; - char *type = NULL; - - (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - &is_log); - - if (is_log) { - bias = VDEV_ALLOC_CLASS_LOGS; - } else { - (void) nvlist_lookup_string(child[c], - ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); - (void) nvlist_lookup_string(child[c], - ZPOOL_CONFIG_TYPE, &type); - } - - if (bias == NULL || strcmp(bias, class) != 0) - continue; - if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0) - continue; - - if (!printed) { - (void) printf("\t%s\t\n", gettext(class)); - printed = B_TRUE; - } - - char *name = zpool_vdev_name(g_zfs, zhp, child[c], - cb->cb_name_flags | VDEV_NAME_TYPE_ID); - if (cb->cb_print_status) - print_status_config(zhp, cb, name, child[c], 2, - B_FALSE); - else - print_import_config(cb, name, child[c], 2); - free(name); - } -} - -/* - * Display the status for the given pool. - */ -static void -show_import(nvlist_t *config) -{ - uint64_t pool_state; - vdev_stat_t *vs; - char *name; - uint64_t guid; - uint64_t hostid = 0; - char *msgid; - char *hostname = "unknown"; - nvlist_t *nvroot, *nvinfo; - int reason; - const char *health; - uint_t vsc; - char *comment; - status_cbdata_t cb = { 0 }; - - verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, - &name) == 0); - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, - &guid) == 0); - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, - &pool_state) == 0); - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - - verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &vsc) == 0); - health = zpool_state_to_name(vs->vs_state, vs->vs_aux); - - reason = zpool_import_status(config, &msgid); - - (void) printf(gettext(" pool: %s\n"), name); - (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); - (void) printf(gettext(" state: %s"), health); - if (pool_state == POOL_STATE_DESTROYED) - (void) printf(gettext(" (DESTROYED)")); - (void) printf("\n"); - - switch (reason) { - case ZPOOL_STATUS_MISSING_DEV_R: - case ZPOOL_STATUS_MISSING_DEV_NR: - case ZPOOL_STATUS_BAD_GUID_SUM: - (void) printf(gettext(" status: One or more devices are " - "missing from the system.\n")); - break; - - case ZPOOL_STATUS_CORRUPT_LABEL_R: - case ZPOOL_STATUS_CORRUPT_LABEL_NR: - (void) printf(gettext(" status: One or more devices contains " - "corrupted data.\n")); - break; - - case ZPOOL_STATUS_CORRUPT_DATA: - (void) printf( - gettext(" status: The pool data is corrupted.\n")); - break; - - case ZPOOL_STATUS_OFFLINE_DEV: - (void) printf(gettext(" status: One or more devices " - "are offlined.\n")); - break; - - case ZPOOL_STATUS_CORRUPT_POOL: - (void) printf(gettext(" status: The pool metadata is " - "corrupted.\n")); - break; - - case ZPOOL_STATUS_VERSION_OLDER: - (void) printf(gettext(" status: The pool is formatted using a " - "legacy on-disk version.\n")); - break; - - case ZPOOL_STATUS_VERSION_NEWER: - (void) printf(gettext(" status: The pool is formatted using an " - "incompatible version.\n")); - break; - - case ZPOOL_STATUS_FEAT_DISABLED: - (void) printf(gettext(" status: Some supported features are " - "not enabled on the pool.\n")); - break; - - case ZPOOL_STATUS_UNSUP_FEAT_READ: - (void) printf(gettext("status: The pool uses the following " - "feature(s) not supported on this system:\n")); - zpool_print_unsup_feat(config); - break; - - case ZPOOL_STATUS_UNSUP_FEAT_WRITE: - (void) printf(gettext("status: The pool can only be accessed " - "in read-only mode on this system. It\n\tcannot be " - "accessed in read-write mode because it uses the " - "following\n\tfeature(s) not supported on this system:\n")); - zpool_print_unsup_feat(config); - break; - - case ZPOOL_STATUS_HOSTID_ACTIVE: - (void) printf(gettext(" status: The pool is currently " - "imported by another system.\n")); - break; - - case ZPOOL_STATUS_HOSTID_REQUIRED: - (void) printf(gettext(" status: The pool has the " - "multihost property on. It cannot\n\tbe safely imported " - "when the system hostid is not set.\n")); - break; - - case ZPOOL_STATUS_HOSTID_MISMATCH: - (void) printf(gettext(" status: The pool was last accessed by " - "another system.\n")); - break; - - case ZPOOL_STATUS_FAULTED_DEV_R: - case ZPOOL_STATUS_FAULTED_DEV_NR: - (void) printf(gettext(" status: One or more devices are " - "faulted.\n")); - break; - - case ZPOOL_STATUS_BAD_LOG: - (void) printf(gettext(" status: An intent log record cannot be " - "read.\n")); - break; - - case ZPOOL_STATUS_RESILVERING: - (void) printf(gettext(" status: One or more devices were being " - "resilvered.\n")); - break; - - case ZPOOL_STATUS_NON_NATIVE_ASHIFT: - (void) printf(gettext("status: One or more devices were " - "configured to use a non-native block size.\n" - "\tExpect reduced performance.\n")); - break; - - default: - /* - * No other status can be seen when importing pools. - */ - assert(reason == ZPOOL_STATUS_OK); - } - - /* - * Print out an action according to the overall state of the pool. - */ - if (vs->vs_state == VDEV_STATE_HEALTHY) { - if (reason == ZPOOL_STATUS_VERSION_OLDER || - reason == ZPOOL_STATUS_FEAT_DISABLED) { - (void) printf(gettext(" action: The pool can be " - "imported using its name or numeric identifier, " - "though\n\tsome features will not be available " - "without an explicit 'zpool upgrade'.\n")); - } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { - (void) printf(gettext(" action: The pool can be " - "imported using its name or numeric " - "identifier and\n\tthe '-f' flag.\n")); - } else { - (void) printf(gettext(" action: The pool can be " - "imported using its name or numeric " - "identifier.\n")); - } - } else if (vs->vs_state == VDEV_STATE_DEGRADED) { - (void) printf(gettext(" action: The pool can be imported " - "despite missing or damaged devices. The\n\tfault " - "tolerance of the pool may be compromised if imported.\n")); - } else { - switch (reason) { - case ZPOOL_STATUS_VERSION_NEWER: - (void) printf(gettext(" action: The pool cannot be " - "imported. Access the pool on a system running " - "newer\n\tsoftware, or recreate the pool from " - "backup.\n")); - break; - case ZPOOL_STATUS_UNSUP_FEAT_READ: - (void) printf(gettext("action: The pool cannot be " - "imported. Access the pool on a system that " - "supports\n\tthe required feature(s), or recreate " - "the pool from backup.\n")); - break; - case ZPOOL_STATUS_UNSUP_FEAT_WRITE: - (void) printf(gettext("action: The pool cannot be " - "imported in read-write mode. Import the pool " - "with\n" - "\t\"-o readonly=on\", access the pool on a system " - "that supports the\n\trequired feature(s), or " - "recreate the pool from backup.\n")); - break; - case ZPOOL_STATUS_MISSING_DEV_R: - case ZPOOL_STATUS_MISSING_DEV_NR: - case ZPOOL_STATUS_BAD_GUID_SUM: - (void) printf(gettext(" action: The pool cannot be " - "imported. Attach the missing\n\tdevices and try " - "again.\n")); - break; - case ZPOOL_STATUS_HOSTID_ACTIVE: - VERIFY0(nvlist_lookup_nvlist(config, - ZPOOL_CONFIG_LOAD_INFO, &nvinfo)); - - if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) - hostname = fnvlist_lookup_string(nvinfo, - ZPOOL_CONFIG_MMP_HOSTNAME); - - if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) - hostid = fnvlist_lookup_uint64(nvinfo, - ZPOOL_CONFIG_MMP_HOSTID); - - (void) printf(gettext(" action: The pool must be " - "exported from %s (hostid=%lx)\n\tbefore it " - "can be safely imported.\n"), hostname, - (unsigned long) hostid); - break; - case ZPOOL_STATUS_HOSTID_REQUIRED: - (void) printf(gettext(" action: Check the SMF " - "svc:/system/hostid service.\n")); - break; - default: - (void) printf(gettext(" action: The pool cannot be " - "imported due to damaged devices or data.\n")); - } - } - - /* Print the comment attached to the pool. */ - if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) - (void) printf(gettext("comment: %s\n"), comment); - - /* - * If the state is "closed" or "can't open", and the aux state - * is "corrupt data": - */ - if (((vs->vs_state == VDEV_STATE_CLOSED) || - (vs->vs_state == VDEV_STATE_CANT_OPEN)) && - (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) { - if (pool_state == POOL_STATE_DESTROYED) - (void) printf(gettext("\tThe pool was destroyed, " - "but can be imported using the '-Df' flags.\n")); - else if (pool_state != POOL_STATE_EXPORTED) - (void) printf(gettext("\tThe pool may be active on " - "another system, but can be imported using\n\t" - "the '-f' flag.\n")); - } - - if (msgid != NULL) - (void) printf(gettext(" see: http://illumos.org/msg/%s\n"), - msgid); - - (void) printf(gettext(" config:\n\n")); - - cb.cb_namewidth = max_width(NULL, nvroot, 0, 0, 0); - if (cb.cb_namewidth < 10) - cb.cb_namewidth = 10; - - print_import_config(&cb, name, nvroot, 0); - - print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP); - print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL); - print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS); - - if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { - (void) printf(gettext("\n\tAdditional devices are known to " - "be part of this pool, though their\n\texact " - "configuration cannot be determined.\n")); - } -} - -static boolean_t -zfs_force_import_required(nvlist_t *config) -{ - uint64_t state; - uint64_t hostid = 0; - nvlist_t *nvinfo; - - state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE); - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); - - if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid()) - return (B_TRUE); - - nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); - if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) { - mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo, - ZPOOL_CONFIG_MMP_STATE); - - if (mmp_state != MMP_STATE_INACTIVE) - return (B_TRUE); - } - - return (B_FALSE); -} - -/* - * Perform the import for the given configuration. This passes the heavy - * lifting off to zpool_import_props(), and then mounts the datasets contained - * within the pool. - */ -static int -do_import(nvlist_t *config, const char *newname, const char *mntopts, - nvlist_t *props, int flags) -{ - zpool_handle_t *zhp; - char *name; - uint64_t version; - - name = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); - version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); - - if (!SPA_VERSION_IS_SUPPORTED(version)) { - (void) fprintf(stderr, gettext("cannot import '%s': pool " - "is formatted using an unsupported ZFS version\n"), name); - return (1); - } else if (zfs_force_import_required(config) && - !(flags & ZFS_IMPORT_ANY_HOST)) { - mmp_state_t mmp_state = MMP_STATE_INACTIVE; - nvlist_t *nvinfo; - - nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); - if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) - mmp_state = fnvlist_lookup_uint64(nvinfo, - ZPOOL_CONFIG_MMP_STATE); - - if (mmp_state == MMP_STATE_ACTIVE) { - char *hostname = ""; - uint64_t hostid = 0; - - if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) - hostname = fnvlist_lookup_string(nvinfo, - ZPOOL_CONFIG_MMP_HOSTNAME); - - if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) - hostid = fnvlist_lookup_uint64(nvinfo, - ZPOOL_CONFIG_MMP_HOSTID); - - (void) fprintf(stderr, gettext("cannot import '%s': " - "pool is imported on %s (hostid: " - "0x%lx)\nExport the pool on the other system, " - "then run 'zpool import'.\n"), - name, hostname, (unsigned long) hostid); - } else if (mmp_state == MMP_STATE_NO_HOSTID) { - (void) fprintf(stderr, gettext("Cannot import '%s': " - "pool has the multihost property on and the\n" - "system's hostid is not set.\n"), name); - } else { - char *hostname = ""; - uint64_t timestamp = 0; - uint64_t hostid = 0; - - if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) - hostname = fnvlist_lookup_string(config, - ZPOOL_CONFIG_HOSTNAME); - - if (nvlist_exists(config, ZPOOL_CONFIG_TIMESTAMP)) - timestamp = fnvlist_lookup_uint64(config, - ZPOOL_CONFIG_TIMESTAMP); - - if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) - hostid = fnvlist_lookup_uint64(config, - ZPOOL_CONFIG_HOSTID); - - (void) fprintf(stderr, gettext("cannot import '%s': " - "pool was previously in use from another system.\n" - "Last accessed by %s (hostid=%lx) at %s" - "The pool can be imported, use 'zpool import -f' " - "to import the pool.\n"), name, hostname, - (unsigned long)hostid, ctime((time_t *)×tamp)); - - } - - return (1); - } - - if (zpool_import_props(g_zfs, config, newname, props, flags) != 0) - return (1); - - if (newname != NULL) - name = (char *)newname; - - if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) - return (1); - - if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && - !(flags & ZFS_IMPORT_ONLY) && - zpool_enable_datasets(zhp, mntopts, 0) != 0) { - zpool_close(zhp); - return (1); - } - - zpool_close(zhp); - return (0); -} - -/* - * zpool checkpoint - * checkpoint --discard - * - * -d Discard the checkpoint from a checkpointed - * --discard pool. - * - * Checkpoints the specified pool, by taking a "snapshot" of its - * current state. A pool can only have one checkpoint at a time. - */ -int -zpool_do_checkpoint(int argc, char **argv) -{ - boolean_t discard; - char *pool; - zpool_handle_t *zhp; - int c, err; - - struct option long_options[] = { - {"discard", no_argument, NULL, 'd'}, - {0, 0, 0, 0} - }; - - discard = B_FALSE; - while ((c = getopt_long(argc, argv, ":d", long_options, NULL)) != -1) { - switch (c) { - case 'd': - discard = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool argument\n")); - usage(B_FALSE); - } - - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - pool = argv[0]; - - if ((zhp = zpool_open(g_zfs, pool)) == NULL) { - /* As a special case, check for use of '/' in the name */ - if (strchr(pool, '/') != NULL) - (void) fprintf(stderr, gettext("'zpool checkpoint' " - "doesn't work on datasets. To save the state " - "of a dataset from a specific point in time " - "please use 'zfs snapshot'\n")); - return (1); - } - - if (discard) - err = (zpool_discard_checkpoint(zhp) != 0); - else - err = (zpool_checkpoint(zhp) != 0); - - zpool_close(zhp); - - return (err); -} - -#define CHECKPOINT_OPT 1024 - -/* - * zpool import [-d dir] [-D] - * import [-o mntopts] [-o prop=value] ... [-R root] [-D] - * [-d dir | -c cachefile] [-f] -a - * import [-o mntopts] [-o prop=value] ... [-R root] [-D] - * [-d dir | -c cachefile] [-f] [-n] [-F] [-t] - * [newpool] - * - * -c Read pool information from a cachefile instead of searching - * devices. - * - * -d Scan in a specific directory, other than /dev/dsk. More than - * one directory can be specified using multiple '-d' options. - * - * -D Scan for previously destroyed pools or import all or only - * specified destroyed pools. - * - * -R Temporarily import the pool, with all mountpoints relative to - * the given root. The pool will remain exported when the machine - * is rebooted. - * - * -V Import even in the presence of faulted vdevs. This is an - * intentionally undocumented option for testing purposes, and - * treats the pool configuration as complete, leaving any bad - * vdevs in the FAULTED state. In other words, it does verbatim - * import. - * - * -f Force import, even if it appears that the pool is active. - * - * -F Attempt rewind if necessary. - * - * -n See if rewind would work, but don't actually rewind. - * - * -N Import the pool but don't mount datasets. - * - * -t Use newpool as a temporary pool name instead of renaming - * the pool. - * - * -T Specify a starting txg to use for import. This option is - * intentionally undocumented option for testing purposes. - * - * -a Import all pools found. - * - * -o Set property=value and/or temporary mount options (without '='). - * - * --rewind-to-checkpoint - * Import the pool and revert back to the checkpoint. - * - * The import command scans for pools to import, and import pools based on pool - * name and GUID. The pool can also be renamed as part of the import process. - */ -int -zpool_do_import(int argc, char **argv) -{ - char **searchdirs = NULL; - int nsearch = 0; - int c; - int err = 0; - nvlist_t *pools = NULL; - boolean_t do_all = B_FALSE; - boolean_t do_destroyed = B_FALSE; - char *mntopts = NULL; - nvpair_t *elem; - nvlist_t *config; - uint64_t searchguid = 0; - char *searchname = NULL; - char *propval; - nvlist_t *found_config; - nvlist_t *policy = NULL; - nvlist_t *props = NULL; - boolean_t first; - int flags = ZFS_IMPORT_NORMAL; - uint32_t rewind_policy = ZPOOL_NO_REWIND; - boolean_t dryrun = B_FALSE; - boolean_t do_rewind = B_FALSE; - boolean_t xtreme_rewind = B_FALSE; - uint64_t pool_state, txg = -1ULL; - char *cachefile = NULL; - importargs_t idata = { 0 }; - char *endptr; - - - struct option long_options[] = { - {"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT}, - {0, 0, 0, 0} - }; - - /* check options */ - while ((c = getopt_long(argc, argv, ":aCc:d:DEfFmnNo:rR:tT:VX", - long_options, NULL)) != -1) { - switch (c) { - case 'a': - do_all = B_TRUE; - break; - case 'c': - cachefile = optarg; - break; - case 'd': - if (searchdirs == NULL) { - searchdirs = safe_malloc(sizeof (char *)); - } else { - char **tmp = safe_malloc((nsearch + 1) * - sizeof (char *)); - bcopy(searchdirs, tmp, nsearch * - sizeof (char *)); - free(searchdirs); - searchdirs = tmp; - } - searchdirs[nsearch++] = optarg; - break; - case 'D': - do_destroyed = B_TRUE; - break; - case 'f': - flags |= ZFS_IMPORT_ANY_HOST; - break; - case 'F': - do_rewind = B_TRUE; - break; - case 'm': - flags |= ZFS_IMPORT_MISSING_LOG; - break; - case 'n': - dryrun = B_TRUE; - break; - case 'N': - flags |= ZFS_IMPORT_ONLY; - break; - case 'o': - if ((propval = strchr(optarg, '=')) != NULL) { - *propval = '\0'; - propval++; - if (add_prop_list(optarg, propval, - &props, B_TRUE)) - goto error; - } else { - mntopts = optarg; - } - break; - case 'R': - if (add_prop_list(zpool_prop_to_name( - ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) - goto error; - if (add_prop_list_default(zpool_prop_to_name( - ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) - goto error; - break; - case 't': - flags |= ZFS_IMPORT_TEMP_NAME; - if (add_prop_list_default(zpool_prop_to_name( - ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) - goto error; - break; - case 'T': - errno = 0; - txg = strtoull(optarg, &endptr, 0); - if (errno != 0 || *endptr != '\0') { - (void) fprintf(stderr, - gettext("invalid txg value\n")); - usage(B_FALSE); - } - rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND; - break; - case 'V': - flags |= ZFS_IMPORT_VERBATIM; - break; - case 'X': - xtreme_rewind = B_TRUE; - break; - case CHECKPOINT_OPT: - flags |= ZFS_IMPORT_CHECKPOINT; - break; - case ':': - (void) fprintf(stderr, gettext("missing argument for " - "'%c' option\n"), optopt); - usage(B_FALSE); - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - if (cachefile && nsearch != 0) { - (void) fprintf(stderr, gettext("-c is incompatible with -d\n")); - usage(B_FALSE); - } - - if ((dryrun || xtreme_rewind) && !do_rewind) { - (void) fprintf(stderr, - gettext("-n or -X only meaningful with -F\n")); - usage(B_FALSE); - } - if (dryrun) - rewind_policy = ZPOOL_TRY_REWIND; - else if (do_rewind) - rewind_policy = ZPOOL_DO_REWIND; - if (xtreme_rewind) - rewind_policy |= ZPOOL_EXTREME_REWIND; - - /* In the future, we can capture further policy and include it here */ - if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || - nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, txg) != 0 || - nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, - rewind_policy) != 0) - goto error; - - if (searchdirs == NULL) { - searchdirs = safe_malloc(sizeof (char *)); - searchdirs[0] = "/dev"; - nsearch = 1; - } - - /* check argument count */ - if (do_all) { - if (argc != 0) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - } else { - if (argc > 2) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - /* - * Check for the SYS_CONFIG privilege. We do this explicitly - * here because otherwise any attempt to discover pools will - * silently fail. - */ - if (argc == 0 && !priv_ineffect(PRIV_SYS_CONFIG)) { - (void) fprintf(stderr, gettext("cannot " - "discover pools: permission denied\n")); - free(searchdirs); - nvlist_free(policy); - return (1); - } - } - - /* - * Depending on the arguments given, we do one of the following: - * - * Iterate through all pools and display information about - * each one. - * - * -a Iterate through all pools and try to import each one. - * - * Find the pool that corresponds to the given GUID/pool - * name and import that one. - * - * -D Above options applies only to destroyed pools. - */ - if (argc != 0) { - char *endptr; - - errno = 0; - searchguid = strtoull(argv[0], &endptr, 10); - if (errno != 0 || *endptr != '\0') { - searchname = argv[0]; - searchguid = 0; - } - found_config = NULL; - - /* - * User specified a name or guid. Ensure it's unique. - */ - idata.unique = B_TRUE; - } - - - idata.path = searchdirs; - idata.paths = nsearch; - idata.poolname = searchname; - idata.guid = searchguid; - idata.cachefile = cachefile; - idata.policy = policy; - - pools = zpool_search_import(g_zfs, &idata); - - if (pools != NULL && idata.exists && - (argc == 1 || strcmp(argv[0], argv[1]) == 0)) { - (void) fprintf(stderr, gettext("cannot import '%s': " - "a pool with that name already exists\n"), - argv[0]); - (void) fprintf(stderr, gettext("use the form 'zpool import " - "[-t] ' to give it a new temporary " - "or permanent name\n")); - err = 1; - } else if (pools == NULL && idata.exists) { - (void) fprintf(stderr, gettext("cannot import '%s': " - "a pool with that name is already created/imported,\n"), - argv[0]); - (void) fprintf(stderr, gettext("and no additional pools " - "with that name were found\n")); - err = 1; - } else if (pools == NULL) { - if (argc != 0) { - (void) fprintf(stderr, gettext("cannot import '%s': " - "no such pool available\n"), argv[0]); - } - err = 1; - } - - if (err == 1) { - free(searchdirs); - nvlist_free(policy); - return (1); - } - - /* - * At this point we have a list of import candidate configs. Even if - * we were searching by pool name or guid, we still need to - * post-process the list to deal with pool state and possible - * duplicate names. - */ - err = 0; - elem = NULL; - first = B_TRUE; - while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { - - verify(nvpair_value_nvlist(elem, &config) == 0); - - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, - &pool_state) == 0); - if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) - continue; - if (do_destroyed && pool_state != POOL_STATE_DESTROYED) - continue; - - verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, - policy) == 0); - - if (argc == 0) { - if (first) - first = B_FALSE; - else if (!do_all) - (void) printf("\n"); - - if (do_all) { - err |= do_import(config, NULL, mntopts, - props, flags); - } else { - show_import(config); - } - } else if (searchname != NULL) { - char *name; - - /* - * We are searching for a pool based on name. - */ - verify(nvlist_lookup_string(config, - ZPOOL_CONFIG_POOL_NAME, &name) == 0); - - if (strcmp(name, searchname) == 0) { - if (found_config != NULL) { - (void) fprintf(stderr, gettext( - "cannot import '%s': more than " - "one matching pool\n"), searchname); - (void) fprintf(stderr, gettext( - "import by numeric ID instead\n")); - err = B_TRUE; - } - found_config = config; - } - } else { - uint64_t guid; - - /* - * Search for a pool by guid. - */ - verify(nvlist_lookup_uint64(config, - ZPOOL_CONFIG_POOL_GUID, &guid) == 0); - - if (guid == searchguid) - found_config = config; - } - } - - /* - * If we were searching for a specific pool, verify that we found a - * pool, and then do the import. - */ - if (argc != 0 && err == 0) { - if (found_config == NULL) { - (void) fprintf(stderr, gettext("cannot import '%s': " - "no such pool available\n"), argv[0]); - err = B_TRUE; - } else { - err |= do_import(found_config, argc == 1 ? NULL : - argv[1], mntopts, props, flags); - } - } - - /* - * If we were just looking for pools, report an error if none were - * found. - */ - if (argc == 0 && first) - (void) fprintf(stderr, - gettext("no pools available to import\n")); - -error: - nvlist_free(props); - nvlist_free(pools); - nvlist_free(policy); - free(searchdirs); - - return (err ? 1 : 0); -} - -/* - * zpool sync [-f] [pool] ... - * - * -f (undocumented) force uberblock (and config including zpool cache file) - * update. - * - * Sync the specified pool(s). - * Without arguments "zpool sync" will sync all pools. - * This command initiates TXG sync(s) and will return after the TXG(s) commit. - * - */ -static int -zpool_do_sync(int argc, char **argv) -{ - int ret; - boolean_t force = B_FALSE; - - /* check options */ - while ((ret = getopt(argc, argv, "f")) != -1) { - switch (ret) { - case 'f': - force = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* if argc == 0 we will execute zpool_sync_one on all pools */ - ret = for_each_pool(argc, argv, B_FALSE, NULL, zpool_sync_one, &force); - - return (ret); -} - -typedef struct iostat_cbdata { - boolean_t cb_verbose; - int cb_name_flags; - int cb_namewidth; - int cb_iteration; - boolean_t cb_scripted; - zpool_list_t *cb_list; -} iostat_cbdata_t; - -static void -print_iostat_separator(iostat_cbdata_t *cb) -{ - int i = 0; - - for (i = 0; i < cb->cb_namewidth; i++) - (void) printf("-"); - (void) printf(" ----- ----- ----- ----- ----- -----\n"); -} - -static void -print_iostat_header(iostat_cbdata_t *cb) -{ - (void) printf("%*s capacity operations bandwidth\n", - cb->cb_namewidth, ""); - (void) printf("%-*s alloc free read write read write\n", - cb->cb_namewidth, "pool"); - print_iostat_separator(cb); -} - -/* - * Display a single statistic. - */ -static void -print_one_stat(uint64_t value) -{ - char buf[64]; - - zfs_nicenum(value, buf, sizeof (buf)); - (void) printf(" %5s", buf); -} - -static const char *class_name[] = { - VDEV_ALLOC_BIAS_DEDUP, - VDEV_ALLOC_BIAS_SPECIAL, - VDEV_ALLOC_CLASS_LOGS -}; - -/* - * Print out all the statistics for the given vdev. This can either be the - * toplevel configuration, or called recursively. If 'name' is NULL, then this - * is a verbose output, and we don't want to display the toplevel pool stats. - * - * Returns the number of stat lines printed. - */ -static unsigned int -print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, - nvlist_t *newnv, iostat_cbdata_t *cb, int depth) -{ - nvlist_t **oldchild, **newchild; - uint_t c, children; - vdev_stat_t *oldvs, *newvs; - vdev_stat_t zerovs = { 0 }; - char *vname; - int ret = 0; - uint64_t tdelta; - double scale; - - if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) - return (ret); - - if (oldnv != NULL) { - verify(nvlist_lookup_uint64_array(oldnv, - ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); - } else { - oldvs = &zerovs; - } - - verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&newvs, &c) == 0); - - if (strlen(name) + depth > cb->cb_namewidth) - (void) printf("%*s%s", depth, "", name); - else - (void) printf("%*s%s%*s", depth, "", name, - (int)(cb->cb_namewidth - strlen(name) - depth), ""); - - tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; - - if (tdelta == 0) - scale = 1.0; - else - scale = (double)NANOSEC / tdelta; - - /* only toplevel vdevs have capacity stats */ - if (newvs->vs_space == 0) { - (void) printf(" - -"); - } else { - print_one_stat(newvs->vs_alloc); - print_one_stat(newvs->vs_space - newvs->vs_alloc); - } - - print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_READ] - - oldvs->vs_ops[ZIO_TYPE_READ]))); - - print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_WRITE] - - oldvs->vs_ops[ZIO_TYPE_WRITE]))); - - print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_READ] - - oldvs->vs_bytes[ZIO_TYPE_READ]))); - - print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_WRITE] - - oldvs->vs_bytes[ZIO_TYPE_WRITE]))); - - (void) printf("\n"); - - if (!cb->cb_verbose) - return (ret); - - if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, - &newchild, &children) != 0) - return (ret); - - if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, - &oldchild, &c) != 0) - return (ret); - - /* - * print normal top-level devices - */ - for (c = 0; c < children; c++) { - uint64_t ishole = B_FALSE, islog = B_FALSE; - - (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE, - &ishole); - - (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, - &islog); - - if (ishole || islog) - continue; - - if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) - continue; - - vname = zpool_vdev_name(g_zfs, zhp, newchild[c], - cb->cb_name_flags); - print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, - newchild[c], cb, depth + 2); - free(vname); - } - - /* - * print all other top-level devices - */ - for (uint_t n = 0; n < 3; n++) { - for (c = 0; c < children; c++) { - uint64_t islog = B_FALSE; - char *bias = NULL; - char *type = NULL; - - (void) nvlist_lookup_uint64(newchild[c], - ZPOOL_CONFIG_IS_LOG, &islog); - if (islog) { - bias = VDEV_ALLOC_CLASS_LOGS; - } else { - (void) nvlist_lookup_string(newchild[c], - ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); - (void) nvlist_lookup_string(newchild[c], - ZPOOL_CONFIG_TYPE, &type); - } - if (bias == NULL || strcmp(bias, class_name[n]) != 0) - continue; - if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) - continue; - - vname = zpool_vdev_name(g_zfs, zhp, newchild[c], - cb->cb_name_flags); - ret += print_vdev_stats(zhp, vname, oldnv ? - oldchild[c] : NULL, newchild[c], cb, depth + 2); - free(vname); - } - - } - - /* - * Include level 2 ARC devices in iostat output - */ - if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, - &newchild, &children) != 0) - return (ret); - - if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, - &oldchild, &c) != 0) - return (ret); - - if (children > 0) { - (void) printf("%-*s - - - - - " - "-\n", cb->cb_namewidth, "cache"); - for (c = 0; c < children; c++) { - vname = zpool_vdev_name(g_zfs, zhp, newchild[c], - cb->cb_name_flags); - print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, - newchild[c], cb, depth + 2); - free(vname); - } - } - - return (ret); -} - -static int -refresh_iostat(zpool_handle_t *zhp, void *data) -{ - iostat_cbdata_t *cb = data; - boolean_t missing; - - /* - * If the pool has disappeared, remove it from the list and continue. - */ - if (zpool_refresh_stats(zhp, &missing) != 0) - return (-1); - - if (missing) - pool_list_remove(cb->cb_list, zhp); - - return (0); -} - -/* - * Callback to print out the iostats for the given pool. - */ -int -print_iostat(zpool_handle_t *zhp, void *data) -{ - iostat_cbdata_t *cb = data; - nvlist_t *oldconfig, *newconfig; - nvlist_t *oldnvroot, *newnvroot; - - newconfig = zpool_get_config(zhp, &oldconfig); - - if (cb->cb_iteration == 1) - oldconfig = NULL; - - verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, - &newnvroot) == 0); - - if (oldconfig == NULL) - oldnvroot = NULL; - else - verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, - &oldnvroot) == 0); - - /* - * Print out the statistics for the pool. - */ - print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0); - - if (cb->cb_verbose) - print_iostat_separator(cb); - - return (0); -} - -int -get_namewidth(zpool_handle_t *zhp, void *data) -{ - iostat_cbdata_t *cb = data; - nvlist_t *config, *nvroot; - - if ((config = zpool_get_config(zhp, NULL)) != NULL) { - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - if (!cb->cb_verbose) - cb->cb_namewidth = strlen(zpool_get_name(zhp)); - else - cb->cb_namewidth = max_width(zhp, nvroot, 0, - cb->cb_namewidth, cb->cb_name_flags); - } - - /* - * The width must fall into the range [10,38]. The upper limit is the - * maximum we can have and still fit in 80 columns. - */ - if (cb->cb_namewidth < 10) - cb->cb_namewidth = 10; - if (cb->cb_namewidth > 38) - cb->cb_namewidth = 38; - - return (0); -} - -/* - * Parse the input string, get the 'interval' and 'count' value if there is one. - */ -static void -get_interval_count(int *argcp, char **argv, unsigned long *iv, - unsigned long *cnt) -{ - unsigned long interval = 0, count = 0; - int argc = *argcp, errno; - - /* - * Determine if the last argument is an integer or a pool name - */ - if (argc > 0 && isdigit(argv[argc - 1][0])) { - char *end; - - errno = 0; - interval = strtoul(argv[argc - 1], &end, 10); - - if (*end == '\0' && errno == 0) { - if (interval == 0) { - (void) fprintf(stderr, gettext("interval " - "cannot be zero\n")); - usage(B_FALSE); - } - /* - * Ignore the last parameter - */ - argc--; - } else { - /* - * If this is not a valid number, just plow on. The - * user will get a more informative error message later - * on. - */ - interval = 0; - } - } - - /* - * If the last argument is also an integer, then we have both a count - * and an interval. - */ - if (argc > 0 && isdigit(argv[argc - 1][0])) { - char *end; - - errno = 0; - count = interval; - interval = strtoul(argv[argc - 1], &end, 10); - - if (*end == '\0' && errno == 0) { - if (interval == 0) { - (void) fprintf(stderr, gettext("interval " - "cannot be zero\n")); - usage(B_FALSE); - } - - /* - * Ignore the last parameter - */ - argc--; - } else { - interval = 0; - } - } - - *iv = interval; - *cnt = count; - *argcp = argc; -} - -static void -get_timestamp_arg(char c) -{ - if (c == 'u') - timestamp_fmt = UDATE; - else if (c == 'd') - timestamp_fmt = DDATE; - else - usage(B_FALSE); -} - -/* - * zpool iostat [-gLPv] [-T d|u] [pool] ... [interval [count]] - * - * -g Display guid for individual vdev name. - * -L Follow links when resolving vdev path name. - * -P Display full path for vdev name. - * -v Display statistics for individual vdevs - * -T Display a timestamp in date(1) or Unix format - * - * This command can be tricky because we want to be able to deal with pool - * creation/destruction as well as vdev configuration changes. The bulk of this - * processing is handled by the pool_list_* routines in zpool_iter.c. We rely - * on pool_list_update() to detect the addition of new pools. Configuration - * changes are all handled within libzfs. - */ -int -zpool_do_iostat(int argc, char **argv) -{ - int c; - int ret; - int npools; - unsigned long interval = 0, count = 0; - zpool_list_t *list; - boolean_t verbose = B_FALSE; - boolean_t guid = B_FALSE; - boolean_t follow_links = B_FALSE; - boolean_t full_name = B_FALSE; - iostat_cbdata_t cb = { 0 }; - - /* check options */ - while ((c = getopt(argc, argv, "gLPT:v")) != -1) { - switch (c) { - case 'g': - guid = B_TRUE; - break; - case 'L': - follow_links = B_TRUE; - break; - case 'P': - full_name = B_TRUE; - break; - case 'T': - get_timestamp_arg(*optarg); - break; - case 'v': - verbose = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - get_interval_count(&argc, argv, &interval, &count); - - /* - * Construct the list of all interesting pools. - */ - ret = 0; - if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL) - return (1); - - if (pool_list_count(list) == 0 && argc != 0) { - pool_list_free(list); - return (1); - } - - if (pool_list_count(list) == 0 && interval == 0) { - pool_list_free(list); - (void) fprintf(stderr, gettext("no pools available\n")); - return (1); - } - - /* - * Enter the main iostat loop. - */ - cb.cb_list = list; - cb.cb_verbose = verbose; - if (guid) - cb.cb_name_flags |= VDEV_NAME_GUID; - if (follow_links) - cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; - if (full_name) - cb.cb_name_flags |= VDEV_NAME_PATH; - cb.cb_iteration = 0; - cb.cb_namewidth = 0; - - for (;;) { - pool_list_update(list); - - if ((npools = pool_list_count(list)) == 0) - break; - - /* - * Refresh all statistics. This is done as an explicit step - * before calculating the maximum name width, so that any - * configuration changes are properly accounted for. - */ - (void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb); - - /* - * Iterate over all pools to determine the maximum width - * for the pool / device name column across all pools. - */ - cb.cb_namewidth = 0; - (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb); - - if (timestamp_fmt != NODATE) - print_timestamp(timestamp_fmt); - - /* - * If it's the first time, or verbose mode, print the header. - */ - if (++cb.cb_iteration == 1 || verbose) - print_iostat_header(&cb); - - (void) pool_list_iter(list, B_FALSE, print_iostat, &cb); - - /* - * If there's more than one pool, and we're not in verbose mode - * (which prints a separator for us), then print a separator. - */ - if (npools > 1 && !verbose) - print_iostat_separator(&cb); - - if (verbose) - (void) printf("\n"); - - /* - * Flush the output so that redirection to a file isn't buffered - * indefinitely. - */ - (void) fflush(stdout); - - if (interval == 0) - break; - - if (count != 0 && --count == 0) - break; - - (void) sleep(interval); - } - - pool_list_free(list); - - return (ret); -} - -typedef struct list_cbdata { - boolean_t cb_verbose; - int cb_name_flags; - int cb_namewidth; - boolean_t cb_scripted; - zprop_list_t *cb_proplist; - boolean_t cb_literal; -} list_cbdata_t; - - -/* - * Given a list of columns to display, output appropriate headers for each one. - */ -static void -print_header(list_cbdata_t *cb) -{ - zprop_list_t *pl = cb->cb_proplist; - char headerbuf[ZPOOL_MAXPROPLEN]; - const char *header; - boolean_t first = B_TRUE; - boolean_t right_justify; - size_t width = 0; - - for (; pl != NULL; pl = pl->pl_next) { - width = pl->pl_width; - if (first && cb->cb_verbose) { - /* - * Reset the width to accommodate the verbose listing - * of devices. - */ - width = cb->cb_namewidth; - } - - if (!first) - (void) printf(" "); - else - first = B_FALSE; - - right_justify = B_FALSE; - if (pl->pl_prop != ZPROP_INVAL) { - header = zpool_prop_column_name(pl->pl_prop); - right_justify = zpool_prop_align_right(pl->pl_prop); - } else { - int i; - - for (i = 0; pl->pl_user_prop[i] != '\0'; i++) - headerbuf[i] = toupper(pl->pl_user_prop[i]); - headerbuf[i] = '\0'; - header = headerbuf; - } - - if (pl->pl_next == NULL && !right_justify) - (void) printf("%s", header); - else if (right_justify) - (void) printf("%*s", width, header); - else - (void) printf("%-*s", width, header); - - } - - (void) printf("\n"); -} - -/* - * Given a pool and a list of properties, print out all the properties according - * to the described layout. Used by zpool_do_list(). - */ -static void -print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) -{ - zprop_list_t *pl = cb->cb_proplist; - boolean_t first = B_TRUE; - char property[ZPOOL_MAXPROPLEN]; - char *propstr; - boolean_t right_justify; - size_t width; - - for (; pl != NULL; pl = pl->pl_next) { - - width = pl->pl_width; - if (first && cb->cb_verbose) { - /* - * Reset the width to accommodate the verbose listing - * of devices. - */ - width = cb->cb_namewidth; - } - - if (!first) { - if (cb->cb_scripted) - (void) printf("\t"); - else - (void) printf(" "); - } else { - first = B_FALSE; - } - - right_justify = B_FALSE; - if (pl->pl_prop != ZPROP_INVAL) { - if (zpool_get_prop(zhp, pl->pl_prop, property, - sizeof (property), NULL, cb->cb_literal) != 0) - propstr = "-"; - else - propstr = property; - - right_justify = zpool_prop_align_right(pl->pl_prop); - } else if ((zpool_prop_feature(pl->pl_user_prop) || - zpool_prop_unsupported(pl->pl_user_prop)) && - zpool_prop_get_feature(zhp, pl->pl_user_prop, property, - sizeof (property)) == 0) { - propstr = property; - } else { - propstr = "-"; - } - - - /* - * If this is being called in scripted mode, or if this is the - * last column and it is left-justified, don't include a width - * format specifier. - */ - if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) - (void) printf("%s", propstr); - else if (right_justify) - (void) printf("%*s", width, propstr); - else - (void) printf("%-*s", width, propstr); - } - - (void) printf("\n"); -} - -static void -print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted, - boolean_t valid) -{ - char propval[64]; - boolean_t fixed; - size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); - - switch (prop) { - case ZPOOL_PROP_EXPANDSZ: - case ZPOOL_PROP_CHECKPOINT: - if (value == 0) - (void) strlcpy(propval, "-", sizeof (propval)); - else - zfs_nicenum(value, propval, sizeof (propval)); - break; - case ZPOOL_PROP_FRAGMENTATION: - if (value == ZFS_FRAG_INVALID) { - (void) strlcpy(propval, "-", sizeof (propval)); - } else { - (void) snprintf(propval, sizeof (propval), "%llu%%", - value); - } - break; - case ZPOOL_PROP_CAPACITY: - (void) snprintf(propval, sizeof (propval), - value < 1000 ? "%1.2f%%" : value < 10000 ? - "%2.1f%%" : "%3.0f%%", value / 100.0); - break; - default: - zfs_nicenum(value, propval, sizeof (propval)); - } - - if (!valid) - (void) strlcpy(propval, "-", sizeof (propval)); - - if (scripted) - (void) printf("\t%s", propval); - else - (void) printf(" %*s", width, propval); -} - -/* - * print static default line per vdev - */ -void -print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, - list_cbdata_t *cb, int depth) -{ - nvlist_t **child; - vdev_stat_t *vs; - uint_t c, children; - char *vname; - boolean_t scripted = cb->cb_scripted; - uint64_t islog = B_FALSE; - char *dashes = "%-*s - - - - - -\n"; - - verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &c) == 0); - - if (name != NULL) { - boolean_t toplevel = (vs->vs_space != 0); - uint64_t cap; - - if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) - return; - - if (scripted) - (void) printf("\t%s", name); - else if (strlen(name) + depth > cb->cb_namewidth) - (void) printf("%*s%s", depth, "", name); - else - (void) printf("%*s%s%*s", depth, "", name, - (int)(cb->cb_namewidth - strlen(name) - depth), ""); - - /* - * Print the properties for the individual vdevs. Some - * properties are only applicable to toplevel vdevs. The - * 'toplevel' boolean value is passed to the print_one_column() - * to indicate that the value is valid. - */ - print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, scripted, - toplevel); - print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, scripted, - toplevel); - print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, - scripted, toplevel); - print_one_column(ZPOOL_PROP_CHECKPOINT, - vs->vs_checkpoint_space, scripted, toplevel); - print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, scripted, - B_TRUE); - print_one_column(ZPOOL_PROP_FRAGMENTATION, - vs->vs_fragmentation, scripted, - (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel)); - cap = (vs->vs_space == 0) ? 0 : - (vs->vs_alloc * 10000 / vs->vs_space); - print_one_column(ZPOOL_PROP_CAPACITY, cap, scripted, toplevel); - (void) printf("\n"); - } - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) - return; - - /* list the normal vdevs first */ - for (c = 0; c < children; c++) { - uint64_t ishole = B_FALSE; - - if (nvlist_lookup_uint64(child[c], - ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) - continue; - - if (nvlist_lookup_uint64(child[c], - ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) - continue; - - if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) - continue; - - vname = zpool_vdev_name(g_zfs, zhp, child[c], - cb->cb_name_flags); - print_list_stats(zhp, vname, child[c], cb, depth + 2); - free(vname); - } - - /* list the classes: 'logs', 'dedup', and 'special' */ - for (uint_t n = 0; n < 3; n++) { - boolean_t printed = B_FALSE; - - for (c = 0; c < children; c++) { - char *bias = NULL; - char *type = NULL; - - if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - &islog) == 0 && islog) { - bias = VDEV_ALLOC_CLASS_LOGS; - } else { - (void) nvlist_lookup_string(child[c], - ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); - (void) nvlist_lookup_string(child[c], - ZPOOL_CONFIG_TYPE, &type); - } - if (bias == NULL || strcmp(bias, class_name[n]) != 0) - continue; - if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) - continue; - - if (!printed) { - /* LINTED E_SEC_PRINTF_VAR_FMT */ - (void) printf(dashes, cb->cb_namewidth, - class_name[n]); - printed = B_TRUE; - } - vname = zpool_vdev_name(g_zfs, zhp, child[c], - cb->cb_name_flags); - print_list_stats(zhp, vname, child[c], cb, depth + 2); - free(vname); - } - } - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, - &child, &children) == 0 && children > 0) { - /* LINTED E_SEC_PRINTF_VAR_FMT */ - (void) printf(dashes, cb->cb_namewidth, "cache"); - for (c = 0; c < children; c++) { - vname = zpool_vdev_name(g_zfs, zhp, child[c], - cb->cb_name_flags); - print_list_stats(zhp, vname, child[c], cb, depth + 2); - free(vname); - } - } - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, - &children) == 0 && children > 0) { - /* LINTED E_SEC_PRINTF_VAR_FMT */ - (void) printf(dashes, cb->cb_namewidth, "spare"); - for (c = 0; c < children; c++) { - vname = zpool_vdev_name(g_zfs, zhp, child[c], - cb->cb_name_flags); - print_list_stats(zhp, vname, child[c], cb, depth + 2); - free(vname); - } - } -} - -/* - * Generic callback function to list a pool. - */ -int -list_callback(zpool_handle_t *zhp, void *data) -{ - list_cbdata_t *cbp = data; - nvlist_t *config; - nvlist_t *nvroot; - - config = zpool_get_config(zhp, NULL); - - if (cbp->cb_verbose) { - config = zpool_get_config(zhp, NULL); - - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - } - - if (cbp->cb_verbose) - cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, - cbp->cb_name_flags); - - print_pool(zhp, cbp); - - if (cbp->cb_verbose) - print_list_stats(zhp, NULL, nvroot, cbp, 0); - - return (0); -} - -/* - * zpool list [-gHLP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] - * - * -g Display guid for individual vdev name. - * -H Scripted mode. Don't display headers, and separate properties - * by a single tab. - * -L Follow links when resolving vdev path name. - * -o List of properties to display. Defaults to - * "name,size,allocated,free,expandsize,fragmentation,capacity," - * "dedupratio,health,altroot" - * -p Diplay values in parsable (exact) format. - * -P Display full path for vdev name. - * -T Display a timestamp in date(1) or Unix format - * - * List all pools in the system, whether or not they're healthy. Output space - * statistics for each one, as well as health status summary. - */ -int -zpool_do_list(int argc, char **argv) -{ - int c; - int ret; - list_cbdata_t cb = { 0 }; - static char default_props[] = - "name,size,allocated,free,checkpoint,expandsize,fragmentation," - "capacity,dedupratio,health,altroot"; - char *props = default_props; - unsigned long interval = 0, count = 0; - zpool_list_t *list; - boolean_t first = B_TRUE; - - /* check options */ - while ((c = getopt(argc, argv, ":gHLo:pPT:v")) != -1) { - switch (c) { - case 'g': - cb.cb_name_flags |= VDEV_NAME_GUID; - break; - case 'H': - cb.cb_scripted = B_TRUE; - break; - case 'L': - cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; - break; - case 'o': - props = optarg; - break; - case 'P': - cb.cb_name_flags |= VDEV_NAME_PATH; - break; - case 'p': - cb.cb_literal = B_TRUE; - break; - case 'T': - get_timestamp_arg(*optarg); - break; - case 'v': - cb.cb_verbose = B_TRUE; - cb.cb_namewidth = 8; /* 8 until precalc is avail */ - break; - case ':': - (void) fprintf(stderr, gettext("missing argument for " - "'%c' option\n"), optopt); - usage(B_FALSE); - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - get_interval_count(&argc, argv, &interval, &count); - - if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) - usage(B_FALSE); - - for (;;) { - if ((list = pool_list_get(argc, argv, &cb.cb_proplist, - &ret)) == NULL) - return (1); - - if (pool_list_count(list) == 0) - break; - - cb.cb_namewidth = 0; - (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb); - - if (timestamp_fmt != NODATE) - print_timestamp(timestamp_fmt); - - if (!cb.cb_scripted && (first || cb.cb_verbose)) { - print_header(&cb); - first = B_FALSE; - } - ret = pool_list_iter(list, B_TRUE, list_callback, &cb); - - if (interval == 0) - break; - - if (count != 0 && --count == 0) - break; - - pool_list_free(list); - (void) sleep(interval); - } - - if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { - (void) printf(gettext("no pools available\n")); - ret = 0; - } - - pool_list_free(list); - zprop_free_list(cb.cb_proplist); - return (ret); -} - -static int -zpool_do_attach_or_replace(int argc, char **argv, int replacing) -{ - boolean_t force = B_FALSE; - int c; - nvlist_t *nvroot; - char *poolname, *old_disk, *new_disk; - zpool_handle_t *zhp; - zpool_boot_label_t boot_type; - uint64_t boot_size; - int ret; - - /* check options */ - while ((c = getopt(argc, argv, "f")) != -1) { - switch (c) { - case 'f': - force = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* get pool name and check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool name argument\n")); - usage(B_FALSE); - } - - poolname = argv[0]; - - if (argc < 2) { - (void) fprintf(stderr, - gettext("missing specification\n")); - usage(B_FALSE); - } - - old_disk = argv[1]; - - if (argc < 3) { - if (!replacing) { - (void) fprintf(stderr, - gettext("missing specification\n")); - usage(B_FALSE); - } - new_disk = old_disk; - argc -= 1; - argv += 1; - } else { - new_disk = argv[2]; - argc -= 2; - argv += 2; - } - - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - if ((zhp = zpool_open(g_zfs, poolname)) == NULL) - return (1); - - if (zpool_get_config(zhp, NULL) == NULL) { - (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), - poolname); - zpool_close(zhp); - return (1); - } - - if (zpool_is_bootable(zhp)) - boot_type = ZPOOL_COPY_BOOT_LABEL; - else - boot_type = ZPOOL_NO_BOOT_LABEL; - - boot_size = zpool_get_prop_int(zhp, ZPOOL_PROP_BOOTSIZE, NULL); - nvroot = make_root_vdev(zhp, force, B_FALSE, replacing, B_FALSE, - boot_type, boot_size, argc, argv); - if (nvroot == NULL) { - zpool_close(zhp); - return (1); - } - - ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing); - - nvlist_free(nvroot); - zpool_close(zhp); - - return (ret); -} - -/* - * zpool replace [-f] - * - * -f Force attach, even if appears to be in use. - * - * Replace with . - */ -/* ARGSUSED */ -int -zpool_do_replace(int argc, char **argv) -{ - return (zpool_do_attach_or_replace(argc, argv, B_TRUE)); -} - -/* - * zpool attach [-f] - * - * -f Force attach, even if appears to be in use. - * - * Attach to the mirror containing . If is not - * part of a mirror, then will be transformed into a mirror of - * and . In either case, will begin life - * with a DTL of [0, now], and will immediately begin to resilver itself. - */ -int -zpool_do_attach(int argc, char **argv) -{ - return (zpool_do_attach_or_replace(argc, argv, B_FALSE)); -} - -/* - * zpool detach [-f] - * - * -f Force detach of , even if DTLs argue against it - * (not supported yet) - * - * Detach a device from a mirror. The operation will be refused if - * is the last device in the mirror, or if the DTLs indicate that this device - * has the only valid copy of some data. - */ -/* ARGSUSED */ -int -zpool_do_detach(int argc, char **argv) -{ - int c; - char *poolname, *path; - zpool_handle_t *zhp; - int ret; - - /* check options */ - while ((c = getopt(argc, argv, "f")) != -1) { - switch (c) { - case 'f': - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* get pool name and check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool name argument\n")); - usage(B_FALSE); - } - - if (argc < 2) { - (void) fprintf(stderr, - gettext("missing specification\n")); - usage(B_FALSE); - } - - poolname = argv[0]; - path = argv[1]; - - if ((zhp = zpool_open(g_zfs, poolname)) == NULL) - return (1); - - ret = zpool_vdev_detach(zhp, path); - - zpool_close(zhp); - - return (ret); -} - -/* - * zpool split [-gLnP] [-o prop=val] ... - * [-o mntopt] ... - * [-R altroot] [ ...] - * - * -g Display guid for individual vdev name. - * -L Follow links when resolving vdev path name. - * -n Do not split the pool, but display the resulting layout if - * it were to be split. - * -o Set property=value, or set mount options. - * -P Display full path for vdev name. - * -R Mount the split-off pool under an alternate root. - * - * Splits the named pool and gives it the new pool name. Devices to be split - * off may be listed, provided that no more than one device is specified - * per top-level vdev mirror. The newly split pool is left in an exported - * state unless -R is specified. - * - * Restrictions: the top-level of the pool pool must only be made up of - * mirrors; all devices in the pool must be healthy; no device may be - * undergoing a resilvering operation. - */ -int -zpool_do_split(int argc, char **argv) -{ - char *srcpool, *newpool, *propval; - char *mntopts = NULL; - splitflags_t flags; - int c, ret = 0; - zpool_handle_t *zhp; - nvlist_t *config, *props = NULL; - - flags.dryrun = B_FALSE; - flags.import = B_FALSE; - flags.name_flags = 0; - - /* check options */ - while ((c = getopt(argc, argv, ":gLR:no:P")) != -1) { - switch (c) { - case 'g': - flags.name_flags |= VDEV_NAME_GUID; - break; - case 'L': - flags.name_flags |= VDEV_NAME_FOLLOW_LINKS; - break; - case 'R': - flags.import = B_TRUE; - if (add_prop_list( - zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, - &props, B_TRUE) != 0) { - nvlist_free(props); - usage(B_FALSE); - } - break; - case 'n': - flags.dryrun = B_TRUE; - break; - case 'o': - if ((propval = strchr(optarg, '=')) != NULL) { - *propval = '\0'; - propval++; - if (add_prop_list(optarg, propval, - &props, B_TRUE) != 0) { - nvlist_free(props); - usage(B_FALSE); - } - } else { - mntopts = optarg; - } - break; - case 'P': - flags.name_flags |= VDEV_NAME_PATH; - break; - case ':': - (void) fprintf(stderr, gettext("missing argument for " - "'%c' option\n"), optopt); - usage(B_FALSE); - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - break; - } - } - - if (!flags.import && mntopts != NULL) { - (void) fprintf(stderr, gettext("setting mntopts is only " - "valid when importing the pool\n")); - usage(B_FALSE); - } - - argc -= optind; - argv += optind; - - if (argc < 1) { - (void) fprintf(stderr, gettext("Missing pool name\n")); - usage(B_FALSE); - } - if (argc < 2) { - (void) fprintf(stderr, gettext("Missing new pool name\n")); - usage(B_FALSE); - } - - srcpool = argv[0]; - newpool = argv[1]; - - argc -= 2; - argv += 2; - - if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) - return (1); - - config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv); - if (config == NULL) { - ret = 1; - } else { - if (flags.dryrun) { - (void) printf(gettext("would create '%s' with the " - "following layout:\n\n"), newpool); - print_vdev_tree(NULL, newpool, config, 0, "", - flags.name_flags); - } - nvlist_free(config); - } - - zpool_close(zhp); - - if (ret != 0 || flags.dryrun || !flags.import) - return (ret); - - /* - * The split was successful. Now we need to open the new - * pool and import it. - */ - if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) - return (1); - if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && - zpool_enable_datasets(zhp, mntopts, 0) != 0) { - ret = 1; - (void) fprintf(stderr, gettext("Split was successful, but " - "the datasets could not all be mounted\n")); - (void) fprintf(stderr, gettext("Try doing '%s' with a " - "different altroot\n"), "zpool import"); - } - zpool_close(zhp); - - return (ret); -} - - - -/* - * zpool online ... - */ -int -zpool_do_online(int argc, char **argv) -{ - int c, i; - char *poolname; - zpool_handle_t *zhp; - int ret = 0; - vdev_state_t newstate; - int flags = 0; - - /* check options */ - while ((c = getopt(argc, argv, "et")) != -1) { - switch (c) { - case 'e': - flags |= ZFS_ONLINE_EXPAND; - break; - case 't': - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* get pool name and check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool name\n")); - usage(B_FALSE); - } - if (argc < 2) { - (void) fprintf(stderr, gettext("missing device name\n")); - usage(B_FALSE); - } - - poolname = argv[0]; - - if ((zhp = zpool_open(g_zfs, poolname)) == NULL) - return (1); - - for (i = 1; i < argc; i++) { - if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { - if (newstate != VDEV_STATE_HEALTHY) { - (void) printf(gettext("warning: device '%s' " - "onlined, but remains in faulted state\n"), - argv[i]); - if (newstate == VDEV_STATE_FAULTED) - (void) printf(gettext("use 'zpool " - "clear' to restore a faulted " - "device\n")); - else - (void) printf(gettext("use 'zpool " - "replace' to replace devices " - "that are no longer present\n")); - } - } else { - ret = 1; - } - } - - zpool_close(zhp); - - return (ret); -} - -/* - * zpool offline [-ft] ... - * - * -f Force the device into the offline state, even if doing - * so would appear to compromise pool availability. - * (not supported yet) - * - * -t Only take the device off-line temporarily. The offline - * state will not be persistent across reboots. - */ -/* ARGSUSED */ -int -zpool_do_offline(int argc, char **argv) -{ - int c, i; - char *poolname; - zpool_handle_t *zhp; - int ret = 0; - boolean_t istmp = B_FALSE; - - /* check options */ - while ((c = getopt(argc, argv, "ft")) != -1) { - switch (c) { - case 't': - istmp = B_TRUE; - break; - case 'f': - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* get pool name and check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool name\n")); - usage(B_FALSE); - } - if (argc < 2) { - (void) fprintf(stderr, gettext("missing device name\n")); - usage(B_FALSE); - } - - poolname = argv[0]; - - if ((zhp = zpool_open(g_zfs, poolname)) == NULL) - return (1); - - for (i = 1; i < argc; i++) { - if (zpool_vdev_offline(zhp, argv[i], istmp) != 0) - ret = 1; - } - - zpool_close(zhp); - - return (ret); -} - -/* - * zpool clear [device] - * - * Clear all errors associated with a pool or a particular device. - */ -int -zpool_do_clear(int argc, char **argv) -{ - int c; - int ret = 0; - boolean_t dryrun = B_FALSE; - boolean_t do_rewind = B_FALSE; - boolean_t xtreme_rewind = B_FALSE; - uint32_t rewind_policy = ZPOOL_NO_REWIND; - nvlist_t *policy = NULL; - zpool_handle_t *zhp; - char *pool, *device; - - /* check options */ - while ((c = getopt(argc, argv, "FnX")) != -1) { - switch (c) { - case 'F': - do_rewind = B_TRUE; - break; - case 'n': - dryrun = B_TRUE; - break; - case 'X': - xtreme_rewind = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool name\n")); - usage(B_FALSE); - } - - if (argc > 2) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - if ((dryrun || xtreme_rewind) && !do_rewind) { - (void) fprintf(stderr, - gettext("-n or -X only meaningful with -F\n")); - usage(B_FALSE); - } - if (dryrun) - rewind_policy = ZPOOL_TRY_REWIND; - else if (do_rewind) - rewind_policy = ZPOOL_DO_REWIND; - if (xtreme_rewind) - rewind_policy |= ZPOOL_EXTREME_REWIND; - - /* In future, further rewind policy choices can be passed along here */ - if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || - nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, - rewind_policy) != 0) { - return (1); - } - - pool = argv[0]; - device = argc == 2 ? argv[1] : NULL; - - if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { - nvlist_free(policy); - return (1); - } - - if (zpool_clear(zhp, device, policy) != 0) - ret = 1; - - zpool_close(zhp); - - nvlist_free(policy); - - return (ret); -} - -/* - * zpool reguid - */ -int -zpool_do_reguid(int argc, char **argv) -{ - int c; - char *poolname; - zpool_handle_t *zhp; - int ret = 0; - - /* check options */ - while ((c = getopt(argc, argv, "")) != -1) { - switch (c) { - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - /* get pool name and check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool name\n")); - usage(B_FALSE); - } - - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - poolname = argv[0]; - if ((zhp = zpool_open(g_zfs, poolname)) == NULL) - return (1); - - ret = zpool_reguid(zhp); - - zpool_close(zhp); - return (ret); -} - - -/* - * zpool reopen - * - * Reopen the pool so that the kernel can update the sizes of all vdevs. - */ -int -zpool_do_reopen(int argc, char **argv) -{ - int c; - int ret = 0; - zpool_handle_t *zhp; - char *pool; - - /* check options */ - while ((c = getopt(argc, argv, "")) != -1) { - switch (c) { - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc--; - argv++; - - if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool name\n")); - usage(B_FALSE); - } - - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); - } - - pool = argv[0]; - if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) - return (1); - - ret = zpool_reopen(zhp); - zpool_close(zhp); - return (ret); -} - -typedef struct scrub_cbdata { - int cb_type; - int cb_argc; - char **cb_argv; - pool_scrub_cmd_t cb_scrub_cmd; -} scrub_cbdata_t; - -static boolean_t -zpool_has_checkpoint(zpool_handle_t *zhp) -{ - nvlist_t *config, *nvroot; - - config = zpool_get_config(zhp, NULL); - - if (config != NULL) { - pool_checkpoint_stat_t *pcs = NULL; - uint_t c; - - nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); - (void) nvlist_lookup_uint64_array(nvroot, - ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); - - if (pcs == NULL || pcs->pcs_state == CS_NONE) - return (B_FALSE); - - assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS || - pcs->pcs_state == CS_CHECKPOINT_DISCARDING); - return (B_TRUE); - } - - return (B_FALSE); -} - -int -scrub_callback(zpool_handle_t *zhp, void *data) -{ - scrub_cbdata_t *cb = data; - int err; - - /* - * Ignore faulted pools. - */ - if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { - (void) fprintf(stderr, gettext("cannot scrub '%s': pool is " - "currently unavailable\n"), zpool_get_name(zhp)); - return (1); - } - - err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd); - - if (err == 0 && zpool_has_checkpoint(zhp) && - cb->cb_type == POOL_SCAN_SCRUB) { - (void) printf(gettext("warning: will not scrub state that " - "belongs to the checkpoint of pool '%s'\n"), - zpool_get_name(zhp)); - } - - return (err != 0); -} - -/* - * zpool scrub [-s | -p] ... - * - * -s Stop. Stops any in-progress scrub. - * -p Pause. Pause in-progress scrub. - */ -int -zpool_do_scrub(int argc, char **argv) -{ - int c; - scrub_cbdata_t cb; - - cb.cb_type = POOL_SCAN_SCRUB; - cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; - - /* check options */ - while ((c = getopt(argc, argv, "sp")) != -1) { - switch (c) { - case 's': - cb.cb_type = POOL_SCAN_NONE; - break; - case 'p': - cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - if (cb.cb_type == POOL_SCAN_NONE && - cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) { - (void) fprintf(stderr, gettext("invalid option combination: " - "-s and -p are mutually exclusive\n")); - usage(B_FALSE); - } - - cb.cb_argc = argc; - cb.cb_argv = argv; - argc -= optind; - argv += optind; - - if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool name argument\n")); - usage(B_FALSE); - } - - return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); -} - -static void -zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) -{ - uint_t children = 0; - nvlist_t **child; - uint_t i; - - (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &child, &children); - - if (children == 0) { - char *path = zpool_vdev_name(g_zfs, zhp, nvroot, B_FALSE); - fnvlist_add_boolean(res, path); - free(path); - return; - } - - for (i = 0; i < children; i++) { - zpool_collect_leaves(zhp, child[i], res); - } -} - -/* - * zpool initialize [-cs] [ ...] - * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool - * if none specified. - * - * -c Cancel. Ends active initializing. - * -s Suspend. Initializing can then be restarted with no flags. - */ -int -zpool_do_initialize(int argc, char **argv) -{ - int c; - char *poolname; - zpool_handle_t *zhp; - nvlist_t *vdevs; - int err = 0; - - struct option long_options[] = { - {"cancel", no_argument, NULL, 'c'}, - {"suspend", no_argument, NULL, 's'}, - {0, 0, 0, 0} - }; - - pool_initialize_func_t cmd_type = POOL_INITIALIZE_DO; - while ((c = getopt_long(argc, argv, "cs", long_options, NULL)) != -1) { - switch (c) { - case 'c': - if (cmd_type != POOL_INITIALIZE_DO) { - (void) fprintf(stderr, gettext("-c cannot be " - "combined with other options\n")); - usage(B_FALSE); - } - cmd_type = POOL_INITIALIZE_CANCEL; - break; - case 's': - if (cmd_type != POOL_INITIALIZE_DO) { - (void) fprintf(stderr, gettext("-s cannot be " - "combined with other options\n")); - usage(B_FALSE); - } - cmd_type = POOL_INITIALIZE_SUSPEND; - break; - case '?': - if (optopt != 0) { - (void) fprintf(stderr, - gettext("invalid option '%c'\n"), optopt); - } else { - (void) fprintf(stderr, - gettext("invalid option '%s'\n"), - argv[optind - 1]); - } - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool name argument\n")); - usage(B_FALSE); - return (-1); - } - - poolname = argv[0]; - zhp = zpool_open(g_zfs, poolname); - if (zhp == NULL) - return (-1); - - vdevs = fnvlist_alloc(); - if (argc == 1) { - /* no individual leaf vdevs specified, so add them all */ - nvlist_t *config = zpool_get_config(zhp, NULL); - nvlist_t *nvroot = fnvlist_lookup_nvlist(config, - ZPOOL_CONFIG_VDEV_TREE); - zpool_collect_leaves(zhp, nvroot, vdevs); - } else { - int i; - for (i = 1; i < argc; i++) { - fnvlist_add_boolean(vdevs, argv[i]); - } - } - - err = zpool_initialize(zhp, cmd_type, vdevs); - - fnvlist_free(vdevs); - zpool_close(zhp); - - return (err); -} - -/* - * Print out detailed scrub status. - */ -static void -print_scan_status(pool_scan_stat_t *ps) -{ - time_t start, end, pause; - uint64_t total_secs_left; - uint64_t elapsed, secs_left, mins_left, hours_left, days_left; - uint64_t pass_scanned, scanned, pass_issued, issued, total; - uint_t scan_rate, issue_rate; - double fraction_done; - char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7]; - char srate_buf[7], irate_buf[7]; - - (void) printf(gettext(" scan: ")); - - /* If there's never been a scan, there's not much to say. */ - if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || - ps->pss_func >= POOL_SCAN_FUNCS) { - (void) printf(gettext("none requested\n")); - return; - } - - start = ps->pss_start_time; - end = ps->pss_end_time; - pause = ps->pss_pass_scrub_pause; - - zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf)); - - assert(ps->pss_func == POOL_SCAN_SCRUB || - ps->pss_func == POOL_SCAN_RESILVER); - - /* Scan is finished or canceled. */ - if (ps->pss_state == DSS_FINISHED) { - total_secs_left = end - start; - days_left = total_secs_left / 60 / 60 / 24; - hours_left = (total_secs_left / 60 / 60) % 24; - mins_left = (total_secs_left / 60) % 60; - secs_left = (total_secs_left % 60); - - if (ps->pss_func == POOL_SCAN_SCRUB) { - (void) printf(gettext("scrub repaired %s " - "in %llu days %02llu:%02llu:%02llu " - "with %llu errors on %s"), processed_buf, - (u_longlong_t)days_left, (u_longlong_t)hours_left, - (u_longlong_t)mins_left, (u_longlong_t)secs_left, - (u_longlong_t)ps->pss_errors, ctime(&end)); - } else if (ps->pss_func == POOL_SCAN_RESILVER) { - (void) printf(gettext("resilvered %s " - "in %llu days %02llu:%02llu:%02llu " - "with %llu errors on %s"), processed_buf, - (u_longlong_t)days_left, (u_longlong_t)hours_left, - (u_longlong_t)mins_left, (u_longlong_t)secs_left, - (u_longlong_t)ps->pss_errors, ctime(&end)); - - } - - return; - } else if (ps->pss_state == DSS_CANCELED) { - if (ps->pss_func == POOL_SCAN_SCRUB) { - (void) printf(gettext("scrub canceled on %s"), - ctime(&end)); - } else if (ps->pss_func == POOL_SCAN_RESILVER) { - (void) printf(gettext("resilver canceled on %s"), - ctime(&end)); - } - return; - } - - assert(ps->pss_state == DSS_SCANNING); - - /* Scan is in progress. Resilvers can't be paused. */ - if (ps->pss_func == POOL_SCAN_SCRUB) { - if (pause == 0) { - (void) printf(gettext("scrub in progress since %s"), - ctime(&start)); - } else { - (void) printf(gettext("scrub paused since %s"), - ctime(&pause)); - (void) printf(gettext("\tscrub started on %s"), - ctime(&start)); - } - } else if (ps->pss_func == POOL_SCAN_RESILVER) { - (void) printf(gettext("resilver in progress since %s"), - ctime(&start)); - } - - scanned = ps->pss_examined; - pass_scanned = ps->pss_pass_exam; - issued = ps->pss_issued; - pass_issued = ps->pss_pass_issued; - total = ps->pss_to_examine; - - /* we are only done with a block once we have issued the IO for it */ - fraction_done = (double)issued / total; - - /* elapsed time for this pass, rounding up to 1 if it's 0 */ - elapsed = time(NULL) - ps->pss_pass_start; - elapsed -= ps->pss_pass_scrub_spent_paused; - elapsed = (elapsed != 0) ? elapsed : 1; - - scan_rate = pass_scanned / elapsed; - issue_rate = pass_issued / elapsed; - total_secs_left = (issue_rate != 0) ? - ((total - issued) / issue_rate) : UINT64_MAX; - - days_left = total_secs_left / 60 / 60 / 24; - hours_left = (total_secs_left / 60 / 60) % 24; - mins_left = (total_secs_left / 60) % 60; - secs_left = (total_secs_left % 60); - - /* format all of the numbers we will be reporting */ - zfs_nicenum(scanned, scanned_buf, sizeof (scanned_buf)); - zfs_nicenum(issued, issued_buf, sizeof (issued_buf)); - zfs_nicenum(total, total_buf, sizeof (total_buf)); - zfs_nicenum(scan_rate, srate_buf, sizeof (srate_buf)); - zfs_nicenum(issue_rate, irate_buf, sizeof (irate_buf)); - - /* doo not print estimated time if we have a paused scrub */ - if (pause == 0) { - (void) printf(gettext("\t%s scanned at %s/s, " - "%s issued at %s/s, %s total\n"), - scanned_buf, srate_buf, issued_buf, irate_buf, total_buf); - } else { - (void) printf(gettext("\t%s scanned, %s issued, %s total\n"), - scanned_buf, issued_buf, total_buf); - } - - if (ps->pss_func == POOL_SCAN_RESILVER) { - (void) printf(gettext("\t%s resilvered, %.2f%% done"), - processed_buf, 100 * fraction_done); - } else if (ps->pss_func == POOL_SCAN_SCRUB) { - (void) printf(gettext("\t%s repaired, %.2f%% done"), - processed_buf, 100 * fraction_done); - } - - if (pause == 0) { - if (issue_rate >= 10 * 1024 * 1024) { - (void) printf(gettext(", %llu days " - "%02llu:%02llu:%02llu to go\n"), - (u_longlong_t)days_left, (u_longlong_t)hours_left, - (u_longlong_t)mins_left, (u_longlong_t)secs_left); - } else { - (void) printf(gettext(", no estimated " - "completion time\n")); - } - } else { - (void) printf(gettext("\n")); - } -} - -/* - * As we don't scrub checkpointed blocks, we want to warn the - * user that we skipped scanning some blocks if a checkpoint exists - * or existed at any time during the scan. - */ -static void -print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) -{ - if (ps == NULL || pcs == NULL) - return; - - if (pcs->pcs_state == CS_NONE || - pcs->pcs_state == CS_CHECKPOINT_DISCARDING) - return; - - assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS); - - if (ps->pss_state == DSS_NONE) - return; - - if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) && - ps->pss_end_time < pcs->pcs_start_time) - return; - - if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) { - (void) printf(gettext(" scan warning: skipped blocks " - "that are only referenced by the checkpoint.\n")); - } else { - assert(ps->pss_state == DSS_SCANNING); - (void) printf(gettext(" scan warning: skipping blocks " - "that are only referenced by the checkpoint.\n")); - } -} - -/* - * Print out detailed removal status. - */ -static void -print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) -{ - char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; - time_t start, end; - nvlist_t *config, *nvroot; - nvlist_t **child; - uint_t children; - char *vdev_name; - - if (prs == NULL || prs->prs_state == DSS_NONE) - return; - - /* - * Determine name of vdev. - */ - config = zpool_get_config(zhp, NULL); - nvroot = fnvlist_lookup_nvlist(config, - ZPOOL_CONFIG_VDEV_TREE); - verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0); - assert(prs->prs_removing_vdev < children); - vdev_name = zpool_vdev_name(g_zfs, zhp, - child[prs->prs_removing_vdev], B_TRUE); - - (void) printf(gettext("remove: ")); - - start = prs->prs_start_time; - end = prs->prs_end_time; - zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf)); - - /* - * Removal is finished or canceled. - */ - if (prs->prs_state == DSS_FINISHED) { - uint64_t minutes_taken = (end - start) / 60; - - (void) printf(gettext("Removal of vdev %llu copied %s " - "in %lluh%um, completed on %s"), - (longlong_t)prs->prs_removing_vdev, - copied_buf, - (u_longlong_t)(minutes_taken / 60), - (uint_t)(minutes_taken % 60), - ctime((time_t *)&end)); - } else if (prs->prs_state == DSS_CANCELED) { - (void) printf(gettext("Removal of %s canceled on %s"), - vdev_name, ctime(&end)); - } else { - uint64_t copied, total, elapsed, mins_left, hours_left; - double fraction_done; - uint_t rate; - - assert(prs->prs_state == DSS_SCANNING); - - /* - * Removal is in progress. - */ - (void) printf(gettext( - "Evacuation of %s in progress since %s"), - vdev_name, ctime(&start)); - - copied = prs->prs_copied > 0 ? prs->prs_copied : 1; - total = prs->prs_to_copy; - fraction_done = (double)copied / total; - - /* elapsed time for this pass */ - elapsed = time(NULL) - prs->prs_start_time; - elapsed = elapsed > 0 ? elapsed : 1; - rate = copied / elapsed; - rate = rate > 0 ? rate : 1; - mins_left = ((total - copied) / rate) / 60; - hours_left = mins_left / 60; - - zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); - zfs_nicenum(total, total_buf, sizeof (total_buf)); - zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); - - /* - * do not print estimated time if hours_left is more than - * 30 days - */ - (void) printf(gettext(" %s copied out of %s at %s/s, " - "%.2f%% done"), - examined_buf, total_buf, rate_buf, 100 * fraction_done); - if (hours_left < (30 * 24)) { - (void) printf(gettext(", %lluh%um to go\n"), - (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); - } else { - (void) printf(gettext( - ", (copy is slow, no estimated time)\n")); - } - } - - if (prs->prs_mapping_memory > 0) { - char mem_buf[7]; - zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf)); - (void) printf(gettext(" %s memory used for " - "removed device mappings\n"), - mem_buf); - } -} - -static void -print_checkpoint_status(pool_checkpoint_stat_t *pcs) -{ - time_t start; - char space_buf[7]; - - if (pcs == NULL || pcs->pcs_state == CS_NONE) - return; - - (void) printf(gettext("checkpoint: ")); - - start = pcs->pcs_start_time; - zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf)); - - if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) { - char *date = ctime(&start); - - /* - * ctime() adds a newline at the end of the generated - * string, thus the weird format specifier and the - * strlen() call used to chop it off from the output. - */ - (void) printf(gettext("created %.*s, consumes %s\n"), - strlen(date) - 1, date, space_buf); - return; - } - - assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING); - - (void) printf(gettext("discarding, %s remaining.\n"), - space_buf); -} - -static void -print_error_log(zpool_handle_t *zhp) -{ - nvlist_t *nverrlist = NULL; - nvpair_t *elem; - char *pathname; - size_t len = MAXPATHLEN * 2; - - if (zpool_get_errlog(zhp, &nverrlist) != 0) { - (void) printf("errors: List of errors unavailable " - "(insufficient privileges)\n"); - return; - } - - (void) printf("errors: Permanent errors have been " - "detected in the following files:\n\n"); - - pathname = safe_malloc(len); - elem = NULL; - while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { - nvlist_t *nv; - uint64_t dsobj, obj; - - verify(nvpair_value_nvlist(elem, &nv) == 0); - verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET, - &dsobj) == 0); - verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT, - &obj) == 0); - zpool_obj_to_path(zhp, dsobj, obj, pathname, len); - (void) printf("%7s %s\n", "", pathname); - } - free(pathname); - nvlist_free(nverrlist); -} - -static void -print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares, - uint_t nspares) -{ - uint_t i; - char *name; - - if (nspares == 0) - return; - - (void) printf(gettext("\tspares\n")); - - for (i = 0; i < nspares; i++) { - name = zpool_vdev_name(g_zfs, zhp, spares[i], - cb->cb_name_flags); - print_status_config(zhp, cb, name, spares[i], 2, B_TRUE); - free(name); - } -} - -static void -print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, - uint_t nl2cache) -{ - uint_t i; - char *name; - - if (nl2cache == 0) - return; - - (void) printf(gettext("\tcache\n")); - - for (i = 0; i < nl2cache; i++) { - name = zpool_vdev_name(g_zfs, zhp, l2cache[i], - cb->cb_name_flags); - print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE); - free(name); - } -} - -static void -print_dedup_stats(nvlist_t *config) -{ - ddt_histogram_t *ddh; - ddt_stat_t *dds; - ddt_object_t *ddo; - uint_t c; - - /* - * If the pool was faulted then we may not have been able to - * obtain the config. Otherwise, if we have anything in the dedup - * table continue processing the stats. - */ - if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, - (uint64_t **)&ddo, &c) != 0) - return; - - (void) printf("\n"); - (void) printf(gettext(" dedup: ")); - if (ddo->ddo_count == 0) { - (void) printf(gettext("no DDT entries\n")); - return; - } - - (void) printf("DDT entries %llu, size %llu on disk, %llu in core\n", - (u_longlong_t)ddo->ddo_count, - (u_longlong_t)ddo->ddo_dspace, - (u_longlong_t)ddo->ddo_mspace); - - verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, - (uint64_t **)&dds, &c) == 0); - verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, - (uint64_t **)&ddh, &c) == 0); - zpool_dump_ddt(dds, ddh); -} - -/* - * Display a summary of pool status. Displays a summary such as: - * - * pool: tank - * status: DEGRADED - * reason: One or more devices ... - * see: http://illumos.org/msg/ZFS-xxxx-01 - * config: - * mirror DEGRADED - * c1t0d0 OK - * c2t0d0 UNAVAIL - * - * When given the '-v' option, we print out the complete config. If the '-e' - * option is specified, then we print out error rate information as well. - */ -int -status_callback(zpool_handle_t *zhp, void *data) -{ - status_cbdata_t *cbp = data; - nvlist_t *config, *nvroot; - char *msgid; - int reason; - const char *health; - uint_t c; - vdev_stat_t *vs; - - config = zpool_get_config(zhp, NULL); - reason = zpool_get_status(zhp, &msgid); - - cbp->cb_count++; - - /* - * If we were given 'zpool status -x', only report those pools with - * problems. - */ - if (cbp->cb_explain && - (reason == ZPOOL_STATUS_OK || - reason == ZPOOL_STATUS_VERSION_OLDER || - reason == ZPOOL_STATUS_NON_NATIVE_ASHIFT || - reason == ZPOOL_STATUS_FEAT_DISABLED)) { - if (!cbp->cb_allpools) { - (void) printf(gettext("pool '%s' is healthy\n"), - zpool_get_name(zhp)); - if (cbp->cb_first) - cbp->cb_first = B_FALSE; - } - return (0); - } - - if (cbp->cb_first) - cbp->cb_first = B_FALSE; - else - (void) printf("\n"); - - nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); - verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &c) == 0); - health = zpool_state_to_name(vs->vs_state, vs->vs_aux); - - (void) printf(gettext(" pool: %s\n"), zpool_get_name(zhp)); - (void) printf(gettext(" state: %s\n"), health); - - switch (reason) { - case ZPOOL_STATUS_MISSING_DEV_R: - (void) printf(gettext("status: One or more devices could not " - "be opened. Sufficient replicas exist for\n\tthe pool to " - "continue functioning in a degraded state.\n")); - (void) printf(gettext("action: Attach the missing device and " - "online it using 'zpool online'.\n")); - break; - - case ZPOOL_STATUS_MISSING_DEV_NR: - (void) printf(gettext("status: One or more devices could not " - "be opened. There are insufficient\n\treplicas for the " - "pool to continue functioning.\n")); - (void) printf(gettext("action: Attach the missing device and " - "online it using 'zpool online'.\n")); - break; - - case ZPOOL_STATUS_CORRUPT_LABEL_R: - (void) printf(gettext("status: One or more devices could not " - "be used because the label is missing or\n\tinvalid. " - "Sufficient replicas exist for the pool to continue\n\t" - "functioning in a degraded state.\n")); - (void) printf(gettext("action: Replace the device using " - "'zpool replace'.\n")); - break; - - case ZPOOL_STATUS_CORRUPT_LABEL_NR: - (void) printf(gettext("status: One or more devices could not " - "be used because the label is missing \n\tor invalid. " - "There are insufficient replicas for the pool to " - "continue\n\tfunctioning.\n")); - zpool_explain_recover(zpool_get_handle(zhp), - zpool_get_name(zhp), reason, config); - break; - - case ZPOOL_STATUS_FAILING_DEV: - (void) printf(gettext("status: One or more devices has " - "experienced an unrecoverable error. An\n\tattempt was " - "made to correct the error. Applications are " - "unaffected.\n")); - (void) printf(gettext("action: Determine if the device needs " - "to be replaced, and clear the errors\n\tusing " - "'zpool clear' or replace the device with 'zpool " - "replace'.\n")); - break; - - case ZPOOL_STATUS_OFFLINE_DEV: - (void) printf(gettext("status: One or more devices has " - "been taken offline by the administrator.\n\tSufficient " - "replicas exist for the pool to continue functioning in " - "a\n\tdegraded state.\n")); - (void) printf(gettext("action: Online the device using " - "'zpool online' or replace the device with\n\t'zpool " - "replace'.\n")); - break; - - case ZPOOL_STATUS_REMOVED_DEV: - (void) printf(gettext("status: One or more devices has " - "been removed by the administrator.\n\tSufficient " - "replicas exist for the pool to continue functioning in " - "a\n\tdegraded state.\n")); - (void) printf(gettext("action: Online the device using " - "'zpool online' or replace the device with\n\t'zpool " - "replace'.\n")); - break; - - case ZPOOL_STATUS_RESILVERING: - (void) printf(gettext("status: One or more devices is " - "currently being resilvered. The pool will\n\tcontinue " - "to function, possibly in a degraded state.\n")); - (void) printf(gettext("action: Wait for the resilver to " - "complete.\n")); - break; - - case ZPOOL_STATUS_CORRUPT_DATA: - (void) printf(gettext("status: One or more devices has " - "experienced an error resulting in data\n\tcorruption. " - "Applications may be affected.\n")); - (void) printf(gettext("action: Restore the file in question " - "if possible. Otherwise restore the\n\tentire pool from " - "backup.\n")); - break; - - case ZPOOL_STATUS_CORRUPT_POOL: - (void) printf(gettext("status: The pool metadata is corrupted " - "and the pool cannot be opened.\n")); - zpool_explain_recover(zpool_get_handle(zhp), - zpool_get_name(zhp), reason, config); - break; - - case ZPOOL_STATUS_VERSION_OLDER: - (void) printf(gettext("status: The pool is formatted using a " - "legacy on-disk format. The pool can\n\tstill be used, " - "but some features are unavailable.\n")); - (void) printf(gettext("action: Upgrade the pool using 'zpool " - "upgrade'. Once this is done, the\n\tpool will no longer " - "be accessible on software that does not support feature\n" - "\tflags.\n")); - break; - - case ZPOOL_STATUS_VERSION_NEWER: - (void) printf(gettext("status: The pool has been upgraded to a " - "newer, incompatible on-disk version.\n\tThe pool cannot " - "be accessed on this system.\n")); - (void) printf(gettext("action: Access the pool from a system " - "running more recent software, or\n\trestore the pool from " - "backup.\n")); - break; - - case ZPOOL_STATUS_FEAT_DISABLED: - (void) printf(gettext("status: Some supported features are not " - "enabled on the pool. The pool can\n\tstill be used, but " - "some features are unavailable.\n")); - (void) printf(gettext("action: Enable all features using " - "'zpool upgrade'. Once this is done,\n\tthe pool may no " - "longer be accessible by software that does not support\n\t" - "the features. See zpool-features(7) for details.\n")); - break; - - case ZPOOL_STATUS_UNSUP_FEAT_READ: - (void) printf(gettext("status: The pool cannot be accessed on " - "this system because it uses the\n\tfollowing feature(s) " - "not supported on this system:\n")); - zpool_print_unsup_feat(config); - (void) printf("\n"); - (void) printf(gettext("action: Access the pool from a system " - "that supports the required feature(s),\n\tor restore the " - "pool from backup.\n")); - break; - - case ZPOOL_STATUS_UNSUP_FEAT_WRITE: - (void) printf(gettext("status: The pool can only be accessed " - "in read-only mode on this system. It\n\tcannot be " - "accessed in read-write mode because it uses the " - "following\n\tfeature(s) not supported on this system:\n")); - zpool_print_unsup_feat(config); - (void) printf("\n"); - (void) printf(gettext("action: The pool cannot be accessed in " - "read-write mode. Import the pool with\n" - "\t\"-o readonly=on\", access the pool from a system that " - "supports the\n\trequired feature(s), or restore the " - "pool from backup.\n")); - break; - - case ZPOOL_STATUS_FAULTED_DEV_R: - (void) printf(gettext("status: One or more devices are " - "faulted in response to persistent errors.\n\tSufficient " - "replicas exist for the pool to continue functioning " - "in a\n\tdegraded state.\n")); - (void) printf(gettext("action: Replace the faulted device, " - "or use 'zpool clear' to mark the device\n\trepaired.\n")); - break; - - case ZPOOL_STATUS_FAULTED_DEV_NR: - (void) printf(gettext("status: One or more devices are " - "faulted in response to persistent errors. There are " - "insufficient replicas for the pool to\n\tcontinue " - "functioning.\n")); - (void) printf(gettext("action: Destroy and re-create the pool " - "from a backup source. Manually marking the device\n" - "\trepaired using 'zpool clear' may allow some data " - "to be recovered.\n")); - break; - - case ZPOOL_STATUS_IO_FAILURE_MMP: - (void) printf(gettext("status: The pool is suspended because " - "multihost writes failed or were delayed;\n\tanother " - "system could import the pool undetected.\n")); - (void) printf(gettext("action: Make sure the pool's devices " - "are connected, then reboot your system and\n\timport the " - "pool.\n")); - break; - - case ZPOOL_STATUS_IO_FAILURE_WAIT: - case ZPOOL_STATUS_IO_FAILURE_CONTINUE: - (void) printf(gettext("status: One or more devices are " - "faulted in response to IO failures.\n")); - (void) printf(gettext("action: Make sure the affected devices " - "are connected, then run 'zpool clear'.\n")); - break; - - case ZPOOL_STATUS_BAD_LOG: - (void) printf(gettext("status: An intent log record " - "could not be read.\n" - "\tWaiting for adminstrator intervention to fix the " - "faulted pool.\n")); - (void) printf(gettext("action: Either restore the affected " - "device(s) and run 'zpool online',\n" - "\tor ignore the intent log records by running " - "'zpool clear'.\n")); - break; - - case ZPOOL_STATUS_NON_NATIVE_ASHIFT: - (void) printf(gettext("status: One or more devices are " - "configured to use a non-native block size.\n" - "\tExpect reduced performance.\n")); - (void) printf(gettext("action: Replace affected devices with " - "devices that support the\n\tconfigured block size, or " - "migrate data to a properly configured\n\tpool.\n")); - break; - - default: - /* - * The remaining errors can't actually be generated, yet. - */ - assert(reason == ZPOOL_STATUS_OK); - } - - if (msgid != NULL) - (void) printf(gettext(" see: http://illumos.org/msg/%s\n"), - msgid); - - if (config != NULL) { - uint64_t nerr; - nvlist_t **spares, **l2cache; - uint_t nspares, nl2cache; - pool_checkpoint_stat_t *pcs = NULL; - pool_scan_stat_t *ps = NULL; - pool_removal_stat_t *prs = NULL; - - (void) nvlist_lookup_uint64_array(nvroot, - ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); - (void) nvlist_lookup_uint64_array(nvroot, - ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); - (void) nvlist_lookup_uint64_array(nvroot, - ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); - - print_scan_status(ps); - print_checkpoint_scan_warning(ps, pcs); - print_removal_status(zhp, prs); - print_checkpoint_status(pcs); - - cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, - cbp->cb_name_flags); - if (cbp->cb_namewidth < 10) - cbp->cb_namewidth = 10; - - (void) printf(gettext("config:\n\n")); - (void) printf(gettext("\t%-*s %-8s %5s %5s %5s\n"), - cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", - "CKSUM"); - - print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, - B_FALSE); - - print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP); - print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL); - print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS); - - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, - &l2cache, &nl2cache) == 0) - print_l2cache(zhp, cbp, l2cache, nl2cache); - - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) - print_spares(zhp, cbp, spares, nspares); - - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, - &nerr) == 0) { - nvlist_t *nverrlist = NULL; - - /* - * If the approximate error count is small, get a - * precise count by fetching the entire log and - * uniquifying the results. - */ - if (nerr > 0 && nerr < 100 && !cbp->cb_verbose && - zpool_get_errlog(zhp, &nverrlist) == 0) { - nvpair_t *elem; - - elem = NULL; - nerr = 0; - while ((elem = nvlist_next_nvpair(nverrlist, - elem)) != NULL) { - nerr++; - } - } - nvlist_free(nverrlist); - - (void) printf("\n"); - - if (nerr == 0) - (void) printf(gettext("errors: No known data " - "errors\n")); - else if (!cbp->cb_verbose) - (void) printf(gettext("errors: %llu data " - "errors, use '-v' for a list\n"), - (u_longlong_t)nerr); - else - print_error_log(zhp); - } - - if (cbp->cb_dedup_stats) - print_dedup_stats(config); - } else { - (void) printf(gettext("config: The configuration cannot be " - "determined.\n")); - } - - return (0); -} - -/* - * zpool status [-gLPvx] [-T d|u] [pool] ... [interval [count]] - * - * -g Display guid for individual vdev name. - * -L Follow links when resolving vdev path name. - * -P Display full path for vdev name. - * -v Display complete error logs - * -x Display only pools with potential problems - * -D Display dedup status (undocumented) - * -T Display a timestamp in date(1) or Unix format - * - * Describes the health status of all pools or some subset. - */ -int -zpool_do_status(int argc, char **argv) -{ - int c; - int ret; - unsigned long interval = 0, count = 0; - status_cbdata_t cb = { 0 }; - - /* check options */ - while ((c = getopt(argc, argv, "gLPvxDT:")) != -1) { - switch (c) { - case 'g': - cb.cb_name_flags |= VDEV_NAME_GUID; - break; - case 'L': - cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; - break; - case 'P': - cb.cb_name_flags |= VDEV_NAME_PATH; - break; - case 'v': - cb.cb_verbose = B_TRUE; - break; - case 'x': - cb.cb_explain = B_TRUE; - break; - case 'D': - cb.cb_dedup_stats = B_TRUE; - break; - case 'T': - get_timestamp_arg(*optarg); - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - get_interval_count(&argc, argv, &interval, &count); - - if (argc == 0) - cb.cb_allpools = B_TRUE; - - cb.cb_first = B_TRUE; - cb.cb_print_status = B_TRUE; - - for (;;) { - if (timestamp_fmt != NODATE) - print_timestamp(timestamp_fmt); - - ret = for_each_pool(argc, argv, B_TRUE, NULL, - status_callback, &cb); - - if (argc == 0 && cb.cb_count == 0) - (void) printf(gettext("no pools available\n")); - else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) - (void) printf(gettext("all pools are healthy\n")); - - if (ret != 0) - return (ret); - - if (interval == 0) - break; - - if (count != 0 && --count == 0) - break; - - (void) sleep(interval); - } - - return (0); -} - -typedef struct upgrade_cbdata { - boolean_t cb_first; - boolean_t cb_unavail; - char cb_poolname[ZFS_MAX_DATASET_NAME_LEN]; - int cb_argc; - uint64_t cb_version; - char **cb_argv; -} upgrade_cbdata_t; - -#ifdef __FreeBSD__ -static int -is_root_pool(zpool_handle_t *zhp) -{ - static struct statfs sfs; - static char *poolname = NULL; - static boolean_t stated = B_FALSE; - char *slash; - - if (!stated) { - stated = B_TRUE; - if (statfs("/", &sfs) == -1) { - (void) fprintf(stderr, - "Unable to stat root file system: %s.\n", - strerror(errno)); - return (0); - } - if (strcmp(sfs.f_fstypename, "zfs") != 0) - return (0); - poolname = sfs.f_mntfromname; - if ((slash = strchr(poolname, '/')) != NULL) - *slash = '\0'; - } - return (poolname != NULL && strcmp(poolname, zpool_get_name(zhp)) == 0); -} - -static void -root_pool_upgrade_check(zpool_handle_t *zhp, char *poolname, int size) -{ - - if (poolname[0] == '\0' && is_root_pool(zhp)) - (void) strlcpy(poolname, zpool_get_name(zhp), size); -} -#endif /* FreeBSD */ - -static int -upgrade_version(zpool_handle_t *zhp, uint64_t version) -{ - int ret; - nvlist_t *config; - uint64_t oldversion; - - config = zpool_get_config(zhp, NULL); - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, - &oldversion) == 0); - - assert(SPA_VERSION_IS_SUPPORTED(oldversion)); - assert(oldversion < version); - - ret = zpool_upgrade(zhp, version); - if (ret != 0) - return (ret); - - if (version >= SPA_VERSION_FEATURES) { - (void) printf(gettext("Successfully upgraded " - "'%s' from version %llu to feature flags.\n"), - zpool_get_name(zhp), oldversion); - } else { - (void) printf(gettext("Successfully upgraded " - "'%s' from version %llu to version %llu.\n"), - zpool_get_name(zhp), oldversion, version); - } - - return (0); -} - -static int -upgrade_enable_all(zpool_handle_t *zhp, int *countp) -{ - int i, ret, count; - boolean_t firstff = B_TRUE; - nvlist_t *enabled = zpool_get_features(zhp); - - count = 0; - for (i = 0; i < SPA_FEATURES; i++) { - const char *fname = spa_feature_table[i].fi_uname; - const char *fguid = spa_feature_table[i].fi_guid; - if (!nvlist_exists(enabled, fguid)) { - char *propname; - verify(-1 != asprintf(&propname, "feature@%s", fname)); - ret = zpool_set_prop(zhp, propname, - ZFS_FEATURE_ENABLED); - if (ret != 0) { - free(propname); - return (ret); - } - count++; - - if (firstff) { - (void) printf(gettext("Enabled the " - "following features on '%s':\n"), - zpool_get_name(zhp)); - firstff = B_FALSE; - } - (void) printf(gettext(" %s\n"), fname); - free(propname); - } - } - - if (countp != NULL) - *countp = count; - return (0); -} - -static int -upgrade_cb(zpool_handle_t *zhp, void *arg) -{ - upgrade_cbdata_t *cbp = arg; - nvlist_t *config; - uint64_t version; - boolean_t printnl = B_FALSE; - int ret; - - if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { - (void) fprintf(stderr, gettext("cannot upgrade '%s': pool is " - "currently unavailable.\n\n"), zpool_get_name(zhp)); - cbp->cb_unavail = B_TRUE; - /* Allow iteration to continue. */ - return (0); - } - - config = zpool_get_config(zhp, NULL); - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, - &version) == 0); - - assert(SPA_VERSION_IS_SUPPORTED(version)); - - if (version < cbp->cb_version) { - cbp->cb_first = B_FALSE; - ret = upgrade_version(zhp, cbp->cb_version); - if (ret != 0) - return (ret); -#ifdef __FreeBSD__ - root_pool_upgrade_check(zhp, cbp->cb_poolname, - sizeof(cbp->cb_poolname)); -#endif /* __FreeBSD__ */ - printnl = B_TRUE; - -#ifdef illumos - /* - * If they did "zpool upgrade -a", then we could - * be doing ioctls to different pools. We need - * to log this history once to each pool, and bypass - * the normal history logging that happens in main(). - */ - (void) zpool_log_history(g_zfs, history_str); - log_history = B_FALSE; -#endif - } - - if (cbp->cb_version >= SPA_VERSION_FEATURES) { - int count; - ret = upgrade_enable_all(zhp, &count); - if (ret != 0) - return (ret); - - if (count > 0) { - cbp->cb_first = B_FALSE; - printnl = B_TRUE; -#ifdef __FreeBSD__ - root_pool_upgrade_check(zhp, cbp->cb_poolname, - sizeof(cbp->cb_poolname)); -#endif /* __FreeBSD__ */ - /* - * If they did "zpool upgrade -a", then we could - * be doing ioctls to different pools. We need - * to log this history once to each pool, and bypass - * the normal history logging that happens in main(). - */ - (void) zpool_log_history(g_zfs, history_str); - log_history = B_FALSE; - } - } - - if (printnl) { - (void) printf(gettext("\n")); - } - - return (0); -} - -static int -upgrade_list_unavail(zpool_handle_t *zhp, void *arg) -{ - upgrade_cbdata_t *cbp = arg; - - if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { - if (cbp->cb_first) { - (void) fprintf(stderr, gettext("The following pools " - "are unavailable and cannot be upgraded as this " - "time.\n\n")); - (void) fprintf(stderr, gettext("POOL\n")); - (void) fprintf(stderr, gettext("------------\n")); - cbp->cb_first = B_FALSE; - } - (void) printf(gettext("%s\n"), zpool_get_name(zhp)); - cbp->cb_unavail = B_TRUE; - } - return (0); -} - -static int -upgrade_list_older_cb(zpool_handle_t *zhp, void *arg) -{ - upgrade_cbdata_t *cbp = arg; - nvlist_t *config; - uint64_t version; - - if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { - /* - * This will have been reported by upgrade_list_unavail so - * just allow iteration to continue. - */ - cbp->cb_unavail = B_TRUE; - return (0); - } - - config = zpool_get_config(zhp, NULL); - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, - &version) == 0); - - assert(SPA_VERSION_IS_SUPPORTED(version)); - - if (version < SPA_VERSION_FEATURES) { - if (cbp->cb_first) { - (void) printf(gettext("The following pools are " - "formatted with legacy version numbers and can\n" - "be upgraded to use feature flags. After " - "being upgraded, these pools\nwill no " - "longer be accessible by software that does not " - "support feature\nflags.\n\n")); - (void) printf(gettext("VER POOL\n")); - (void) printf(gettext("--- ------------\n")); - cbp->cb_first = B_FALSE; - } - - (void) printf("%2llu %s\n", (u_longlong_t)version, - zpool_get_name(zhp)); - } - - return (0); -} - -static int -upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) -{ - upgrade_cbdata_t *cbp = arg; - nvlist_t *config; - uint64_t version; - - if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { - /* - * This will have been reported by upgrade_list_unavail so - * just allow iteration to continue. - */ - cbp->cb_unavail = B_TRUE; - return (0); - } - - config = zpool_get_config(zhp, NULL); - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, - &version) == 0); - - if (version >= SPA_VERSION_FEATURES) { - int i; - boolean_t poolfirst = B_TRUE; - nvlist_t *enabled = zpool_get_features(zhp); - - for (i = 0; i < SPA_FEATURES; i++) { - const char *fguid = spa_feature_table[i].fi_guid; - const char *fname = spa_feature_table[i].fi_uname; - if (!nvlist_exists(enabled, fguid)) { - if (cbp->cb_first) { - (void) printf(gettext("\nSome " - "supported features are not " - "enabled on the following pools. " - "Once a\nfeature is enabled the " - "pool may become incompatible with " - "software\nthat does not support " - "the feature. See " - "zpool-features(7) for " - "details.\n\n")); - (void) printf(gettext("POOL " - "FEATURE\n")); - (void) printf(gettext("------" - "---------\n")); - cbp->cb_first = B_FALSE; - } - - if (poolfirst) { - (void) printf(gettext("%s\n"), - zpool_get_name(zhp)); - poolfirst = B_FALSE; - } - - (void) printf(gettext(" %s\n"), fname); - } - } - } - - return (0); -} - -/* ARGSUSED */ -static int -upgrade_one(zpool_handle_t *zhp, void *data) -{ - boolean_t printnl = B_FALSE; - upgrade_cbdata_t *cbp = data; - uint64_t cur_version; - int ret; - - if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { - (void) fprintf(stderr, gettext("cannot upgrade '%s': pool is " - "is currently unavailable.\n\n"), zpool_get_name(zhp)); - cbp->cb_unavail = B_TRUE; - return (1); - } - - if (strcmp("log", zpool_get_name(zhp)) == 0) { - (void) printf(gettext("'log' is now a reserved word\n" - "Pool 'log' must be renamed using export and import" - " to upgrade.\n\n")); - return (1); - } - - cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); - if (cur_version > cbp->cb_version) { - (void) printf(gettext("Pool '%s' is already formatted " - "using more current version '%llu'.\n\n"), - zpool_get_name(zhp), cur_version); - return (0); - } - - if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) { - (void) printf(gettext("Pool '%s' is already formatted " - "using version %llu.\n\n"), zpool_get_name(zhp), - cbp->cb_version); - return (0); - } - - if (cur_version != cbp->cb_version) { - printnl = B_TRUE; - ret = upgrade_version(zhp, cbp->cb_version); - if (ret != 0) - return (ret); -#ifdef __FreeBSD__ - root_pool_upgrade_check(zhp, cbp->cb_poolname, - sizeof(cbp->cb_poolname)); -#endif /* __FreeBSD__ */ - } - - if (cbp->cb_version >= SPA_VERSION_FEATURES) { - int count = 0; - ret = upgrade_enable_all(zhp, &count); - if (ret != 0) - return (ret); - - if (count != 0) { - printnl = B_TRUE; -#ifdef __FreeBSD__ - root_pool_upgrade_check(zhp, cbp->cb_poolname, - sizeof(cbp->cb_poolname)); -#endif /* __FreeBSD __*/ - } else if (cur_version == SPA_VERSION) { - (void) printf(gettext("Pool '%s' already has all " - "supported features enabled.\n\n"), - zpool_get_name(zhp)); - } - } - - if (printnl) { - (void) printf(gettext("\n")); - } - - return (0); -} - -/* - * zpool upgrade - * zpool upgrade -v - * zpool upgrade [-V version] <-a | pool ...> - * - * With no arguments, display downrev'd ZFS pool available for upgrade. - * Individual pools can be upgraded by specifying the pool, and '-a' will - * upgrade all pools. - */ -int -zpool_do_upgrade(int argc, char **argv) -{ - int c; - upgrade_cbdata_t cb = { 0 }; - int ret = 0; - boolean_t showversions = B_FALSE; - boolean_t upgradeall = B_FALSE; - char *end; - - - /* check options */ - while ((c = getopt(argc, argv, ":avV:")) != -1) { - switch (c) { - case 'a': - upgradeall = B_TRUE; - break; - case 'v': - showversions = B_TRUE; - break; - case 'V': - cb.cb_version = strtoll(optarg, &end, 10); - if (*end != '\0' || - !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) { - (void) fprintf(stderr, - gettext("invalid version '%s'\n"), optarg); - usage(B_FALSE); - } - break; - case ':': - (void) fprintf(stderr, gettext("missing argument for " - "'%c' option\n"), optopt); - usage(B_FALSE); - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - cb.cb_argc = argc; - cb.cb_argv = argv; - argc -= optind; - argv += optind; - - if (cb.cb_version == 0) { - cb.cb_version = SPA_VERSION; - } else if (!upgradeall && argc == 0) { - (void) fprintf(stderr, gettext("-V option is " - "incompatible with other arguments\n")); - usage(B_FALSE); - } - - if (showversions) { - if (upgradeall || argc != 0) { - (void) fprintf(stderr, gettext("-v option is " - "incompatible with other arguments\n")); - usage(B_FALSE); - } - } else if (upgradeall) { - if (argc != 0) { - (void) fprintf(stderr, gettext("-a option should not " - "be used along with a pool name\n")); - usage(B_FALSE); - } - } - - (void) printf(gettext("This system supports ZFS pool feature " - "flags.\n\n")); - if (showversions) { - int i; - - (void) printf(gettext("The following features are " - "supported:\n\n")); - (void) printf(gettext("FEAT DESCRIPTION\n")); - (void) printf("----------------------------------------------" - "---------------\n"); - for (i = 0; i < SPA_FEATURES; i++) { - zfeature_info_t *fi = &spa_feature_table[i]; - const char *ro = - (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? - " (read-only compatible)" : ""; - - (void) printf("%-37s%s\n", fi->fi_uname, ro); - (void) printf(" %s\n", fi->fi_desc); - } - (void) printf("\n"); - - (void) printf(gettext("The following legacy versions are also " - "supported:\n\n")); - (void) printf(gettext("VER DESCRIPTION\n")); - (void) printf("--- -----------------------------------------" - "---------------\n"); - (void) printf(gettext(" 1 Initial ZFS version\n")); - (void) printf(gettext(" 2 Ditto blocks " - "(replicated metadata)\n")); - (void) printf(gettext(" 3 Hot spares and double parity " - "RAID-Z\n")); - (void) printf(gettext(" 4 zpool history\n")); - (void) printf(gettext(" 5 Compression using the gzip " - "algorithm\n")); - (void) printf(gettext(" 6 bootfs pool property\n")); - (void) printf(gettext(" 7 Separate intent log devices\n")); - (void) printf(gettext(" 8 Delegated administration\n")); - (void) printf(gettext(" 9 refquota and refreservation " - "properties\n")); - (void) printf(gettext(" 10 Cache devices\n")); - (void) printf(gettext(" 11 Improved scrub performance\n")); - (void) printf(gettext(" 12 Snapshot properties\n")); - (void) printf(gettext(" 13 snapused property\n")); - (void) printf(gettext(" 14 passthrough-x aclinherit\n")); - (void) printf(gettext(" 15 user/group space accounting\n")); - (void) printf(gettext(" 16 stmf property support\n")); - (void) printf(gettext(" 17 Triple-parity RAID-Z\n")); - (void) printf(gettext(" 18 Snapshot user holds\n")); - (void) printf(gettext(" 19 Log device removal\n")); - (void) printf(gettext(" 20 Compression using zle " - "(zero-length encoding)\n")); - (void) printf(gettext(" 21 Deduplication\n")); - (void) printf(gettext(" 22 Received properties\n")); - (void) printf(gettext(" 23 Slim ZIL\n")); - (void) printf(gettext(" 24 System attributes\n")); - (void) printf(gettext(" 25 Improved scrub stats\n")); - (void) printf(gettext(" 26 Improved snapshot deletion " - "performance\n")); - (void) printf(gettext(" 27 Improved snapshot creation " - "performance\n")); - (void) printf(gettext(" 28 Multiple vdev replacements\n")); - (void) printf(gettext("\nFor more information on a particular " - "version, including supported releases,\n")); - (void) printf(gettext("see the ZFS Administration Guide.\n\n")); - } else if (argc == 0 && upgradeall) { - cb.cb_first = B_TRUE; - ret = zpool_iter(g_zfs, upgrade_cb, &cb); - if (ret == 0 && cb.cb_first) { - if (cb.cb_version == SPA_VERSION) { - (void) printf(gettext("All %spools are already " - "formatted using feature flags.\n\n"), - cb.cb_unavail ? gettext("available ") : ""); - (void) printf(gettext("Every %sfeature flags " - "pool already has all supported features " - "enabled.\n"), - cb.cb_unavail ? gettext("available ") : ""); - } else { - (void) printf(gettext("All pools are already " - "formatted with version %llu or higher.\n"), - cb.cb_version); - } - } - } else if (argc == 0) { - cb.cb_first = B_TRUE; - ret = zpool_iter(g_zfs, upgrade_list_unavail, &cb); - assert(ret == 0); - - if (!cb.cb_first) { - (void) fprintf(stderr, "\n"); - } - - cb.cb_first = B_TRUE; - ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb); - assert(ret == 0); - - if (cb.cb_first) { - (void) printf(gettext("All %spools are formatted using " - "feature flags.\n\n"), cb.cb_unavail ? - gettext("available ") : ""); - } else { - (void) printf(gettext("\nUse 'zpool upgrade -v' " - "for a list of available legacy versions.\n")); - } - - cb.cb_first = B_TRUE; - ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb); - assert(ret == 0); - - if (cb.cb_first) { - (void) printf(gettext("Every %sfeature flags pool has " - "all supported features enabled.\n"), - cb.cb_unavail ? gettext("available ") : ""); - } else { - (void) printf(gettext("\n")); - } - } else { - ret = for_each_pool(argc, argv, B_TRUE, NULL, - upgrade_one, &cb); - } - - if (cb.cb_poolname[0] != '\0') { - (void) printf( - "If you boot from pool '%s', don't forget to update boot code.\n" - "Assuming you use GPT partitioning and da0 is your boot disk\n" - "the following command will do it:\n" - "\n" - "\tgpart bootcode -b /boot/pmbr -p /boot/gptzfsboot -i 1 da0\n\n", - cb.cb_poolname); - } - - return (ret); -} - -typedef struct hist_cbdata { - boolean_t first; - boolean_t longfmt; - boolean_t internal; -} hist_cbdata_t; - -static void -print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb) -{ - nvlist_t **records; - uint_t numrecords; - int i; - - verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, - &records, &numrecords) == 0); - for (i = 0; i < numrecords; i++) { - nvlist_t *rec = records[i]; - char tbuf[30] = ""; - - if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { - time_t tsec; - struct tm t; - - tsec = fnvlist_lookup_uint64(records[i], - ZPOOL_HIST_TIME); - (void) localtime_r(&tsec, &t); - (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); - } - - if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { - (void) printf("%s %s", tbuf, - fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); - } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { - int ievent = - fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); - if (!cb->internal) - continue; - if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { - (void) printf("%s unrecognized record:\n", - tbuf); - dump_nvlist(rec, 4); - continue; - } - (void) printf("%s [internal %s txg:%lld] %s", tbuf, - zfs_history_event_names[ievent], - fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG), - fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); - } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { - if (!cb->internal) - continue; - (void) printf("%s [txg:%lld] %s", tbuf, - fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG), - fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); - if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { - (void) printf(" %s (%llu)", - fnvlist_lookup_string(rec, - ZPOOL_HIST_DSNAME), - fnvlist_lookup_uint64(rec, - ZPOOL_HIST_DSID)); - } - (void) printf(" %s", fnvlist_lookup_string(rec, - ZPOOL_HIST_INT_STR)); - } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { - if (!cb->internal) - continue; - (void) printf("%s ioctl %s\n", tbuf, - fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); - if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { - (void) printf(" input:\n"); - dump_nvlist(fnvlist_lookup_nvlist(rec, - ZPOOL_HIST_INPUT_NVL), 8); - } - if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { - (void) printf(" output:\n"); - dump_nvlist(fnvlist_lookup_nvlist(rec, - ZPOOL_HIST_OUTPUT_NVL), 8); - } - if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) { - (void) printf(" errno: %lld\n", - fnvlist_lookup_int64(rec, - ZPOOL_HIST_ERRNO)); - } - } else { - if (!cb->internal) - continue; - (void) printf("%s unrecognized record:\n", tbuf); - dump_nvlist(rec, 4); - } - - if (!cb->longfmt) { - (void) printf("\n"); - continue; - } - (void) printf(" ["); - if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { - uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); - struct passwd *pwd = getpwuid(who); - (void) printf("user %d ", (int)who); - if (pwd != NULL) - (void) printf("(%s) ", pwd->pw_name); - } - if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { - (void) printf("on %s", - fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); - } - if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { - (void) printf(":%s", - fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); - } - (void) printf("]"); - (void) printf("\n"); - } -} - -/* - * Print out the command history for a specific pool. - */ -static int -get_history_one(zpool_handle_t *zhp, void *data) -{ - nvlist_t *nvhis; - int ret; - hist_cbdata_t *cb = (hist_cbdata_t *)data; - uint64_t off = 0; - boolean_t eof = B_FALSE; - - cb->first = B_FALSE; - - (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp)); - - while (!eof) { - if ((ret = zpool_get_history(zhp, &nvhis, &off, &eof)) != 0) - return (ret); - - print_history_records(nvhis, cb); - nvlist_free(nvhis); - } - (void) printf("\n"); - - return (ret); -} - -/* - * zpool history - * - * Displays the history of commands that modified pools. - */ -int -zpool_do_history(int argc, char **argv) -{ - hist_cbdata_t cbdata = { 0 }; - int ret; - int c; - - cbdata.first = B_TRUE; - /* check options */ - while ((c = getopt(argc, argv, "li")) != -1) { - switch (c) { - case 'l': - cbdata.longfmt = B_TRUE; - break; - case 'i': - cbdata.internal = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - argc -= optind; - argv += optind; - - ret = for_each_pool(argc, argv, B_FALSE, NULL, get_history_one, - &cbdata); - - if (argc == 0 && cbdata.first == B_TRUE) { - (void) printf(gettext("no pools available\n")); - return (0); - } - - return (ret); -} - -static int -get_callback(zpool_handle_t *zhp, void *data) -{ - zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; - char value[MAXNAMELEN]; - zprop_source_t srctype; - zprop_list_t *pl; - - for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { - - /* - * Skip the special fake placeholder. This will also skip - * over the name property when 'all' is specified. - */ - if (pl->pl_prop == ZPOOL_PROP_NAME && - pl == cbp->cb_proplist) - continue; - - if (pl->pl_prop == ZPROP_INVAL && - (zpool_prop_feature(pl->pl_user_prop) || - zpool_prop_unsupported(pl->pl_user_prop))) { - srctype = ZPROP_SRC_LOCAL; - - if (zpool_prop_get_feature(zhp, pl->pl_user_prop, - value, sizeof (value)) == 0) { - zprop_print_one_property(zpool_get_name(zhp), - cbp, pl->pl_user_prop, value, srctype, - NULL, NULL); - } - } else { - if (zpool_get_prop(zhp, pl->pl_prop, value, - sizeof (value), &srctype, cbp->cb_literal) != 0) - continue; - - zprop_print_one_property(zpool_get_name(zhp), cbp, - zpool_prop_to_name(pl->pl_prop), value, srctype, - NULL, NULL); - } - } - return (0); -} - -/* - * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> ... - * - * -H Scripted mode. Don't display headers, and separate properties - * by a single tab. - * -o List of columns to display. Defaults to - * "name,property,value,source". - * -p Diplay values in parsable (exact) format. - * - * Get properties of pools in the system. Output space statistics - * for each one as well as other attributes. - */ -int -zpool_do_get(int argc, char **argv) -{ - zprop_get_cbdata_t cb = { 0 }; - zprop_list_t fake_name = { 0 }; - int ret; - int c, i; - char *value; - - cb.cb_first = B_TRUE; - - /* - * Set up default columns and sources. - */ - cb.cb_sources = ZPROP_SRC_ALL; - cb.cb_columns[0] = GET_COL_NAME; - cb.cb_columns[1] = GET_COL_PROPERTY; - cb.cb_columns[2] = GET_COL_VALUE; - cb.cb_columns[3] = GET_COL_SOURCE; - cb.cb_type = ZFS_TYPE_POOL; - - /* check options */ - while ((c = getopt(argc, argv, ":Hpo:")) != -1) { - switch (c) { - case 'p': - cb.cb_literal = B_TRUE; - break; - case 'H': - cb.cb_scripted = B_TRUE; - break; - case 'o': - bzero(&cb.cb_columns, sizeof (cb.cb_columns)); - i = 0; - while (*optarg != '\0') { - static char *col_subopts[] = - { "name", "property", "value", "source", - "all", NULL }; - - if (i == ZFS_GET_NCOLS) { - (void) fprintf(stderr, gettext("too " - "many fields given to -o " - "option\n")); - usage(B_FALSE); - } - - switch (getsubopt(&optarg, col_subopts, - &value)) { - case 0: - cb.cb_columns[i++] = GET_COL_NAME; - break; - case 1: - cb.cb_columns[i++] = GET_COL_PROPERTY; - break; - case 2: - cb.cb_columns[i++] = GET_COL_VALUE; - break; - case 3: - cb.cb_columns[i++] = GET_COL_SOURCE; - break; - case 4: - if (i > 0) { - (void) fprintf(stderr, - gettext("\"all\" conflicts " - "with specific fields " - "given to -o option\n")); - usage(B_FALSE); - } - cb.cb_columns[0] = GET_COL_NAME; - cb.cb_columns[1] = GET_COL_PROPERTY; - cb.cb_columns[2] = GET_COL_VALUE; - cb.cb_columns[3] = GET_COL_SOURCE; - i = ZFS_GET_NCOLS; - break; - default: - (void) fprintf(stderr, - gettext("invalid column name " - "'%s'\n"), suboptarg); - usage(B_FALSE); - } - } - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; - - if (argc < 1) { - (void) fprintf(stderr, gettext("missing property " - "argument\n")); - usage(B_FALSE); - } - - if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist, - ZFS_TYPE_POOL) != 0) - usage(B_FALSE); - - argc--; - argv++; - - if (cb.cb_proplist != NULL) { - fake_name.pl_prop = ZPOOL_PROP_NAME; - fake_name.pl_width = strlen(gettext("NAME")); - fake_name.pl_next = cb.cb_proplist; - cb.cb_proplist = &fake_name; - } - - ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, - get_callback, &cb); - - if (cb.cb_proplist == &fake_name) - zprop_free_list(fake_name.pl_next); - else - zprop_free_list(cb.cb_proplist); - - return (ret); -} - -typedef struct set_cbdata { - char *cb_propname; - char *cb_value; - boolean_t cb_any_successful; -} set_cbdata_t; - -int -set_callback(zpool_handle_t *zhp, void *data) -{ - int error; - set_cbdata_t *cb = (set_cbdata_t *)data; - - error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); - - if (!error) - cb->cb_any_successful = B_TRUE; - - return (error); -} - -int -zpool_do_set(int argc, char **argv) -{ - set_cbdata_t cb = { 0 }; - int error; - - if (argc > 1 && argv[1][0] == '-') { - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - argv[1][1]); - usage(B_FALSE); - } - - if (argc < 2) { - (void) fprintf(stderr, gettext("missing property=value " - "argument\n")); - usage(B_FALSE); - } - - if (argc < 3) { - (void) fprintf(stderr, gettext("missing pool name\n")); - usage(B_FALSE); - } - - if (argc > 3) { - (void) fprintf(stderr, gettext("too many pool names\n")); - usage(B_FALSE); - } - - cb.cb_propname = argv[1]; - cb.cb_value = strchr(cb.cb_propname, '='); - if (cb.cb_value == NULL) { - (void) fprintf(stderr, gettext("missing value in " - "property=value argument\n")); - usage(B_FALSE); - } - - *(cb.cb_value) = '\0'; - cb.cb_value++; - - error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, - set_callback, &cb); - - return (error); -} - -static int -find_command_idx(char *command, int *idx) -{ - int i; - - for (i = 0; i < NCOMMAND; i++) { - if (command_table[i].name == NULL) - continue; - - if (strcmp(command, command_table[i].name) == 0) { - *idx = i; - return (0); - } - } - return (1); -} - -int -main(int argc, char **argv) -{ - int ret = 0; - int i; - char *cmdname; - - (void) setlocale(LC_ALL, ""); - (void) textdomain(TEXT_DOMAIN); - - if ((g_zfs = libzfs_init()) == NULL) { - (void) fprintf(stderr, gettext("internal error: failed to " - "initialize ZFS library\n")); - return (1); - } - - libzfs_print_on_error(g_zfs, B_TRUE); - - opterr = 0; - - /* - * Make sure the user has specified some command. - */ - if (argc < 2) { - (void) fprintf(stderr, gettext("missing command\n")); - usage(B_FALSE); - } - - cmdname = argv[1]; - - /* - * Special case '-?' - */ - if (strcmp(cmdname, "-?") == 0) - usage(B_TRUE); - - zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); - - /* - * Run the appropriate command. - */ - if (find_command_idx(cmdname, &i) == 0) { - current_command = &command_table[i]; - ret = command_table[i].func(argc - 1, argv + 1); - } else if (strchr(cmdname, '=')) { - verify(find_command_idx("set", &i) == 0); - current_command = &command_table[i]; - ret = command_table[i].func(argc, argv); - } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) { - /* - * 'freeze' is a vile debugging abomination, so we treat - * it as such. - */ - zfs_cmd_t zc = { 0 }; - (void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name)); - return (!!zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc)); - } else { - (void) fprintf(stderr, gettext("unrecognized " - "command '%s'\n"), cmdname); - usage(B_FALSE); - } - - if (ret == 0 && log_history) - (void) zpool_log_history(g_zfs, history_str); - - libzfs_fini(g_zfs); - - /* - * The 'ZFS_ABORT' environment variable causes us to dump core on exit - * for the purposes of running ::findleaks. - */ - if (getenv("ZFS_ABORT") != NULL) { - (void) printf("dumping core by request\n"); - abort(); - } - - return (ret); -} diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool_util.c b/cddl/contrib/opensolaris/cmd/zpool/zpool_util.c deleted file mode 100644 index c7a002efb17c..000000000000 --- a/cddl/contrib/opensolaris/cmd/zpool/zpool_util.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include -#include -#include -#include -#include -#include - -#include "zpool_util.h" - -/* - * Utility function to guarantee malloc() success. - */ -void * -safe_malloc(size_t size) -{ - void *data; - - if ((data = calloc(1, size)) == NULL) { - (void) fprintf(stderr, "internal error: out of memory\n"); - exit(1); - } - - return (data); -} - -/* - * Display an out of memory error message and abort the current program. - */ -void -zpool_no_memory(void) -{ - assert(errno == ENOMEM); - (void) fprintf(stderr, - gettext("internal error: out of memory\n")); - exit(1); -} - -/* - * Return the number of logs in supplied nvlist - */ -uint_t -num_logs(nvlist_t *nv) -{ - uint_t nlogs = 0; - uint_t c, children; - nvlist_t **child; - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) - return (0); - - for (c = 0; c < children; c++) { - uint64_t is_log = B_FALSE; - - (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - &is_log); - if (is_log) - nlogs++; - } - return (nlogs); -} diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool_util.h b/cddl/contrib/opensolaris/cmd/zpool/zpool_util.h deleted file mode 100644 index 118029a22866..000000000000 --- a/cddl/contrib/opensolaris/cmd/zpool/zpool_util.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#ifndef ZPOOL_UTIL_H -#define ZPOOL_UTIL_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Basic utility functions - */ -void *safe_malloc(size_t); -void zpool_no_memory(void); -uint_t num_logs(nvlist_t *nv); - -/* - * Virtual device functions - */ - -nvlist_t *make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, - boolean_t replacing, boolean_t dryrun, zpool_boot_label_t boot_type, - uint64_t boot_size, int argc, char **argv); -nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, - nvlist_t *props, splitflags_t flags, int argc, char **argv); - -/* - * Pool list functions - */ -int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **, - zpool_iter_f, void *); - -typedef struct zpool_list zpool_list_t; - -zpool_list_t *pool_list_get(int, char **, zprop_list_t **, int *); -void pool_list_update(zpool_list_t *); -int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *); -void pool_list_free(zpool_list_t *); -int pool_list_count(zpool_list_t *); -void pool_list_remove(zpool_list_t *, zpool_handle_t *); - -extern libzfs_handle_t *g_zfs; - -#ifdef __cplusplus -} -#endif - -#endif /* ZPOOL_UTIL_H */ diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool_vdev.c b/cddl/contrib/opensolaris/cmd/zpool/zpool_vdev.c deleted file mode 100644 index 43d66d2263e0..000000000000 --- a/cddl/contrib/opensolaris/cmd/zpool/zpool_vdev.c +++ /dev/null @@ -1,1729 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2018 by Delphix. All rights reserved. - * Copyright (c) 2016, 2017 Intel Corporation. - * Copyright 2016 Igor Kozhukhov . - */ - -/* - * Functions to convert between a list of vdevs and an nvlist representing the - * configuration. Each entry in the list can be one of: - * - * Device vdevs - * disk=(path=..., devid=...) - * file=(path=...) - * - * Group vdevs - * raidz[1|2]=(...) - * mirror=(...) - * - * Hot spares - * - * While the underlying implementation supports it, group vdevs cannot contain - * other group vdevs. All userland verification of devices is contained within - * this file. If successful, the nvlist returned can be passed directly to the - * kernel; we've done as much verification as possible in userland. - * - * Hot spares are a special case, and passed down as an array of disk vdevs, at - * the same level as the root of the vdev tree. - * - * The only function exported by this file is 'make_root_vdev'. The - * function performs several passes: - * - * 1. Construct the vdev specification. Performs syntax validation and - * makes sure each device is valid. - * 2. Check for devices in use. Using libdiskmgt, makes sure that no - * devices are also in use. Some can be overridden using the 'force' - * flag, others cannot. - * 3. Check for replication errors if the 'force' flag is not specified. - * validates that the replication level is consistent across the - * entire pool. - * 4. Call libzfs to label any whole disks with an EFI label. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zpool_util.h" - -#define BACKUP_SLICE "s2" - -/* - * For any given vdev specification, we can have multiple errors. The - * vdev_error() function keeps track of whether we have seen an error yet, and - * prints out a header if its the first error we've seen. - */ -boolean_t error_seen; -boolean_t is_force; - -/*PRINTFLIKE1*/ -static void -vdev_error(const char *fmt, ...) -{ - va_list ap; - - if (!error_seen) { - (void) fprintf(stderr, gettext("invalid vdev specification\n")); - if (!is_force) - (void) fprintf(stderr, gettext("use '-f' to override " - "the following errors:\n")); - else - (void) fprintf(stderr, gettext("the following errors " - "must be manually repaired:\n")); - error_seen = B_TRUE; - } - - va_start(ap, fmt); - (void) vfprintf(stderr, fmt, ap); - va_end(ap); -} - -#ifdef illumos -static void -libdiskmgt_error(int error) -{ - /* - * ENXIO/ENODEV is a valid error message if the device doesn't live in - * /dev/dsk. Don't bother printing an error message in this case. - */ - if (error == ENXIO || error == ENODEV) - return; - - (void) fprintf(stderr, gettext("warning: device in use checking " - "failed: %s\n"), strerror(error)); -} - -/* - * Validate a device, passing the bulk of the work off to libdiskmgt. - */ -static int -check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) -{ - char *msg; - int error = 0; - dm_who_type_t who; - - if (force) - who = DM_WHO_ZPOOL_FORCE; - else if (isspare) - who = DM_WHO_ZPOOL_SPARE; - else - who = DM_WHO_ZPOOL; - - if (dm_inuse((char *)path, &msg, who, &error) || error) { - if (error != 0) { - libdiskmgt_error(error); - return (0); - } else { - vdev_error("%s", msg); - free(msg); - return (-1); - } - } - - /* - * If we're given a whole disk, ignore overlapping slices since we're - * about to label it anyway. - */ - error = 0; - if (!wholedisk && !force && - (dm_isoverlapping((char *)path, &msg, &error) || error)) { - if (error == 0) { - /* dm_isoverlapping returned -1 */ - vdev_error(gettext("%s overlaps with %s\n"), path, msg); - free(msg); - return (-1); - } else if (error != ENODEV) { - /* libdiskmgt's devcache only handles physical drives */ - libdiskmgt_error(error); - return (0); - } - } - - return (0); -} - - -/* - * Validate a whole disk. Iterate over all slices on the disk and make sure - * that none is in use by calling check_slice(). - */ -static int -check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) -{ - dm_descriptor_t *drive, *media, *slice; - int err = 0; - int i; - int ret; - - /* - * Get the drive associated with this disk. This should never fail, - * because we already have an alias handle open for the device. - */ - if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, - &err)) == NULL || *drive == NULL) { - if (err) - libdiskmgt_error(err); - return (0); - } - - if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, - &err)) == NULL) { - dm_free_descriptors(drive); - if (err) - libdiskmgt_error(err); - return (0); - } - - dm_free_descriptors(drive); - - /* - * It is possible that the user has specified a removable media drive, - * and the media is not present. - */ - if (*media == NULL) { - dm_free_descriptors(media); - vdev_error(gettext("'%s' has no media in drive\n"), name); - return (-1); - } - - if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, - &err)) == NULL) { - dm_free_descriptors(media); - if (err) - libdiskmgt_error(err); - return (0); - } - - dm_free_descriptors(media); - - ret = 0; - - /* - * Iterate over all slices and report any errors. We don't care about - * overlapping slices because we are using the whole disk. - */ - for (i = 0; slice[i] != NULL; i++) { - char *name = dm_get_name(slice[i], &err); - - if (check_slice(name, force, B_TRUE, isspare) != 0) - ret = -1; - - dm_free_name(name); - } - - dm_free_descriptors(slice); - return (ret); -} - -/* - * Validate a device. - */ -static int -check_device(const char *path, boolean_t force, boolean_t isspare) -{ - dm_descriptor_t desc; - int err; - char *dev; - - /* - * For whole disks, libdiskmgt does not include the leading dev path. - */ - dev = strrchr(path, '/'); - assert(dev != NULL); - dev++; - if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { - err = check_disk(path, desc, force, isspare); - dm_free_descriptor(desc); - return (err); - } - - return (check_slice(path, force, B_FALSE, isspare)); -} -#endif /* illumos */ - -/* - * Check that a file is valid. All we can do in this case is check that it's - * not in use by another pool, and not in use by swap. - */ -static int -check_file(const char *file, boolean_t force, boolean_t isspare) -{ - char *name; - int fd; - int ret = 0; - int err; - pool_state_t state; - boolean_t inuse; - -#ifdef illumos - if (dm_inuse_swap(file, &err)) { - if (err) - libdiskmgt_error(err); - else - vdev_error(gettext("%s is currently used by swap. " - "Please see swap(1M).\n"), file); - return (-1); - } -#endif - - if ((fd = open(file, O_RDONLY)) < 0) - return (0); - - if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { - const char *desc; - - switch (state) { - case POOL_STATE_ACTIVE: - desc = gettext("active"); - break; - - case POOL_STATE_EXPORTED: - desc = gettext("exported"); - break; - - case POOL_STATE_POTENTIALLY_ACTIVE: - desc = gettext("potentially active"); - break; - - default: - desc = gettext("unknown"); - break; - } - - /* - * Allow hot spares to be shared between pools. - */ - if (state == POOL_STATE_SPARE && isspare) - return (0); - - if (state == POOL_STATE_ACTIVE || - state == POOL_STATE_SPARE || !force) { - switch (state) { - case POOL_STATE_SPARE: - vdev_error(gettext("%s is reserved as a hot " - "spare for pool %s\n"), file, name); - break; - default: - vdev_error(gettext("%s is part of %s pool " - "'%s'\n"), file, desc, name); - break; - } - ret = -1; - } - - free(name); - } - - (void) close(fd); - return (ret); -} - -static int -check_device(const char *name, boolean_t force, boolean_t isspare) -{ - char path[MAXPATHLEN]; - - if (strncmp(name, _PATH_DEV, sizeof(_PATH_DEV) - 1) != 0) - snprintf(path, sizeof(path), "%s%s", _PATH_DEV, name); - else - strlcpy(path, name, sizeof(path)); - - return (check_file(path, force, isspare)); -} - -/* - * By "whole disk" we mean an entire physical disk (something we can - * label, toggle the write cache on, etc.) as opposed to the full - * capacity of a pseudo-device such as lofi or did. We act as if we - * are labeling the disk, which should be a pretty good test of whether - * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if - * it isn't. - */ -static boolean_t -is_whole_disk(const char *arg) -{ -#ifdef illumos - struct dk_gpt *label; - int fd; - char path[MAXPATHLEN]; - - (void) snprintf(path, sizeof (path), "%s%s%s", - ZFS_RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE); - if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) - return (B_FALSE); - if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { - (void) close(fd); - return (B_FALSE); - } - efi_free(label); - (void) close(fd); - return (B_TRUE); -#else - int fd; - - fd = g_open(arg, 0); - if (fd >= 0) { - g_close(fd); - return (B_TRUE); - } - return (B_FALSE); -#endif -} - -/* - * Create a leaf vdev. Determine if this is a file or a device. If it's a - * device, fill in the device id to make a complete nvlist. Valid forms for a - * leaf vdev are: - * - * /dev/dsk/xxx Complete disk path - * /xxx Full path to file - * xxx Shorthand for /dev/dsk/xxx - */ -static nvlist_t * -make_leaf_vdev(const char *arg, uint64_t is_log) -{ - char path[MAXPATHLEN]; - struct stat64 statbuf; - nvlist_t *vdev = NULL; - char *type = NULL; - boolean_t wholedisk = B_FALSE; - - /* - * Determine what type of vdev this is, and put the full path into - * 'path'. We detect whether this is a device of file afterwards by - * checking the st_mode of the file. - */ - if (arg[0] == '/') { - /* - * Complete device or file path. Exact type is determined by - * examining the file descriptor afterwards. - */ - wholedisk = is_whole_disk(arg); - if (!wholedisk && (stat64(arg, &statbuf) != 0)) { - (void) fprintf(stderr, - gettext("cannot open '%s': %s\n"), - arg, strerror(errno)); - return (NULL); - } - - (void) strlcpy(path, arg, sizeof (path)); - } else { - /* - * This may be a short path for a device, or it could be total - * gibberish. Check to see if it's a known device in - * /dev/dsk/. As part of this check, see if we've been given a - * an entire disk (minus the slice number). - */ - if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) - strlcpy(path, arg, sizeof (path)); - else - snprintf(path, sizeof (path), "%s%s", _PATH_DEV, arg); - wholedisk = is_whole_disk(path); - if (!wholedisk && (stat64(path, &statbuf) != 0)) { - /* - * If we got ENOENT, then the user gave us - * gibberish, so try to direct them with a - * reasonable error message. Otherwise, - * regurgitate strerror() since it's the best we - * can do. - */ - if (errno == ENOENT) { - (void) fprintf(stderr, - gettext("cannot open '%s': no such " - "GEOM provider\n"), arg); - (void) fprintf(stderr, - gettext("must be a full path or " - "shorthand device name\n")); - return (NULL); - } else { - (void) fprintf(stderr, - gettext("cannot open '%s': %s\n"), - path, strerror(errno)); - return (NULL); - } - } - } - -#ifdef __FreeBSD__ - if (S_ISCHR(statbuf.st_mode)) { - statbuf.st_mode &= ~S_IFCHR; - statbuf.st_mode |= S_IFBLK; - wholedisk = B_FALSE; - } -#endif - - /* - * Determine whether this is a device or a file. - */ - if (wholedisk || S_ISBLK(statbuf.st_mode)) { - type = VDEV_TYPE_DISK; - } else if (S_ISREG(statbuf.st_mode)) { - type = VDEV_TYPE_FILE; - } else { - (void) fprintf(stderr, gettext("cannot use '%s': must be a " - "GEOM provider or regular file\n"), path); - return (NULL); - } - - /* - * Finally, we have the complete device or file, and we know that it is - * acceptable to use. Construct the nvlist to describe this vdev. All - * vdevs have a 'path' element, and devices also have a 'devid' element. - */ - verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); - verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); - verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); - verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); - if (is_log) - verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS, - VDEV_ALLOC_BIAS_LOG) == 0); - if (strcmp(type, VDEV_TYPE_DISK) == 0) - verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, - (uint64_t)wholedisk) == 0); - -#ifdef have_devid - /* - * For a whole disk, defer getting its devid until after labeling it. - */ - if (S_ISBLK(statbuf.st_mode) && !wholedisk) { - /* - * Get the devid for the device. - */ - int fd; - ddi_devid_t devid; - char *minor = NULL, *devid_str = NULL; - - if ((fd = open(path, O_RDONLY)) < 0) { - (void) fprintf(stderr, gettext("cannot open '%s': " - "%s\n"), path, strerror(errno)); - nvlist_free(vdev); - return (NULL); - } - - if (devid_get(fd, &devid) == 0) { - if (devid_get_minor_name(fd, &minor) == 0 && - (devid_str = devid_str_encode(devid, minor)) != - NULL) { - verify(nvlist_add_string(vdev, - ZPOOL_CONFIG_DEVID, devid_str) == 0); - } - if (devid_str != NULL) - devid_str_free(devid_str); - if (minor != NULL) - devid_str_free(minor); - devid_free(devid); - } - - (void) close(fd); - } -#endif - - return (vdev); -} - -/* - * Go through and verify the replication level of the pool is consistent. - * Performs the following checks: - * - * For the new spec, verifies that devices in mirrors and raidz are the - * same size. - * - * If the current configuration already has inconsistent replication - * levels, ignore any other potential problems in the new spec. - * - * Otherwise, make sure that the current spec (if there is one) and the new - * spec have consistent replication levels. - * - * If there is no current spec (create), make sure new spec has at least - * one general purpose vdev. - */ -typedef struct replication_level { - char *zprl_type; - uint64_t zprl_children; - uint64_t zprl_parity; -} replication_level_t; - -#define ZPOOL_FUZZ (16 * 1024 * 1024) - -static boolean_t -is_raidz_mirror(replication_level_t *a, replication_level_t *b, - replication_level_t **raidz, replication_level_t **mirror) -{ - if (strcmp(a->zprl_type, "raidz") == 0 && - strcmp(b->zprl_type, "mirror") == 0) { - *raidz = a; - *mirror = b; - return (B_TRUE); - } - return (B_FALSE); -} - -/* - * Given a list of toplevel vdevs, return the current replication level. If - * the config is inconsistent, then NULL is returned. If 'fatal' is set, then - * an error message will be displayed for each self-inconsistent vdev. - */ -static replication_level_t * -get_replication(nvlist_t *nvroot, boolean_t fatal) -{ - nvlist_t **top; - uint_t t, toplevels; - nvlist_t **child; - uint_t c, children; - nvlist_t *nv; - char *type; - replication_level_t lastrep = {0}; - replication_level_t rep; - replication_level_t *ret; - replication_level_t *raidz, *mirror; - boolean_t dontreport; - - ret = safe_malloc(sizeof (replication_level_t)); - - verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &top, &toplevels) == 0); - - for (t = 0; t < toplevels; t++) { - uint64_t is_log = B_FALSE; - - nv = top[t]; - - /* - * For separate logs we ignore the top level vdev replication - * constraints. - */ - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); - if (is_log) - continue; - - verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, - &type) == 0); - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) { - /* - * This is a 'file' or 'disk' vdev. - */ - rep.zprl_type = type; - rep.zprl_children = 1; - rep.zprl_parity = 0; - } else { - uint64_t vdev_size; - - /* - * This is a mirror or RAID-Z vdev. Go through and make - * sure the contents are all the same (files vs. disks), - * keeping track of the number of elements in the - * process. - * - * We also check that the size of each vdev (if it can - * be determined) is the same. - */ - rep.zprl_type = type; - rep.zprl_children = 0; - - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { - verify(nvlist_lookup_uint64(nv, - ZPOOL_CONFIG_NPARITY, - &rep.zprl_parity) == 0); - assert(rep.zprl_parity != 0); - } else { - rep.zprl_parity = 0; - } - - /* - * The 'dontreport' variable indicates that we've - * already reported an error for this spec, so don't - * bother doing it again. - */ - type = NULL; - dontreport = 0; - vdev_size = -1ULL; - for (c = 0; c < children; c++) { - boolean_t is_replacing, is_spare; - nvlist_t *cnv = child[c]; - char *path; - struct stat64 statbuf; - uint64_t size = -1ULL; - char *childtype; - int fd, err; - - rep.zprl_children++; - - verify(nvlist_lookup_string(cnv, - ZPOOL_CONFIG_TYPE, &childtype) == 0); - - /* - * If this is a replacing or spare vdev, then - * get the real first child of the vdev. - */ - is_replacing = strcmp(childtype, - VDEV_TYPE_REPLACING) == 0; - is_spare = strcmp(childtype, - VDEV_TYPE_SPARE) == 0; - if (is_replacing || is_spare) { - nvlist_t **rchild; - uint_t rchildren; - - verify(nvlist_lookup_nvlist_array(cnv, - ZPOOL_CONFIG_CHILDREN, &rchild, - &rchildren) == 0); - assert((is_replacing && rchildren == 2) - || (is_spare && rchildren >= 2)); - cnv = rchild[0]; - - verify(nvlist_lookup_string(cnv, - ZPOOL_CONFIG_TYPE, - &childtype) == 0); - if (strcmp(childtype, - VDEV_TYPE_SPARE) == 0) { - /* We have a replacing vdev with - * a spare child. Get the first - * real child of the spare - */ - verify( - nvlist_lookup_nvlist_array( - cnv, - ZPOOL_CONFIG_CHILDREN, - &rchild, - &rchildren) == 0); - assert(rchildren >= 2); - cnv = rchild[0]; - } - } - - verify(nvlist_lookup_string(cnv, - ZPOOL_CONFIG_PATH, &path) == 0); - - /* - * If we have a raidz/mirror that combines disks - * with files, report it as an error. - */ - if (!dontreport && type != NULL && - strcmp(type, childtype) != 0) { - if (ret != NULL) - free(ret); - ret = NULL; - if (fatal) - vdev_error(gettext( - "mismatched replication " - "level: %s contains both " - "files and devices\n"), - rep.zprl_type); - else - return (NULL); - dontreport = B_TRUE; - } - - /* - * According to stat(2), the value of 'st_size' - * is undefined for block devices and character - * devices. But there is no effective way to - * determine the real size in userland. - * - * Instead, we'll take advantage of an - * implementation detail of spec_size(). If the - * device is currently open, then we (should) - * return a valid size. - * - * If we still don't get a valid size (indicated - * by a size of 0 or MAXOFFSET_T), then ignore - * this device altogether. - */ - if ((fd = open(path, O_RDONLY)) >= 0) { - err = fstat64(fd, &statbuf); - (void) close(fd); - } else { - err = stat64(path, &statbuf); - } - - if (err != 0 || - statbuf.st_size == 0 || - statbuf.st_size == MAXOFFSET_T) - continue; - - size = statbuf.st_size; - - /* - * Also make sure that devices and - * slices have a consistent size. If - * they differ by a significant amount - * (~16MB) then report an error. - */ - if (!dontreport && - (vdev_size != -1ULL && - (labs(size - vdev_size) > - ZPOOL_FUZZ))) { - if (ret != NULL) - free(ret); - ret = NULL; - if (fatal) - vdev_error(gettext( - "%s contains devices of " - "different sizes\n"), - rep.zprl_type); - else - return (NULL); - dontreport = B_TRUE; - } - - type = childtype; - vdev_size = size; - } - } - - /* - * At this point, we have the replication of the last toplevel - * vdev in 'rep'. Compare it to 'lastrep' to see if it is - * different. - */ - if (lastrep.zprl_type != NULL) { - if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) || - is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) { - /* - * Accepted raidz and mirror when they can - * handle the same number of disk failures. - */ - if (raidz->zprl_parity != - mirror->zprl_children - 1) { - if (ret != NULL) - free(ret); - ret = NULL; - if (fatal) - vdev_error(gettext( - "mismatched replication " - "level: " - "%s and %s vdevs with " - "different redundancy, " - "%llu vs. %llu (%llu-way) " - "are present\n"), - raidz->zprl_type, - mirror->zprl_type, - raidz->zprl_parity, - mirror->zprl_children - 1, - mirror->zprl_children); - else - return (NULL); - } - } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != - 0) { - if (ret != NULL) - free(ret); - ret = NULL; - if (fatal) - vdev_error(gettext( - "mismatched replication level: " - "both %s and %s vdevs are " - "present\n"), - lastrep.zprl_type, rep.zprl_type); - else - return (NULL); - } else if (lastrep.zprl_parity != rep.zprl_parity) { - if (ret) - free(ret); - ret = NULL; - if (fatal) - vdev_error(gettext( - "mismatched replication level: " - "both %llu and %llu device parity " - "%s vdevs are present\n"), - lastrep.zprl_parity, - rep.zprl_parity, - rep.zprl_type); - else - return (NULL); - } else if (lastrep.zprl_children != rep.zprl_children) { - if (ret) - free(ret); - ret = NULL; - if (fatal) - vdev_error(gettext( - "mismatched replication level: " - "both %llu-way and %llu-way %s " - "vdevs are present\n"), - lastrep.zprl_children, - rep.zprl_children, - rep.zprl_type); - else - return (NULL); - } - } - lastrep = rep; - } - - if (ret != NULL) - *ret = rep; - - return (ret); -} - -/* - * Check the replication level of the vdev spec against the current pool. Calls - * get_replication() to make sure the new spec is self-consistent. If the pool - * has a consistent replication level, then we ignore any errors. Otherwise, - * report any difference between the two. - */ -static int -check_replication(nvlist_t *config, nvlist_t *newroot) -{ - nvlist_t **child; - uint_t children; - replication_level_t *current = NULL, *new; - replication_level_t *raidz, *mirror; - int ret; - - /* - * If we have a current pool configuration, check to see if it's - * self-consistent. If not, simply return success. - */ - if (config != NULL) { - nvlist_t *nvroot; - - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - if ((current = get_replication(nvroot, B_FALSE)) == NULL) - return (0); - } - /* - * for spares there may be no children, and therefore no - * replication level to check - */ - if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) || (children == 0)) { - free(current); - return (0); - } - - /* - * If all we have is logs then there's no replication level to check. - */ - if (num_logs(newroot) == children) { - free(current); - return (0); - } - - /* - * Get the replication level of the new vdev spec, reporting any - * inconsistencies found. - */ - if ((new = get_replication(newroot, B_TRUE)) == NULL) { - free(current); - return (-1); - } - - /* - * Check to see if the new vdev spec matches the replication level of - * the current pool. - */ - ret = 0; - if (current != NULL) { - if (is_raidz_mirror(current, new, &raidz, &mirror) || - is_raidz_mirror(new, current, &raidz, &mirror)) { - if (raidz->zprl_parity != mirror->zprl_children - 1) { - vdev_error(gettext( - "mismatched replication level: pool and " - "new vdev with different redundancy, %s " - "and %s vdevs, %llu vs. %llu (%llu-way)\n"), - raidz->zprl_type, - mirror->zprl_type, - raidz->zprl_parity, - mirror->zprl_children - 1, - mirror->zprl_children); - ret = -1; - } - } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { - vdev_error(gettext( - "mismatched replication level: pool uses %s " - "and new vdev is %s\n"), - current->zprl_type, new->zprl_type); - ret = -1; - } else if (current->zprl_parity != new->zprl_parity) { - vdev_error(gettext( - "mismatched replication level: pool uses %llu " - "device parity and new vdev uses %llu\n"), - current->zprl_parity, new->zprl_parity); - ret = -1; - } else if (current->zprl_children != new->zprl_children) { - vdev_error(gettext( - "mismatched replication level: pool uses %llu-way " - "%s and new vdev uses %llu-way %s\n"), - current->zprl_children, current->zprl_type, - new->zprl_children, new->zprl_type); - ret = -1; - } - } - - free(new); - if (current != NULL) - free(current); - - return (ret); -} - -#ifdef illumos -/* - * Go through and find any whole disks in the vdev specification, labelling them - * as appropriate. When constructing the vdev spec, we were unable to open this - * device in order to provide a devid. Now that we have labelled the disk and - * know the pool slice is valid, we can construct the devid now. - * - * If the disk was already labeled with an EFI label, we will have gotten the - * devid already (because we were able to open the whole disk). Otherwise, we - * need to get the devid after we label the disk. - */ -static int -make_disks(zpool_handle_t *zhp, nvlist_t *nv, zpool_boot_label_t boot_type, - uint64_t boot_size) -{ - nvlist_t **child; - uint_t c, children; - char *type, *path, *diskname; - char buf[MAXPATHLEN]; - uint64_t wholedisk; - int fd; - int ret; - int slice; - ddi_devid_t devid; - char *minor = NULL, *devid_str = NULL; - - verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) { - - if (strcmp(type, VDEV_TYPE_DISK) != 0) - return (0); - - /* - * We have a disk device. Get the path to the device - * and see if it's a whole disk by appending the backup - * slice and stat()ing the device. - */ - verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); - - diskname = strrchr(path, '/'); - assert(diskname != NULL); - diskname++; - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, - &wholedisk) != 0 || !wholedisk) { - /* - * This is not whole disk, return error if - * boot partition creation was requested - */ - if (boot_type == ZPOOL_CREATE_BOOT_LABEL) { - (void) fprintf(stderr, - gettext("creating boot partition is only " - "supported on whole disk vdevs: %s\n"), - diskname); - return (-1); - } - return (0); - } - - ret = zpool_label_disk(g_zfs, zhp, diskname, boot_type, - boot_size, &slice); - if (ret == -1) - return (ret); - - /* - * Fill in the devid, now that we've labeled the disk. - */ - (void) snprintf(buf, sizeof (buf), "%ss%d", path, slice); - if ((fd = open(buf, O_RDONLY)) < 0) { - (void) fprintf(stderr, - gettext("cannot open '%s': %s\n"), - buf, strerror(errno)); - return (-1); - } - - if (devid_get(fd, &devid) == 0) { - if (devid_get_minor_name(fd, &minor) == 0 && - (devid_str = devid_str_encode(devid, minor)) != - NULL) { - verify(nvlist_add_string(nv, - ZPOOL_CONFIG_DEVID, devid_str) == 0); - } - if (devid_str != NULL) - devid_str_free(devid_str); - if (minor != NULL) - devid_str_free(minor); - devid_free(devid); - } - - /* - * Update the path to refer to the pool slice. The presence of - * the 'whole_disk' field indicates to the CLI that we should - * chop off the slice number when displaying the device in - * future output. - */ - verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); - - (void) close(fd); - - return (0); - } - - /* illumos kernel does not support booting from multi-vdev pools. */ - if ((boot_type == ZPOOL_CREATE_BOOT_LABEL)) { - if ((strcmp(type, VDEV_TYPE_ROOT) == 0) && children > 1) { - (void) fprintf(stderr, gettext("boot pool " - "can not have more than one vdev\n")); - return (-1); - } - } - - for (c = 0; c < children; c++) { - ret = make_disks(zhp, child[c], boot_type, boot_size); - if (ret != 0) - return (ret); - } - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, - &child, &children) == 0) - for (c = 0; c < children; c++) { - ret = make_disks(zhp, child[c], boot_type, boot_size); - if (ret != 0) - return (ret); - } - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, - &child, &children) == 0) - for (c = 0; c < children; c++) { - ret = make_disks(zhp, child[c], boot_type, boot_size); - if (ret != 0) - return (ret); - } - - return (0); -} -#endif /* illumos */ - -/* - * Determine if the given path is a hot spare within the given configuration. - */ -static boolean_t -is_spare(nvlist_t *config, const char *path) -{ - int fd; - pool_state_t state; - char *name = NULL; - nvlist_t *label; - uint64_t guid, spareguid; - nvlist_t *nvroot; - nvlist_t **spares; - uint_t i, nspares; - boolean_t inuse; - - if ((fd = open(path, O_RDONLY)) < 0) - return (B_FALSE); - - if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || - !inuse || - state != POOL_STATE_SPARE || - zpool_read_label(fd, &label) != 0) { - free(name); - (void) close(fd); - return (B_FALSE); - } - free(name); - (void) close(fd); - - verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); - nvlist_free(label); - - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - for (i = 0; i < nspares; i++) { - verify(nvlist_lookup_uint64(spares[i], - ZPOOL_CONFIG_GUID, &spareguid) == 0); - if (spareguid == guid) - return (B_TRUE); - } - } - - return (B_FALSE); -} - -/* - * Go through and find any devices that are in use. We rely on libdiskmgt for - * the majority of this task. - */ -static boolean_t -is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, - boolean_t replacing, boolean_t isspare) -{ - nvlist_t **child; - uint_t c, children; - char *type, *path; - int ret = 0; - char buf[MAXPATHLEN]; - uint64_t wholedisk; - boolean_t anyinuse = B_FALSE; - - verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) { - - verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); - - /* - * As a generic check, we look to see if this is a replace of a - * hot spare within the same pool. If so, we allow it - * regardless of what libdiskmgt or zpool_in_use() says. - */ - if (replacing) { -#ifdef illumos - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, - &wholedisk) == 0 && wholedisk) - (void) snprintf(buf, sizeof (buf), "%ss0", - path); - else -#endif - (void) strlcpy(buf, path, sizeof (buf)); - - if (is_spare(config, buf)) - return (B_FALSE); - } - - if (strcmp(type, VDEV_TYPE_DISK) == 0) - ret = check_device(path, force, isspare); - else if (strcmp(type, VDEV_TYPE_FILE) == 0) - ret = check_file(path, force, isspare); - - return (ret != 0); - } - - for (c = 0; c < children; c++) - if (is_device_in_use(config, child[c], force, replacing, - B_FALSE)) - anyinuse = B_TRUE; - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, - &child, &children) == 0) - for (c = 0; c < children; c++) - if (is_device_in_use(config, child[c], force, replacing, - B_TRUE)) - anyinuse = B_TRUE; - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, - &child, &children) == 0) - for (c = 0; c < children; c++) - if (is_device_in_use(config, child[c], force, replacing, - B_FALSE)) - anyinuse = B_TRUE; - - return (anyinuse); -} - -static const char * -is_grouping(const char *type, int *mindev, int *maxdev) -{ - if (strncmp(type, "raidz", 5) == 0) { - const char *p = type + 5; - char *end; - long nparity; - - if (*p == '\0') { - nparity = 1; - } else if (*p == '0') { - return (NULL); /* no zero prefixes allowed */ - } else { - errno = 0; - nparity = strtol(p, &end, 10); - if (errno != 0 || nparity < 1 || nparity >= 255 || - *end != '\0') - return (NULL); - } - - if (mindev != NULL) - *mindev = nparity + 1; - if (maxdev != NULL) - *maxdev = 255; - return (VDEV_TYPE_RAIDZ); - } - - if (maxdev != NULL) - *maxdev = INT_MAX; - - if (strcmp(type, "mirror") == 0) { - if (mindev != NULL) - *mindev = 2; - return (VDEV_TYPE_MIRROR); - } - - if (strcmp(type, "spare") == 0) { - if (mindev != NULL) - *mindev = 1; - return (VDEV_TYPE_SPARE); - } - - if (strcmp(type, "log") == 0) { - if (mindev != NULL) - *mindev = 1; - return (VDEV_TYPE_LOG); - } - - if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 || - strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { - if (mindev != NULL) - *mindev = 1; - return (type); - } - - if (strcmp(type, "cache") == 0) { - if (mindev != NULL) - *mindev = 1; - return (VDEV_TYPE_L2CACHE); - } - - return (NULL); -} - -/* - * Construct a syntactically valid vdev specification, - * and ensure that all devices and files exist and can be opened. - * Note: we don't bother freeing anything in the error paths - * because the program is just going to exit anyway. - */ -nvlist_t * -construct_spec(int argc, char **argv) -{ - nvlist_t *nvroot, *nv, **top, **spares, **l2cache; - int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; - const char *type; - uint64_t is_log, is_special, is_dedup; - boolean_t seen_logs; - - top = NULL; - toplevels = 0; - spares = NULL; - l2cache = NULL; - nspares = 0; - nlogs = 0; - nl2cache = 0; - is_log = is_special = is_dedup = B_FALSE; - seen_logs = B_FALSE; - - while (argc > 0) { - nv = NULL; - - /* - * If it's a mirror or raidz, the subsequent arguments are - * its leaves -- until we encounter the next mirror or raidz. - */ - if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { - nvlist_t **child = NULL; - int c, children = 0; - - if (strcmp(type, VDEV_TYPE_SPARE) == 0) { - if (spares != NULL) { - (void) fprintf(stderr, - gettext("invalid vdev " - "specification: 'spare' can be " - "specified only once\n")); - return (NULL); - } - is_log = is_special = is_dedup = B_FALSE; - } - - if (strcmp(type, VDEV_TYPE_LOG) == 0) { - if (seen_logs) { - (void) fprintf(stderr, - gettext("invalid vdev " - "specification: 'log' can be " - "specified only once\n")); - return (NULL); - } - seen_logs = B_TRUE; - is_log = B_TRUE; - is_special = B_FALSE; - is_dedup = B_FALSE; - argc--; - argv++; - /* - * A log is not a real grouping device. - * We just set is_log and continue. - */ - continue; - } - - if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { - is_special = B_TRUE; - is_log = B_FALSE; - is_dedup = B_FALSE; - argc--; - argv++; - continue; - } - - if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { - is_dedup = B_TRUE; - is_log = B_FALSE; - is_special = B_FALSE; - argc--; - argv++; - continue; - } - - if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { - if (l2cache != NULL) { - (void) fprintf(stderr, - gettext("invalid vdev " - "specification: 'cache' can be " - "specified only once\n")); - return (NULL); - } - is_log = is_special = is_dedup = B_FALSE; - } - - if (is_log || is_special || is_dedup) { - if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { - (void) fprintf(stderr, - gettext("invalid vdev " - "specification: unsupported '%s' " - "device: %s\n"), is_log ? "log" : - "special", type); - return (NULL); - } - nlogs++; - } - - for (c = 1; c < argc; c++) { - if (is_grouping(argv[c], NULL, NULL) != NULL) - break; - children++; - child = realloc(child, - children * sizeof (nvlist_t *)); - if (child == NULL) - zpool_no_memory(); - if ((nv = make_leaf_vdev(argv[c], B_FALSE)) - == NULL) - return (NULL); - child[children - 1] = nv; - } - - if (children < mindev) { - (void) fprintf(stderr, gettext("invalid vdev " - "specification: %s requires at least %d " - "devices\n"), argv[0], mindev); - return (NULL); - } - - if (children > maxdev) { - (void) fprintf(stderr, gettext("invalid vdev " - "specification: %s supports no more than " - "%d devices\n"), argv[0], maxdev); - return (NULL); - } - - argc -= c; - argv += c; - - if (strcmp(type, VDEV_TYPE_SPARE) == 0) { - spares = child; - nspares = children; - continue; - } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { - l2cache = child; - nl2cache = children; - continue; - } else { - /* create a top-level vdev with children */ - verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, - 0) == 0); - verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, - type) == 0); - verify(nvlist_add_uint64(nv, - ZPOOL_CONFIG_IS_LOG, is_log) == 0); - if (is_log) - verify(nvlist_add_string(nv, - ZPOOL_CONFIG_ALLOCATION_BIAS, - VDEV_ALLOC_BIAS_LOG) == 0); - if (is_special) { - verify(nvlist_add_string(nv, - ZPOOL_CONFIG_ALLOCATION_BIAS, - VDEV_ALLOC_BIAS_SPECIAL) == 0); - } - if (is_dedup) { - verify(nvlist_add_string(nv, - ZPOOL_CONFIG_ALLOCATION_BIAS, - VDEV_ALLOC_BIAS_DEDUP) == 0); - } - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { - verify(nvlist_add_uint64(nv, - ZPOOL_CONFIG_NPARITY, - mindev - 1) == 0); - } - verify(nvlist_add_nvlist_array(nv, - ZPOOL_CONFIG_CHILDREN, child, - children) == 0); - - for (c = 0; c < children; c++) - nvlist_free(child[c]); - free(child); - } - } else { - /* - * We have a device. Pass off to make_leaf_vdev() to - * construct the appropriate nvlist describing the vdev. - */ - if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL) - return (NULL); - if (is_log) - nlogs++; - if (is_special) { - verify(nvlist_add_string(nv, - ZPOOL_CONFIG_ALLOCATION_BIAS, - VDEV_ALLOC_BIAS_SPECIAL) == 0); - } - if (is_dedup) { - verify(nvlist_add_string(nv, - ZPOOL_CONFIG_ALLOCATION_BIAS, - VDEV_ALLOC_BIAS_DEDUP) == 0); - } - argc--; - argv++; - } - - toplevels++; - top = realloc(top, toplevels * sizeof (nvlist_t *)); - if (top == NULL) - zpool_no_memory(); - top[toplevels - 1] = nv; - } - - if (toplevels == 0 && nspares == 0 && nl2cache == 0) { - (void) fprintf(stderr, gettext("invalid vdev " - "specification: at least one toplevel vdev must be " - "specified\n")); - return (NULL); - } - - if (seen_logs && nlogs == 0) { - (void) fprintf(stderr, gettext("invalid vdev specification: " - "log requires at least 1 device\n")); - return (NULL); - } - - /* - * Finally, create nvroot and add all top-level vdevs to it. - */ - verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); - verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_ROOT) == 0); - verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - top, toplevels) == 0); - if (nspares != 0) - verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - spares, nspares) == 0); - if (nl2cache != 0) - verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, - l2cache, nl2cache) == 0); - - for (t = 0; t < toplevels; t++) - nvlist_free(top[t]); - for (t = 0; t < nspares; t++) - nvlist_free(spares[t]); - for (t = 0; t < nl2cache; t++) - nvlist_free(l2cache[t]); - if (spares) - free(spares); - if (l2cache) - free(l2cache); - free(top); - - return (nvroot); -} - -nvlist_t * -split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, - splitflags_t flags, int argc, char **argv) -{ - nvlist_t *newroot = NULL, **child; - uint_t c, children; -#ifdef illumos - zpool_boot_label_t boot_type; -#endif - - if (argc > 0) { - if ((newroot = construct_spec(argc, argv)) == NULL) { - (void) fprintf(stderr, gettext("Unable to build a " - "pool from the specified devices\n")); - return (NULL); - } - -#ifdef illumos - if (zpool_is_bootable(zhp)) - boot_type = ZPOOL_COPY_BOOT_LABEL; - else - boot_type = ZPOOL_NO_BOOT_LABEL; - - if (!flags.dryrun && - make_disks(zhp, newroot, boot_type, 0) != 0) { - nvlist_free(newroot); - return (NULL); - } -#endif - - /* avoid any tricks in the spec */ - verify(nvlist_lookup_nvlist_array(newroot, - ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); - for (c = 0; c < children; c++) { - char *path; - const char *type; - int min, max; - - verify(nvlist_lookup_string(child[c], - ZPOOL_CONFIG_PATH, &path) == 0); - if ((type = is_grouping(path, &min, &max)) != NULL) { - (void) fprintf(stderr, gettext("Cannot use " - "'%s' as a device for splitting\n"), type); - nvlist_free(newroot); - return (NULL); - } - } - } - - if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { - nvlist_free(newroot); - return (NULL); - } - - return (newroot); -} - -static int -num_normal_vdevs(nvlist_t *nvroot) -{ - nvlist_t **top; - uint_t t, toplevels, normal = 0; - - verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &top, &toplevels) == 0); - - for (t = 0; t < toplevels; t++) { - uint64_t log = B_FALSE; - - (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log); - if (log) - continue; - if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS)) - continue; - - normal++; - } - - return (normal); -} - -/* - * Get and validate the contents of the given vdev specification. This ensures - * that the nvlist returned is well-formed, that all the devices exist, and that - * they are not currently in use by any other known consumer. The 'poolconfig' - * parameter is the current configuration of the pool when adding devices - * existing pool, and is used to perform additional checks, such as changing the - * replication level of the pool. It can be 'NULL' to indicate that this is a - * new pool. The 'force' flag controls whether devices should be forcefully - * added, even if they appear in use. - */ -nvlist_t * -make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, - boolean_t replacing, boolean_t dryrun, zpool_boot_label_t boot_type, - uint64_t boot_size, int argc, char **argv) -{ - nvlist_t *newroot; - nvlist_t *poolconfig = NULL; - is_force = force; - - /* - * Construct the vdev specification. If this is successful, we know - * that we have a valid specification, and that all devices can be - * opened. - */ - if ((newroot = construct_spec(argc, argv)) == NULL) - return (NULL); - - if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) - return (NULL); - - /* - * Validate each device to make sure that its not shared with another - * subsystem. We do this even if 'force' is set, because there are some - * uses (such as a dedicated dump device) that even '-f' cannot - * override. - */ - if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) { - nvlist_free(newroot); - return (NULL); - } - - /* - * Check the replication level of the given vdevs and report any errors - * found. We include the existing pool spec, if any, as we need to - * catch changes against the existing replication level. - */ - if (check_rep && check_replication(poolconfig, newroot) != 0) { - nvlist_free(newroot); - return (NULL); - } - -#ifdef illumos - /* - * On pool create the new vdev spec must have one normal vdev. - */ - if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) { - vdev_error(gettext("at least one general top-level vdev must " - "be specified\n")); - nvlist_free(newroot); - return (NULL); - } - - /* - * Run through the vdev specification and label any whole disks found. - */ - if (!dryrun && make_disks(zhp, newroot, boot_type, boot_size) != 0) { - nvlist_free(newroot); - return (NULL); - } -#endif - - return (newroot); -} diff --git a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.1 b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.1 deleted file mode 100644 index 3e3050283313..000000000000 --- a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.1 +++ /dev/null @@ -1,76 +0,0 @@ -'\" te -.\" Copyright (c) 2011, Martin Matuska . -.\" All Rights Reserved. -.\" -.\" The contents of this file are subject to the terms of the -.\" Common Development and Distribution License (the "License"). -.\" You may not use this file except in compliance with the License. -.\" -.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -.\" or http://www.opensolaris.org/os/licensing. -.\" See the License for the specific language governing permissions -.\" and limitations under the License. -.\" -.\" When distributing Covered Code, include this CDDL HEADER in each -.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. -.\" If applicable, add the following below this CDDL HEADER, with the -.\" fields enclosed by brackets "[]" replaced with your own identifying -.\" information: Portions Copyright [yyyy] [name of copyright owner] -.\" -.\" Copyright (c) 2009, Sun Microsystems, Inc. All Rights Reserved. -.\" Copyright (c) 2013, Delphix. All Rights Reserved. -.\" -.\" $FreeBSD$ -.\" -.Dd February 25, 2020 -.Dt ZSTREAMDUMP 8 -.Os -.Sh NAME -.Nm zstreamdump -.Nd filter data in zfs send stream -.Sh SYNOPSIS -.Nm -.Op Fl C -.Op Fl d -.Op Fl v -.Sh DESCRIPTION -The -.Nm -utility reads from the output of the -.Qq Nm zfs Cm send -command, then displays headers and some statistics from that output. See -.Xr zfs 8 . -.Pp -The following options are supported: -.Bl -tag -width indent -.It Fl C -Suppress the validation of checksums. -.It Fl d -Dump contents of blocks modified, implies verbose. -.It Fl v -Verbose. Dump all headers, not only begin and end headers. -.El -.Sh SEE ALSO -.Xr zfs 8 -.Sh HISTORY -The -.Nm -utility first appeared in -.Fx 7.0 . -.Sh AUTHORS -This manual page is a -.Xr mdoc 7 -reimplementation of the -.Tn OpenSolaris -manual page -.Em zstreamdump(1M) , -modified and customized for -.Fx -and licensed under the -.Tn Common Development and Distribution License -.Pq Tn CDDL . -.Pp -The -.Xr mdoc 7 -implementation of this manual page was initially written by -.An Martin Matuska Aq mm@FreeBSD.org . diff --git a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c deleted file mode 100644 index 51c4c8e0e649..000000000000 --- a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c +++ /dev/null @@ -1,644 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2014 Integros [integros.com] - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* - * If dump mode is enabled, the number of bytes to print per line - */ -#define BYTES_PER_LINE 16 -/* - * If dump mode is enabled, the number of bytes to group together, separated - * by newlines or spaces - */ -#define DUMP_GROUPING 4 - -uint64_t total_write_size = 0; -uint64_t total_stream_len = 0; -FILE *send_stream = 0; -boolean_t do_byteswap = B_FALSE; -boolean_t do_cksum = B_TRUE; - -static void -usage(void) -{ - (void) fprintf(stderr, "usage: zstreamdump [-v] [-C] [-d] < file\n"); - (void) fprintf(stderr, "\t -v -- verbose\n"); - (void) fprintf(stderr, "\t -C -- suppress checksum verification\n"); - (void) fprintf(stderr, "\t -d -- dump contents of blocks modified, " - "implies verbose\n"); - exit(1); -} - -static void * -safe_malloc(size_t size) -{ - void *rv = malloc(size); - if (rv == NULL) { - (void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n", - size); - abort(); - } - return (rv); -} - -/* - * ssread - send stream read. - * - * Read while computing incremental checksum - */ -static size_t -ssread(void *buf, size_t len, zio_cksum_t *cksum) -{ - size_t outlen; - - if ((outlen = fread(buf, len, 1, send_stream)) == 0) - return (0); - - if (do_cksum) { - if (do_byteswap) - fletcher_4_incremental_byteswap(buf, len, cksum); - else - fletcher_4_incremental_native(buf, len, cksum); - } - total_stream_len += len; - return (outlen); -} - -static size_t -read_hdr(dmu_replay_record_t *drr, zio_cksum_t *cksum) -{ - ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), - ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - size_t r = ssread(drr, sizeof (*drr) - sizeof (zio_cksum_t), cksum); - if (r == 0) - return (0); - zio_cksum_t saved_cksum = *cksum; - r = ssread(&drr->drr_u.drr_checksum.drr_checksum, - sizeof (zio_cksum_t), cksum); - if (r == 0) - return (0); - if (!ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.drr_checksum.drr_checksum) && - !ZIO_CHECKSUM_EQUAL(saved_cksum, - drr->drr_u.drr_checksum.drr_checksum)) { - fprintf(stderr, "invalid checksum\n"); - (void) printf("Incorrect checksum in record header.\n"); - (void) printf("Expected checksum = %llx/%llx/%llx/%llx\n", - saved_cksum.zc_word[0], - saved_cksum.zc_word[1], - saved_cksum.zc_word[2], - saved_cksum.zc_word[3]); - return (0); - } - return (sizeof (*drr)); -} - -/* - * Print part of a block in ASCII characters - */ -static void -print_ascii_block(char *subbuf, int length) -{ - int i; - - for (i = 0; i < length; i++) { - char char_print = isprint(subbuf[i]) ? subbuf[i] : '.'; - if (i != 0 && i % DUMP_GROUPING == 0) { - (void) printf(" "); - } - (void) printf("%c", char_print); - } - (void) printf("\n"); -} - -/* - * print_block - Dump the contents of a modified block to STDOUT - * - * Assume that buf has capacity evenly divisible by BYTES_PER_LINE - */ -static void -print_block(char *buf, int length) -{ - int i; - /* - * Start printing ASCII characters at a constant offset, after - * the hex prints. Leave 3 characters per byte on a line (2 digit - * hex number plus 1 space) plus spaces between characters and - * groupings. - */ - int ascii_start = BYTES_PER_LINE * 3 + - BYTES_PER_LINE / DUMP_GROUPING + 2; - - for (i = 0; i < length; i += BYTES_PER_LINE) { - int j; - int this_line_length = MIN(BYTES_PER_LINE, length - i); - int print_offset = 0; - - for (j = 0; j < this_line_length; j++) { - int buf_offset = i + j; - - /* - * Separate every DUMP_GROUPING bytes by a space. - */ - if (buf_offset % DUMP_GROUPING == 0) { - print_offset += printf(" "); - } - - /* - * Print the two-digit hex value for this byte. - */ - unsigned char hex_print = buf[buf_offset]; - print_offset += printf("%02x ", hex_print); - } - - (void) printf("%*s", ascii_start - print_offset, " "); - - print_ascii_block(buf + i, this_line_length); - } -} - -int -main(int argc, char *argv[]) -{ - char *buf = safe_malloc(SPA_MAXBLOCKSIZE); - uint64_t drr_record_count[DRR_NUMTYPES] = { 0 }; - uint64_t total_records = 0; - dmu_replay_record_t thedrr; - dmu_replay_record_t *drr = &thedrr; - struct drr_begin *drrb = &thedrr.drr_u.drr_begin; - struct drr_end *drre = &thedrr.drr_u.drr_end; - struct drr_object *drro = &thedrr.drr_u.drr_object; - struct drr_freeobjects *drrfo = &thedrr.drr_u.drr_freeobjects; - struct drr_write *drrw = &thedrr.drr_u.drr_write; - struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref; - struct drr_free *drrf = &thedrr.drr_u.drr_free; - struct drr_spill *drrs = &thedrr.drr_u.drr_spill; - struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded; - struct drr_checksum *drrc = &thedrr.drr_u.drr_checksum; - char c; - boolean_t verbose = B_FALSE; - boolean_t very_verbose = B_FALSE; - boolean_t first = B_TRUE; - /* - * dump flag controls whether the contents of any modified data blocks - * are printed to the console during processing of the stream. Warning: - * for large streams, this can obviously lead to massive prints. - */ - boolean_t dump = B_FALSE; - int err; - zio_cksum_t zc = { 0 }; - zio_cksum_t pcksum = { 0 }; - - while ((c = getopt(argc, argv, ":vCd")) != -1) { - switch (c) { - case 'C': - do_cksum = B_FALSE; - break; - case 'v': - if (verbose) - very_verbose = B_TRUE; - verbose = B_TRUE; - break; - case 'd': - dump = B_TRUE; - verbose = B_TRUE; - very_verbose = B_TRUE; - break; - case ':': - (void) fprintf(stderr, - "missing argument for '%c' option\n", optopt); - usage(); - break; - case '?': - (void) fprintf(stderr, "invalid option '%c'\n", - optopt); - usage(); - break; - } - } - - if (isatty(STDIN_FILENO)) { - (void) fprintf(stderr, - "Error: Backup stream can not be read " - "from a terminal.\n" - "You must redirect standard input.\n"); - exit(1); - } - - send_stream = stdin; - pcksum = zc; - while (read_hdr(drr, &zc)) { - - /* - * If this is the first DMU record being processed, check for - * the magic bytes and figure out the endian-ness based on them. - */ - if (first) { - if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { - do_byteswap = B_TRUE; - if (do_cksum) { - ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); - /* - * recalculate header checksum now - * that we know it needs to be - * byteswapped. - */ - fletcher_4_incremental_byteswap(drr, - sizeof (dmu_replay_record_t), &zc); - } - } else if (drrb->drr_magic != DMU_BACKUP_MAGIC) { - (void) fprintf(stderr, "Invalid stream " - "(bad magic number)\n"); - exit(1); - } - first = B_FALSE; - } - if (do_byteswap) { - drr->drr_type = BSWAP_32(drr->drr_type); - drr->drr_payloadlen = - BSWAP_32(drr->drr_payloadlen); - } - - /* - * At this point, the leading fields of the replay record - * (drr_type and drr_payloadlen) have been byte-swapped if - * necessary, but the rest of the data structure (the - * union of type-specific structures) is still in its - * original state. - */ - if (drr->drr_type >= DRR_NUMTYPES) { - (void) printf("INVALID record found: type 0x%x\n", - drr->drr_type); - (void) printf("Aborting.\n"); - exit(1); - } - - drr_record_count[drr->drr_type]++; - total_records++; - - switch (drr->drr_type) { - case DRR_BEGIN: - if (do_byteswap) { - drrb->drr_magic = BSWAP_64(drrb->drr_magic); - drrb->drr_versioninfo = - BSWAP_64(drrb->drr_versioninfo); - drrb->drr_creation_time = - BSWAP_64(drrb->drr_creation_time); - drrb->drr_type = BSWAP_32(drrb->drr_type); - drrb->drr_flags = BSWAP_32(drrb->drr_flags); - drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); - drrb->drr_fromguid = - BSWAP_64(drrb->drr_fromguid); - } - - (void) printf("BEGIN record\n"); - (void) printf("\thdrtype = %lld\n", - DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo)); - (void) printf("\tfeatures = %llx\n", - DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo)); - (void) printf("\tmagic = %llx\n", - (u_longlong_t)drrb->drr_magic); - (void) printf("\tcreation_time = %llx\n", - (u_longlong_t)drrb->drr_creation_time); - (void) printf("\ttype = %u\n", drrb->drr_type); - (void) printf("\tflags = 0x%x\n", drrb->drr_flags); - (void) printf("\ttoguid = %llx\n", - (u_longlong_t)drrb->drr_toguid); - (void) printf("\tfromguid = %llx\n", - (u_longlong_t)drrb->drr_fromguid); - (void) printf("\ttoname = %s\n", drrb->drr_toname); - if (verbose) - (void) printf("\n"); - - if (drr->drr_payloadlen != 0) { - nvlist_t *nv; - int sz = drr->drr_payloadlen; - - if (sz > SPA_MAXBLOCKSIZE) { - free(buf); - buf = safe_malloc(sz); - } - (void) ssread(buf, sz, &zc); - if (ferror(send_stream)) - perror("fread"); - err = nvlist_unpack(buf, sz, &nv, 0); - if (err) - perror(strerror(err)); - nvlist_print(stdout, nv); - nvlist_free(nv); - } - break; - - case DRR_END: - if (do_byteswap) { - drre->drr_checksum.zc_word[0] = - BSWAP_64(drre->drr_checksum.zc_word[0]); - drre->drr_checksum.zc_word[1] = - BSWAP_64(drre->drr_checksum.zc_word[1]); - drre->drr_checksum.zc_word[2] = - BSWAP_64(drre->drr_checksum.zc_word[2]); - drre->drr_checksum.zc_word[3] = - BSWAP_64(drre->drr_checksum.zc_word[3]); - } - /* - * We compare against the *previous* checksum - * value, because the stored checksum is of - * everything before the DRR_END record. - */ - if (do_cksum && !ZIO_CHECKSUM_EQUAL(drre->drr_checksum, - pcksum)) { - (void) printf("Expected checksum differs from " - "checksum in stream.\n"); - (void) printf("Expected checksum = " - "%llx/%llx/%llx/%llx\n", - pcksum.zc_word[0], - pcksum.zc_word[1], - pcksum.zc_word[2], - pcksum.zc_word[3]); - } - (void) printf("END checksum = %llx/%llx/%llx/%llx\n", - drre->drr_checksum.zc_word[0], - drre->drr_checksum.zc_word[1], - drre->drr_checksum.zc_word[2], - drre->drr_checksum.zc_word[3]); - - ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); - break; - - case DRR_OBJECT: - if (do_byteswap) { - drro->drr_object = BSWAP_64(drro->drr_object); - drro->drr_type = BSWAP_32(drro->drr_type); - drro->drr_bonustype = - BSWAP_32(drro->drr_bonustype); - drro->drr_blksz = BSWAP_32(drro->drr_blksz); - drro->drr_bonuslen = - BSWAP_32(drro->drr_bonuslen); - drro->drr_toguid = BSWAP_64(drro->drr_toguid); - } - if (verbose) { - (void) printf("OBJECT object = %" PRIu64 - " type = %u bonustype = %u blksz = %u" - " bonuslen = %u dn_slots = %u\n", - drro->drr_object, - drro->drr_type, - drro->drr_bonustype, - drro->drr_blksz, - drro->drr_bonuslen, - drro->drr_dn_slots); - } - if (drro->drr_bonuslen > 0) { - (void) ssread(buf, - P2ROUNDUP(drro->drr_bonuslen, 8), &zc); - if (dump) { - print_block(buf, - P2ROUNDUP(drro->drr_bonuslen, 8)); - } - } - break; - - case DRR_FREEOBJECTS: - if (do_byteswap) { - drrfo->drr_firstobj = - BSWAP_64(drrfo->drr_firstobj); - drrfo->drr_numobjs = - BSWAP_64(drrfo->drr_numobjs); - drrfo->drr_toguid = BSWAP_64(drrfo->drr_toguid); - } - if (verbose) { - (void) printf("FREEOBJECTS firstobj = %llu " - "numobjs = %llu\n", - (u_longlong_t)drrfo->drr_firstobj, - (u_longlong_t)drrfo->drr_numobjs); - } - break; - - case DRR_WRITE: - if (do_byteswap) { - drrw->drr_object = BSWAP_64(drrw->drr_object); - drrw->drr_type = BSWAP_32(drrw->drr_type); - drrw->drr_offset = BSWAP_64(drrw->drr_offset); - drrw->drr_logical_size = - BSWAP_64(drrw->drr_logical_size); - drrw->drr_toguid = BSWAP_64(drrw->drr_toguid); - drrw->drr_key.ddk_prop = - BSWAP_64(drrw->drr_key.ddk_prop); - drrw->drr_compressed_size = - BSWAP_64(drrw->drr_compressed_size); - } - - uint64_t payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); - - /* - * If this is verbose and/or dump output, - * print info on the modified block - */ - if (verbose) { - (void) printf("WRITE object = %llu type = %u " - "checksum type = %u compression type = %u\n" - " offset = %llu logical_size = %llu " - "compressed_size = %llu " - "payload_size = %llu " - "props = %llx\n", - (u_longlong_t)drrw->drr_object, - drrw->drr_type, - drrw->drr_checksumtype, - drrw->drr_compressiontype, - (u_longlong_t)drrw->drr_offset, - (u_longlong_t)drrw->drr_logical_size, - (u_longlong_t)drrw->drr_compressed_size, - (u_longlong_t)payload_size, - (u_longlong_t)drrw->drr_key.ddk_prop); - } - - /* - * Read the contents of the block in from STDIN to buf - */ - (void) ssread(buf, payload_size, &zc); - /* - * If in dump mode - */ - if (dump) { - print_block(buf, payload_size); - } - total_write_size += payload_size; - break; - - case DRR_WRITE_BYREF: - if (do_byteswap) { - drrwbr->drr_object = - BSWAP_64(drrwbr->drr_object); - drrwbr->drr_offset = - BSWAP_64(drrwbr->drr_offset); - drrwbr->drr_length = - BSWAP_64(drrwbr->drr_length); - drrwbr->drr_toguid = - BSWAP_64(drrwbr->drr_toguid); - drrwbr->drr_refguid = - BSWAP_64(drrwbr->drr_refguid); - drrwbr->drr_refobject = - BSWAP_64(drrwbr->drr_refobject); - drrwbr->drr_refoffset = - BSWAP_64(drrwbr->drr_refoffset); - drrwbr->drr_key.ddk_prop = - BSWAP_64(drrwbr->drr_key.ddk_prop); - } - if (verbose) { - (void) printf("WRITE_BYREF object = %llu " - "checksum type = %u props = %llx\n" - " offset = %llu length = %llu\n" - "toguid = %llx refguid = %llx\n" - " refobject = %llu refoffset = %llu\n", - (u_longlong_t)drrwbr->drr_object, - drrwbr->drr_checksumtype, - (u_longlong_t)drrwbr->drr_key.ddk_prop, - (u_longlong_t)drrwbr->drr_offset, - (u_longlong_t)drrwbr->drr_length, - (u_longlong_t)drrwbr->drr_toguid, - (u_longlong_t)drrwbr->drr_refguid, - (u_longlong_t)drrwbr->drr_refobject, - (u_longlong_t)drrwbr->drr_refoffset); - } - break; - - case DRR_FREE: - if (do_byteswap) { - drrf->drr_object = BSWAP_64(drrf->drr_object); - drrf->drr_offset = BSWAP_64(drrf->drr_offset); - drrf->drr_length = BSWAP_64(drrf->drr_length); - } - if (verbose) { - (void) printf("FREE object = %llu " - "offset = %llu length = %lld\n", - (u_longlong_t)drrf->drr_object, - (u_longlong_t)drrf->drr_offset, - (longlong_t)drrf->drr_length); - } - break; - case DRR_SPILL: - if (do_byteswap) { - drrs->drr_object = BSWAP_64(drrs->drr_object); - drrs->drr_length = BSWAP_64(drrs->drr_length); - } - if (verbose) { - (void) printf("SPILL block for object = %llu " - "length = %llu\n", drrs->drr_object, - drrs->drr_length); - } - (void) ssread(buf, drrs->drr_length, &zc); - if (dump) { - print_block(buf, drrs->drr_length); - } - break; - case DRR_WRITE_EMBEDDED: - if (do_byteswap) { - drrwe->drr_object = - BSWAP_64(drrwe->drr_object); - drrwe->drr_offset = - BSWAP_64(drrwe->drr_offset); - drrwe->drr_length = - BSWAP_64(drrwe->drr_length); - drrwe->drr_toguid = - BSWAP_64(drrwe->drr_toguid); - drrwe->drr_lsize = - BSWAP_32(drrwe->drr_lsize); - drrwe->drr_psize = - BSWAP_32(drrwe->drr_psize); - } - if (verbose) { - (void) printf("WRITE_EMBEDDED object = %llu " - "offset = %llu length = %llu\n" - " toguid = %llx comp = %u etype = %u " - "lsize = %u psize = %u\n", - (u_longlong_t)drrwe->drr_object, - (u_longlong_t)drrwe->drr_offset, - (u_longlong_t)drrwe->drr_length, - (u_longlong_t)drrwe->drr_toguid, - drrwe->drr_compression, - drrwe->drr_etype, - drrwe->drr_lsize, - drrwe->drr_psize); - } - (void) ssread(buf, - P2ROUNDUP(drrwe->drr_psize, 8), &zc); - break; - } - if (drr->drr_type != DRR_BEGIN && very_verbose) { - (void) printf(" checksum = %llx/%llx/%llx/%llx\n", - (longlong_t)drrc->drr_checksum.zc_word[0], - (longlong_t)drrc->drr_checksum.zc_word[1], - (longlong_t)drrc->drr_checksum.zc_word[2], - (longlong_t)drrc->drr_checksum.zc_word[3]); - } - pcksum = zc; - } - free(buf); - - /* Print final summary */ - - (void) printf("SUMMARY:\n"); - (void) printf("\tTotal DRR_BEGIN records = %lld\n", - (u_longlong_t)drr_record_count[DRR_BEGIN]); - (void) printf("\tTotal DRR_END records = %lld\n", - (u_longlong_t)drr_record_count[DRR_END]); - (void) printf("\tTotal DRR_OBJECT records = %lld\n", - (u_longlong_t)drr_record_count[DRR_OBJECT]); - (void) printf("\tTotal DRR_FREEOBJECTS records = %lld\n", - (u_longlong_t)drr_record_count[DRR_FREEOBJECTS]); - (void) printf("\tTotal DRR_WRITE records = %lld\n", - (u_longlong_t)drr_record_count[DRR_WRITE]); - (void) printf("\tTotal DRR_WRITE_BYREF records = %lld\n", - (u_longlong_t)drr_record_count[DRR_WRITE_BYREF]); - (void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld\n", - (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED]); - (void) printf("\tTotal DRR_FREE records = %lld\n", - (u_longlong_t)drr_record_count[DRR_FREE]); - (void) printf("\tTotal DRR_SPILL records = %lld\n", - (u_longlong_t)drr_record_count[DRR_SPILL]); - (void) printf("\tTotal records = %lld\n", - (u_longlong_t)total_records); - (void) printf("\tTotal write size = %lld (0x%llx)\n", - (u_longlong_t)total_write_size, (u_longlong_t)total_write_size); - (void) printf("\tTotal stream length = %lld (0x%llx)\n", - (u_longlong_t)total_stream_len, (u_longlong_t)total_stream_len); - return (0); -} diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c deleted file mode 100644 index 65a4858b95d9..000000000000 --- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c +++ /dev/null @@ -1,7135 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012 Martin Matuska . All rights reserved. - * Copyright (c) 2013 Steven Hartland. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2017 Joyent, Inc. - * Copyright (c) 2017, Intel Corporation. - * Copyright 2017 RackTop Systems. - */ - -/* - * The objective of this program is to provide a DMU/ZAP/SPA stress test - * that runs entirely in userland, is easy to use, and easy to extend. - * - * The overall design of the ztest program is as follows: - * - * (1) For each major functional area (e.g. adding vdevs to a pool, - * creating and destroying datasets, reading and writing objects, etc) - * we have a simple routine to test that functionality. These - * individual routines do not have to do anything "stressful". - * - * (2) We turn these simple functionality tests into a stress test by - * running them all in parallel, with as many threads as desired, - * and spread across as many datasets, objects, and vdevs as desired. - * - * (3) While all this is happening, we inject faults into the pool to - * verify that self-healing data really works. - * - * (4) Every time we open a dataset, we change its checksum and compression - * functions. Thus even individual objects vary from block to block - * in which checksum they use and whether they're compressed. - * - * (5) To verify that we never lose on-disk consistency after a crash, - * we run the entire test in a child of the main process. - * At random times, the child self-immolates with a SIGKILL. - * This is the software equivalent of pulling the power cord. - * The parent then runs the test again, using the existing - * storage pool, as many times as desired. If backwards compatibility - * testing is enabled ztest will sometimes run the "older" version - * of ztest after a SIGKILL. - * - * (6) To verify that we don't have future leaks or temporal incursions, - * many of the functional tests record the transaction group number - * as part of their data. When reading old data, they verify that - * the transaction group number is less than the current, open txg. - * If you add a new test, please do this if applicable. - * - * When run with no arguments, ztest runs for about five minutes and - * produces no output if successful. To get a little bit of information, - * specify -V. To get more information, specify -VV, and so on. - * - * To turn this into an overnight stress test, use -T to specify run time. - * - * You can ask more more vdevs [-v], datasets [-d], or threads [-t] - * to increase the pool capacity, fanout, and overall stress level. - * - * Use the -k option to set the desired frequency of kills. - * - * When ztest invokes itself it passes all relevant information through a - * temporary file which is mmap-ed in the child process. This allows shared - * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always - * stored at offset 0 of this file and contains information on the size and - * number of shared structures in the file. The information stored in this file - * must remain backwards compatible with older versions of ztest so that - * ztest can invoke them during backwards compatibility testing (-B). - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int ztest_fd_data = -1; -static int ztest_fd_rand = -1; - -typedef struct ztest_shared_hdr { - uint64_t zh_hdr_size; - uint64_t zh_opts_size; - uint64_t zh_size; - uint64_t zh_stats_size; - uint64_t zh_stats_count; - uint64_t zh_ds_size; - uint64_t zh_ds_count; -} ztest_shared_hdr_t; - -static ztest_shared_hdr_t *ztest_shared_hdr; - -enum ztest_class_state { - ZTEST_VDEV_CLASS_OFF, - ZTEST_VDEV_CLASS_ON, - ZTEST_VDEV_CLASS_RND -}; - -typedef struct ztest_shared_opts { - char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; - char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; - char zo_alt_ztest[MAXNAMELEN]; - char zo_alt_libpath[MAXNAMELEN]; - uint64_t zo_vdevs; - uint64_t zo_vdevtime; - size_t zo_vdev_size; - int zo_ashift; - int zo_mirrors; - int zo_raidz; - int zo_raidz_parity; - int zo_datasets; - int zo_threads; - uint64_t zo_passtime; - uint64_t zo_killrate; - int zo_verbose; - int zo_init; - uint64_t zo_time; - uint64_t zo_maxloops; - uint64_t zo_metaslab_force_ganging; - int zo_mmp_test; - int zo_special_vdevs; -} ztest_shared_opts_t; - -static const ztest_shared_opts_t ztest_opts_defaults = { - .zo_pool = { 'z', 't', 'e', 's', 't', '\0' }, - .zo_dir = { '/', 't', 'm', 'p', '\0' }, - .zo_alt_ztest = { '\0' }, - .zo_alt_libpath = { '\0' }, - .zo_vdevs = 5, - .zo_ashift = SPA_MINBLOCKSHIFT, - .zo_mirrors = 2, - .zo_raidz = 4, - .zo_raidz_parity = 1, - .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ - .zo_datasets = 7, - .zo_threads = 23, - .zo_passtime = 60, /* 60 seconds */ - .zo_killrate = 70, /* 70% kill rate */ - .zo_verbose = 0, - .zo_mmp_test = 0, - .zo_init = 1, - .zo_time = 300, /* 5 minutes */ - .zo_maxloops = 50, /* max loops during spa_freeze() */ - .zo_metaslab_force_ganging = 32 << 10, - .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, -}; - -extern uint64_t metaslab_force_ganging; -extern uint64_t metaslab_df_alloc_threshold; -extern uint64_t zfs_deadman_synctime_ms; -extern int metaslab_preload_limit; -extern boolean_t zfs_compressed_arc_enabled; -extern boolean_t zfs_abd_scatter_enabled; -extern int dmu_object_alloc_chunk_shift; -extern boolean_t zfs_force_some_double_word_sm_entries; -extern unsigned long zfs_reconstruct_indirect_damage_fraction; - -static ztest_shared_opts_t *ztest_shared_opts; -static ztest_shared_opts_t ztest_opts; - -typedef struct ztest_shared_ds { - uint64_t zd_seq; -} ztest_shared_ds_t; - -static ztest_shared_ds_t *ztest_shared_ds; -#define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) - -#define BT_MAGIC 0x123456789abcdefULL -#define MAXFAULTS() \ - (MAX(zs->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1) - -enum ztest_io_type { - ZTEST_IO_WRITE_TAG, - ZTEST_IO_WRITE_PATTERN, - ZTEST_IO_WRITE_ZEROES, - ZTEST_IO_TRUNCATE, - ZTEST_IO_SETATTR, - ZTEST_IO_REWRITE, - ZTEST_IO_TYPES -}; - -typedef struct ztest_block_tag { - uint64_t bt_magic; - uint64_t bt_objset; - uint64_t bt_object; - uint64_t bt_dnodesize; - uint64_t bt_offset; - uint64_t bt_gen; - uint64_t bt_txg; - uint64_t bt_crtxg; -} ztest_block_tag_t; - -typedef struct bufwad { - uint64_t bw_index; - uint64_t bw_txg; - uint64_t bw_data; -} bufwad_t; - -/* - * It would be better to use a rangelock_t per object. Unfortunately - * the rangelock_t is not a drop-in replacement for rl_t, because we - * still need to map from object ID to rangelock_t. - */ -typedef enum { - RL_READER, - RL_WRITER, - RL_APPEND -} rl_type_t; - -typedef struct rll { - void *rll_writer; - int rll_readers; - kmutex_t rll_lock; - kcondvar_t rll_cv; -} rll_t; - -typedef struct rl { - uint64_t rl_object; - uint64_t rl_offset; - uint64_t rl_size; - rll_t *rl_lock; -} rl_t; - -#define ZTEST_RANGE_LOCKS 64 -#define ZTEST_OBJECT_LOCKS 64 - -/* - * Object descriptor. Used as a template for object lookup/create/remove. - */ -typedef struct ztest_od { - uint64_t od_dir; - uint64_t od_object; - dmu_object_type_t od_type; - dmu_object_type_t od_crtype; - uint64_t od_blocksize; - uint64_t od_crblocksize; - uint64_t od_crdnodesize; - uint64_t od_gen; - uint64_t od_crgen; - char od_name[ZFS_MAX_DATASET_NAME_LEN]; -} ztest_od_t; - -/* - * Per-dataset state. - */ -typedef struct ztest_ds { - ztest_shared_ds_t *zd_shared; - objset_t *zd_os; - krwlock_t zd_zilog_lock; - zilog_t *zd_zilog; - ztest_od_t *zd_od; /* debugging aid */ - char zd_name[ZFS_MAX_DATASET_NAME_LEN]; - kmutex_t zd_dirobj_lock; - rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; - rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; -} ztest_ds_t; - -/* - * Per-iteration state. - */ -typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); - -typedef struct ztest_info { - ztest_func_t *zi_func; /* test function */ - uint64_t zi_iters; /* iterations per execution */ - uint64_t *zi_interval; /* execute every seconds */ -} ztest_info_t; - -typedef struct ztest_shared_callstate { - uint64_t zc_count; /* per-pass count */ - uint64_t zc_time; /* per-pass time */ - uint64_t zc_next; /* next time to call this function */ -} ztest_shared_callstate_t; - -static ztest_shared_callstate_t *ztest_shared_callstate; -#define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) - -/* - * Note: these aren't static because we want dladdr() to work. - */ -ztest_func_t ztest_dmu_read_write; -ztest_func_t ztest_dmu_write_parallel; -ztest_func_t ztest_dmu_object_alloc_free; -ztest_func_t ztest_dmu_object_next_chunk; -ztest_func_t ztest_dmu_commit_callbacks; -ztest_func_t ztest_zap; -ztest_func_t ztest_zap_parallel; -ztest_func_t ztest_zil_commit; -ztest_func_t ztest_zil_remount; -ztest_func_t ztest_dmu_read_write_zcopy; -ztest_func_t ztest_dmu_objset_create_destroy; -ztest_func_t ztest_dmu_prealloc; -ztest_func_t ztest_fzap; -ztest_func_t ztest_dmu_snapshot_create_destroy; -ztest_func_t ztest_dsl_prop_get_set; -ztest_func_t ztest_spa_prop_get_set; -ztest_func_t ztest_spa_create_destroy; -ztest_func_t ztest_fault_inject; -ztest_func_t ztest_ddt_repair; -ztest_func_t ztest_dmu_snapshot_hold; -ztest_func_t ztest_mmp_enable_disable; -ztest_func_t ztest_scrub; -ztest_func_t ztest_dsl_dataset_promote_busy; -ztest_func_t ztest_vdev_attach_detach; -ztest_func_t ztest_vdev_LUN_growth; -ztest_func_t ztest_vdev_add_remove; -ztest_func_t ztest_vdev_class_add; -ztest_func_t ztest_vdev_aux_add_remove; -ztest_func_t ztest_split_pool; -ztest_func_t ztest_reguid; -ztest_func_t ztest_spa_upgrade; -ztest_func_t ztest_device_removal; -ztest_func_t ztest_remap_blocks; -ztest_func_t ztest_spa_checkpoint_create_discard; -ztest_func_t ztest_initialize; -ztest_func_t ztest_verify_dnode_bt; - -uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ -uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ -uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ -uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ -uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ - -ztest_info_t ztest_info[] = { - { ztest_dmu_read_write, 1, &zopt_always }, - { ztest_dmu_write_parallel, 10, &zopt_always }, - { ztest_dmu_object_alloc_free, 1, &zopt_always }, - { ztest_dmu_object_next_chunk, 1, &zopt_sometimes }, - { ztest_dmu_commit_callbacks, 1, &zopt_always }, - { ztest_zap, 30, &zopt_always }, - { ztest_zap_parallel, 100, &zopt_always }, - { ztest_split_pool, 1, &zopt_always }, - { ztest_zil_commit, 1, &zopt_incessant }, - { ztest_zil_remount, 1, &zopt_sometimes }, - { ztest_dmu_read_write_zcopy, 1, &zopt_often }, - { ztest_dmu_objset_create_destroy, 1, &zopt_often }, - { ztest_dsl_prop_get_set, 1, &zopt_often }, - { ztest_spa_prop_get_set, 1, &zopt_sometimes }, -#if 0 - { ztest_dmu_prealloc, 1, &zopt_sometimes }, -#endif - { ztest_fzap, 1, &zopt_sometimes }, - { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, - { ztest_spa_create_destroy, 1, &zopt_sometimes }, - { ztest_fault_inject, 1, &zopt_incessant }, - { ztest_ddt_repair, 1, &zopt_sometimes }, - { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, - { ztest_mmp_enable_disable, 1, &zopt_sometimes }, - { ztest_reguid, 1, &zopt_rarely }, - { ztest_scrub, 1, &zopt_often }, - { ztest_spa_upgrade, 1, &zopt_rarely }, - { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, - { ztest_vdev_attach_detach, 1, &zopt_incessant }, - { ztest_vdev_LUN_growth, 1, &zopt_rarely }, - { ztest_vdev_add_remove, 1, - &ztest_opts.zo_vdevtime }, - { ztest_vdev_class_add, 1, - &ztest_opts.zo_vdevtime }, - { ztest_vdev_aux_add_remove, 1, - &ztest_opts.zo_vdevtime }, - { ztest_device_removal, 1, &zopt_sometimes }, - { ztest_remap_blocks, 1, &zopt_sometimes }, - { ztest_spa_checkpoint_create_discard, 1, &zopt_rarely }, - { ztest_initialize, 1, &zopt_sometimes }, - { ztest_verify_dnode_bt, 1, &zopt_sometimes } -}; - -#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) - -/* - * The following struct is used to hold a list of uncalled commit callbacks. - * The callbacks are ordered by txg number. - */ -typedef struct ztest_cb_list { - kmutex_t zcl_callbacks_lock; - list_t zcl_callbacks; -} ztest_cb_list_t; - -/* - * Stuff we need to share writably between parent and child. - */ -typedef struct ztest_shared { - boolean_t zs_do_init; - hrtime_t zs_proc_start; - hrtime_t zs_proc_stop; - hrtime_t zs_thread_start; - hrtime_t zs_thread_stop; - hrtime_t zs_thread_kill; - uint64_t zs_enospc_count; - uint64_t zs_vdev_next_leaf; - uint64_t zs_vdev_aux; - uint64_t zs_alloc; - uint64_t zs_space; - uint64_t zs_splits; - uint64_t zs_mirrors; - uint64_t zs_metaslab_sz; - uint64_t zs_metaslab_df_alloc_threshold; - uint64_t zs_guid; -} ztest_shared_t; - -#define ID_PARALLEL -1ULL - -static char ztest_dev_template[] = "%s/%s.%llua"; -static char ztest_aux_template[] = "%s/%s.%s.%llu"; -ztest_shared_t *ztest_shared; - -static spa_t *ztest_spa = NULL; -static ztest_ds_t *ztest_ds; - -static kmutex_t ztest_vdev_lock; -static boolean_t ztest_device_removal_active = B_FALSE; -static kmutex_t ztest_checkpoint_lock; - -/* - * The ztest_name_lock protects the pool and dataset namespace used by - * the individual tests. To modify the namespace, consumers must grab - * this lock as writer. Grabbing the lock as reader will ensure that the - * namespace does not change while the lock is held. - */ -static krwlock_t ztest_name_lock; - -static boolean_t ztest_dump_core = B_TRUE; -static boolean_t ztest_exiting; - -/* Global commit callback list */ -static ztest_cb_list_t zcl; - -enum ztest_object { - ZTEST_META_DNODE = 0, - ZTEST_DIROBJ, - ZTEST_OBJECTS -}; - -static void usage(boolean_t) __NORETURN; - -/* - * These libumem hooks provide a reasonable set of defaults for the allocator's - * debugging facilities. - */ -const char * -_umem_debug_init() -{ - return ("default,verbose"); /* $UMEM_DEBUG setting */ -} - -const char * -_umem_logging_init(void) -{ - return ("fail,contents"); /* $UMEM_LOGGING setting */ -} - -#define FATAL_MSG_SZ 1024 - -char *fatal_msg; - -static void -fatal(int do_perror, char *message, ...) -{ - va_list args; - int save_errno = errno; - char buf[FATAL_MSG_SZ]; - - (void) fflush(stdout); - - va_start(args, message); - (void) sprintf(buf, "ztest: "); - /* LINTED */ - (void) vsprintf(buf + strlen(buf), message, args); - va_end(args); - if (do_perror) { - (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), - ": %s", strerror(save_errno)); - } - (void) fprintf(stderr, "%s\n", buf); - fatal_msg = buf; /* to ease debugging */ - if (ztest_dump_core) - abort(); - exit(3); -} - -static int -str2shift(const char *buf) -{ - const char *ends = "BKMGTPEZ"; - int i; - - if (buf[0] == '\0') - return (0); - for (i = 0; i < strlen(ends); i++) { - if (toupper(buf[0]) == ends[i]) - break; - } - if (i == strlen(ends)) { - (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", - buf); - usage(B_FALSE); - } - if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { - return (10*i); - } - (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); - usage(B_FALSE); - /* NOTREACHED */ -} - -static uint64_t -nicenumtoull(const char *buf) -{ - char *end; - uint64_t val; - - val = strtoull(buf, &end, 0); - if (end == buf) { - (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); - usage(B_FALSE); - } else if (end[0] == '.') { - double fval = strtod(buf, &end); - fval *= pow(2, str2shift(end)); - if (fval > UINT64_MAX) { - (void) fprintf(stderr, "ztest: value too large: %s\n", - buf); - usage(B_FALSE); - } - val = (uint64_t)fval; - } else { - int shift = str2shift(end); - if (shift >= 64 || (val << shift) >> shift != val) { - (void) fprintf(stderr, "ztest: value too large: %s\n", - buf); - usage(B_FALSE); - } - val <<= shift; - } - return (val); -} - -static void -usage(boolean_t requested) -{ - const ztest_shared_opts_t *zo = &ztest_opts_defaults; - - char nice_vdev_size[NN_NUMBUF_SZ]; - char nice_force_ganging[NN_NUMBUF_SZ]; - FILE *fp = requested ? stdout : stderr; - - nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size)); - nicenum(zo->zo_metaslab_force_ganging, nice_force_ganging, - sizeof (nice_force_ganging)); - - (void) fprintf(fp, "Usage: %s\n" - "\t[-v vdevs (default: %llu)]\n" - "\t[-s size_of_each_vdev (default: %s)]\n" - "\t[-a alignment_shift (default: %d)] use 0 for random\n" - "\t[-m mirror_copies (default: %d)]\n" - "\t[-r raidz_disks (default: %d)]\n" - "\t[-R raidz_parity (default: %d)]\n" - "\t[-d datasets (default: %d)]\n" - "\t[-t threads (default: %d)]\n" - "\t[-g gang_block_threshold (default: %s)]\n" - "\t[-i init_count (default: %d)] initialize pool i times\n" - "\t[-k kill_percentage (default: %llu%%)]\n" - "\t[-p pool_name (default: %s)]\n" - "\t[-f dir (default: %s)] file directory for vdev files\n" - "\t[-M] Multi-host simulate pool imported on remote host\n" - "\t[-V] verbose (use multiple times for ever more blather)\n" - "\t[-E] use existing pool instead of creating new one\n" - "\t[-T time (default: %llu sec)] total run time\n" - "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" - "\t[-P passtime (default: %llu sec)] time per pass\n" - "\t[-B alt_ztest (default: )] alternate ztest path\n" - "\t[-C vdev class state (default: random)] special=on|off|random\n" - "\t[-o variable=value] ... set global variable to an unsigned\n" - "\t 32-bit integer value\n" - "\t[-h] (print help)\n" - "", - zo->zo_pool, - (u_longlong_t)zo->zo_vdevs, /* -v */ - nice_vdev_size, /* -s */ - zo->zo_ashift, /* -a */ - zo->zo_mirrors, /* -m */ - zo->zo_raidz, /* -r */ - zo->zo_raidz_parity, /* -R */ - zo->zo_datasets, /* -d */ - zo->zo_threads, /* -t */ - nice_force_ganging, /* -g */ - zo->zo_init, /* -i */ - (u_longlong_t)zo->zo_killrate, /* -k */ - zo->zo_pool, /* -p */ - zo->zo_dir, /* -f */ - (u_longlong_t)zo->zo_time, /* -T */ - (u_longlong_t)zo->zo_maxloops, /* -F */ - (u_longlong_t)zo->zo_passtime); - exit(requested ? 0 : 1); -} - - -static void -ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) -{ - char name[32]; - char *value; - int state = ZTEST_VDEV_CLASS_RND; - - (void) strlcpy(name, input, sizeof (name)); - - value = strchr(name, '='); - if (value == NULL) { - (void) fprintf(stderr, "missing value in property=value " - "'-C' argument (%s)\n", input); - usage(B_FALSE); - } - *(value) = '\0'; - value++; - - if (strcmp(value, "on") == 0) { - state = ZTEST_VDEV_CLASS_ON; - } else if (strcmp(value, "off") == 0) { - state = ZTEST_VDEV_CLASS_OFF; - } else if (strcmp(value, "random") == 0) { - state = ZTEST_VDEV_CLASS_RND; - } else { - (void) fprintf(stderr, "invalid property value '%s'\n", value); - usage(B_FALSE); - } - - if (strcmp(name, "special") == 0) { - zo->zo_special_vdevs = state; - } else { - (void) fprintf(stderr, "invalid property name '%s'\n", name); - usage(B_FALSE); - } - if (zo->zo_verbose >= 3) - (void) printf("%s vdev state is '%s'\n", name, value); -} - -static void -process_options(int argc, char **argv) -{ - char *path; - ztest_shared_opts_t *zo = &ztest_opts; - - int opt; - uint64_t value; - char altdir[MAXNAMELEN] = { 0 }; - - bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); - - while ((opt = getopt(argc, argv, - "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:")) != EOF) { - value = 0; - switch (opt) { - case 'v': - case 's': - case 'a': - case 'm': - case 'r': - case 'R': - case 'd': - case 't': - case 'g': - case 'i': - case 'k': - case 'T': - case 'P': - case 'F': - value = nicenumtoull(optarg); - } - switch (opt) { - case 'v': - zo->zo_vdevs = value; - break; - case 's': - zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); - break; - case 'a': - zo->zo_ashift = value; - break; - case 'm': - zo->zo_mirrors = value; - break; - case 'r': - zo->zo_raidz = MAX(1, value); - break; - case 'R': - zo->zo_raidz_parity = MIN(MAX(value, 1), 3); - break; - case 'd': - zo->zo_datasets = MAX(1, value); - break; - case 't': - zo->zo_threads = MAX(1, value); - break; - case 'g': - zo->zo_metaslab_force_ganging = - MAX(SPA_MINBLOCKSIZE << 1, value); - break; - case 'i': - zo->zo_init = value; - break; - case 'k': - zo->zo_killrate = value; - break; - case 'p': - (void) strlcpy(zo->zo_pool, optarg, - sizeof (zo->zo_pool)); - break; - case 'f': - path = realpath(optarg, NULL); - if (path == NULL) { - (void) fprintf(stderr, "error: %s: %s\n", - optarg, strerror(errno)); - usage(B_FALSE); - } else { - (void) strlcpy(zo->zo_dir, path, - sizeof (zo->zo_dir)); - } - break; - case 'M': - zo->zo_mmp_test = 1; - break; - case 'V': - zo->zo_verbose++; - break; - case 'E': - zo->zo_init = 0; - break; - case 'T': - zo->zo_time = value; - break; - case 'P': - zo->zo_passtime = MAX(1, value); - break; - case 'F': - zo->zo_maxloops = MAX(1, value); - break; - case 'B': - (void) strlcpy(altdir, optarg, sizeof (altdir)); - break; - case 'C': - ztest_parse_name_value(optarg, zo); - break; - case 'o': - if (set_global_var(optarg) != 0) - usage(B_FALSE); - break; - case 'h': - usage(B_TRUE); - break; - case '?': - default: - usage(B_FALSE); - break; - } - } - - zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1); - - zo->zo_vdevtime = - (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : - UINT64_MAX >> 2); - - if (strlen(altdir) > 0) { - char *cmd; - char *realaltdir; - char *bin; - char *ztest; - char *isa; - int isalen; - - cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); - realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); - - VERIFY(NULL != realpath(getexecname(), cmd)); - if (0 != access(altdir, F_OK)) { - ztest_dump_core = B_FALSE; - fatal(B_TRUE, "invalid alternate ztest path: %s", - altdir); - } - VERIFY(NULL != realpath(altdir, realaltdir)); - - /* - * 'cmd' should be of the form "/usr/bin//ztest". - * We want to extract to determine if we should use - * 32 or 64 bit binaries. - */ - bin = strstr(cmd, "/usr/bin/"); - ztest = strstr(bin, "/ztest"); - isa = bin + 9; - isalen = ztest - isa; - (void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest), - "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa); - (void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath), - "%s/usr/lib/%.*s", realaltdir, isalen, isa); - - if (0 != access(zo->zo_alt_ztest, X_OK)) { - ztest_dump_core = B_FALSE; - fatal(B_TRUE, "invalid alternate ztest: %s", - zo->zo_alt_ztest); - } else if (0 != access(zo->zo_alt_libpath, X_OK)) { - ztest_dump_core = B_FALSE; - fatal(B_TRUE, "invalid alternate lib directory %s", - zo->zo_alt_libpath); - } - - umem_free(cmd, MAXPATHLEN); - umem_free(realaltdir, MAXPATHLEN); - } -} - -static void -ztest_kill(ztest_shared_t *zs) -{ - zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); - zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); - - /* - * Before we kill off ztest, make sure that the config is updated. - * See comment above spa_write_cachefile(). - */ - mutex_enter(&spa_namespace_lock); - spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); - mutex_exit(&spa_namespace_lock); - - zfs_dbgmsg_print(FTAG); - (void) kill(getpid(), SIGKILL); -} - -static uint64_t -ztest_random(uint64_t range) -{ - uint64_t r; - - ASSERT3S(ztest_fd_rand, >=, 0); - - if (range == 0) - return (0); - - if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) - fatal(1, "short read from /dev/urandom"); - - return (r % range); -} - -/* ARGSUSED */ -static void -ztest_record_enospc(const char *s) -{ - ztest_shared->zs_enospc_count++; -} - -static uint64_t -ztest_get_ashift(void) -{ - if (ztest_opts.zo_ashift == 0) - return (SPA_MINBLOCKSHIFT + ztest_random(5)); - return (ztest_opts.zo_ashift); -} - -static nvlist_t * -make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) -{ - char pathbuf[MAXPATHLEN]; - uint64_t vdev; - nvlist_t *file; - - if (ashift == 0) - ashift = ztest_get_ashift(); - - if (path == NULL) { - path = pathbuf; - - if (aux != NULL) { - vdev = ztest_shared->zs_vdev_aux; - (void) snprintf(path, sizeof (pathbuf), - ztest_aux_template, ztest_opts.zo_dir, - pool == NULL ? ztest_opts.zo_pool : pool, - aux, vdev); - } else { - vdev = ztest_shared->zs_vdev_next_leaf++; - (void) snprintf(path, sizeof (pathbuf), - ztest_dev_template, ztest_opts.zo_dir, - pool == NULL ? ztest_opts.zo_pool : pool, vdev); - } - } - - if (size != 0) { - int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); - if (fd == -1) - fatal(1, "can't open %s", path); - if (ftruncate(fd, size) != 0) - fatal(1, "can't ftruncate %s", path); - (void) close(fd); - } - - VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); - VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0); - VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); - - return (file); -} - -static nvlist_t * -make_vdev_raidz(char *path, char *aux, char *pool, size_t size, - uint64_t ashift, int r) -{ - nvlist_t *raidz, **child; - int c; - - if (r < 2) - return (make_vdev_file(path, aux, pool, size, ashift)); - child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); - - for (c = 0; c < r; c++) - child[c] = make_vdev_file(path, aux, pool, size, ashift); - - VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_RAIDZ) == 0); - VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, - ztest_opts.zo_raidz_parity) == 0); - VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, - child, r) == 0); - - for (c = 0; c < r; c++) - nvlist_free(child[c]); - - umem_free(child, r * sizeof (nvlist_t *)); - - return (raidz); -} - -static nvlist_t * -make_vdev_mirror(char *path, char *aux, char *pool, size_t size, - uint64_t ashift, int r, int m) -{ - nvlist_t *mirror, **child; - int c; - - if (m < 1) - return (make_vdev_raidz(path, aux, pool, size, ashift, r)); - - child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); - - for (c = 0; c < m; c++) - child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r); - - VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_MIRROR) == 0); - VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, - child, m) == 0); - - for (c = 0; c < m; c++) - nvlist_free(child[c]); - - umem_free(child, m * sizeof (nvlist_t *)); - - return (mirror); -} - -static nvlist_t * -make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, - const char *class, int r, int m, int t) -{ - nvlist_t *root, **child; - int c; - boolean_t log; - - ASSERT(t > 0); - - log = (class != NULL && strcmp(class, "log") == 0); - - child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); - - for (c = 0; c < t; c++) { - child[c] = make_vdev_mirror(path, aux, pool, size, ashift, - r, m); - VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - log) == 0); - - if (class != NULL && class[0] != '\0') { - ASSERT(m > 1 || log); /* expecting a mirror */ - VERIFY(nvlist_add_string(child[c], - ZPOOL_CONFIG_ALLOCATION_BIAS, class) == 0); - } - } - - VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); - VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, - child, t) == 0); - - for (c = 0; c < t; c++) - nvlist_free(child[c]); - - umem_free(child, t * sizeof (nvlist_t *)); - - return (root); -} - -/* - * Find a random spa version. Returns back a random spa version in the - * range [initial_version, SPA_VERSION_FEATURES]. - */ -static uint64_t -ztest_random_spa_version(uint64_t initial_version) -{ - uint64_t version = initial_version; - - if (version <= SPA_VERSION_BEFORE_FEATURES) { - version = version + - ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); - } - - if (version > SPA_VERSION_BEFORE_FEATURES) - version = SPA_VERSION_FEATURES; - - ASSERT(SPA_VERSION_IS_SUPPORTED(version)); - return (version); -} - -static int -ztest_random_blocksize(void) -{ - uint64_t block_shift; - - ASSERT(ztest_spa->spa_max_ashift != 0); - - /* - * Choose a block size >= the ashift. - * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. - */ - int maxbs = SPA_OLD_MAXBLOCKSHIFT; - if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) - maxbs = 20; - block_shift = ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); - return (1 << (SPA_MINBLOCKSHIFT + block_shift)); -} - -static int -ztest_random_dnodesize(void) -{ - int slots; - int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; - - if (max_slots == DNODE_MIN_SLOTS) - return (DNODE_MIN_SIZE); - - /* - * Weight the random distribution more heavily toward smaller - * dnode sizes since that is more likely to reflect real-world - * usage. - */ - ASSERT3U(max_slots, >, 4); - switch (ztest_random(10)) { - case 0: - slots = 5 + ztest_random(max_slots - 4); - break; - case 1 ... 4: - slots = 2 + ztest_random(3); - break; - default: - slots = 1; - break; - } - - return (slots << DNODE_SHIFT); -} - -static int -ztest_random_ibshift(void) -{ - return (DN_MIN_INDBLKSHIFT + - ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); -} - -static uint64_t -ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) -{ - uint64_t top; - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *tvd; - - ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); - - do { - top = ztest_random(rvd->vdev_children); - tvd = rvd->vdev_child[top]; - } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || - tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); - - return (top); -} - -static uint64_t -ztest_random_dsl_prop(zfs_prop_t prop) -{ - uint64_t value; - - do { - value = zfs_prop_random_value(prop, ztest_random(-1ULL)); - } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); - - return (value); -} - -static int -ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, - boolean_t inherit) -{ - const char *propname = zfs_prop_to_name(prop); - const char *valname; - char setpoint[MAXPATHLEN]; - uint64_t curval; - int error; - - error = dsl_prop_set_int(osname, propname, - (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); - - if (error == ENOSPC) { - ztest_record_enospc(FTAG); - return (error); - } - ASSERT0(error); - - VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); - - if (ztest_opts.zo_verbose >= 6) { - VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0); - (void) printf("%s %s = %s at '%s'\n", - osname, propname, valname, setpoint); - } - - return (error); -} - -static int -ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) -{ - spa_t *spa = ztest_spa; - nvlist_t *props = NULL; - int error; - - VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0); - - error = spa_prop_set(spa, props); - - nvlist_free(props); - - if (error == ENOSPC) { - ztest_record_enospc(FTAG); - return (error); - } - ASSERT0(error); - - return (error); -} - -static void -ztest_rll_init(rll_t *rll) -{ - rll->rll_writer = NULL; - rll->rll_readers = 0; - mutex_init(&rll->rll_lock, NULL, USYNC_THREAD, NULL); - cv_init(&rll->rll_cv, NULL, USYNC_THREAD, NULL); -} - -static void -ztest_rll_destroy(rll_t *rll) -{ - ASSERT(rll->rll_writer == NULL); - ASSERT(rll->rll_readers == 0); - mutex_destroy(&rll->rll_lock); - cv_destroy(&rll->rll_cv); -} - -static void -ztest_rll_lock(rll_t *rll, rl_type_t type) -{ - mutex_enter(&rll->rll_lock); - - if (type == RL_READER) { - while (rll->rll_writer != NULL) - cv_wait(&rll->rll_cv, &rll->rll_lock); - rll->rll_readers++; - } else { - while (rll->rll_writer != NULL || rll->rll_readers) - cv_wait(&rll->rll_cv, &rll->rll_lock); - rll->rll_writer = curthread; - } - - mutex_exit(&rll->rll_lock); -} - -static void -ztest_rll_unlock(rll_t *rll) -{ - mutex_enter(&rll->rll_lock); - - if (rll->rll_writer) { - ASSERT(rll->rll_readers == 0); - rll->rll_writer = NULL; - } else { - ASSERT(rll->rll_readers != 0); - ASSERT(rll->rll_writer == NULL); - rll->rll_readers--; - } - - if (rll->rll_writer == NULL && rll->rll_readers == 0) - cv_broadcast(&rll->rll_cv); - - mutex_exit(&rll->rll_lock); -} - -static void -ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) -{ - rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; - - ztest_rll_lock(rll, type); -} - -static void -ztest_object_unlock(ztest_ds_t *zd, uint64_t object) -{ - rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; - - ztest_rll_unlock(rll); -} - -static rl_t * -ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, - uint64_t size, rl_type_t type) -{ - uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); - rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; - rl_t *rl; - - rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); - rl->rl_object = object; - rl->rl_offset = offset; - rl->rl_size = size; - rl->rl_lock = rll; - - ztest_rll_lock(rll, type); - - return (rl); -} - -static void -ztest_range_unlock(rl_t *rl) -{ - rll_t *rll = rl->rl_lock; - - ztest_rll_unlock(rll); - - umem_free(rl, sizeof (*rl)); -} - -static void -ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) -{ - zd->zd_os = os; - zd->zd_zilog = dmu_objset_zil(os); - zd->zd_shared = szd; - dmu_objset_name(os, zd->zd_name); - - if (zd->zd_shared != NULL) - zd->zd_shared->zd_seq = 0; - - rw_init(&zd->zd_zilog_lock, NULL, USYNC_THREAD, NULL); - mutex_init(&zd->zd_dirobj_lock, NULL, USYNC_THREAD, NULL); - - for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) - ztest_rll_init(&zd->zd_object_lock[l]); - - for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) - ztest_rll_init(&zd->zd_range_lock[l]); -} - -static void -ztest_zd_fini(ztest_ds_t *zd) -{ - mutex_destroy(&zd->zd_dirobj_lock); - - for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) - ztest_rll_destroy(&zd->zd_object_lock[l]); - - for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) - ztest_rll_destroy(&zd->zd_range_lock[l]); -} - -#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) - -static uint64_t -ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) -{ - uint64_t txg; - int error; - - /* - * Attempt to assign tx to some transaction group. - */ - error = dmu_tx_assign(tx, txg_how); - if (error) { - if (error == ERESTART) { - ASSERT(txg_how == TXG_NOWAIT); - dmu_tx_wait(tx); - } else { - ASSERT3U(error, ==, ENOSPC); - ztest_record_enospc(tag); - } - dmu_tx_abort(tx); - return (0); - } - txg = dmu_tx_get_txg(tx); - ASSERT(txg != 0); - return (txg); -} - -static void -ztest_pattern_set(void *buf, uint64_t size, uint64_t value) -{ - uint64_t *ip = buf; - uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); - - while (ip < ip_end) - *ip++ = value; -} - -static boolean_t -ztest_pattern_match(void *buf, uint64_t size, uint64_t value) -{ - uint64_t *ip = buf; - uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); - uint64_t diff = 0; - - while (ip < ip_end) - diff |= (value - *ip++); - - return (diff == 0); -} - -static void -ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, - uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, - uint64_t crtxg) -{ - bt->bt_magic = BT_MAGIC; - bt->bt_objset = dmu_objset_id(os); - bt->bt_object = object; - bt->bt_dnodesize = dnodesize; - bt->bt_offset = offset; - bt->bt_gen = gen; - bt->bt_txg = txg; - bt->bt_crtxg = crtxg; -} - -static void -ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, - uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, - uint64_t crtxg) -{ - ASSERT3U(bt->bt_magic, ==, BT_MAGIC); - ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); - ASSERT3U(bt->bt_object, ==, object); - ASSERT3U(bt->bt_dnodesize, ==, dnodesize); - ASSERT3U(bt->bt_offset, ==, offset); - ASSERT3U(bt->bt_gen, <=, gen); - ASSERT3U(bt->bt_txg, <=, txg); - ASSERT3U(bt->bt_crtxg, ==, crtxg); -} - -static ztest_block_tag_t * -ztest_bt_bonus(dmu_buf_t *db) -{ - dmu_object_info_t doi; - ztest_block_tag_t *bt; - - dmu_object_info_from_db(db, &doi); - ASSERT3U(doi.doi_bonus_size, <=, db->db_size); - ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); - bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); - - return (bt); -} - -/* - * Generate a token to fill up unused bonus buffer space. Try to make - * it unique to the object, generation, and offset to verify that data - * is not getting overwritten by data from other dnodes. - */ -#define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ - (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) - -/* - * Fill up the unused bonus buffer region before the block tag with a - * verifiable pattern. Filling the whole bonus area with non-zero data - * helps ensure that all dnode traversal code properly skips the - * interior regions of large dnodes. - */ -void -ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, - objset_t *os, uint64_t gen) -{ - uint64_t *bonusp; - - ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); - - for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { - uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), - gen, bonusp - (uint64_t *)db->db_data); - *bonusp = token; - } -} - -/* - * Verify that the unused area of a bonus buffer is filled with the - * expected tokens. - */ -void -ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, - objset_t *os, uint64_t gen) -{ - uint64_t *bonusp; - - for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { - uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), - gen, bonusp - (uint64_t *)db->db_data); - VERIFY3U(*bonusp, ==, token); - } -} - -/* - * ZIL logging ops - */ - -#define lrz_type lr_mode -#define lrz_blocksize lr_uid -#define lrz_ibshift lr_gid -#define lrz_bonustype lr_rdev -#define lrz_dnodesize lr_crtime[1] - -static void -ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) -{ - char *name = (void *)(lr + 1); /* name follows lr */ - size_t namesize = strlen(name) + 1; - itx_t *itx; - - if (zil_replaying(zd->zd_zilog, tx)) - return; - - itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); - bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, - sizeof (*lr) + namesize - sizeof (lr_t)); - - zil_itx_assign(zd->zd_zilog, itx, tx); -} - -static void -ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) -{ - char *name = (void *)(lr + 1); /* name follows lr */ - size_t namesize = strlen(name) + 1; - itx_t *itx; - - if (zil_replaying(zd->zd_zilog, tx)) - return; - - itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); - bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, - sizeof (*lr) + namesize - sizeof (lr_t)); - - itx->itx_oid = object; - zil_itx_assign(zd->zd_zilog, itx, tx); -} - -static void -ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) -{ - itx_t *itx; - itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); - - if (zil_replaying(zd->zd_zilog, tx)) - return; - - if (lr->lr_length > zil_max_log_data(zd->zd_zilog)) - write_state = WR_INDIRECT; - - itx = zil_itx_create(TX_WRITE, - sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); - - if (write_state == WR_COPIED && - dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, - ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { - zil_itx_destroy(itx); - itx = zil_itx_create(TX_WRITE, sizeof (*lr)); - write_state = WR_NEED_COPY; - } - itx->itx_private = zd; - itx->itx_wr_state = write_state; - itx->itx_sync = (ztest_random(8) == 0); - - bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, - sizeof (*lr) - sizeof (lr_t)); - - zil_itx_assign(zd->zd_zilog, itx, tx); -} - -static void -ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) -{ - itx_t *itx; - - if (zil_replaying(zd->zd_zilog, tx)) - return; - - itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); - bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, - sizeof (*lr) - sizeof (lr_t)); - - itx->itx_sync = B_FALSE; - zil_itx_assign(zd->zd_zilog, itx, tx); -} - -static void -ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) -{ - itx_t *itx; - - if (zil_replaying(zd->zd_zilog, tx)) - return; - - itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); - bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, - sizeof (*lr) - sizeof (lr_t)); - - itx->itx_sync = B_FALSE; - zil_itx_assign(zd->zd_zilog, itx, tx); -} - -/* - * ZIL replay ops - */ -static int -ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) -{ - ztest_ds_t *zd = arg1; - lr_create_t *lr = arg2; - char *name = (void *)(lr + 1); /* name follows lr */ - objset_t *os = zd->zd_os; - ztest_block_tag_t *bbt; - dmu_buf_t *db; - dmu_tx_t *tx; - uint64_t txg; - int error = 0; - int bonuslen; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - ASSERT(lr->lr_doid == ZTEST_DIROBJ); - ASSERT(name[0] != '\0'); - - tx = dmu_tx_create(os); - - dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); - - if (lr->lrz_type == DMU_OT_ZAP_OTHER) { - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); - } else { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - } - - txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); - if (txg == 0) - return (ENOSPC); - - ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid); - bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); - - if (lr->lrz_type == DMU_OT_ZAP_OTHER) { - if (lr->lr_foid == 0) { - lr->lr_foid = zap_create_dnsize(os, - lr->lrz_type, lr->lrz_bonustype, - bonuslen, lr->lrz_dnodesize, tx); - } else { - error = zap_create_claim_dnsize(os, lr->lr_foid, - lr->lrz_type, lr->lrz_bonustype, - bonuslen, lr->lrz_dnodesize, tx); - } - } else { - if (lr->lr_foid == 0) { - lr->lr_foid = dmu_object_alloc_dnsize(os, - lr->lrz_type, 0, lr->lrz_bonustype, - bonuslen, lr->lrz_dnodesize, tx); - } else { - error = dmu_object_claim_dnsize(os, lr->lr_foid, - lr->lrz_type, 0, lr->lrz_bonustype, - bonuslen, lr->lrz_dnodesize, tx); - } - } - - if (error) { - ASSERT3U(error, ==, EEXIST); - ASSERT(zd->zd_zilog->zl_replay); - dmu_tx_commit(tx); - return (error); - } - - ASSERT(lr->lr_foid != 0); - - if (lr->lrz_type != DMU_OT_ZAP_OTHER) - VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid, - lr->lrz_blocksize, lr->lrz_ibshift, tx)); - - VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); - bbt = ztest_bt_bonus(db); - dmu_buf_will_dirty(db, tx); - ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, - lr->lr_gen, txg, txg); - ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); - dmu_buf_rele(db, FTAG); - - VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, - &lr->lr_foid, tx)); - - (void) ztest_log_create(zd, tx, lr); - - dmu_tx_commit(tx); - - return (0); -} - -static int -ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) -{ - ztest_ds_t *zd = arg1; - lr_remove_t *lr = arg2; - char *name = (void *)(lr + 1); /* name follows lr */ - objset_t *os = zd->zd_os; - dmu_object_info_t doi; - dmu_tx_t *tx; - uint64_t object, txg; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - ASSERT(lr->lr_doid == ZTEST_DIROBJ); - ASSERT(name[0] != '\0'); - - VERIFY3U(0, ==, - zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); - ASSERT(object != 0); - - ztest_object_lock(zd, object, RL_WRITER); - - VERIFY3U(0, ==, dmu_object_info(os, object, &doi)); - - tx = dmu_tx_create(os); - - dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); - dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); - - txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); - if (txg == 0) { - ztest_object_unlock(zd, object); - return (ENOSPC); - } - - if (doi.doi_type == DMU_OT_ZAP_OTHER) { - VERIFY3U(0, ==, zap_destroy(os, object, tx)); - } else { - VERIFY3U(0, ==, dmu_object_free(os, object, tx)); - } - - VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx)); - - (void) ztest_log_remove(zd, tx, lr, object); - - dmu_tx_commit(tx); - - ztest_object_unlock(zd, object); - - return (0); -} - -static int -ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) -{ - ztest_ds_t *zd = arg1; - lr_write_t *lr = arg2; - objset_t *os = zd->zd_os; - void *data = lr + 1; /* data follows lr */ - uint64_t offset, length; - ztest_block_tag_t *bt = data; - ztest_block_tag_t *bbt; - uint64_t gen, txg, lrtxg, crtxg; - dmu_object_info_t doi; - dmu_tx_t *tx; - dmu_buf_t *db; - arc_buf_t *abuf = NULL; - rl_t *rl; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - offset = lr->lr_offset; - length = lr->lr_length; - - /* If it's a dmu_sync() block, write the whole block */ - if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { - uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); - if (length < blocksize) { - offset -= offset % blocksize; - length = blocksize; - } - } - - if (bt->bt_magic == BSWAP_64(BT_MAGIC)) - byteswap_uint64_array(bt, sizeof (*bt)); - - if (bt->bt_magic != BT_MAGIC) - bt = NULL; - - ztest_object_lock(zd, lr->lr_foid, RL_READER); - rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); - - VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); - - dmu_object_info_from_db(db, &doi); - - bbt = ztest_bt_bonus(db); - ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); - gen = bbt->bt_gen; - crtxg = bbt->bt_crtxg; - lrtxg = lr->lr_common.lrc_txg; - - tx = dmu_tx_create(os); - - dmu_tx_hold_write(tx, lr->lr_foid, offset, length); - - if (ztest_random(8) == 0 && length == doi.doi_data_block_size && - P2PHASE(offset, length) == 0) - abuf = dmu_request_arcbuf(db, length); - - txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); - if (txg == 0) { - if (abuf != NULL) - dmu_return_arcbuf(abuf); - dmu_buf_rele(db, FTAG); - ztest_range_unlock(rl); - ztest_object_unlock(zd, lr->lr_foid); - return (ENOSPC); - } - - if (bt != NULL) { - /* - * Usually, verify the old data before writing new data -- - * but not always, because we also want to verify correct - * behavior when the data was not recently read into cache. - */ - ASSERT(offset % doi.doi_data_block_size == 0); - if (ztest_random(4) != 0) { - int prefetch = ztest_random(2) ? - DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; - ztest_block_tag_t rbt; - - VERIFY(dmu_read(os, lr->lr_foid, offset, - sizeof (rbt), &rbt, prefetch) == 0); - if (rbt.bt_magic == BT_MAGIC) { - ztest_bt_verify(&rbt, os, lr->lr_foid, 0, - offset, gen, txg, crtxg); - } - } - - /* - * Writes can appear to be newer than the bonus buffer because - * the ztest_get_data() callback does a dmu_read() of the - * open-context data, which may be different than the data - * as it was when the write was generated. - */ - if (zd->zd_zilog->zl_replay) { - ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, - MAX(gen, bt->bt_gen), MAX(txg, lrtxg), - bt->bt_crtxg); - } - - /* - * Set the bt's gen/txg to the bonus buffer's gen/txg - * so that all of the usual ASSERTs will work. - */ - ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, - crtxg); - } - - if (abuf == NULL) { - dmu_write(os, lr->lr_foid, offset, length, data, tx); - } else { - bcopy(data, abuf->b_data, length); - dmu_assign_arcbuf(db, offset, abuf, tx); - } - - (void) ztest_log_write(zd, tx, lr); - - dmu_buf_rele(db, FTAG); - - dmu_tx_commit(tx); - - ztest_range_unlock(rl); - ztest_object_unlock(zd, lr->lr_foid); - - return (0); -} - -static int -ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) -{ - ztest_ds_t *zd = arg1; - lr_truncate_t *lr = arg2; - objset_t *os = zd->zd_os; - dmu_tx_t *tx; - uint64_t txg; - rl_t *rl; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - ztest_object_lock(zd, lr->lr_foid, RL_READER); - rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, - RL_WRITER); - - tx = dmu_tx_create(os); - - dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); - - txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); - if (txg == 0) { - ztest_range_unlock(rl); - ztest_object_unlock(zd, lr->lr_foid); - return (ENOSPC); - } - - VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset, - lr->lr_length, tx) == 0); - - (void) ztest_log_truncate(zd, tx, lr); - - dmu_tx_commit(tx); - - ztest_range_unlock(rl); - ztest_object_unlock(zd, lr->lr_foid); - - return (0); -} - -static int -ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) -{ - ztest_ds_t *zd = arg1; - lr_setattr_t *lr = arg2; - objset_t *os = zd->zd_os; - dmu_tx_t *tx; - dmu_buf_t *db; - ztest_block_tag_t *bbt; - uint64_t txg, lrtxg, crtxg, dnodesize; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - ztest_object_lock(zd, lr->lr_foid, RL_WRITER); - - VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); - - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, lr->lr_foid); - - txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); - if (txg == 0) { - dmu_buf_rele(db, FTAG); - ztest_object_unlock(zd, lr->lr_foid); - return (ENOSPC); - } - - bbt = ztest_bt_bonus(db); - ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); - crtxg = bbt->bt_crtxg; - lrtxg = lr->lr_common.lrc_txg; - dnodesize = bbt->bt_dnodesize; - - if (zd->zd_zilog->zl_replay) { - ASSERT(lr->lr_size != 0); - ASSERT(lr->lr_mode != 0); - ASSERT(lrtxg != 0); - } else { - /* - * Randomly change the size and increment the generation. - */ - lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * - sizeof (*bbt); - lr->lr_mode = bbt->bt_gen + 1; - ASSERT(lrtxg == 0); - } - - /* - * Verify that the current bonus buffer is not newer than our txg. - */ - ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, - MAX(txg, lrtxg), crtxg); - - dmu_buf_will_dirty(db, tx); - - ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); - ASSERT3U(lr->lr_size, <=, db->db_size); - VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); - bbt = ztest_bt_bonus(db); - - ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, - txg, crtxg); - ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); - - dmu_buf_rele(db, FTAG); - - (void) ztest_log_setattr(zd, tx, lr); - - dmu_tx_commit(tx); - - ztest_object_unlock(zd, lr->lr_foid); - - return (0); -} - -zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { - NULL, /* 0 no such transaction type */ - ztest_replay_create, /* TX_CREATE */ - NULL, /* TX_MKDIR */ - NULL, /* TX_MKXATTR */ - NULL, /* TX_SYMLINK */ - ztest_replay_remove, /* TX_REMOVE */ - NULL, /* TX_RMDIR */ - NULL, /* TX_LINK */ - NULL, /* TX_RENAME */ - ztest_replay_write, /* TX_WRITE */ - ztest_replay_truncate, /* TX_TRUNCATE */ - ztest_replay_setattr, /* TX_SETATTR */ - NULL, /* TX_ACL */ - NULL, /* TX_CREATE_ACL */ - NULL, /* TX_CREATE_ATTR */ - NULL, /* TX_CREATE_ACL_ATTR */ - NULL, /* TX_MKDIR_ACL */ - NULL, /* TX_MKDIR_ATTR */ - NULL, /* TX_MKDIR_ACL_ATTR */ - NULL, /* TX_WRITE2 */ -}; - -/* - * ZIL get_data callbacks - */ - -/* ARGSUSED */ -static void -ztest_get_done(zgd_t *zgd, int error) -{ - ztest_ds_t *zd = zgd->zgd_private; - uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; - - if (zgd->zgd_db) - dmu_buf_rele(zgd->zgd_db, zgd); - - ztest_range_unlock((rl_t *)zgd->zgd_lr); - ztest_object_unlock(zd, object); - - umem_free(zgd, sizeof (*zgd)); -} - -static int -ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, - zio_t *zio) -{ - ztest_ds_t *zd = arg; - objset_t *os = zd->zd_os; - uint64_t object = lr->lr_foid; - uint64_t offset = lr->lr_offset; - uint64_t size = lr->lr_length; - uint64_t txg = lr->lr_common.lrc_txg; - uint64_t crtxg; - dmu_object_info_t doi; - dmu_buf_t *db; - zgd_t *zgd; - int error; - - ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); - ASSERT3U(size, !=, 0); - - ztest_object_lock(zd, object, RL_READER); - error = dmu_bonus_hold(os, object, FTAG, &db); - if (error) { - ztest_object_unlock(zd, object); - return (error); - } - - crtxg = ztest_bt_bonus(db)->bt_crtxg; - - if (crtxg == 0 || crtxg > txg) { - dmu_buf_rele(db, FTAG); - ztest_object_unlock(zd, object); - return (ENOENT); - } - - dmu_object_info_from_db(db, &doi); - dmu_buf_rele(db, FTAG); - db = NULL; - - zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); - zgd->zgd_lwb = lwb; - zgd->zgd_private = zd; - - if (buf != NULL) { /* immediate write */ - zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd, - object, offset, size, RL_READER); - - error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); - ASSERT(error == 0); - } else { - size = doi.doi_data_block_size; - if (ISP2(size)) { - offset = P2ALIGN(offset, size); - } else { - ASSERT(offset < size); - offset = 0; - } - - zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd, - object, offset, size, RL_READER); - - error = dmu_buf_hold(os, object, offset, zgd, &db, - DMU_READ_NO_PREFETCH); - - if (error == 0) { - blkptr_t *bp = &lr->lr_blkptr; - - zgd->zgd_db = db; - zgd->zgd_bp = bp; - - ASSERT(db->db_offset == offset); - ASSERT(db->db_size == size); - - error = dmu_sync(zio, lr->lr_common.lrc_txg, - ztest_get_done, zgd); - - if (error == 0) - return (0); - } - } - - ztest_get_done(zgd, error); - - return (error); -} - -static void * -ztest_lr_alloc(size_t lrsize, char *name) -{ - char *lr; - size_t namesize = name ? strlen(name) + 1 : 0; - - lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); - - if (name) - bcopy(name, lr + lrsize, namesize); - - return (lr); -} - -void -ztest_lr_free(void *lr, size_t lrsize, char *name) -{ - size_t namesize = name ? strlen(name) + 1 : 0; - - umem_free(lr, lrsize + namesize); -} - -/* - * Lookup a bunch of objects. Returns the number of objects not found. - */ -static int -ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) -{ - int missing = 0; - int error; - - ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); - - for (int i = 0; i < count; i++, od++) { - od->od_object = 0; - error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, - sizeof (uint64_t), 1, &od->od_object); - if (error) { - ASSERT(error == ENOENT); - ASSERT(od->od_object == 0); - missing++; - } else { - dmu_buf_t *db; - ztest_block_tag_t *bbt; - dmu_object_info_t doi; - - ASSERT(od->od_object != 0); - ASSERT(missing == 0); /* there should be no gaps */ - - ztest_object_lock(zd, od->od_object, RL_READER); - VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os, - od->od_object, FTAG, &db)); - dmu_object_info_from_db(db, &doi); - bbt = ztest_bt_bonus(db); - ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); - od->od_type = doi.doi_type; - od->od_blocksize = doi.doi_data_block_size; - od->od_gen = bbt->bt_gen; - dmu_buf_rele(db, FTAG); - ztest_object_unlock(zd, od->od_object); - } - } - - return (missing); -} - -static int -ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) -{ - int missing = 0; - - ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); - - for (int i = 0; i < count; i++, od++) { - if (missing) { - od->od_object = 0; - missing++; - continue; - } - - lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); - - lr->lr_doid = od->od_dir; - lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ - lr->lrz_type = od->od_crtype; - lr->lrz_blocksize = od->od_crblocksize; - lr->lrz_ibshift = ztest_random_ibshift(); - lr->lrz_bonustype = DMU_OT_UINT64_OTHER; - lr->lrz_dnodesize = od->od_crdnodesize; - lr->lr_gen = od->od_crgen; - lr->lr_crtime[0] = time(NULL); - - if (ztest_replay_create(zd, lr, B_FALSE) != 0) { - ASSERT(missing == 0); - od->od_object = 0; - missing++; - } else { - od->od_object = lr->lr_foid; - od->od_type = od->od_crtype; - od->od_blocksize = od->od_crblocksize; - od->od_gen = od->od_crgen; - ASSERT(od->od_object != 0); - } - - ztest_lr_free(lr, sizeof (*lr), od->od_name); - } - - return (missing); -} - -static int -ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) -{ - int missing = 0; - int error; - - ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); - - od += count - 1; - - for (int i = count - 1; i >= 0; i--, od--) { - if (missing) { - missing++; - continue; - } - - /* - * No object was found. - */ - if (od->od_object == 0) - continue; - - lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); - - lr->lr_doid = od->od_dir; - - if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { - ASSERT3U(error, ==, ENOSPC); - missing++; - } else { - od->od_object = 0; - } - ztest_lr_free(lr, sizeof (*lr), od->od_name); - } - - return (missing); -} - -static int -ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, - void *data) -{ - lr_write_t *lr; - int error; - - lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); - - lr->lr_foid = object; - lr->lr_offset = offset; - lr->lr_length = size; - lr->lr_blkoff = 0; - BP_ZERO(&lr->lr_blkptr); - - bcopy(data, lr + 1, size); - - error = ztest_replay_write(zd, lr, B_FALSE); - - ztest_lr_free(lr, sizeof (*lr) + size, NULL); - - return (error); -} - -static int -ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) -{ - lr_truncate_t *lr; - int error; - - lr = ztest_lr_alloc(sizeof (*lr), NULL); - - lr->lr_foid = object; - lr->lr_offset = offset; - lr->lr_length = size; - - error = ztest_replay_truncate(zd, lr, B_FALSE); - - ztest_lr_free(lr, sizeof (*lr), NULL); - - return (error); -} - -static int -ztest_setattr(ztest_ds_t *zd, uint64_t object) -{ - lr_setattr_t *lr; - int error; - - lr = ztest_lr_alloc(sizeof (*lr), NULL); - - lr->lr_foid = object; - lr->lr_size = 0; - lr->lr_mode = 0; - - error = ztest_replay_setattr(zd, lr, B_FALSE); - - ztest_lr_free(lr, sizeof (*lr), NULL); - - return (error); -} - -static void -ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) -{ - objset_t *os = zd->zd_os; - dmu_tx_t *tx; - uint64_t txg; - rl_t *rl; - - txg_wait_synced(dmu_objset_pool(os), 0); - - ztest_object_lock(zd, object, RL_READER); - rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); - - tx = dmu_tx_create(os); - - dmu_tx_hold_write(tx, object, offset, size); - - txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); - - if (txg != 0) { - dmu_prealloc(os, object, offset, size, tx); - dmu_tx_commit(tx); - txg_wait_synced(dmu_objset_pool(os), txg); - } else { - (void) dmu_free_long_range(os, object, offset, size); - } - - ztest_range_unlock(rl); - ztest_object_unlock(zd, object); -} - -static void -ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) -{ - int err; - ztest_block_tag_t wbt; - dmu_object_info_t doi; - enum ztest_io_type io_type; - uint64_t blocksize; - void *data; - - VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0); - blocksize = doi.doi_data_block_size; - data = umem_alloc(blocksize, UMEM_NOFAIL); - - /* - * Pick an i/o type at random, biased toward writing block tags. - */ - io_type = ztest_random(ZTEST_IO_TYPES); - if (ztest_random(2) == 0) - io_type = ZTEST_IO_WRITE_TAG; - - rw_enter(&zd->zd_zilog_lock, RW_READER); - - switch (io_type) { - - case ZTEST_IO_WRITE_TAG: - ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, - offset, 0, 0, 0); - (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); - break; - - case ZTEST_IO_WRITE_PATTERN: - (void) memset(data, 'a' + (object + offset) % 5, blocksize); - if (ztest_random(2) == 0) { - /* - * Induce fletcher2 collisions to ensure that - * zio_ddt_collision() detects and resolves them - * when using fletcher2-verify for deduplication. - */ - ((uint64_t *)data)[0] ^= 1ULL << 63; - ((uint64_t *)data)[4] ^= 1ULL << 63; - } - (void) ztest_write(zd, object, offset, blocksize, data); - break; - - case ZTEST_IO_WRITE_ZEROES: - bzero(data, blocksize); - (void) ztest_write(zd, object, offset, blocksize, data); - break; - - case ZTEST_IO_TRUNCATE: - (void) ztest_truncate(zd, object, offset, blocksize); - break; - - case ZTEST_IO_SETATTR: - (void) ztest_setattr(zd, object); - break; - - case ZTEST_IO_REWRITE: - rw_enter(&ztest_name_lock, RW_READER); - err = ztest_dsl_prop_set_uint64(zd->zd_name, - ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), - B_FALSE); - VERIFY(err == 0 || err == ENOSPC); - err = ztest_dsl_prop_set_uint64(zd->zd_name, - ZFS_PROP_COMPRESSION, - ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), - B_FALSE); - VERIFY(err == 0 || err == ENOSPC); - rw_exit(&ztest_name_lock); - - VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, - DMU_READ_NO_PREFETCH)); - - (void) ztest_write(zd, object, offset, blocksize, data); - break; - } - - rw_exit(&zd->zd_zilog_lock); - - umem_free(data, blocksize); -} - -/* - * Initialize an object description template. - */ -static void -ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, - dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, - uint64_t gen) -{ - od->od_dir = ZTEST_DIROBJ; - od->od_object = 0; - - od->od_crtype = type; - od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); - od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); - od->od_crgen = gen; - - od->od_type = DMU_OT_NONE; - od->od_blocksize = 0; - od->od_gen = 0; - - (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", - tag, (int64_t)id, index); -} - -/* - * Lookup or create the objects for a test using the od template. - * If the objects do not all exist, or if 'remove' is specified, - * remove any existing objects and create new ones. Otherwise, - * use the existing objects. - */ -static int -ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) -{ - int count = size / sizeof (*od); - int rv = 0; - - mutex_enter(&zd->zd_dirobj_lock); - if ((ztest_lookup(zd, od, count) != 0 || remove) && - (ztest_remove(zd, od, count) != 0 || - ztest_create(zd, od, count) != 0)) - rv = -1; - zd->zd_od = od; - mutex_exit(&zd->zd_dirobj_lock); - - return (rv); -} - -/* ARGSUSED */ -void -ztest_zil_commit(ztest_ds_t *zd, uint64_t id) -{ - zilog_t *zilog = zd->zd_zilog; - - rw_enter(&zd->zd_zilog_lock, RW_READER); - - zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); - - /* - * Remember the committed values in zd, which is in parent/child - * shared memory. If we die, the next iteration of ztest_run() - * will verify that the log really does contain this record. - */ - mutex_enter(&zilog->zl_lock); - ASSERT(zd->zd_shared != NULL); - ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); - zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; - mutex_exit(&zilog->zl_lock); - - rw_exit(&zd->zd_zilog_lock); -} - -/* - * This function is designed to simulate the operations that occur during a - * mount/unmount operation. We hold the dataset across these operations in an - * attempt to expose any implicit assumptions about ZIL management. - */ -/* ARGSUSED */ -void -ztest_zil_remount(ztest_ds_t *zd, uint64_t id) -{ - objset_t *os = zd->zd_os; - - /* - * We grab the zd_dirobj_lock to ensure that no other thread is - * updating the zil (i.e. adding in-memory log records) and the - * zd_zilog_lock to block any I/O. - */ - mutex_enter(&zd->zd_dirobj_lock); - rw_enter(&zd->zd_zilog_lock, RW_WRITER); - - /* zfsvfs_teardown() */ - zil_close(zd->zd_zilog); - - /* zfsvfs_setup() */ - VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog); - zil_replay(os, zd, ztest_replay_vector); - - rw_exit(&zd->zd_zilog_lock); - mutex_exit(&zd->zd_dirobj_lock); -} - -/* - * Verify that we can't destroy an active pool, create an existing pool, - * or create a pool with a bad vdev spec. - */ -/* ARGSUSED */ -void -ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) -{ - ztest_shared_opts_t *zo = &ztest_opts; - spa_t *spa; - nvlist_t *nvroot; - - if (zo->zo_mmp_test) - return; - - /* - * Attempt to create using a bad file. - */ - nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); - VERIFY3U(ENOENT, ==, - spa_create("ztest_bad_file", nvroot, NULL, NULL)); - nvlist_free(nvroot); - - /* - * Attempt to create using a bad mirror. - */ - nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); - VERIFY3U(ENOENT, ==, - spa_create("ztest_bad_mirror", nvroot, NULL, NULL)); - nvlist_free(nvroot); - - /* - * Attempt to create an existing pool. It shouldn't matter - * what's in the nvroot; we should fail with EEXIST. - */ - rw_enter(&ztest_name_lock, RW_READER); - nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); - VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL)); - nvlist_free(nvroot); - VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG)); - VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool)); - spa_close(spa, FTAG); - - rw_exit(&ztest_name_lock); -} - -/* - * Start and then stop the MMP threads to ensure the startup and shutdown code - * works properly. Actual protection and property-related code tested via ZTS. - */ -/* ARGSUSED */ -void -ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) -{ - ztest_shared_opts_t *zo = &ztest_opts; - spa_t *spa = ztest_spa; - - if (zo->zo_mmp_test) - return; - - /* - * Since enabling MMP involves setting a property, it could not be done - * while the pool is suspended. - */ - if (spa_suspended(spa)) - return; - - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - mutex_enter(&spa->spa_props_lock); - - zfs_multihost_fail_intervals = 0; - - if (!spa_multihost(spa)) { - spa->spa_multihost = B_TRUE; - mmp_thread_start(spa); - } - - mutex_exit(&spa->spa_props_lock); - spa_config_exit(spa, SCL_CONFIG, FTAG); - - txg_wait_synced(spa_get_dsl(spa), 0); - mmp_signal_all_threads(); - txg_wait_synced(spa_get_dsl(spa), 0); - - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - mutex_enter(&spa->spa_props_lock); - - if (spa_multihost(spa)) { - mmp_thread_stop(spa); - spa->spa_multihost = B_FALSE; - } - - mutex_exit(&spa->spa_props_lock); - spa_config_exit(spa, SCL_CONFIG, FTAG); -} - -/* ARGSUSED */ -void -ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) -{ - spa_t *spa; - uint64_t initial_version = SPA_VERSION_INITIAL; - uint64_t version, newversion; - nvlist_t *nvroot, *props; - char *name; - - if (ztest_opts.zo_mmp_test) - return; - - mutex_enter(&ztest_vdev_lock); - name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); - - /* - * Clean up from previous runs. - */ - (void) spa_destroy(name); - - nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); - - /* - * If we're configuring a RAIDZ device then make sure that the - * the initial version is capable of supporting that feature. - */ - switch (ztest_opts.zo_raidz_parity) { - case 0: - case 1: - initial_version = SPA_VERSION_INITIAL; - break; - case 2: - initial_version = SPA_VERSION_RAIDZ2; - break; - case 3: - initial_version = SPA_VERSION_RAIDZ3; - break; - } - - /* - * Create a pool with a spa version that can be upgraded. Pick - * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. - */ - do { - version = ztest_random_spa_version(initial_version); - } while (version > SPA_VERSION_BEFORE_FEATURES); - - props = fnvlist_alloc(); - fnvlist_add_uint64(props, - zpool_prop_to_name(ZPOOL_PROP_VERSION), version); - VERIFY0(spa_create(name, nvroot, props, NULL)); - fnvlist_free(nvroot); - fnvlist_free(props); - - VERIFY0(spa_open(name, &spa, FTAG)); - VERIFY3U(spa_version(spa), ==, version); - newversion = ztest_random_spa_version(version + 1); - - if (ztest_opts.zo_verbose >= 4) { - (void) printf("upgrading spa version from %llu to %llu\n", - (u_longlong_t)version, (u_longlong_t)newversion); - } - - spa_upgrade(spa, newversion); - VERIFY3U(spa_version(spa), >, version); - VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, - zpool_prop_to_name(ZPOOL_PROP_VERSION))); - spa_close(spa, FTAG); - - strfree(name); - mutex_exit(&ztest_vdev_lock); -} - -static void -ztest_spa_checkpoint(spa_t *spa) -{ - ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); - - int error = spa_checkpoint(spa->spa_name); - - switch (error) { - case 0: - case ZFS_ERR_DEVRM_IN_PROGRESS: - case ZFS_ERR_DISCARDING_CHECKPOINT: - case ZFS_ERR_CHECKPOINT_EXISTS: - break; - case ENOSPC: - ztest_record_enospc(FTAG); - break; - default: - fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error); - } -} - -static void -ztest_spa_discard_checkpoint(spa_t *spa) -{ - ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); - - int error = spa_checkpoint_discard(spa->spa_name); - - switch (error) { - case 0: - case ZFS_ERR_DISCARDING_CHECKPOINT: - case ZFS_ERR_NO_CHECKPOINT: - break; - default: - fatal(0, "spa_discard_checkpoint(%s) = %d", - spa->spa_name, error); - } - -} - -/* ARGSUSED */ -void -ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) -{ - spa_t *spa = ztest_spa; - - mutex_enter(&ztest_checkpoint_lock); - if (ztest_random(2) == 0) { - ztest_spa_checkpoint(spa); - } else { - ztest_spa_discard_checkpoint(spa); - } - mutex_exit(&ztest_checkpoint_lock); -} - - -static vdev_t * -vdev_lookup_by_path(vdev_t *vd, const char *path) -{ - vdev_t *mvd; - - if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) - return (vd); - - for (int c = 0; c < vd->vdev_children; c++) - if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != - NULL) - return (mvd); - - return (NULL); -} - -/* - * Find the first available hole which can be used as a top-level. - */ -int -find_vdev_hole(spa_t *spa) -{ - vdev_t *rvd = spa->spa_root_vdev; - int c; - - ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV); - - for (c = 0; c < rvd->vdev_children; c++) { - vdev_t *cvd = rvd->vdev_child[c]; - - if (cvd->vdev_ishole) - break; - } - return (c); -} - -/* - * Verify that vdev_add() works as expected. - */ -/* ARGSUSED */ -void -ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) -{ - ztest_shared_t *zs = ztest_shared; - spa_t *spa = ztest_spa; - uint64_t leaves; - uint64_t guid; - nvlist_t *nvroot; - int error; - - if (ztest_opts.zo_mmp_test) - return; - - mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - - ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; - - /* - * If we have slogs then remove them 1/4 of the time. - */ - if (spa_has_slogs(spa) && ztest_random(4) == 0) { - metaslab_group_t *mg; - - /* - * find the first real slog in log allocation class - */ - mg = spa_log_class(spa)->mc_rotor; - while (!mg->mg_vd->vdev_islog) - mg = mg->mg_next; - - guid = mg->mg_vd->vdev_guid; - - spa_config_exit(spa, SCL_VDEV, FTAG); - - /* - * We have to grab the zs_name_lock as writer to - * prevent a race between removing a slog (dmu_objset_find) - * and destroying a dataset. Removing the slog will - * grab a reference on the dataset which may cause - * dmu_objset_destroy() to fail with EBUSY thus - * leaving the dataset in an inconsistent state. - */ - rw_enter(&ztest_name_lock, RW_WRITER); - error = spa_vdev_remove(spa, guid, B_FALSE); - rw_exit(&ztest_name_lock); - - switch (error) { - case 0: - case EEXIST: - case ZFS_ERR_CHECKPOINT_EXISTS: - case ZFS_ERR_DISCARDING_CHECKPOINT: - break; - default: - fatal(0, "spa_vdev_remove() = %d", error); - } - } else { - spa_config_exit(spa, SCL_VDEV, FTAG); - - /* - * Make 1/4 of the devices be log devices - */ - nvroot = make_vdev_root(NULL, NULL, NULL, - ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? - "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); - - error = spa_vdev_add(spa, nvroot); - nvlist_free(nvroot); - - switch (error) { - case 0: - break; - case ENOSPC: - ztest_record_enospc("spa_vdev_add"); - break; - default: - fatal(0, "spa_vdev_add() = %d", error); - } - } - - mutex_exit(&ztest_vdev_lock); -} - -/* ARGSUSED */ -void -ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) -{ - ztest_shared_t *zs = ztest_shared; - spa_t *spa = ztest_spa; - uint64_t leaves; - nvlist_t *nvroot; - const char *class = (ztest_random(2) == 0) ? - VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; - int error; - - /* - * By default add a special vdev 50% of the time - */ - if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || - (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && - ztest_random(2) == 0)) { - return; - } - - mutex_enter(&ztest_vdev_lock); - - /* Only test with mirrors */ - if (zs->zs_mirrors < 2) { - mutex_exit(&ztest_vdev_lock); - return; - } - - /* requires feature@allocation_classes */ - if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { - mutex_exit(&ztest_vdev_lock); - return; - } - - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; - spa_config_exit(spa, SCL_VDEV, FTAG); - - nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - class, ztest_opts.zo_raidz, zs->zs_mirrors, 1); - - error = spa_vdev_add(spa, nvroot); - nvlist_free(nvroot); - - if (error == ENOSPC) - ztest_record_enospc("spa_vdev_add"); - else if (error != 0) - fatal(0, "spa_vdev_add() = %d", error); - - /* - * 50% of the time allow small blocks in the special class - */ - if (error == 0 && - spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { - if (ztest_opts.zo_verbose >= 3) - (void) printf("Enabling special VDEV small blocks\n"); - (void) ztest_dsl_prop_set_uint64(zd->zd_name, - ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); - } - - mutex_exit(&ztest_vdev_lock); - - if (ztest_opts.zo_verbose >= 3) { - metaslab_class_t *mc; - - if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) - mc = spa_special_class(spa); - else - mc = spa_dedup_class(spa); - (void) printf("Added a %s mirrored vdev (of %d)\n", - class, (int)mc->mc_groups); - } -} - -/* - * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. - */ -/* ARGSUSED */ -void -ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) -{ - ztest_shared_t *zs = ztest_shared; - spa_t *spa = ztest_spa; - vdev_t *rvd = spa->spa_root_vdev; - spa_aux_vdev_t *sav; - char *aux; - uint64_t guid = 0; - int error; - - if (ztest_opts.zo_mmp_test) - return; - - if (ztest_random(2) == 0) { - sav = &spa->spa_spares; - aux = ZPOOL_CONFIG_SPARES; - } else { - sav = &spa->spa_l2cache; - aux = ZPOOL_CONFIG_L2CACHE; - } - - mutex_enter(&ztest_vdev_lock); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - - if (sav->sav_count != 0 && ztest_random(4) == 0) { - /* - * Pick a random device to remove. - */ - guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; - } else { - /* - * Find an unused device we can add. - */ - zs->zs_vdev_aux = 0; - for (;;) { - char path[MAXPATHLEN]; - int c; - (void) snprintf(path, sizeof (path), ztest_aux_template, - ztest_opts.zo_dir, ztest_opts.zo_pool, aux, - zs->zs_vdev_aux); - for (c = 0; c < sav->sav_count; c++) - if (strcmp(sav->sav_vdevs[c]->vdev_path, - path) == 0) - break; - if (c == sav->sav_count && - vdev_lookup_by_path(rvd, path) == NULL) - break; - zs->zs_vdev_aux++; - } - } - - spa_config_exit(spa, SCL_VDEV, FTAG); - - if (guid == 0) { - /* - * Add a new device. - */ - nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, - (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); - error = spa_vdev_add(spa, nvroot); - - switch (error) { - case 0: - break; - default: - fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); - } - nvlist_free(nvroot); - } else { - /* - * Remove an existing device. Sometimes, dirty its - * vdev state first to make sure we handle removal - * of devices that have pending state changes. - */ - if (ztest_random(2) == 0) - (void) vdev_online(spa, guid, 0, NULL); - - error = spa_vdev_remove(spa, guid, B_FALSE); - - switch (error) { - case 0: - case EBUSY: - case ZFS_ERR_CHECKPOINT_EXISTS: - case ZFS_ERR_DISCARDING_CHECKPOINT: - break; - default: - fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); - } - } - - mutex_exit(&ztest_vdev_lock); -} - -/* - * split a pool if it has mirror tlvdevs - */ -/* ARGSUSED */ -void -ztest_split_pool(ztest_ds_t *zd, uint64_t id) -{ - ztest_shared_t *zs = ztest_shared; - spa_t *spa = ztest_spa; - vdev_t *rvd = spa->spa_root_vdev; - nvlist_t *tree, **child, *config, *split, **schild; - uint_t c, children, schildren = 0, lastlogid = 0; - int error = 0; - - if (ztest_opts.zo_mmp_test) - return; - - mutex_enter(&ztest_vdev_lock); - - /* ensure we have a useable config; mirrors of raidz aren't supported */ - if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) { - mutex_exit(&ztest_vdev_lock); - return; - } - - /* clean up the old pool, if any */ - (void) spa_destroy("splitp"); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - - /* generate a config from the existing config */ - mutex_enter(&spa->spa_props_lock); - VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE, - &tree) == 0); - mutex_exit(&spa->spa_props_lock); - - VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, - &children) == 0); - - schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); - for (c = 0; c < children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - nvlist_t **mchild; - uint_t mchildren; - - if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { - VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME, - 0) == 0); - VERIFY(nvlist_add_string(schild[schildren], - ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); - VERIFY(nvlist_add_uint64(schild[schildren], - ZPOOL_CONFIG_IS_HOLE, 1) == 0); - if (lastlogid == 0) - lastlogid = schildren; - ++schildren; - continue; - } - lastlogid = 0; - VERIFY(nvlist_lookup_nvlist_array(child[c], - ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); - VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0); - } - - /* OK, create a config that can be used to split */ - VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_ROOT) == 0); - VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, - lastlogid != 0 ? lastlogid : schildren) == 0); - - VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0); - - for (c = 0; c < schildren; c++) - nvlist_free(schild[c]); - free(schild); - nvlist_free(split); - - spa_config_exit(spa, SCL_VDEV, FTAG); - - rw_enter(&ztest_name_lock, RW_WRITER); - error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); - rw_exit(&ztest_name_lock); - - nvlist_free(config); - - if (error == 0) { - (void) printf("successful split - results:\n"); - mutex_enter(&spa_namespace_lock); - show_pool_stats(spa); - show_pool_stats(spa_lookup("splitp")); - mutex_exit(&spa_namespace_lock); - ++zs->zs_splits; - --zs->zs_mirrors; - } - mutex_exit(&ztest_vdev_lock); -} - -/* - * Verify that we can attach and detach devices. - */ -/* ARGSUSED */ -void -ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) -{ - ztest_shared_t *zs = ztest_shared; - spa_t *spa = ztest_spa; - spa_aux_vdev_t *sav = &spa->spa_spares; - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *oldvd, *newvd, *pvd; - nvlist_t *root; - uint64_t leaves; - uint64_t leaf, top; - uint64_t ashift = ztest_get_ashift(); - uint64_t oldguid, pguid; - uint64_t oldsize, newsize; - char oldpath[MAXPATHLEN], newpath[MAXPATHLEN]; - int replacing; - int oldvd_has_siblings = B_FALSE; - int newvd_is_spare = B_FALSE; - int oldvd_is_log; - int error, expected_error; - - if (ztest_opts.zo_mmp_test) - return; - - mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - - /* - * If a vdev is in the process of being removed, its removal may - * finish while we are in progress, leading to an unexpected error - * value. Don't bother trying to attach while we are in the middle - * of removal. - */ - if (ztest_device_removal_active) { - spa_config_exit(spa, SCL_ALL, FTAG); - mutex_exit(&ztest_vdev_lock); - return; - } - - /* - * Decide whether to do an attach or a replace. - */ - replacing = ztest_random(2); - - /* - * Pick a random top-level vdev. - */ - top = ztest_random_vdev_top(spa, B_TRUE); - - /* - * Pick a random leaf within it. - */ - leaf = ztest_random(leaves); - - /* - * Locate this vdev. - */ - oldvd = rvd->vdev_child[top]; - - /* pick a child from the mirror */ - if (zs->zs_mirrors >= 1) { - ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); - ASSERT(oldvd->vdev_children >= zs->zs_mirrors); - oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; - } - - /* pick a child out of the raidz group */ - if (ztest_opts.zo_raidz > 1) { - ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); - ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); - oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz]; - } - - /* - * If we're already doing an attach or replace, oldvd may be a - * mirror vdev -- in which case, pick a random child. - */ - while (oldvd->vdev_children != 0) { - oldvd_has_siblings = B_TRUE; - ASSERT(oldvd->vdev_children >= 2); - oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; - } - - oldguid = oldvd->vdev_guid; - oldsize = vdev_get_min_asize(oldvd); - oldvd_is_log = oldvd->vdev_top->vdev_islog; - (void) strcpy(oldpath, oldvd->vdev_path); - pvd = oldvd->vdev_parent; - pguid = pvd->vdev_guid; - - /* - * If oldvd has siblings, then half of the time, detach it. - */ - if (oldvd_has_siblings && ztest_random(2) == 0) { - spa_config_exit(spa, SCL_ALL, FTAG); - error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); - if (error != 0 && error != ENODEV && error != EBUSY && - error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && - error != ZFS_ERR_DISCARDING_CHECKPOINT) - fatal(0, "detach (%s) returned %d", oldpath, error); - mutex_exit(&ztest_vdev_lock); - return; - } - - /* - * For the new vdev, choose with equal probability between the two - * standard paths (ending in either 'a' or 'b') or a random hot spare. - */ - if (sav->sav_count != 0 && ztest_random(3) == 0) { - newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; - newvd_is_spare = B_TRUE; - (void) strcpy(newpath, newvd->vdev_path); - } else { - (void) snprintf(newpath, sizeof (newpath), ztest_dev_template, - ztest_opts.zo_dir, ztest_opts.zo_pool, - top * leaves + leaf); - if (ztest_random(2) == 0) - newpath[strlen(newpath) - 1] = 'b'; - newvd = vdev_lookup_by_path(rvd, newpath); - } - - if (newvd) { - /* - * Reopen to ensure the vdev's asize field isn't stale. - */ - vdev_reopen(newvd); - newsize = vdev_get_min_asize(newvd); - } else { - /* - * Make newsize a little bigger or smaller than oldsize. - * If it's smaller, the attach should fail. - * If it's larger, and we're doing a replace, - * we should get dynamic LUN growth when we're done. - */ - newsize = 10 * oldsize / (9 + ztest_random(3)); - } - - /* - * If pvd is not a mirror or root, the attach should fail with ENOTSUP, - * unless it's a replace; in that case any non-replacing parent is OK. - * - * If newvd is already part of the pool, it should fail with EBUSY. - * - * If newvd is too small, it should fail with EOVERFLOW. - */ - if (pvd->vdev_ops != &vdev_mirror_ops && - pvd->vdev_ops != &vdev_root_ops && (!replacing || - pvd->vdev_ops == &vdev_replacing_ops || - pvd->vdev_ops == &vdev_spare_ops)) - expected_error = ENOTSUP; - else if (newvd_is_spare && (!replacing || oldvd_is_log)) - expected_error = ENOTSUP; - else if (newvd == oldvd) - expected_error = replacing ? 0 : EBUSY; - else if (vdev_lookup_by_path(rvd, newpath) != NULL) - expected_error = EBUSY; - else if (newsize < oldsize) - expected_error = EOVERFLOW; - else if (ashift > oldvd->vdev_top->vdev_ashift) - expected_error = EDOM; - else - expected_error = 0; - - spa_config_exit(spa, SCL_ALL, FTAG); - - /* - * Build the nvlist describing newpath. - */ - root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, - ashift, NULL, 0, 0, 1); - - error = spa_vdev_attach(spa, oldguid, root, replacing); - - nvlist_free(root); - - /* - * If our parent was the replacing vdev, but the replace completed, - * then instead of failing with ENOTSUP we may either succeed, - * fail with ENODEV, or fail with EOVERFLOW. - */ - if (expected_error == ENOTSUP && - (error == 0 || error == ENODEV || error == EOVERFLOW)) - expected_error = error; - - /* - * If someone grew the LUN, the replacement may be too small. - */ - if (error == EOVERFLOW || error == EBUSY) - expected_error = error; - - if (error == ZFS_ERR_CHECKPOINT_EXISTS || - error == ZFS_ERR_DISCARDING_CHECKPOINT) - expected_error = error; - - /* XXX workaround 6690467 */ - if (error != expected_error && expected_error != EBUSY) { - fatal(0, "attach (%s %llu, %s %llu, %d) " - "returned %d, expected %d", - oldpath, oldsize, newpath, - newsize, replacing, error, expected_error); - } - - mutex_exit(&ztest_vdev_lock); -} - -/* ARGSUSED */ -void -ztest_device_removal(ztest_ds_t *zd, uint64_t id) -{ - spa_t *spa = ztest_spa; - vdev_t *vd; - uint64_t guid; - int error; - - mutex_enter(&ztest_vdev_lock); - - if (ztest_device_removal_active) { - mutex_exit(&ztest_vdev_lock); - return; - } - - /* - * Remove a random top-level vdev and wait for removal to finish. - */ - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); - guid = vd->vdev_guid; - spa_config_exit(spa, SCL_VDEV, FTAG); - - error = spa_vdev_remove(spa, guid, B_FALSE); - if (error == 0) { - ztest_device_removal_active = B_TRUE; - mutex_exit(&ztest_vdev_lock); - - while (spa->spa_vdev_removal != NULL) - txg_wait_synced(spa_get_dsl(spa), 0); - } else { - mutex_exit(&ztest_vdev_lock); - return; - } - - /* - * The pool needs to be scrubbed after completing device removal. - * Failure to do so may result in checksum errors due to the - * strategy employed by ztest_fault_inject() when selecting which - * offset are redundant and can be damaged. - */ - error = spa_scan(spa, POOL_SCAN_SCRUB); - if (error == 0) { - while (dsl_scan_scrubbing(spa_get_dsl(spa))) - txg_wait_synced(spa_get_dsl(spa), 0); - } - - mutex_enter(&ztest_vdev_lock); - ztest_device_removal_active = B_FALSE; - mutex_exit(&ztest_vdev_lock); -} - -/* - * Callback function which expands the physical size of the vdev. - */ -vdev_t * -grow_vdev(vdev_t *vd, void *arg) -{ - spa_t *spa = vd->vdev_spa; - size_t *newsize = arg; - size_t fsize; - int fd; - - ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); - ASSERT(vd->vdev_ops->vdev_op_leaf); - - if ((fd = open(vd->vdev_path, O_RDWR)) == -1) - return (vd); - - fsize = lseek(fd, 0, SEEK_END); - (void) ftruncate(fd, *newsize); - - if (ztest_opts.zo_verbose >= 6) { - (void) printf("%s grew from %lu to %lu bytes\n", - vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); - } - (void) close(fd); - return (NULL); -} - -/* - * Callback function which expands a given vdev by calling vdev_online(). - */ -/* ARGSUSED */ -vdev_t * -online_vdev(vdev_t *vd, void *arg) -{ - spa_t *spa = vd->vdev_spa; - vdev_t *tvd = vd->vdev_top; - uint64_t guid = vd->vdev_guid; - uint64_t generation = spa->spa_config_generation + 1; - vdev_state_t newstate = VDEV_STATE_UNKNOWN; - int error; - - ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); - ASSERT(vd->vdev_ops->vdev_op_leaf); - - /* Calling vdev_online will initialize the new metaslabs */ - spa_config_exit(spa, SCL_STATE, spa); - error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); - spa_config_enter(spa, SCL_STATE, spa, RW_READER); - - /* - * If vdev_online returned an error or the underlying vdev_open - * failed then we abort the expand. The only way to know that - * vdev_open fails is by checking the returned newstate. - */ - if (error || newstate != VDEV_STATE_HEALTHY) { - if (ztest_opts.zo_verbose >= 5) { - (void) printf("Unable to expand vdev, state %llu, " - "error %d\n", (u_longlong_t)newstate, error); - } - return (vd); - } - ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); - - /* - * Since we dropped the lock we need to ensure that we're - * still talking to the original vdev. It's possible this - * vdev may have been detached/replaced while we were - * trying to online it. - */ - if (generation != spa->spa_config_generation) { - if (ztest_opts.zo_verbose >= 5) { - (void) printf("vdev configuration has changed, " - "guid %llu, state %llu, expected gen %llu, " - "got gen %llu\n", - (u_longlong_t)guid, - (u_longlong_t)tvd->vdev_state, - (u_longlong_t)generation, - (u_longlong_t)spa->spa_config_generation); - } - return (vd); - } - return (NULL); -} - -/* - * Traverse the vdev tree calling the supplied function. - * We continue to walk the tree until we either have walked all - * children or we receive a non-NULL return from the callback. - * If a NULL callback is passed, then we just return back the first - * leaf vdev we encounter. - */ -vdev_t * -vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) -{ - if (vd->vdev_ops->vdev_op_leaf) { - if (func == NULL) - return (vd); - else - return (func(vd, arg)); - } - - for (uint_t c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) - return (cvd); - } - return (NULL); -} - -/* - * Verify that dynamic LUN growth works as expected. - */ -/* ARGSUSED */ -void -ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) -{ - spa_t *spa = ztest_spa; - vdev_t *vd, *tvd; - metaslab_class_t *mc; - metaslab_group_t *mg; - size_t psize, newsize; - uint64_t top; - uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; - - mutex_enter(&ztest_checkpoint_lock); - mutex_enter(&ztest_vdev_lock); - spa_config_enter(spa, SCL_STATE, spa, RW_READER); - - /* - * If there is a vdev removal in progress, it could complete while - * we are running, in which case we would not be able to verify - * that the metaslab_class space increased (because it decreases - * when the device removal completes). - */ - if (ztest_device_removal_active) { - spa_config_exit(spa, SCL_STATE, spa); - mutex_exit(&ztest_vdev_lock); - mutex_exit(&ztest_checkpoint_lock); - return; - } - - top = ztest_random_vdev_top(spa, B_TRUE); - - tvd = spa->spa_root_vdev->vdev_child[top]; - mg = tvd->vdev_mg; - mc = mg->mg_class; - old_ms_count = tvd->vdev_ms_count; - old_class_space = metaslab_class_get_space(mc); - - /* - * Determine the size of the first leaf vdev associated with - * our top-level device. - */ - vd = vdev_walk_tree(tvd, NULL, NULL); - ASSERT3P(vd, !=, NULL); - ASSERT(vd->vdev_ops->vdev_op_leaf); - - psize = vd->vdev_psize; - - /* - * We only try to expand the vdev if it's healthy, less than 4x its - * original size, and it has a valid psize. - */ - if (tvd->vdev_state != VDEV_STATE_HEALTHY || - psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { - spa_config_exit(spa, SCL_STATE, spa); - mutex_exit(&ztest_vdev_lock); - mutex_exit(&ztest_checkpoint_lock); - return; - } - ASSERT(psize > 0); - newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); - ASSERT3U(newsize, >, psize); - - if (ztest_opts.zo_verbose >= 6) { - (void) printf("Expanding LUN %s from %lu to %lu\n", - vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); - } - - /* - * Growing the vdev is a two step process: - * 1). expand the physical size (i.e. relabel) - * 2). online the vdev to create the new metaslabs - */ - if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || - vdev_walk_tree(tvd, online_vdev, NULL) != NULL || - tvd->vdev_state != VDEV_STATE_HEALTHY) { - if (ztest_opts.zo_verbose >= 5) { - (void) printf("Could not expand LUN because " - "the vdev configuration changed.\n"); - } - spa_config_exit(spa, SCL_STATE, spa); - mutex_exit(&ztest_vdev_lock); - mutex_exit(&ztest_checkpoint_lock); - return; - } - - spa_config_exit(spa, SCL_STATE, spa); - - /* - * Expanding the LUN will update the config asynchronously, - * thus we must wait for the async thread to complete any - * pending tasks before proceeding. - */ - for (;;) { - boolean_t done; - mutex_enter(&spa->spa_async_lock); - done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); - mutex_exit(&spa->spa_async_lock); - if (done) - break; - txg_wait_synced(spa_get_dsl(spa), 0); - (void) poll(NULL, 0, 100); - } - - spa_config_enter(spa, SCL_STATE, spa, RW_READER); - - tvd = spa->spa_root_vdev->vdev_child[top]; - new_ms_count = tvd->vdev_ms_count; - new_class_space = metaslab_class_get_space(mc); - - if (tvd->vdev_mg != mg || mg->mg_class != mc) { - if (ztest_opts.zo_verbose >= 5) { - (void) printf("Could not verify LUN expansion due to " - "intervening vdev offline or remove.\n"); - } - spa_config_exit(spa, SCL_STATE, spa); - mutex_exit(&ztest_vdev_lock); - mutex_exit(&ztest_checkpoint_lock); - return; - } - - /* - * Make sure we were able to grow the vdev. - */ - if (new_ms_count <= old_ms_count) { - fatal(0, "LUN expansion failed: ms_count %llu < %llu\n", - old_ms_count, new_ms_count); - } - - /* - * Make sure we were able to grow the pool. - */ - if (new_class_space <= old_class_space) { - fatal(0, "LUN expansion failed: class_space %llu < %llu\n", - old_class_space, new_class_space); - } - - if (ztest_opts.zo_verbose >= 5) { - char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; - - nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); - nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); - (void) printf("%s grew from %s to %s\n", - spa->spa_name, oldnumbuf, newnumbuf); - } - - spa_config_exit(spa, SCL_STATE, spa); - mutex_exit(&ztest_vdev_lock); - mutex_exit(&ztest_checkpoint_lock); -} - -/* - * Verify that dmu_objset_{create,destroy,open,close} work as expected. - */ -/* ARGSUSED */ -static void -ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) -{ - /* - * Create the objects common to all ztest datasets. - */ - VERIFY(zap_create_claim(os, ZTEST_DIROBJ, - DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); -} - -static int -ztest_dataset_create(char *dsname) -{ - uint64_t zilset = ztest_random(100); - int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, - ztest_objset_create_cb, NULL); - - if (err || zilset < 80) - return (err); - - if (ztest_opts.zo_verbose >= 6) - (void) printf("Setting dataset %s to sync always\n", dsname); - return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, - ZFS_SYNC_ALWAYS, B_FALSE)); -} - -/* ARGSUSED */ -static int -ztest_objset_destroy_cb(const char *name, void *arg) -{ - objset_t *os; - dmu_object_info_t doi; - int error; - - /* - * Verify that the dataset contains a directory object. - */ - VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os)); - error = dmu_object_info(os, ZTEST_DIROBJ, &doi); - if (error != ENOENT) { - /* We could have crashed in the middle of destroying it */ - ASSERT0(error); - ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); - ASSERT3S(doi.doi_physical_blocks_512, >=, 0); - } - dmu_objset_disown(os, FTAG); - - /* - * Destroy the dataset. - */ - if (strchr(name, '@') != NULL) { - VERIFY0(dsl_destroy_snapshot(name, B_FALSE)); - } else { - VERIFY0(dsl_destroy_head(name)); - } - return (0); -} - -static boolean_t -ztest_snapshot_create(char *osname, uint64_t id) -{ - char snapname[ZFS_MAX_DATASET_NAME_LEN]; - int error; - - (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id); - - error = dmu_objset_snapshot_one(osname, snapname); - if (error == ENOSPC) { - ztest_record_enospc(FTAG); - return (B_FALSE); - } - if (error != 0 && error != EEXIST) { - fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname, - snapname, error); - } - return (B_TRUE); -} - -static boolean_t -ztest_snapshot_destroy(char *osname, uint64_t id) -{ - char snapname[ZFS_MAX_DATASET_NAME_LEN]; - int error; - - (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname, - (u_longlong_t)id); - - error = dsl_destroy_snapshot(snapname, B_FALSE); - if (error != 0 && error != ENOENT) - fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); - return (B_TRUE); -} - -/* ARGSUSED */ -void -ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) -{ - ztest_ds_t zdtmp; - int iters; - int error; - objset_t *os, *os2; - char name[ZFS_MAX_DATASET_NAME_LEN]; - zilog_t *zilog; - - rw_enter(&ztest_name_lock, RW_READER); - - (void) snprintf(name, sizeof (name), "%s/temp_%llu", - ztest_opts.zo_pool, (u_longlong_t)id); - - /* - * If this dataset exists from a previous run, process its replay log - * half of the time. If we don't replay it, then dmu_objset_destroy() - * (invoked from ztest_objset_destroy_cb()) should just throw it away. - */ - if (ztest_random(2) == 0 && - dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) { - ztest_zd_init(&zdtmp, NULL, os); - zil_replay(os, &zdtmp, ztest_replay_vector); - ztest_zd_fini(&zdtmp); - dmu_objset_disown(os, FTAG); - } - - /* - * There may be an old instance of the dataset we're about to - * create lying around from a previous run. If so, destroy it - * and all of its snapshots. - */ - (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, - DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); - - /* - * Verify that the destroyed dataset is no longer in the namespace. - */ - VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, - FTAG, &os)); - - /* - * Verify that we can create a new dataset. - */ - error = ztest_dataset_create(name); - if (error) { - if (error == ENOSPC) { - ztest_record_enospc(FTAG); - rw_exit(&ztest_name_lock); - return; - } - fatal(0, "dmu_objset_create(%s) = %d", name, error); - } - - VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os)); - - ztest_zd_init(&zdtmp, NULL, os); - - /* - * Open the intent log for it. - */ - zilog = zil_open(os, ztest_get_data); - - /* - * Put some objects in there, do a little I/O to them, - * and randomly take a couple of snapshots along the way. - */ - iters = ztest_random(5); - for (int i = 0; i < iters; i++) { - ztest_dmu_object_alloc_free(&zdtmp, id); - if (ztest_random(iters) == 0) - (void) ztest_snapshot_create(name, i); - } - - /* - * Verify that we cannot create an existing dataset. - */ - VERIFY3U(EEXIST, ==, - dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL)); - - /* - * Verify that we can hold an objset that is also owned. - */ - VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2)); - dmu_objset_rele(os2, FTAG); - - /* - * Verify that we cannot own an objset that is already owned. - */ - VERIFY3U(EBUSY, ==, - dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2)); - - zil_close(zilog); - dmu_objset_disown(os, FTAG); - ztest_zd_fini(&zdtmp); - - rw_exit(&ztest_name_lock); -} - -/* - * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. - */ -void -ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) -{ - rw_enter(&ztest_name_lock, RW_READER); - (void) ztest_snapshot_destroy(zd->zd_name, id); - (void) ztest_snapshot_create(zd->zd_name, id); - rw_exit(&ztest_name_lock); -} - -/* - * Cleanup non-standard snapshots and clones. - */ -void -ztest_dsl_dataset_cleanup(char *osname, uint64_t id) -{ - char snap1name[ZFS_MAX_DATASET_NAME_LEN]; - char clone1name[ZFS_MAX_DATASET_NAME_LEN]; - char snap2name[ZFS_MAX_DATASET_NAME_LEN]; - char clone2name[ZFS_MAX_DATASET_NAME_LEN]; - char snap3name[ZFS_MAX_DATASET_NAME_LEN]; - int error; - - (void) snprintf(snap1name, sizeof (snap1name), - "%s@s1_%llu", osname, id); - (void) snprintf(clone1name, sizeof (clone1name), - "%s/c1_%llu", osname, id); - (void) snprintf(snap2name, sizeof (snap2name), - "%s@s2_%llu", clone1name, id); - (void) snprintf(clone2name, sizeof (clone2name), - "%s/c2_%llu", osname, id); - (void) snprintf(snap3name, sizeof (snap3name), - "%s@s3_%llu", clone1name, id); - - error = dsl_destroy_head(clone2name); - if (error && error != ENOENT) - fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error); - error = dsl_destroy_snapshot(snap3name, B_FALSE); - if (error && error != ENOENT) - fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error); - error = dsl_destroy_snapshot(snap2name, B_FALSE); - if (error && error != ENOENT) - fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error); - error = dsl_destroy_head(clone1name); - if (error && error != ENOENT) - fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error); - error = dsl_destroy_snapshot(snap1name, B_FALSE); - if (error && error != ENOENT) - fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error); -} - -/* - * Verify dsl_dataset_promote handles EBUSY - */ -void -ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) -{ - objset_t *os; - char snap1name[ZFS_MAX_DATASET_NAME_LEN]; - char clone1name[ZFS_MAX_DATASET_NAME_LEN]; - char snap2name[ZFS_MAX_DATASET_NAME_LEN]; - char clone2name[ZFS_MAX_DATASET_NAME_LEN]; - char snap3name[ZFS_MAX_DATASET_NAME_LEN]; - char *osname = zd->zd_name; - int error; - - rw_enter(&ztest_name_lock, RW_READER); - - ztest_dsl_dataset_cleanup(osname, id); - - (void) snprintf(snap1name, sizeof (snap1name), - "%s@s1_%llu", osname, id); - (void) snprintf(clone1name, sizeof (clone1name), - "%s/c1_%llu", osname, id); - (void) snprintf(snap2name, sizeof (snap2name), - "%s@s2_%llu", clone1name, id); - (void) snprintf(clone2name, sizeof (clone2name), - "%s/c2_%llu", osname, id); - (void) snprintf(snap3name, sizeof (snap3name), - "%s@s3_%llu", clone1name, id); - - error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); - if (error && error != EEXIST) { - if (error == ENOSPC) { - ztest_record_enospc(FTAG); - goto out; - } - fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); - } - - error = dmu_objset_clone(clone1name, snap1name); - if (error) { - if (error == ENOSPC) { - ztest_record_enospc(FTAG); - goto out; - } - fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); - } - - error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); - if (error && error != EEXIST) { - if (error == ENOSPC) { - ztest_record_enospc(FTAG); - goto out; - } - fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); - } - - error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); - if (error && error != EEXIST) { - if (error == ENOSPC) { - ztest_record_enospc(FTAG); - goto out; - } - fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); - } - - error = dmu_objset_clone(clone2name, snap3name); - if (error) { - if (error == ENOSPC) { - ztest_record_enospc(FTAG); - goto out; - } - fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); - } - - error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os); - if (error) - fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); - error = dsl_dataset_promote(clone2name, NULL); - if (error == ENOSPC) { - dmu_objset_disown(os, FTAG); - ztest_record_enospc(FTAG); - goto out; - } - if (error != EBUSY) - fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, - error); - dmu_objset_disown(os, FTAG); - -out: - ztest_dsl_dataset_cleanup(osname, id); - - rw_exit(&ztest_name_lock); -} - -/* - * Verify that dmu_object_{alloc,free} work as expected. - */ -void -ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) -{ - ztest_od_t od[4]; - int batchsize = sizeof (od) / sizeof (od[0]); - - for (int b = 0; b < batchsize; b++) { - ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, - 0, 0, 0); - } - - /* - * Destroy the previous batch of objects, create a new batch, - * and do some I/O on the new objects. - */ - if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0) - return; - - while (ztest_random(4 * batchsize) != 0) - ztest_io(zd, od[ztest_random(batchsize)].od_object, - ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); -} - -/* - * Rewind the global allocator to verify object allocation backfilling. - */ -void -ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) -{ - objset_t *os = zd->zd_os; - int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; - uint64_t object; - - /* - * Rewind the global allocator randomly back to a lower object number - * to force backfilling and reclamation of recently freed dnodes. - */ - mutex_enter(&os->os_obj_lock); - object = ztest_random(os->os_obj_next_chunk); - os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); - mutex_exit(&os->os_obj_lock); -} - -/* - * Verify that dmu_{read,write} work as expected. - */ -void -ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) -{ - objset_t *os = zd->zd_os; - ztest_od_t od[2]; - dmu_tx_t *tx; - int i, freeit, error; - uint64_t n, s, txg; - bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; - uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; - uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); - uint64_t regions = 997; - uint64_t stride = 123456789ULL; - uint64_t width = 40; - int free_percent = 5; - - /* - * This test uses two objects, packobj and bigobj, that are always - * updated together (i.e. in the same tx) so that their contents are - * in sync and can be compared. Their contents relate to each other - * in a simple way: packobj is a dense array of 'bufwad' structures, - * while bigobj is a sparse array of the same bufwads. Specifically, - * for any index n, there are three bufwads that should be identical: - * - * packobj, at offset n * sizeof (bufwad_t) - * bigobj, at the head of the nth chunk - * bigobj, at the tail of the nth chunk - * - * The chunk size is arbitrary. It doesn't have to be a power of two, - * and it doesn't have any relation to the object blocksize. - * The only requirement is that it can hold at least two bufwads. - * - * Normally, we write the bufwad to each of these locations. - * However, free_percent of the time we instead write zeroes to - * packobj and perform a dmu_free_range() on bigobj. By comparing - * bigobj to packobj, we can verify that the DMU is correctly - * tracking which parts of an object are allocated and free, - * and that the contents of the allocated blocks are correct. - */ - - /* - * Read the directory info. If it's the first time, set things up. - */ - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, - chunksize); - ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, - chunksize); - - if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) - return; - - bigobj = od[0].od_object; - packobj = od[1].od_object; - chunksize = od[0].od_gen; - ASSERT(chunksize == od[1].od_gen); - - /* - * Prefetch a random chunk of the big object. - * Our aim here is to get some async reads in flight - * for blocks that we may free below; the DMU should - * handle this race correctly. - */ - n = ztest_random(regions) * stride + ztest_random(width); - s = 1 + ztest_random(2 * width - 1); - dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, - ZIO_PRIORITY_SYNC_READ); - - /* - * Pick a random index and compute the offsets into packobj and bigobj. - */ - n = ztest_random(regions) * stride + ztest_random(width); - s = 1 + ztest_random(width - 1); - - packoff = n * sizeof (bufwad_t); - packsize = s * sizeof (bufwad_t); - - bigoff = n * chunksize; - bigsize = s * chunksize; - - packbuf = umem_alloc(packsize, UMEM_NOFAIL); - bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); - - /* - * free_percent of the time, free a range of bigobj rather than - * overwriting it. - */ - freeit = (ztest_random(100) < free_percent); - - /* - * Read the current contents of our objects. - */ - error = dmu_read(os, packobj, packoff, packsize, packbuf, - DMU_READ_PREFETCH); - ASSERT0(error); - error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, - DMU_READ_PREFETCH); - ASSERT0(error); - - /* - * Get a tx for the mods to both packobj and bigobj. - */ - tx = dmu_tx_create(os); - - dmu_tx_hold_write(tx, packobj, packoff, packsize); - - if (freeit) - dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); - else - dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); - - /* This accounts for setting the checksum/compression. */ - dmu_tx_hold_bonus(tx, bigobj); - - txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); - if (txg == 0) { - umem_free(packbuf, packsize); - umem_free(bigbuf, bigsize); - return; - } - - enum zio_checksum cksum; - do { - cksum = (enum zio_checksum) - ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); - } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); - dmu_object_set_checksum(os, bigobj, cksum, tx); - - enum zio_compress comp; - do { - comp = (enum zio_compress) - ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); - } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); - dmu_object_set_compress(os, bigobj, comp, tx); - - /* - * For each index from n to n + s, verify that the existing bufwad - * in packobj matches the bufwads at the head and tail of the - * corresponding chunk in bigobj. Then update all three bufwads - * with the new values we want to write out. - */ - for (i = 0; i < s; i++) { - /* LINTED */ - pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); - /* LINTED */ - bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); - /* LINTED */ - bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; - - ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); - ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); - - if (pack->bw_txg > txg) - fatal(0, "future leak: got %llx, open txg is %llx", - pack->bw_txg, txg); - - if (pack->bw_data != 0 && pack->bw_index != n + i) - fatal(0, "wrong index: got %llx, wanted %llx+%llx", - pack->bw_index, n, i); - - if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) - fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); - - if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) - fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); - - if (freeit) { - bzero(pack, sizeof (bufwad_t)); - } else { - pack->bw_index = n + i; - pack->bw_txg = txg; - pack->bw_data = 1 + ztest_random(-2ULL); - } - *bigH = *pack; - *bigT = *pack; - } - - /* - * We've verified all the old bufwads, and made new ones. - * Now write them out. - */ - dmu_write(os, packobj, packoff, packsize, packbuf, tx); - - if (freeit) { - if (ztest_opts.zo_verbose >= 7) { - (void) printf("freeing offset %llx size %llx" - " txg %llx\n", - (u_longlong_t)bigoff, - (u_longlong_t)bigsize, - (u_longlong_t)txg); - } - VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx)); - } else { - if (ztest_opts.zo_verbose >= 7) { - (void) printf("writing offset %llx size %llx" - " txg %llx\n", - (u_longlong_t)bigoff, - (u_longlong_t)bigsize, - (u_longlong_t)txg); - } - dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); - } - - dmu_tx_commit(tx); - - /* - * Sanity check the stuff we just wrote. - */ - { - void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); - void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); - - VERIFY(0 == dmu_read(os, packobj, packoff, - packsize, packcheck, DMU_READ_PREFETCH)); - VERIFY(0 == dmu_read(os, bigobj, bigoff, - bigsize, bigcheck, DMU_READ_PREFETCH)); - - ASSERT(bcmp(packbuf, packcheck, packsize) == 0); - ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); - - umem_free(packcheck, packsize); - umem_free(bigcheck, bigsize); - } - - umem_free(packbuf, packsize); - umem_free(bigbuf, bigsize); -} - -void -compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, - uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) -{ - uint64_t i; - bufwad_t *pack; - bufwad_t *bigH; - bufwad_t *bigT; - - /* - * For each index from n to n + s, verify that the existing bufwad - * in packobj matches the bufwads at the head and tail of the - * corresponding chunk in bigobj. Then update all three bufwads - * with the new values we want to write out. - */ - for (i = 0; i < s; i++) { - /* LINTED */ - pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); - /* LINTED */ - bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); - /* LINTED */ - bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; - - ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); - ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); - - if (pack->bw_txg > txg) - fatal(0, "future leak: got %llx, open txg is %llx", - pack->bw_txg, txg); - - if (pack->bw_data != 0 && pack->bw_index != n + i) - fatal(0, "wrong index: got %llx, wanted %llx+%llx", - pack->bw_index, n, i); - - if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) - fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); - - if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) - fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); - - pack->bw_index = n + i; - pack->bw_txg = txg; - pack->bw_data = 1 + ztest_random(-2ULL); - - *bigH = *pack; - *bigT = *pack; - } -} - -void -ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) -{ - objset_t *os = zd->zd_os; - ztest_od_t od[2]; - dmu_tx_t *tx; - uint64_t i; - int error; - uint64_t n, s, txg; - bufwad_t *packbuf, *bigbuf; - uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; - uint64_t blocksize = ztest_random_blocksize(); - uint64_t chunksize = blocksize; - uint64_t regions = 997; - uint64_t stride = 123456789ULL; - uint64_t width = 9; - dmu_buf_t *bonus_db; - arc_buf_t **bigbuf_arcbufs; - dmu_object_info_t doi; - - /* - * This test uses two objects, packobj and bigobj, that are always - * updated together (i.e. in the same tx) so that their contents are - * in sync and can be compared. Their contents relate to each other - * in a simple way: packobj is a dense array of 'bufwad' structures, - * while bigobj is a sparse array of the same bufwads. Specifically, - * for any index n, there are three bufwads that should be identical: - * - * packobj, at offset n * sizeof (bufwad_t) - * bigobj, at the head of the nth chunk - * bigobj, at the tail of the nth chunk - * - * The chunk size is set equal to bigobj block size so that - * dmu_assign_arcbuf() can be tested for object updates. - */ - - /* - * Read the directory info. If it's the first time, set things up. - */ - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, - 0, 0); - ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, - chunksize); - - if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) - return; - - bigobj = od[0].od_object; - packobj = od[1].od_object; - blocksize = od[0].od_blocksize; - chunksize = blocksize; - ASSERT(chunksize == od[1].od_gen); - - VERIFY(dmu_object_info(os, bigobj, &doi) == 0); - VERIFY(ISP2(doi.doi_data_block_size)); - VERIFY(chunksize == doi.doi_data_block_size); - VERIFY(chunksize >= 2 * sizeof (bufwad_t)); - - /* - * Pick a random index and compute the offsets into packobj and bigobj. - */ - n = ztest_random(regions) * stride + ztest_random(width); - s = 1 + ztest_random(width - 1); - - packoff = n * sizeof (bufwad_t); - packsize = s * sizeof (bufwad_t); - - bigoff = n * chunksize; - bigsize = s * chunksize; - - packbuf = umem_zalloc(packsize, UMEM_NOFAIL); - bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); - - VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); - - bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); - - /* - * Iteration 0 test zcopy for DB_UNCACHED dbufs. - * Iteration 1 test zcopy to already referenced dbufs. - * Iteration 2 test zcopy to dirty dbuf in the same txg. - * Iteration 3 test zcopy to dbuf dirty in previous txg. - * Iteration 4 test zcopy when dbuf is no longer dirty. - * Iteration 5 test zcopy when it can't be done. - * Iteration 6 one more zcopy write. - */ - for (i = 0; i < 7; i++) { - uint64_t j; - uint64_t off; - - /* - * In iteration 5 (i == 5) use arcbufs - * that don't match bigobj blksz to test - * dmu_assign_arcbuf() when it can't directly - * assign an arcbuf to a dbuf. - */ - for (j = 0; j < s; j++) { - if (i != 5) { - bigbuf_arcbufs[j] = - dmu_request_arcbuf(bonus_db, chunksize); - } else { - bigbuf_arcbufs[2 * j] = - dmu_request_arcbuf(bonus_db, chunksize / 2); - bigbuf_arcbufs[2 * j + 1] = - dmu_request_arcbuf(bonus_db, chunksize / 2); - } - } - - /* - * Get a tx for the mods to both packobj and bigobj. - */ - tx = dmu_tx_create(os); - - dmu_tx_hold_write(tx, packobj, packoff, packsize); - dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); - - txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); - if (txg == 0) { - umem_free(packbuf, packsize); - umem_free(bigbuf, bigsize); - for (j = 0; j < s; j++) { - if (i != 5) { - dmu_return_arcbuf(bigbuf_arcbufs[j]); - } else { - dmu_return_arcbuf( - bigbuf_arcbufs[2 * j]); - dmu_return_arcbuf( - bigbuf_arcbufs[2 * j + 1]); - } - } - umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); - dmu_buf_rele(bonus_db, FTAG); - return; - } - - /* - * 50% of the time don't read objects in the 1st iteration to - * test dmu_assign_arcbuf() for the case when there're no - * existing dbufs for the specified offsets. - */ - if (i != 0 || ztest_random(2) != 0) { - error = dmu_read(os, packobj, packoff, - packsize, packbuf, DMU_READ_PREFETCH); - ASSERT0(error); - error = dmu_read(os, bigobj, bigoff, bigsize, - bigbuf, DMU_READ_PREFETCH); - ASSERT0(error); - } - compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, - n, chunksize, txg); - - /* - * We've verified all the old bufwads, and made new ones. - * Now write them out. - */ - dmu_write(os, packobj, packoff, packsize, packbuf, tx); - if (ztest_opts.zo_verbose >= 7) { - (void) printf("writing offset %llx size %llx" - " txg %llx\n", - (u_longlong_t)bigoff, - (u_longlong_t)bigsize, - (u_longlong_t)txg); - } - for (off = bigoff, j = 0; j < s; j++, off += chunksize) { - dmu_buf_t *dbt; - if (i != 5) { - bcopy((caddr_t)bigbuf + (off - bigoff), - bigbuf_arcbufs[j]->b_data, chunksize); - } else { - bcopy((caddr_t)bigbuf + (off - bigoff), - bigbuf_arcbufs[2 * j]->b_data, - chunksize / 2); - bcopy((caddr_t)bigbuf + (off - bigoff) + - chunksize / 2, - bigbuf_arcbufs[2 * j + 1]->b_data, - chunksize / 2); - } - - if (i == 1) { - VERIFY(dmu_buf_hold(os, bigobj, off, - FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); - } - if (i != 5) { - dmu_assign_arcbuf(bonus_db, off, - bigbuf_arcbufs[j], tx); - } else { - dmu_assign_arcbuf(bonus_db, off, - bigbuf_arcbufs[2 * j], tx); - dmu_assign_arcbuf(bonus_db, - off + chunksize / 2, - bigbuf_arcbufs[2 * j + 1], tx); - } - if (i == 1) { - dmu_buf_rele(dbt, FTAG); - } - } - dmu_tx_commit(tx); - - /* - * Sanity check the stuff we just wrote. - */ - { - void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); - void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); - - VERIFY(0 == dmu_read(os, packobj, packoff, - packsize, packcheck, DMU_READ_PREFETCH)); - VERIFY(0 == dmu_read(os, bigobj, bigoff, - bigsize, bigcheck, DMU_READ_PREFETCH)); - - ASSERT(bcmp(packbuf, packcheck, packsize) == 0); - ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); - - umem_free(packcheck, packsize); - umem_free(bigcheck, bigsize); - } - if (i == 2) { - txg_wait_open(dmu_objset_pool(os), 0); - } else if (i == 3) { - txg_wait_synced(dmu_objset_pool(os), 0); - } - } - - dmu_buf_rele(bonus_db, FTAG); - umem_free(packbuf, packsize); - umem_free(bigbuf, bigsize); - umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); -} - -/* ARGSUSED */ -void -ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) -{ - ztest_od_t od[1]; - uint64_t offset = (1ULL << (ztest_random(20) + 43)) + - (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); - - /* - * Have multiple threads write to large offsets in an object - * to verify that parallel writes to an object -- even to the - * same blocks within the object -- doesn't cause any trouble. - */ - ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, - 0, 0, 0); - - if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) - return; - - while (ztest_random(10) != 0) - ztest_io(zd, od[0].od_object, offset); -} - -void -ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) -{ - ztest_od_t od[1]; - uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + - (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); - uint64_t count = ztest_random(20) + 1; - uint64_t blocksize = ztest_random_blocksize(); - void *data; - - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, - 0, 0); - - if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) - return; - - if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0) - return; - - ztest_prealloc(zd, od[0].od_object, offset, count * blocksize); - - data = umem_zalloc(blocksize, UMEM_NOFAIL); - - while (ztest_random(count) != 0) { - uint64_t randoff = offset + (ztest_random(count) * blocksize); - if (ztest_write(zd, od[0].od_object, randoff, blocksize, - data) != 0) - break; - while (ztest_random(4) != 0) - ztest_io(zd, od[0].od_object, randoff); - } - - umem_free(data, blocksize); -} - -/* - * Verify that zap_{create,destroy,add,remove,update} work as expected. - */ -#define ZTEST_ZAP_MIN_INTS 1 -#define ZTEST_ZAP_MAX_INTS 4 -#define ZTEST_ZAP_MAX_PROPS 1000 - -void -ztest_zap(ztest_ds_t *zd, uint64_t id) -{ - objset_t *os = zd->zd_os; - ztest_od_t od[1]; - uint64_t object; - uint64_t txg, last_txg; - uint64_t value[ZTEST_ZAP_MAX_INTS]; - uint64_t zl_ints, zl_intsize, prop; - int i, ints; - dmu_tx_t *tx; - char propname[100], txgname[100]; - int error; - char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; - - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); - - if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) - return; - - object = od[0].od_object; - - /* - * Generate a known hash collision, and verify that - * we can lookup and remove both entries. - */ - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, object, B_TRUE, NULL); - txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); - if (txg == 0) - return; - for (i = 0; i < 2; i++) { - value[i] = i; - VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t), - 1, &value[i], tx)); - } - for (i = 0; i < 2; i++) { - VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], - sizeof (uint64_t), 1, &value[i], tx)); - VERIFY3U(0, ==, - zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); - ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); - ASSERT3U(zl_ints, ==, 1); - } - for (i = 0; i < 2; i++) { - VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx)); - } - dmu_tx_commit(tx); - - /* - * Generate a buch of random entries. - */ - ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); - - prop = ztest_random(ZTEST_ZAP_MAX_PROPS); - (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); - (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); - bzero(value, sizeof (value)); - last_txg = 0; - - /* - * If these zap entries already exist, validate their contents. - */ - error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); - if (error == 0) { - ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); - ASSERT3U(zl_ints, ==, 1); - - VERIFY(zap_lookup(os, object, txgname, zl_intsize, - zl_ints, &last_txg) == 0); - - VERIFY(zap_length(os, object, propname, &zl_intsize, - &zl_ints) == 0); - - ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); - ASSERT3U(zl_ints, ==, ints); - - VERIFY(zap_lookup(os, object, propname, zl_intsize, - zl_ints, value) == 0); - - for (i = 0; i < ints; i++) { - ASSERT3U(value[i], ==, last_txg + object + i); - } - } else { - ASSERT3U(error, ==, ENOENT); - } - - /* - * Atomically update two entries in our zap object. - * The first is named txg_%llu, and contains the txg - * in which the property was last updated. The second - * is named prop_%llu, and the nth element of its value - * should be txg + object + n. - */ - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, object, B_TRUE, NULL); - txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); - if (txg == 0) - return; - - if (last_txg > txg) - fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); - - for (i = 0; i < ints; i++) - value[i] = txg + object + i; - - VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t), - 1, &txg, tx)); - VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t), - ints, value, tx)); - - dmu_tx_commit(tx); - - /* - * Remove a random pair of entries. - */ - prop = ztest_random(ZTEST_ZAP_MAX_PROPS); - (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); - (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); - - error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); - - if (error == ENOENT) - return; - - ASSERT0(error); - - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, object, B_TRUE, NULL); - txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); - if (txg == 0) - return; - VERIFY3U(0, ==, zap_remove(os, object, txgname, tx)); - VERIFY3U(0, ==, zap_remove(os, object, propname, tx)); - dmu_tx_commit(tx); -} - -/* - * Testcase to test the upgrading of a microzap to fatzap. - */ -void -ztest_fzap(ztest_ds_t *zd, uint64_t id) -{ - objset_t *os = zd->zd_os; - ztest_od_t od[1]; - uint64_t object, txg; - - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); - - if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) - return; - - object = od[0].od_object; - - /* - * Add entries to this ZAP and make sure it spills over - * and gets upgraded to a fatzap. Also, since we are adding - * 2050 entries we should see ptrtbl growth and leaf-block split. - */ - for (int i = 0; i < 2050; i++) { - char name[ZFS_MAX_DATASET_NAME_LEN]; - uint64_t value = i; - dmu_tx_t *tx; - int error; - - (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", - id, value); - - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, object, B_TRUE, name); - txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); - if (txg == 0) - return; - error = zap_add(os, object, name, sizeof (uint64_t), 1, - &value, tx); - ASSERT(error == 0 || error == EEXIST); - dmu_tx_commit(tx); - } -} - -/* ARGSUSED */ -void -ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) -{ - objset_t *os = zd->zd_os; - ztest_od_t od[1]; - uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; - dmu_tx_t *tx; - int i, namelen, error; - int micro = ztest_random(2); - char name[20], string_value[20]; - void *data; - - ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, - 0, 0, 0); - - if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) - return; - - object = od[0].od_object; - - /* - * Generate a random name of the form 'xxx.....' where each - * x is a random printable character and the dots are dots. - * There are 94 such characters, and the name length goes from - * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. - */ - namelen = ztest_random(sizeof (name) - 5) + 5 + 1; - - for (i = 0; i < 3; i++) - name[i] = '!' + ztest_random('~' - '!' + 1); - for (; i < namelen - 1; i++) - name[i] = '.'; - name[i] = '\0'; - - if ((namelen & 1) || micro) { - wsize = sizeof (txg); - wc = 1; - data = &txg; - } else { - wsize = 1; - wc = namelen; - data = string_value; - } - - count = -1ULL; - VERIFY0(zap_count(os, object, &count)); - ASSERT(count != -1ULL); - - /* - * Select an operation: length, lookup, add, update, remove. - */ - i = ztest_random(5); - - if (i >= 2) { - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, object, B_TRUE, NULL); - txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); - if (txg == 0) - return; - bcopy(name, string_value, namelen); - } else { - tx = NULL; - txg = 0; - bzero(string_value, namelen); - } - - switch (i) { - - case 0: - error = zap_length(os, object, name, &zl_wsize, &zl_wc); - if (error == 0) { - ASSERT3U(wsize, ==, zl_wsize); - ASSERT3U(wc, ==, zl_wc); - } else { - ASSERT3U(error, ==, ENOENT); - } - break; - - case 1: - error = zap_lookup(os, object, name, wsize, wc, data); - if (error == 0) { - if (data == string_value && - bcmp(name, data, namelen) != 0) - fatal(0, "name '%s' != val '%s' len %d", - name, data, namelen); - } else { - ASSERT3U(error, ==, ENOENT); - } - break; - - case 2: - error = zap_add(os, object, name, wsize, wc, data, tx); - ASSERT(error == 0 || error == EEXIST); - break; - - case 3: - VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); - break; - - case 4: - error = zap_remove(os, object, name, tx); - ASSERT(error == 0 || error == ENOENT); - break; - } - - if (tx != NULL) - dmu_tx_commit(tx); -} - -/* - * Commit callback data. - */ -typedef struct ztest_cb_data { - list_node_t zcd_node; - uint64_t zcd_txg; - int zcd_expected_err; - boolean_t zcd_added; - boolean_t zcd_called; - spa_t *zcd_spa; -} ztest_cb_data_t; - -/* This is the actual commit callback function */ -static void -ztest_commit_callback(void *arg, int error) -{ - ztest_cb_data_t *data = arg; - uint64_t synced_txg; - - VERIFY(data != NULL); - VERIFY3S(data->zcd_expected_err, ==, error); - VERIFY(!data->zcd_called); - - synced_txg = spa_last_synced_txg(data->zcd_spa); - if (data->zcd_txg > synced_txg) - fatal(0, "commit callback of txg %" PRIu64 " called prematurely" - ", last synced txg = %" PRIu64 "\n", data->zcd_txg, - synced_txg); - - data->zcd_called = B_TRUE; - - if (error == ECANCELED) { - ASSERT0(data->zcd_txg); - ASSERT(!data->zcd_added); - - /* - * The private callback data should be destroyed here, but - * since we are going to check the zcd_called field after - * dmu_tx_abort(), we will destroy it there. - */ - return; - } - - /* Was this callback added to the global callback list? */ - if (!data->zcd_added) - goto out; - - ASSERT3U(data->zcd_txg, !=, 0); - - /* Remove our callback from the list */ - mutex_enter(&zcl.zcl_callbacks_lock); - list_remove(&zcl.zcl_callbacks, data); - mutex_exit(&zcl.zcl_callbacks_lock); - -out: - umem_free(data, sizeof (ztest_cb_data_t)); -} - -/* Allocate and initialize callback data structure */ -static ztest_cb_data_t * -ztest_create_cb_data(objset_t *os, uint64_t txg) -{ - ztest_cb_data_t *cb_data; - - cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); - - cb_data->zcd_txg = txg; - cb_data->zcd_spa = dmu_objset_spa(os); - - return (cb_data); -} - -/* - * If a number of txgs equal to this threshold have been created after a commit - * callback has been registered but not called, then we assume there is an - * implementation bug. - */ -#define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2) - -/* - * Commit callback test. - */ -void -ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) -{ - objset_t *os = zd->zd_os; - ztest_od_t od[1]; - dmu_tx_t *tx; - ztest_cb_data_t *cb_data[3], *tmp_cb; - uint64_t old_txg, txg; - int i, error; - - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); - - if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) - return; - - tx = dmu_tx_create(os); - - cb_data[0] = ztest_create_cb_data(os, 0); - dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); - - dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t)); - - /* Every once in a while, abort the transaction on purpose */ - if (ztest_random(100) == 0) - error = -1; - - if (!error) - error = dmu_tx_assign(tx, TXG_NOWAIT); - - txg = error ? 0 : dmu_tx_get_txg(tx); - - cb_data[0]->zcd_txg = txg; - cb_data[1] = ztest_create_cb_data(os, txg); - dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); - - if (error) { - /* - * It's not a strict requirement to call the registered - * callbacks from inside dmu_tx_abort(), but that's what - * it's supposed to happen in the current implementation - * so we will check for that. - */ - for (i = 0; i < 2; i++) { - cb_data[i]->zcd_expected_err = ECANCELED; - VERIFY(!cb_data[i]->zcd_called); - } - - dmu_tx_abort(tx); - - for (i = 0; i < 2; i++) { - VERIFY(cb_data[i]->zcd_called); - umem_free(cb_data[i], sizeof (ztest_cb_data_t)); - } - - return; - } - - cb_data[2] = ztest_create_cb_data(os, txg); - dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); - - /* - * Read existing data to make sure there isn't a future leak. - */ - VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t), - &old_txg, DMU_READ_PREFETCH)); - - if (old_txg > txg) - fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, - old_txg, txg); - - dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx); - - mutex_enter(&zcl.zcl_callbacks_lock); - - /* - * Since commit callbacks don't have any ordering requirement and since - * it is theoretically possible for a commit callback to be called - * after an arbitrary amount of time has elapsed since its txg has been - * synced, it is difficult to reliably determine whether a commit - * callback hasn't been called due to high load or due to a flawed - * implementation. - * - * In practice, we will assume that if after a certain number of txgs a - * commit callback hasn't been called, then most likely there's an - * implementation bug.. - */ - tmp_cb = list_head(&zcl.zcl_callbacks); - if (tmp_cb != NULL && - (txg - ZTEST_COMMIT_CALLBACK_THRESH) > tmp_cb->zcd_txg) { - fatal(0, "Commit callback threshold exceeded, oldest txg: %" - PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); - } - - /* - * Let's find the place to insert our callbacks. - * - * Even though the list is ordered by txg, it is possible for the - * insertion point to not be the end because our txg may already be - * quiescing at this point and other callbacks in the open txg - * (from other objsets) may have sneaked in. - */ - tmp_cb = list_tail(&zcl.zcl_callbacks); - while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) - tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); - - /* Add the 3 callbacks to the list */ - for (i = 0; i < 3; i++) { - if (tmp_cb == NULL) - list_insert_head(&zcl.zcl_callbacks, cb_data[i]); - else - list_insert_after(&zcl.zcl_callbacks, tmp_cb, - cb_data[i]); - - cb_data[i]->zcd_added = B_TRUE; - VERIFY(!cb_data[i]->zcd_called); - - tmp_cb = cb_data[i]; - } - - mutex_exit(&zcl.zcl_callbacks_lock); - - dmu_tx_commit(tx); -} - -/* - * Visit each object in the dataset. Verify that its properties - * are consistent what was stored in the block tag when it was created, - * and that its unused bonus buffer space has not been overwritten. - */ -void -ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) -{ - objset_t *os = zd->zd_os; - uint64_t obj; - int err = 0; - - for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { - ztest_block_tag_t *bt = NULL; - dmu_object_info_t doi; - dmu_buf_t *db; - - if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) - continue; - - dmu_object_info_from_db(db, &doi); - if (doi.doi_bonus_size >= sizeof (*bt)) - bt = ztest_bt_bonus(db); - - if (bt && bt->bt_magic == BT_MAGIC) { - ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, - bt->bt_offset, bt->bt_gen, bt->bt_txg, - bt->bt_crtxg); - ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); - } - - dmu_buf_rele(db, FTAG); - } -} - -/* ARGSUSED */ -void -ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) -{ - zfs_prop_t proplist[] = { - ZFS_PROP_CHECKSUM, - ZFS_PROP_COMPRESSION, - ZFS_PROP_COPIES, - ZFS_PROP_DEDUP - }; - - rw_enter(&ztest_name_lock, RW_READER); - - for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) - (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], - ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); - - rw_exit(&ztest_name_lock); -} - -/* ARGSUSED */ -void -ztest_remap_blocks(ztest_ds_t *zd, uint64_t id) -{ - rw_enter(&ztest_name_lock, RW_READER); - - int error = dmu_objset_remap_indirects(zd->zd_name); - if (error == ENOSPC) - error = 0; - ASSERT0(error); - - rw_exit(&ztest_name_lock); -} - -/* ARGSUSED */ -void -ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) -{ - nvlist_t *props = NULL; - - rw_enter(&ztest_name_lock, RW_READER); - - (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO, - ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN)); - - VERIFY0(spa_prop_get(ztest_spa, &props)); - - if (ztest_opts.zo_verbose >= 6) - dump_nvlist(props, 4); - - nvlist_free(props); - - rw_exit(&ztest_name_lock); -} - -static int -user_release_one(const char *snapname, const char *holdname) -{ - nvlist_t *snaps, *holds; - int error; - - snaps = fnvlist_alloc(); - holds = fnvlist_alloc(); - fnvlist_add_boolean(holds, holdname); - fnvlist_add_nvlist(snaps, snapname, holds); - fnvlist_free(holds); - error = dsl_dataset_user_release(snaps, NULL); - fnvlist_free(snaps); - return (error); -} - -/* - * Test snapshot hold/release and deferred destroy. - */ -void -ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) -{ - int error; - objset_t *os = zd->zd_os; - objset_t *origin; - char snapname[100]; - char fullname[100]; - char clonename[100]; - char tag[100]; - char osname[ZFS_MAX_DATASET_NAME_LEN]; - nvlist_t *holds; - - rw_enter(&ztest_name_lock, RW_READER); - - dmu_objset_name(os, osname); - - (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", id); - (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); - (void) snprintf(clonename, sizeof (clonename), - "%s/ch1_%llu", osname, id); - (void) snprintf(tag, sizeof (tag), "tag_%llu", id); - - /* - * Clean up from any previous run. - */ - error = dsl_destroy_head(clonename); - if (error != ENOENT) - ASSERT0(error); - error = user_release_one(fullname, tag); - if (error != ESRCH && error != ENOENT) - ASSERT0(error); - error = dsl_destroy_snapshot(fullname, B_FALSE); - if (error != ENOENT) - ASSERT0(error); - - /* - * Create snapshot, clone it, mark snap for deferred destroy, - * destroy clone, verify snap was also destroyed. - */ - error = dmu_objset_snapshot_one(osname, snapname); - if (error) { - if (error == ENOSPC) { - ztest_record_enospc("dmu_objset_snapshot"); - goto out; - } - fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); - } - - error = dmu_objset_clone(clonename, fullname); - if (error) { - if (error == ENOSPC) { - ztest_record_enospc("dmu_objset_clone"); - goto out; - } - fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); - } - - error = dsl_destroy_snapshot(fullname, B_TRUE); - if (error) { - fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", - fullname, error); - } - - error = dsl_destroy_head(clonename); - if (error) - fatal(0, "dsl_destroy_head(%s) = %d", clonename, error); - - error = dmu_objset_hold(fullname, FTAG, &origin); - if (error != ENOENT) - fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); - - /* - * Create snapshot, add temporary hold, verify that we can't - * destroy a held snapshot, mark for deferred destroy, - * release hold, verify snapshot was destroyed. - */ - error = dmu_objset_snapshot_one(osname, snapname); - if (error) { - if (error == ENOSPC) { - ztest_record_enospc("dmu_objset_snapshot"); - goto out; - } - fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); - } - - holds = fnvlist_alloc(); - fnvlist_add_string(holds, fullname, tag); - error = dsl_dataset_user_hold(holds, 0, NULL); - fnvlist_free(holds); - - if (error == ENOSPC) { - ztest_record_enospc("dsl_dataset_user_hold"); - goto out; - } else if (error) { - fatal(0, "dsl_dataset_user_hold(%s, %s) = %u", - fullname, tag, error); - } - - error = dsl_destroy_snapshot(fullname, B_FALSE); - if (error != EBUSY) { - fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d", - fullname, error); - } - - error = dsl_destroy_snapshot(fullname, B_TRUE); - if (error) { - fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", - fullname, error); - } - - error = user_release_one(fullname, tag); - if (error) - fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error); - - VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); - -out: - rw_exit(&ztest_name_lock); -} - -/* - * Inject random faults into the on-disk data. - */ -/* ARGSUSED */ -void -ztest_fault_inject(ztest_ds_t *zd, uint64_t id) -{ - ztest_shared_t *zs = ztest_shared; - spa_t *spa = ztest_spa; - int fd; - uint64_t offset; - uint64_t leaves; - uint64_t bad = 0x1990c0ffeedecadeULL; - uint64_t top, leaf; - char path0[MAXPATHLEN]; - char pathrand[MAXPATHLEN]; - size_t fsize; - int bshift = SPA_MAXBLOCKSHIFT + 2; - int iters = 1000; - int maxfaults; - int mirror_save; - vdev_t *vd0 = NULL; - uint64_t guid0 = 0; - boolean_t islog = B_FALSE; - - mutex_enter(&ztest_vdev_lock); - - /* - * Device removal is in progress, fault injection must be disabled - * until it completes and the pool is scrubbed. The fault injection - * strategy for damaging blocks does not take in to account evacuated - * blocks which may have already been damaged. - */ - if (ztest_device_removal_active) { - mutex_exit(&ztest_vdev_lock); - return; - } - - maxfaults = MAXFAULTS(); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; - mirror_save = zs->zs_mirrors; - mutex_exit(&ztest_vdev_lock); - - ASSERT(leaves >= 1); - - /* - * Grab the name lock as reader. There are some operations - * which don't like to have their vdevs changed while - * they are in progress (i.e. spa_change_guid). Those - * operations will have grabbed the name lock as writer. - */ - rw_enter(&ztest_name_lock, RW_READER); - - /* - * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. - */ - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - - if (ztest_random(2) == 0) { - /* - * Inject errors on a normal data device or slog device. - */ - top = ztest_random_vdev_top(spa, B_TRUE); - leaf = ztest_random(leaves) + zs->zs_splits; - - /* - * Generate paths to the first leaf in this top-level vdev, - * and to the random leaf we selected. We'll induce transient - * write failures and random online/offline activity on leaf 0, - * and we'll write random garbage to the randomly chosen leaf. - */ - (void) snprintf(path0, sizeof (path0), ztest_dev_template, - ztest_opts.zo_dir, ztest_opts.zo_pool, - top * leaves + zs->zs_splits); - (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template, - ztest_opts.zo_dir, ztest_opts.zo_pool, - top * leaves + leaf); - - vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); - if (vd0 != NULL && vd0->vdev_top->vdev_islog) - islog = B_TRUE; - - /* - * If the top-level vdev needs to be resilvered - * then we only allow faults on the device that is - * resilvering. - */ - if (vd0 != NULL && maxfaults != 1 && - (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || - vd0->vdev_resilver_txg != 0)) { - /* - * Make vd0 explicitly claim to be unreadable, - * or unwriteable, or reach behind its back - * and close the underlying fd. We can do this if - * maxfaults == 0 because we'll fail and reexecute, - * and we can do it if maxfaults >= 2 because we'll - * have enough redundancy. If maxfaults == 1, the - * combination of this with injection of random data - * corruption below exceeds the pool's fault tolerance. - */ - vdev_file_t *vf = vd0->vdev_tsd; - - zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", - (long long)vd0->vdev_id, (int)maxfaults); - - if (vf != NULL && ztest_random(3) == 0) { - (void) close(vf->vf_vnode->v_fd); - vf->vf_vnode->v_fd = -1; - } else if (ztest_random(2) == 0) { - vd0->vdev_cant_read = B_TRUE; - } else { - vd0->vdev_cant_write = B_TRUE; - } - guid0 = vd0->vdev_guid; - } - } else { - /* - * Inject errors on an l2cache device. - */ - spa_aux_vdev_t *sav = &spa->spa_l2cache; - - if (sav->sav_count == 0) { - spa_config_exit(spa, SCL_STATE, FTAG); - rw_exit(&ztest_name_lock); - return; - } - vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; - guid0 = vd0->vdev_guid; - (void) strcpy(path0, vd0->vdev_path); - (void) strcpy(pathrand, vd0->vdev_path); - - leaf = 0; - leaves = 1; - maxfaults = INT_MAX; /* no limit on cache devices */ - } - - spa_config_exit(spa, SCL_STATE, FTAG); - rw_exit(&ztest_name_lock); - - /* - * If we can tolerate two or more faults, or we're dealing - * with a slog, randomly online/offline vd0. - */ - if ((maxfaults >= 2 || islog) && guid0 != 0) { - if (ztest_random(10) < 6) { - int flags = (ztest_random(2) == 0 ? - ZFS_OFFLINE_TEMPORARY : 0); - - /* - * We have to grab the zs_name_lock as writer to - * prevent a race between offlining a slog and - * destroying a dataset. Offlining the slog will - * grab a reference on the dataset which may cause - * dmu_objset_destroy() to fail with EBUSY thus - * leaving the dataset in an inconsistent state. - */ - if (islog) - rw_enter(&ztest_name_lock, RW_WRITER); - - VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); - - if (islog) - rw_exit(&ztest_name_lock); - } else { - /* - * Ideally we would like to be able to randomly - * call vdev_[on|off]line without holding locks - * to force unpredictable failures but the side - * effects of vdev_[on|off]line prevent us from - * doing so. We grab the ztest_vdev_lock here to - * prevent a race between injection testing and - * aux_vdev removal. - */ - mutex_enter(&ztest_vdev_lock); - (void) vdev_online(spa, guid0, 0, NULL); - mutex_exit(&ztest_vdev_lock); - } - } - - if (maxfaults == 0) - return; - - /* - * We have at least single-fault tolerance, so inject data corruption. - */ - fd = open(pathrand, O_RDWR); - - if (fd == -1) /* we hit a gap in the device namespace */ - return; - - fsize = lseek(fd, 0, SEEK_END); - - while (--iters != 0) { - /* - * The offset must be chosen carefully to ensure that - * we do not inject a given logical block with errors - * on two different leaf devices, because ZFS can not - * tolerate that (if maxfaults==1). - * - * We divide each leaf into chunks of size - * (# leaves * SPA_MAXBLOCKSIZE * 4). Within each chunk - * there is a series of ranges to which we can inject errors. - * Each range can accept errors on only a single leaf vdev. - * The error injection ranges are separated by ranges - * which we will not inject errors on any device (DMZs). - * Each DMZ must be large enough such that a single block - * can not straddle it, so that a single block can not be - * a target in two different injection ranges (on different - * leaf vdevs). - * - * For example, with 3 leaves, each chunk looks like: - * 0 to 32M: injection range for leaf 0 - * 32M to 64M: DMZ - no injection allowed - * 64M to 96M: injection range for leaf 1 - * 96M to 128M: DMZ - no injection allowed - * 128M to 160M: injection range for leaf 2 - * 160M to 192M: DMZ - no injection allowed - */ - offset = ztest_random(fsize / (leaves << bshift)) * - (leaves << bshift) + (leaf << bshift) + - (ztest_random(1ULL << (bshift - 1)) & -8ULL); - - /* - * Only allow damage to the labels at one end of the vdev. - * - * If all labels are damaged, the device will be totally - * inaccessible, which will result in loss of data, - * because we also damage (parts of) the other side of - * the mirror/raidz. - * - * Additionally, we will always have both an even and an - * odd label, so that we can handle crashes in the - * middle of vdev_config_sync(). - */ - if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) - continue; - - /* - * The two end labels are stored at the "end" of the disk, but - * the end of the disk (vdev_psize) is aligned to - * sizeof (vdev_label_t). - */ - uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); - if ((leaf & 1) == 1 && - offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) - continue; - - mutex_enter(&ztest_vdev_lock); - if (mirror_save != zs->zs_mirrors) { - mutex_exit(&ztest_vdev_lock); - (void) close(fd); - return; - } - - if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) - fatal(1, "can't inject bad word at 0x%llx in %s", - offset, pathrand); - - mutex_exit(&ztest_vdev_lock); - - if (ztest_opts.zo_verbose >= 7) - (void) printf("injected bad word into %s," - " offset 0x%llx\n", pathrand, (u_longlong_t)offset); - } - - (void) close(fd); -} - -/* - * Verify that DDT repair works as expected. - */ -void -ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) -{ - ztest_shared_t *zs = ztest_shared; - spa_t *spa = ztest_spa; - objset_t *os = zd->zd_os; - ztest_od_t od[1]; - uint64_t object, blocksize, txg, pattern, psize; - enum zio_checksum checksum = spa_dedup_checksum(spa); - dmu_buf_t *db; - dmu_tx_t *tx; - abd_t *abd; - blkptr_t blk; - int copies = 2 * ZIO_DEDUPDITTO_MIN; - - blocksize = ztest_random_blocksize(); - blocksize = MIN(blocksize, 2048); /* because we write so many */ - - ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, - 0, 0); - - if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) - return; - - /* - * Take the name lock as writer to prevent anyone else from changing - * the pool and dataset properies we need to maintain during this test. - */ - rw_enter(&ztest_name_lock, RW_WRITER); - - if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum, - B_FALSE) != 0 || - ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1, - B_FALSE) != 0) { - rw_exit(&ztest_name_lock); - return; - } - - dmu_objset_stats_t dds; - dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - dmu_objset_fast_stat(os, &dds); - dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - - object = od[0].od_object; - blocksize = od[0].od_blocksize; - pattern = zs->zs_guid ^ dds.dds_guid; - - ASSERT(object != 0); - - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, object, 0, copies * blocksize); - txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); - if (txg == 0) { - rw_exit(&ztest_name_lock); - return; - } - - /* - * Write all the copies of our block. - */ - for (int i = 0; i < copies; i++) { - uint64_t offset = i * blocksize; - int error = dmu_buf_hold(os, object, offset, FTAG, &db, - DMU_READ_NO_PREFETCH); - if (error != 0) { - fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u", - os, (long long)object, (long long) offset, error); - } - ASSERT(db->db_offset == offset); - ASSERT(db->db_size == blocksize); - ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) || - ztest_pattern_match(db->db_data, db->db_size, 0ULL)); - dmu_buf_will_fill(db, tx); - ztest_pattern_set(db->db_data, db->db_size, pattern); - dmu_buf_rele(db, FTAG); - } - - dmu_tx_commit(tx); - txg_wait_synced(spa_get_dsl(spa), txg); - - /* - * Find out what block we got. - */ - VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db, - DMU_READ_NO_PREFETCH)); - blk = *((dmu_buf_impl_t *)db)->db_blkptr; - dmu_buf_rele(db, FTAG); - - /* - * Damage the block. Dedup-ditto will save us when we read it later. - */ - psize = BP_GET_PSIZE(&blk); - abd = abd_alloc_linear(psize, B_TRUE); - ztest_pattern_set(abd_to_buf(abd), psize, ~pattern); - - (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, - abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, - ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); - - abd_free(abd); - - rw_exit(&ztest_name_lock); -} - -/* - * Scrub the pool. - */ -/* ARGSUSED */ -void -ztest_scrub(ztest_ds_t *zd, uint64_t id) -{ - spa_t *spa = ztest_spa; - - /* - * Scrub in progress by device removal. - */ - if (ztest_device_removal_active) - return; - - (void) spa_scan(spa, POOL_SCAN_SCRUB); - (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */ - (void) spa_scan(spa, POOL_SCAN_SCRUB); -} - -/* - * Change the guid for the pool. - */ -/* ARGSUSED */ -void -ztest_reguid(ztest_ds_t *zd, uint64_t id) -{ - spa_t *spa = ztest_spa; - uint64_t orig, load; - int error; - - if (ztest_opts.zo_mmp_test) - return; - - orig = spa_guid(spa); - load = spa_load_guid(spa); - - rw_enter(&ztest_name_lock, RW_WRITER); - error = spa_change_guid(spa); - rw_exit(&ztest_name_lock); - - if (error != 0) - return; - - if (ztest_opts.zo_verbose >= 4) { - (void) printf("Changed guid old %llu -> %llu\n", - (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); - } - - VERIFY3U(orig, !=, spa_guid(spa)); - VERIFY3U(load, ==, spa_load_guid(spa)); -} - -static vdev_t * -ztest_random_concrete_vdev_leaf(vdev_t *vd) -{ - if (vd == NULL) - return (NULL); - - if (vd->vdev_children == 0) - return (vd); - - vdev_t *eligible[vd->vdev_children]; - int eligible_idx = 0, i; - for (i = 0; i < vd->vdev_children; i++) { - vdev_t *cvd = vd->vdev_child[i]; - if (cvd->vdev_top->vdev_removing) - continue; - if (cvd->vdev_children > 0 || - (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { - eligible[eligible_idx++] = cvd; - } - } - VERIFY(eligible_idx > 0); - - uint64_t child_no = ztest_random(eligible_idx); - return (ztest_random_concrete_vdev_leaf(eligible[child_no])); -} - -/* ARGSUSED */ -void -ztest_initialize(ztest_ds_t *zd, uint64_t id) -{ - spa_t *spa = ztest_spa; - int error = 0; - - mutex_enter(&ztest_vdev_lock); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - - /* Random leaf vdev */ - vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); - if (rand_vd == NULL) { - spa_config_exit(spa, SCL_VDEV, FTAG); - mutex_exit(&ztest_vdev_lock); - return; - } - - /* - * The random vdev we've selected may change as soon as we - * drop the spa_config_lock. We create local copies of things - * we're interested in. - */ - uint64_t guid = rand_vd->vdev_guid; - char *path = strdup(rand_vd->vdev_path); - boolean_t active = rand_vd->vdev_initialize_thread != NULL; - - zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid); - spa_config_exit(spa, SCL_VDEV, FTAG); - - uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); - error = spa_vdev_initialize(spa, guid, cmd); - switch (cmd) { - case POOL_INITIALIZE_CANCEL: - if (ztest_opts.zo_verbose >= 4) { - (void) printf("Cancel initialize %s", path); - if (!active) - (void) printf(" failed (no initialize active)"); - (void) printf("\n"); - } - break; - case POOL_INITIALIZE_DO: - if (ztest_opts.zo_verbose >= 4) { - (void) printf("Start initialize %s", path); - if (active && error == 0) - (void) printf(" failed (already active)"); - else if (error != 0) - (void) printf(" failed (error %d)", error); - (void) printf("\n"); - } - break; - case POOL_INITIALIZE_SUSPEND: - if (ztest_opts.zo_verbose >= 4) { - (void) printf("Suspend initialize %s", path); - if (!active) - (void) printf(" failed (no initialize active)"); - (void) printf("\n"); - } - break; - } - free(path); - mutex_exit(&ztest_vdev_lock); -} - -/* - * Verify pool integrity by running zdb. - */ -static void -ztest_run_zdb(char *pool) -{ - int status; - char zdb[MAXPATHLEN + MAXNAMELEN + 20]; - char zbuf[1024]; - char *bin; - char *ztest; - char *isa; - int isalen; - FILE *fp; - - strlcpy(zdb, "/usr/bin/ztest", sizeof(zdb)); - - /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */ - bin = strstr(zdb, "/usr/bin/"); - ztest = strstr(bin, "/ztest"); - isa = bin + 8; - isalen = ztest - isa; - isa = strdup(isa); - /* LINTED */ - (void) sprintf(bin, - "/usr/sbin%.*s/zdb -bcc%s%s -G -d -U %s " - "-o zfs_reconstruct_indirect_combinations_max=65536 %s", - isalen, - isa, - ztest_opts.zo_verbose >= 3 ? "s" : "", - ztest_opts.zo_verbose >= 4 ? "v" : "", - spa_config_path, - pool); - free(isa); - - if (ztest_opts.zo_verbose >= 5) - (void) printf("Executing %s\n", strstr(zdb, "zdb ")); - - fp = popen(zdb, "r"); - assert(fp != NULL); - - while (fgets(zbuf, sizeof (zbuf), fp) != NULL) - if (ztest_opts.zo_verbose >= 3) - (void) printf("%s", zbuf); - - status = pclose(fp); - - if (status == 0) - return; - - ztest_dump_core = 0; - if (WIFEXITED(status)) - fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status)); - else - fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status)); -} - -static void -ztest_walk_pool_directory(char *header) -{ - spa_t *spa = NULL; - - if (ztest_opts.zo_verbose >= 6) - (void) printf("%s\n", header); - - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) - if (ztest_opts.zo_verbose >= 6) - (void) printf("\t%s\n", spa_name(spa)); - mutex_exit(&spa_namespace_lock); -} - -static void -ztest_spa_import_export(char *oldname, char *newname) -{ - nvlist_t *config, *newconfig; - uint64_t pool_guid; - spa_t *spa; - int error; - - if (ztest_opts.zo_verbose >= 4) { - (void) printf("import/export: old = %s, new = %s\n", - oldname, newname); - } - - /* - * Clean up from previous runs. - */ - (void) spa_destroy(newname); - - /* - * Get the pool's configuration and guid. - */ - VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); - - /* - * Kick off a scrub to tickle scrub/export races. - */ - if (ztest_random(2) == 0) - (void) spa_scan(spa, POOL_SCAN_SCRUB); - - pool_guid = spa_guid(spa); - spa_close(spa, FTAG); - - ztest_walk_pool_directory("pools before export"); - - /* - * Export it. - */ - VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE)); - - ztest_walk_pool_directory("pools after export"); - - /* - * Try to import it. - */ - newconfig = spa_tryimport(config); - ASSERT(newconfig != NULL); - nvlist_free(newconfig); - - /* - * Import it under the new name. - */ - error = spa_import(newname, config, NULL, 0); - if (error != 0) { - dump_nvlist(config, 0); - fatal(B_FALSE, "couldn't import pool %s as %s: error %u", - oldname, newname, error); - } - - ztest_walk_pool_directory("pools after import"); - - /* - * Try to import it again -- should fail with EEXIST. - */ - VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); - - /* - * Try to import it under a different name -- should fail with EEXIST. - */ - VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); - - /* - * Verify that the pool is no longer visible under the old name. - */ - VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); - - /* - * Verify that we can open and close the pool using the new name. - */ - VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); - ASSERT(pool_guid == spa_guid(spa)); - spa_close(spa, FTAG); - - nvlist_free(config); -} - -static void -ztest_resume(spa_t *spa) -{ - if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) - (void) printf("resuming from suspended state\n"); - spa_vdev_state_enter(spa, SCL_NONE); - vdev_clear(spa, NULL); - (void) spa_vdev_state_exit(spa, NULL, 0); - (void) zio_resume(spa); -} - -static void * -ztest_resume_thread(void *arg) -{ - spa_t *spa = arg; - - while (!ztest_exiting) { - if (spa_suspended(spa)) - ztest_resume(spa); - (void) poll(NULL, 0, 100); - - /* - * Periodically change the zfs_compressed_arc_enabled setting. - */ - if (ztest_random(10) == 0) - zfs_compressed_arc_enabled = ztest_random(2); - - /* - * Periodically change the zfs_abd_scatter_enabled setting. - */ - if (ztest_random(10) == 0) - zfs_abd_scatter_enabled = ztest_random(2); - } - return (NULL); -} - -static void * -ztest_deadman_thread(void *arg) -{ - ztest_shared_t *zs = arg; - spa_t *spa = ztest_spa; - hrtime_t delta, total = 0; - - for (;;) { - delta = zs->zs_thread_stop - zs->zs_thread_start + - MSEC2NSEC(zfs_deadman_synctime_ms); - - (void) poll(NULL, 0, (int)NSEC2MSEC(delta)); - - /* - * If the pool is suspended then fail immediately. Otherwise, - * check to see if the pool is making any progress. If - * vdev_deadman() discovers that there hasn't been any recent - * I/Os then it will end up aborting the tests. - */ - if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { - fatal(0, "aborting test after %llu seconds because " - "pool has transitioned to a suspended state.", - zfs_deadman_synctime_ms / 1000); - return (NULL); - } - vdev_deadman(spa->spa_root_vdev); - - total += zfs_deadman_synctime_ms/1000; - (void) printf("ztest has been running for %lld seconds\n", - total); - } -} - -static void -ztest_execute(int test, ztest_info_t *zi, uint64_t id) -{ - ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; - ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); - hrtime_t functime = gethrtime(); - - for (int i = 0; i < zi->zi_iters; i++) - zi->zi_func(zd, id); - - functime = gethrtime() - functime; - - atomic_add_64(&zc->zc_count, 1); - atomic_add_64(&zc->zc_time, functime); - - if (ztest_opts.zo_verbose >= 4) { - Dl_info dli; - (void) dladdr((void *)zi->zi_func, &dli); - (void) printf("%6.2f sec in %s\n", - (double)functime / NANOSEC, dli.dli_sname); - } -} - -static void * -ztest_thread(void *arg) -{ - int rand; - uint64_t id = (uintptr_t)arg; - ztest_shared_t *zs = ztest_shared; - uint64_t call_next; - hrtime_t now; - ztest_info_t *zi; - ztest_shared_callstate_t *zc; - - while ((now = gethrtime()) < zs->zs_thread_stop) { - /* - * See if it's time to force a crash. - */ - if (now > zs->zs_thread_kill) - ztest_kill(zs); - - /* - * If we're getting ENOSPC with some regularity, stop. - */ - if (zs->zs_enospc_count > 10) - break; - - /* - * Pick a random function to execute. - */ - rand = ztest_random(ZTEST_FUNCS); - zi = &ztest_info[rand]; - zc = ZTEST_GET_SHARED_CALLSTATE(rand); - call_next = zc->zc_next; - - if (now >= call_next && - atomic_cas_64(&zc->zc_next, call_next, call_next + - ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { - ztest_execute(rand, zi, id); - } - } - - return (NULL); -} - -static void -ztest_dataset_name(char *dsname, char *pool, int d) -{ - (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); -} - -static void -ztest_dataset_destroy(int d) -{ - char name[ZFS_MAX_DATASET_NAME_LEN]; - - ztest_dataset_name(name, ztest_opts.zo_pool, d); - - if (ztest_opts.zo_verbose >= 3) - (void) printf("Destroying %s to free up space\n", name); - - /* - * Cleanup any non-standard clones and snapshots. In general, - * ztest thread t operates on dataset (t % zopt_datasets), - * so there may be more than one thing to clean up. - */ - for (int t = d; t < ztest_opts.zo_threads; - t += ztest_opts.zo_datasets) { - ztest_dsl_dataset_cleanup(name, t); - } - - (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, - DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); -} - -static void -ztest_dataset_dirobj_verify(ztest_ds_t *zd) -{ - uint64_t usedobjs, dirobjs, scratch; - - /* - * ZTEST_DIROBJ is the object directory for the entire dataset. - * Therefore, the number of objects in use should equal the - * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. - * If not, we have an object leak. - * - * Note that we can only check this in ztest_dataset_open(), - * when the open-context and syncing-context values agree. - * That's because zap_count() returns the open-context value, - * while dmu_objset_space() returns the rootbp fill count. - */ - VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); - dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); - ASSERT3U(dirobjs + 1, ==, usedobjs); -} - -static int -ztest_dataset_open(int d) -{ - ztest_ds_t *zd = &ztest_ds[d]; - uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; - objset_t *os; - zilog_t *zilog; - char name[ZFS_MAX_DATASET_NAME_LEN]; - int error; - - ztest_dataset_name(name, ztest_opts.zo_pool, d); - - rw_enter(&ztest_name_lock, RW_READER); - - error = ztest_dataset_create(name); - if (error == ENOSPC) { - rw_exit(&ztest_name_lock); - ztest_record_enospc(FTAG); - return (error); - } - ASSERT(error == 0 || error == EEXIST); - - VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os)); - rw_exit(&ztest_name_lock); - - ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); - - zilog = zd->zd_zilog; - - if (zilog->zl_header->zh_claim_lr_seq != 0 && - zilog->zl_header->zh_claim_lr_seq < committed_seq) - fatal(0, "missing log records: claimed %llu < committed %llu", - zilog->zl_header->zh_claim_lr_seq, committed_seq); - - ztest_dataset_dirobj_verify(zd); - - zil_replay(os, zd, ztest_replay_vector); - - ztest_dataset_dirobj_verify(zd); - - if (ztest_opts.zo_verbose >= 6) - (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", - zd->zd_name, - (u_longlong_t)zilog->zl_parse_blk_count, - (u_longlong_t)zilog->zl_parse_lr_count, - (u_longlong_t)zilog->zl_replaying_seq); - - zilog = zil_open(os, ztest_get_data); - - if (zilog->zl_replaying_seq != 0 && - zilog->zl_replaying_seq < committed_seq) - fatal(0, "missing log records: replayed %llu < committed %llu", - zilog->zl_replaying_seq, committed_seq); - - return (0); -} - -static void -ztest_dataset_close(int d) -{ - ztest_ds_t *zd = &ztest_ds[d]; - - zil_close(zd->zd_zilog); - dmu_objset_disown(zd->zd_os, zd); - - ztest_zd_fini(zd); -} - -/* - * Kick off threads to run tests on all datasets in parallel. - */ -static void -ztest_run(ztest_shared_t *zs) -{ - thread_t *tid; - spa_t *spa; - objset_t *os; - thread_t resume_tid; - int error; - - ztest_exiting = B_FALSE; - - /* - * Initialize parent/child shared state. - */ - mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL); - mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL); - rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL); - - zs->zs_thread_start = gethrtime(); - zs->zs_thread_stop = - zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; - zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); - zs->zs_thread_kill = zs->zs_thread_stop; - if (ztest_random(100) < ztest_opts.zo_killrate) { - zs->zs_thread_kill -= - ztest_random(ztest_opts.zo_passtime * NANOSEC); - } - - mutex_init(&zcl.zcl_callbacks_lock, NULL, USYNC_THREAD, NULL); - - list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), - offsetof(ztest_cb_data_t, zcd_node)); - - /* - * Open our pool. - */ - kernel_init(FREAD | FWRITE); - VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); - metaslab_preload_limit = ztest_random(20) + 1; - ztest_spa = spa; - - dmu_objset_stats_t dds; - VERIFY0(dmu_objset_own(ztest_opts.zo_pool, - DMU_OST_ANY, B_TRUE, FTAG, &os)); - dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - dmu_objset_fast_stat(os, &dds); - dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - zs->zs_guid = dds.dds_guid; - dmu_objset_disown(os, FTAG); - - spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN; - - /* - * We don't expect the pool to suspend unless maxfaults == 0, - * in which case ztest_fault_inject() temporarily takes away - * the only valid replica. - */ - if (MAXFAULTS() == 0) - spa->spa_failmode = ZIO_FAILURE_MODE_WAIT; - else - spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; - - /* - * Create a thread to periodically resume suspended I/O. - */ - VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND, - &resume_tid) == 0); - - /* - * Create a deadman thread to abort() if we hang. - */ - VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND, - NULL) == 0); - - /* - * Verify that we can safely inquire about any object, - * whether it's allocated or not. To make it interesting, - * we probe a 5-wide window around each power of two. - * This hits all edge cases, including zero and the max. - */ - for (int t = 0; t < 64; t++) { - for (int d = -5; d <= 5; d++) { - error = dmu_object_info(spa->spa_meta_objset, - (1ULL << t) + d, NULL); - ASSERT(error == 0 || error == ENOENT || - error == EINVAL); - } - } - - /* - * If we got any ENOSPC errors on the previous run, destroy something. - */ - if (zs->zs_enospc_count != 0) { - int d = ztest_random(ztest_opts.zo_datasets); - ztest_dataset_destroy(d); - } - zs->zs_enospc_count = 0; - - tid = umem_zalloc(ztest_opts.zo_threads * sizeof (thread_t), - UMEM_NOFAIL); - - if (ztest_opts.zo_verbose >= 4) - (void) printf("starting main threads...\n"); - - /* - * Kick off all the tests that run in parallel. - */ - for (int t = 0; t < ztest_opts.zo_threads; t++) { - if (t < ztest_opts.zo_datasets && - ztest_dataset_open(t) != 0) - return; - VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t, - THR_BOUND, &tid[t]) == 0); - } - - /* - * Wait for all of the tests to complete. We go in reverse order - * so we don't close datasets while threads are still using them. - */ - for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) { - VERIFY(thr_join(tid[t], NULL, NULL) == 0); - if (t < ztest_opts.zo_datasets) - ztest_dataset_close(t); - } - - txg_wait_synced(spa_get_dsl(spa), 0); - - zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); - zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); - zfs_dbgmsg_print(FTAG); - - umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t)); - - /* Kill the resume thread */ - ztest_exiting = B_TRUE; - VERIFY(thr_join(resume_tid, NULL, NULL) == 0); - ztest_resume(spa); - - /* - * Right before closing the pool, kick off a bunch of async I/O; - * spa_close() should wait for it to complete. - */ - for (uint64_t object = 1; object < 50; object++) { - dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, - ZIO_PRIORITY_SYNC_READ); - } - - spa_close(spa, FTAG); - - /* - * Verify that we can loop over all pools. - */ - mutex_enter(&spa_namespace_lock); - for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) - if (ztest_opts.zo_verbose > 3) - (void) printf("spa_next: found %s\n", spa_name(spa)); - mutex_exit(&spa_namespace_lock); - - /* - * Verify that we can export the pool and reimport it under a - * different name. - */ - if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { - char name[ZFS_MAX_DATASET_NAME_LEN]; - (void) snprintf(name, sizeof (name), "%s_import", - ztest_opts.zo_pool); - ztest_spa_import_export(ztest_opts.zo_pool, name); - ztest_spa_import_export(name, ztest_opts.zo_pool); - } - - kernel_fini(); - - list_destroy(&zcl.zcl_callbacks); - - mutex_destroy(&zcl.zcl_callbacks_lock); - - rw_destroy(&ztest_name_lock); - mutex_destroy(&ztest_vdev_lock); - mutex_destroy(&ztest_checkpoint_lock); -} - -static void -ztest_freeze(void) -{ - ztest_ds_t *zd = &ztest_ds[0]; - spa_t *spa; - int numloops = 0; - - if (ztest_opts.zo_verbose >= 3) - (void) printf("testing spa_freeze()...\n"); - - kernel_init(FREAD | FWRITE); - VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); - VERIFY3U(0, ==, ztest_dataset_open(0)); - ztest_spa = spa; - - /* - * Force the first log block to be transactionally allocated. - * We have to do this before we freeze the pool -- otherwise - * the log chain won't be anchored. - */ - while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { - ztest_dmu_object_alloc_free(zd, 0); - zil_commit(zd->zd_zilog, 0); - } - - txg_wait_synced(spa_get_dsl(spa), 0); - - /* - * Freeze the pool. This stops spa_sync() from doing anything, - * so that the only way to record changes from now on is the ZIL. - */ - spa_freeze(spa); - - /* - * Because it is hard to predict how much space a write will actually - * require beforehand, we leave ourselves some fudge space to write over - * capacity. - */ - uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; - - /* - * Run tests that generate log records but don't alter the pool config - * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). - * We do a txg_wait_synced() after each iteration to force the txg - * to increase well beyond the last synced value in the uberblock. - * The ZIL should be OK with that. - * - * Run a random number of times less than zo_maxloops and ensure we do - * not run out of space on the pool. - */ - while (ztest_random(10) != 0 && - numloops++ < ztest_opts.zo_maxloops && - metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { - ztest_od_t od; - ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); - VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); - ztest_io(zd, od.od_object, - ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); - txg_wait_synced(spa_get_dsl(spa), 0); - } - - /* - * Commit all of the changes we just generated. - */ - zil_commit(zd->zd_zilog, 0); - txg_wait_synced(spa_get_dsl(spa), 0); - - /* - * Close our dataset and close the pool. - */ - ztest_dataset_close(0); - spa_close(spa, FTAG); - kernel_fini(); - - /* - * Open and close the pool and dataset to induce log replay. - */ - kernel_init(FREAD | FWRITE); - VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); - ASSERT(spa_freeze_txg(spa) == UINT64_MAX); - VERIFY3U(0, ==, ztest_dataset_open(0)); - ztest_dataset_close(0); - - ztest_spa = spa; - txg_wait_synced(spa_get_dsl(spa), 0); - ztest_reguid(NULL, 0); - - spa_close(spa, FTAG); - kernel_fini(); -} - -void -print_time(hrtime_t t, char *timebuf) -{ - hrtime_t s = t / NANOSEC; - hrtime_t m = s / 60; - hrtime_t h = m / 60; - hrtime_t d = h / 24; - - s -= m * 60; - m -= h * 60; - h -= d * 24; - - timebuf[0] = '\0'; - - if (d) - (void) sprintf(timebuf, - "%llud%02lluh%02llum%02llus", d, h, m, s); - else if (h) - (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); - else if (m) - (void) sprintf(timebuf, "%llum%02llus", m, s); - else - (void) sprintf(timebuf, "%llus", s); -} - -static nvlist_t * -make_random_props() -{ - nvlist_t *props; - - VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); - - if (ztest_random(2) == 0) - return (props); - VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0); - - return (props); -} - -/* - * Import a storage pool with the given name. - */ -static void -ztest_import(ztest_shared_t *zs) -{ - libzfs_handle_t *hdl; - importargs_t args = { 0 }; - spa_t *spa; - nvlist_t *cfg = NULL; - int nsearch = 1; - char *searchdirs[nsearch]; - char *name = ztest_opts.zo_pool; - int flags = ZFS_IMPORT_MISSING_LOG; - int error; - - mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); - rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL); - - kernel_init(FREAD | FWRITE); - hdl = libzfs_init(); - - searchdirs[0] = ztest_opts.zo_dir; - args.paths = nsearch; - args.path = searchdirs; - args.can_be_active = B_FALSE; - - error = zpool_tryimport(hdl, name, &cfg, &args); - if (error) - (void) fatal(0, "No pools found\n"); - - VERIFY0(spa_import(name, cfg, NULL, flags)); - VERIFY0(spa_open(name, &spa, FTAG)); - zs->zs_metaslab_sz = - 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; - spa_close(spa, FTAG); - - libzfs_fini(hdl); - kernel_fini(); - - if (!ztest_opts.zo_mmp_test) { - ztest_run_zdb(ztest_opts.zo_pool); - ztest_freeze(); - ztest_run_zdb(ztest_opts.zo_pool); - } - - rw_destroy(&ztest_name_lock); - mutex_destroy(&ztest_vdev_lock); -} - -/* - * Create a storage pool with the given name and initial vdev size. - * Then test spa_freeze() functionality. - */ -static void -ztest_init(ztest_shared_t *zs) -{ - spa_t *spa; - nvlist_t *nvroot, *props; - - mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL); - mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL); - rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL); - - kernel_init(FREAD | FWRITE); - - /* - * Create the storage pool. - */ - (void) spa_destroy(ztest_opts.zo_pool); - ztest_shared->zs_vdev_next_leaf = 0; - zs->zs_splits = 0; - zs->zs_mirrors = ztest_opts.zo_mirrors; - nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); - props = make_random_props(); - for (int i = 0; i < SPA_FEATURES; i++) { - char buf[1024]; - (void) snprintf(buf, sizeof (buf), "feature@%s", - spa_feature_table[i].fi_uname); - VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0)); - } - VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL)); - nvlist_free(nvroot); - nvlist_free(props); - - VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); - zs->zs_metaslab_sz = - 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; - - spa_close(spa, FTAG); - - kernel_fini(); - - if (!ztest_opts.zo_mmp_test) { - ztest_run_zdb(ztest_opts.zo_pool); - ztest_freeze(); - ztest_run_zdb(ztest_opts.zo_pool); - } - - rw_destroy(&ztest_name_lock); - mutex_destroy(&ztest_vdev_lock); - mutex_destroy(&ztest_checkpoint_lock); -} - -static void -setup_data_fd(void) -{ - static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; - - ztest_fd_data = mkstemp(ztest_name_data); - ASSERT3S(ztest_fd_data, >=, 0); - (void) unlink(ztest_name_data); -} - - -static int -shared_data_size(ztest_shared_hdr_t *hdr) -{ - int size; - - size = hdr->zh_hdr_size; - size += hdr->zh_opts_size; - size += hdr->zh_size; - size += hdr->zh_stats_size * hdr->zh_stats_count; - size += hdr->zh_ds_size * hdr->zh_ds_count; - - return (size); -} - -static void -setup_hdr(void) -{ - int size; - ztest_shared_hdr_t *hdr; - - hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), - PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); - ASSERT(hdr != MAP_FAILED); - - VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); - - hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); - hdr->zh_opts_size = sizeof (ztest_shared_opts_t); - hdr->zh_size = sizeof (ztest_shared_t); - hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); - hdr->zh_stats_count = ZTEST_FUNCS; - hdr->zh_ds_size = sizeof (ztest_shared_ds_t); - hdr->zh_ds_count = ztest_opts.zo_datasets; - - size = shared_data_size(hdr); - VERIFY3U(0, ==, ftruncate(ztest_fd_data, size)); - - (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); -} - -static void -setup_data(void) -{ - int size, offset; - ztest_shared_hdr_t *hdr; - uint8_t *buf; - - hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), - PROT_READ, MAP_SHARED, ztest_fd_data, 0); - ASSERT(hdr != MAP_FAILED); - - size = shared_data_size(hdr); - - (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); - hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), - PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); - ASSERT(hdr != MAP_FAILED); - buf = (uint8_t *)hdr; - - offset = hdr->zh_hdr_size; - ztest_shared_opts = (void *)&buf[offset]; - offset += hdr->zh_opts_size; - ztest_shared = (void *)&buf[offset]; - offset += hdr->zh_size; - ztest_shared_callstate = (void *)&buf[offset]; - offset += hdr->zh_stats_size * hdr->zh_stats_count; - ztest_shared_ds = (void *)&buf[offset]; -} - -static boolean_t -exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) -{ - pid_t pid; - int status; - char *cmdbuf = NULL; - - pid = fork(); - - if (cmd == NULL) { - cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); - (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); - cmd = cmdbuf; - } - - if (pid == -1) - fatal(1, "fork failed"); - - if (pid == 0) { /* child */ - char *emptyargv[2] = { cmd, NULL }; - char fd_data_str[12]; - - struct rlimit rl = { 1024, 1024 }; - (void) setrlimit(RLIMIT_NOFILE, &rl); - - (void) close(ztest_fd_rand); - VERIFY3U(11, >=, - snprintf(fd_data_str, 12, "%d", ztest_fd_data)); - VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); - - (void) enable_extended_FILE_stdio(-1, -1); - if (libpath != NULL) - VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1)); -#ifdef illumos - (void) execv(cmd, emptyargv); -#else - (void) execvp(cmd, emptyargv); -#endif - ztest_dump_core = B_FALSE; - fatal(B_TRUE, "exec failed: %s", cmd); - } - - if (cmdbuf != NULL) { - umem_free(cmdbuf, MAXPATHLEN); - cmd = NULL; - } - - while (waitpid(pid, &status, 0) != pid) - continue; - if (statusp != NULL) - *statusp = status; - - if (WIFEXITED(status)) { - if (WEXITSTATUS(status) != 0) { - (void) fprintf(stderr, "child exited with code %d\n", - WEXITSTATUS(status)); - exit(2); - } - return (B_FALSE); - } else if (WIFSIGNALED(status)) { - if (!ignorekill || WTERMSIG(status) != SIGKILL) { - (void) fprintf(stderr, "child died with signal %d\n", - WTERMSIG(status)); - exit(3); - } - return (B_TRUE); - } else { - (void) fprintf(stderr, "something strange happened to child\n"); - exit(4); - /* NOTREACHED */ - } -} - -static void -ztest_run_init(void) -{ - ztest_shared_t *zs = ztest_shared; - - /* - * Blow away any existing copy of zpool.cache - */ - (void) remove(spa_config_path); - - if (ztest_opts.zo_init == 0) { - if (ztest_opts.zo_verbose >= 1) - (void) printf("Importing pool %s\n", - ztest_opts.zo_pool); - ztest_import(zs); - return; - } - - /* - * Create and initialize our storage pool. - */ - for (int i = 1; i <= ztest_opts.zo_init; i++) { - bzero(zs, sizeof (ztest_shared_t)); - if (ztest_opts.zo_verbose >= 3 && - ztest_opts.zo_init != 1) { - (void) printf("ztest_init(), pass %d\n", i); - } - ztest_init(zs); - } -} - -int -main(int argc, char **argv) -{ - int kills = 0; - int iters = 0; - int older = 0; - int newer = 0; - ztest_shared_t *zs; - ztest_info_t *zi; - ztest_shared_callstate_t *zc; - char timebuf[100]; - char numbuf[NN_NUMBUF_SZ]; - char *cmd; - boolean_t hasalt; - char *fd_data_str = getenv("ZTEST_FD_DATA"); - - (void) setvbuf(stdout, NULL, _IOLBF, 0); - - dprintf_setup(&argc, argv); - zfs_deadman_synctime_ms = 300000; - /* - * As two-word space map entries may not come up often (especially - * if pool and vdev sizes are small) we want to force at least some - * of them so the feature get tested. - */ - zfs_force_some_double_word_sm_entries = B_TRUE; - - /* - * Verify that even extensively damaged split blocks with many - * segments can be reconstructed in a reasonable amount of time - * when reconstruction is known to be possible. - */ - zfs_reconstruct_indirect_damage_fraction = 4; - - ztest_fd_rand = open("/dev/urandom", O_RDONLY); - ASSERT3S(ztest_fd_rand, >=, 0); - - if (!fd_data_str) { - process_options(argc, argv); - - setup_data_fd(); - setup_hdr(); - setup_data(); - bcopy(&ztest_opts, ztest_shared_opts, - sizeof (*ztest_shared_opts)); - } else { - ztest_fd_data = atoi(fd_data_str); - setup_data(); - bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts)); - } - ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); - - /* Override location of zpool.cache */ - VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache", - ztest_opts.zo_dir), !=, -1); - - ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), - UMEM_NOFAIL); - zs = ztest_shared; - - if (fd_data_str) { - metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; - metaslab_df_alloc_threshold = - zs->zs_metaslab_df_alloc_threshold; - - if (zs->zs_do_init) - ztest_run_init(); - else - ztest_run(zs); - exit(0); - } - - hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); - - if (ztest_opts.zo_verbose >= 1) { - (void) printf("%llu vdevs, %d datasets, %d threads," - " %llu seconds...\n", - (u_longlong_t)ztest_opts.zo_vdevs, - ztest_opts.zo_datasets, - ztest_opts.zo_threads, - (u_longlong_t)ztest_opts.zo_time); - } - - cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); - (void) strlcpy(cmd, getexecname(), MAXNAMELEN); - - zs->zs_do_init = B_TRUE; - if (strlen(ztest_opts.zo_alt_ztest) != 0) { - if (ztest_opts.zo_verbose >= 1) { - (void) printf("Executing older ztest for " - "initialization: %s\n", ztest_opts.zo_alt_ztest); - } - VERIFY(!exec_child(ztest_opts.zo_alt_ztest, - ztest_opts.zo_alt_libpath, B_FALSE, NULL)); - } else { - VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); - } - zs->zs_do_init = B_FALSE; - - zs->zs_proc_start = gethrtime(); - zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; - - for (int f = 0; f < ZTEST_FUNCS; f++) { - zi = &ztest_info[f]; - zc = ZTEST_GET_SHARED_CALLSTATE(f); - if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) - zc->zc_next = UINT64_MAX; - else - zc->zc_next = zs->zs_proc_start + - ztest_random(2 * zi->zi_interval[0] + 1); - } - - /* - * Run the tests in a loop. These tests include fault injection - * to verify that self-healing data works, and forced crashes - * to verify that we never lose on-disk consistency. - */ - while (gethrtime() < zs->zs_proc_stop) { - int status; - boolean_t killed; - - /* - * Initialize the workload counters for each function. - */ - for (int f = 0; f < ZTEST_FUNCS; f++) { - zc = ZTEST_GET_SHARED_CALLSTATE(f); - zc->zc_count = 0; - zc->zc_time = 0; - } - - /* Set the allocation switch size */ - zs->zs_metaslab_df_alloc_threshold = - ztest_random(zs->zs_metaslab_sz / 4) + 1; - - if (!hasalt || ztest_random(2) == 0) { - if (hasalt && ztest_opts.zo_verbose >= 1) { - (void) printf("Executing newer ztest: %s\n", - cmd); - } - newer++; - killed = exec_child(cmd, NULL, B_TRUE, &status); - } else { - if (hasalt && ztest_opts.zo_verbose >= 1) { - (void) printf("Executing older ztest: %s\n", - ztest_opts.zo_alt_ztest); - } - older++; - killed = exec_child(ztest_opts.zo_alt_ztest, - ztest_opts.zo_alt_libpath, B_TRUE, &status); - } - - if (killed) - kills++; - iters++; - - if (ztest_opts.zo_verbose >= 1) { - hrtime_t now = gethrtime(); - - now = MIN(now, zs->zs_proc_stop); - print_time(zs->zs_proc_stop - now, timebuf); - nicenum(zs->zs_space, numbuf, sizeof (numbuf)); - - (void) printf("Pass %3d, %8s, %3llu ENOSPC, " - "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", - iters, - WIFEXITED(status) ? "Complete" : "SIGKILL", - (u_longlong_t)zs->zs_enospc_count, - 100.0 * zs->zs_alloc / zs->zs_space, - numbuf, - 100.0 * (now - zs->zs_proc_start) / - (ztest_opts.zo_time * NANOSEC), timebuf); - } - - if (ztest_opts.zo_verbose >= 2) { - (void) printf("\nWorkload summary:\n\n"); - (void) printf("%7s %9s %s\n", - "Calls", "Time", "Function"); - (void) printf("%7s %9s %s\n", - "-----", "----", "--------"); - for (int f = 0; f < ZTEST_FUNCS; f++) { - Dl_info dli; - - zi = &ztest_info[f]; - zc = ZTEST_GET_SHARED_CALLSTATE(f); - print_time(zc->zc_time, timebuf); - (void) dladdr((void *)zi->zi_func, &dli); - (void) printf("%7llu %9s %s\n", - (u_longlong_t)zc->zc_count, timebuf, - dli.dli_sname); - } - (void) printf("\n"); - } - - if (!ztest_opts.zo_mmp_test) - ztest_run_zdb(ztest_opts.zo_pool); - } - - if (ztest_opts.zo_verbose >= 1) { - if (hasalt) { - (void) printf("%d runs of older ztest: %s\n", older, - ztest_opts.zo_alt_ztest); - (void) printf("%d runs of newer ztest: %s\n", newer, - cmd); - } - (void) printf("%d killed, %d completed, %.0f%% kill rate\n", - kills, iters - kills, (100.0 * kills) / MAX(1, iters)); - } - - umem_free(cmd, MAXNAMELEN); - - return (0); -} diff --git a/cddl/contrib/opensolaris/lib/libdtrace/common/drti.c b/cddl/contrib/opensolaris/lib/libdtrace/common/drti.c index 836eeccb8274..a66661fd9ab8 100644 --- a/cddl/contrib/opensolaris/lib/libdtrace/common/drti.c +++ b/cddl/contrib/opensolaris/lib/libdtrace/common/drti.c @@ -24,6 +24,7 @@ * Use is subject to license terms. */ +#include #include #include #include diff --git a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_link.c b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_link.c index 05f2785e6600..8f32890057f0 100644 --- a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_link.c +++ b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_link.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include diff --git a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_print.c b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_print.c index 0a3a10a76954..97da0c3a5ac2 100644 --- a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_print.c +++ b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_print.c @@ -77,7 +77,6 @@ #include #include #include -#include #include #include diff --git a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_printf.c b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_printf.c index f7b4684b01d0..57a7db4ad0fd 100644 --- a/cddl/contrib/opensolaris/lib/libdtrace/common/dt_printf.c +++ b/cddl/contrib/opensolaris/lib/libdtrace/common/dt_printf.c @@ -44,12 +44,19 @@ #include #include #include -#include - +#include #include #include #include +#ifndef NS_IN6ADDRSZ +#define NS_IN6ADDRSZ 16 +#endif + +#ifndef NS_INADDRSZ +#define NS_INADDRSZ 4 +#endif + /*ARGSUSED*/ static int pfcheck_addr(dt_pfargv_t *pfv, dt_pfargd_t *pfd, dt_node_t *dnp) diff --git a/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.c b/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.c deleted file mode 100644 index c6fbfe97a9af..000000000000 --- a/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.c +++ /dev/null @@ -1,1286 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include "libnvpair.h" - -/* - * libnvpair - A tools library for manipulating pairs. - * - * This library provides routines packing an unpacking nv pairs - * for transporting data across process boundaries, transporting - * between kernel and userland, and possibly saving onto disk files. - */ - -/* - * Print control structure. - */ - -#define DEFINEOP(opname, vtype) \ - struct { \ - int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \ - const char *, vtype); \ - void *arg; \ - } opname - -#define DEFINEARROP(opname, vtype) \ - struct { \ - int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \ - const char *, vtype, uint_t); \ - void *arg; \ - } opname - -struct nvlist_printops { - DEFINEOP(print_boolean, int); - DEFINEOP(print_boolean_value, boolean_t); - DEFINEOP(print_byte, uchar_t); - DEFINEOP(print_int8, int8_t); - DEFINEOP(print_uint8, uint8_t); - DEFINEOP(print_int16, int16_t); - DEFINEOP(print_uint16, uint16_t); - DEFINEOP(print_int32, int32_t); - DEFINEOP(print_uint32, uint32_t); - DEFINEOP(print_int64, int64_t); - DEFINEOP(print_uint64, uint64_t); - DEFINEOP(print_double, double); - DEFINEOP(print_string, char *); - DEFINEOP(print_hrtime, hrtime_t); - DEFINEOP(print_nvlist, nvlist_t *); - DEFINEARROP(print_boolean_array, boolean_t *); - DEFINEARROP(print_byte_array, uchar_t *); - DEFINEARROP(print_int8_array, int8_t *); - DEFINEARROP(print_uint8_array, uint8_t *); - DEFINEARROP(print_int16_array, int16_t *); - DEFINEARROP(print_uint16_array, uint16_t *); - DEFINEARROP(print_int32_array, int32_t *); - DEFINEARROP(print_uint32_array, uint32_t *); - DEFINEARROP(print_int64_array, int64_t *); - DEFINEARROP(print_uint64_array, uint64_t *); - DEFINEARROP(print_string_array, char **); - DEFINEARROP(print_nvlist_array, nvlist_t **); -}; - -struct nvlist_prtctl { - FILE *nvprt_fp; /* output destination */ - enum nvlist_indent_mode nvprt_indent_mode; /* see above */ - int nvprt_indent; /* absolute indent, or tab depth */ - int nvprt_indentinc; /* indent or tab increment */ - const char *nvprt_nmfmt; /* member name format, max one %s */ - const char *nvprt_eomfmt; /* after member format, e.g. "\n" */ - const char *nvprt_btwnarrfmt; /* between array members */ - int nvprt_btwnarrfmt_nl; /* nvprt_eoamfmt includes newline? */ - struct nvlist_printops *nvprt_dfltops; - struct nvlist_printops *nvprt_custops; -}; - -#define DFLTPRTOP(pctl, type) \ - ((pctl)->nvprt_dfltops->print_##type.op) - -#define DFLTPRTOPARG(pctl, type) \ - ((pctl)->nvprt_dfltops->print_##type.arg) - -#define CUSTPRTOP(pctl, type) \ - ((pctl)->nvprt_custops->print_##type.op) - -#define CUSTPRTOPARG(pctl, type) \ - ((pctl)->nvprt_custops->print_##type.arg) - -#define RENDER(pctl, type, nvl, name, val) \ - { \ - int done = 0; \ - if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \ - done = CUSTPRTOP(pctl, type)(pctl, \ - CUSTPRTOPARG(pctl, type), nvl, name, val); \ - } \ - if (!done) { \ - (void) DFLTPRTOP(pctl, type)(pctl, \ - DFLTPRTOPARG(pctl, type), nvl, name, val); \ - } \ - (void) fprintf(pctl->nvprt_fp, pctl->nvprt_eomfmt); \ - } - -#define ARENDER(pctl, type, nvl, name, arrp, count) \ - { \ - int done = 0; \ - if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \ - done = CUSTPRTOP(pctl, type)(pctl, \ - CUSTPRTOPARG(pctl, type), nvl, name, arrp, count); \ - } \ - if (!done) { \ - (void) DFLTPRTOP(pctl, type)(pctl, \ - DFLTPRTOPARG(pctl, type), nvl, name, arrp, count); \ - } \ - (void) fprintf(pctl->nvprt_fp, pctl->nvprt_eomfmt); \ - } - -static void nvlist_print_with_indent(nvlist_t *, nvlist_prtctl_t); - -/* - * ====================================================================== - * | | - * | Indentation | - * | | - * ====================================================================== - */ - -static void -indent(nvlist_prtctl_t pctl, int onemore) -{ - int depth; - - switch (pctl->nvprt_indent_mode) { - case NVLIST_INDENT_ABS: - (void) fprintf(pctl->nvprt_fp, "%*s", - pctl->nvprt_indent + onemore * pctl->nvprt_indentinc, ""); - break; - - case NVLIST_INDENT_TABBED: - depth = pctl->nvprt_indent + onemore; - while (depth-- > 0) - (void) fprintf(pctl->nvprt_fp, "\t"); - } -} - -/* - * ====================================================================== - * | | - * | Default nvlist member rendering functions. | - * | | - * ====================================================================== - */ - -/* - * Generate functions to print single-valued nvlist members. - * - * type_and_variant - suffix to form function name - * vtype - C type for the member value - * ptype - C type to cast value to for printing - * vfmt - format string for pair value, e.g "%d" or "0x%llx" - */ - -#define NVLIST_PRTFUNC(type_and_variant, vtype, ptype, vfmt) \ -static int \ -nvprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \ - nvlist_t *nvl, const char *name, vtype value) \ -{ \ - FILE *fp = pctl->nvprt_fp; \ - NOTE(ARGUNUSED(private)) \ - NOTE(ARGUNUSED(nvl)) \ - indent(pctl, 1); \ - (void) fprintf(fp, pctl->nvprt_nmfmt, name); \ - (void) fprintf(fp, vfmt, (ptype)value); \ - return (1); \ -} - -NVLIST_PRTFUNC(boolean, int, int, "%d") -NVLIST_PRTFUNC(boolean_value, boolean_t, int, "%d") -NVLIST_PRTFUNC(byte, uchar_t, uchar_t, "0x%2.2x") -NVLIST_PRTFUNC(int8, int8_t, int, "%d") -NVLIST_PRTFUNC(uint8, uint8_t, uint8_t, "0x%x") -NVLIST_PRTFUNC(int16, int16_t, int16_t, "%d") -NVLIST_PRTFUNC(uint16, uint16_t, uint16_t, "0x%x") -NVLIST_PRTFUNC(int32, int32_t, int32_t, "%d") -NVLIST_PRTFUNC(uint32, uint32_t, uint32_t, "0x%x") -NVLIST_PRTFUNC(int64, int64_t, longlong_t, "%lld") -NVLIST_PRTFUNC(uint64, uint64_t, u_longlong_t, "0x%llx") -NVLIST_PRTFUNC(double, double, double, "0x%f") -NVLIST_PRTFUNC(string, char *, char *, "%s") -NVLIST_PRTFUNC(hrtime, hrtime_t, hrtime_t, "0x%llx") - -/* - * Generate functions to print array-valued nvlist members. - */ - -#define NVLIST_ARRPRTFUNC(type_and_variant, vtype, ptype, vfmt) \ -static int \ -nvaprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \ - nvlist_t *nvl, const char *name, vtype *valuep, uint_t count) \ -{ \ - FILE *fp = pctl->nvprt_fp; \ - uint_t i; \ - NOTE(ARGUNUSED(private)) \ - NOTE(ARGUNUSED(nvl)) \ - for (i = 0; i < count; i++) { \ - if (i == 0 || pctl->nvprt_btwnarrfmt_nl) { \ - indent(pctl, 1); \ - (void) fprintf(fp, pctl->nvprt_nmfmt, name); \ - if (pctl->nvprt_btwnarrfmt_nl) \ - (void) fprintf(fp, "[%d]: ", i); \ - } \ - if (i != 0) \ - (void) fprintf(fp, pctl->nvprt_btwnarrfmt); \ - (void) fprintf(fp, vfmt, (ptype)valuep[i]); \ - } \ - return (1); \ -} - -NVLIST_ARRPRTFUNC(boolean_array, boolean_t, boolean_t, "%d") -NVLIST_ARRPRTFUNC(byte_array, uchar_t, uchar_t, "0x%2.2x") -NVLIST_ARRPRTFUNC(int8_array, int8_t, int8_t, "%d") -NVLIST_ARRPRTFUNC(uint8_array, uint8_t, uint8_t, "0x%x") -NVLIST_ARRPRTFUNC(int16_array, int16_t, int16_t, "%d") -NVLIST_ARRPRTFUNC(uint16_array, uint16_t, uint16_t, "0x%x") -NVLIST_ARRPRTFUNC(int32_array, int32_t, int32_t, "%d") -NVLIST_ARRPRTFUNC(uint32_array, uint32_t, uint32_t, "0x%x") -NVLIST_ARRPRTFUNC(int64_array, int64_t, longlong_t, "%lld") -NVLIST_ARRPRTFUNC(uint64_array, uint64_t, u_longlong_t, "0x%llx") -NVLIST_ARRPRTFUNC(string_array, char *, char *, "%s") - -/*ARGSUSED*/ -static int -nvprint_nvlist(nvlist_prtctl_t pctl, void *private, - nvlist_t *nvl, const char *name, nvlist_t *value) -{ - FILE *fp = pctl->nvprt_fp; - - indent(pctl, 1); - (void) fprintf(fp, "%s = (embedded nvlist)\n", name); - - pctl->nvprt_indent += pctl->nvprt_indentinc; - nvlist_print_with_indent(value, pctl); - pctl->nvprt_indent -= pctl->nvprt_indentinc; - - indent(pctl, 1); - (void) fprintf(fp, "(end %s)\n", name); - - return (1); -} - -/*ARGSUSED*/ -static int -nvaprint_nvlist_array(nvlist_prtctl_t pctl, void *private, - nvlist_t *nvl, const char *name, nvlist_t **valuep, uint_t count) -{ - FILE *fp = pctl->nvprt_fp; - uint_t i; - - indent(pctl, 1); - (void) fprintf(fp, "%s = (array of embedded nvlists)\n", name); - - for (i = 0; i < count; i++) { - indent(pctl, 1); - (void) fprintf(fp, "(start %s[%d])\n", name, i); - - pctl->nvprt_indent += pctl->nvprt_indentinc; - nvlist_print_with_indent(valuep[i], pctl); - pctl->nvprt_indent -= pctl->nvprt_indentinc; - - indent(pctl, 1); - (void) fprintf(fp, "(end %s[%d])\n", name, i); - } - - return (1); -} - -/* - * ====================================================================== - * | | - * | Interfaces that allow control over formatting. | - * | | - * ====================================================================== - */ - -void -nvlist_prtctl_setdest(nvlist_prtctl_t pctl, FILE *fp) -{ - pctl->nvprt_fp = fp; -} - -FILE * -nvlist_prtctl_getdest(nvlist_prtctl_t pctl) -{ - return (pctl->nvprt_fp); -} - - -void -nvlist_prtctl_setindent(nvlist_prtctl_t pctl, enum nvlist_indent_mode mode, - int start, int inc) -{ - if (mode < NVLIST_INDENT_ABS || mode > NVLIST_INDENT_TABBED) - mode = NVLIST_INDENT_TABBED; - - if (start < 0) - start = 0; - - if (inc < 0) - inc = 1; - - pctl->nvprt_indent_mode = mode; - pctl->nvprt_indent = start; - pctl->nvprt_indentinc = inc; -} - -void -nvlist_prtctl_doindent(nvlist_prtctl_t pctl, int onemore) -{ - indent(pctl, onemore); -} - - -void -nvlist_prtctl_setfmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which, - const char *fmt) -{ - switch (which) { - case NVLIST_FMT_MEMBER_NAME: - if (fmt == NULL) - fmt = "%s = "; - pctl->nvprt_nmfmt = fmt; - break; - - case NVLIST_FMT_MEMBER_POSTAMBLE: - if (fmt == NULL) - fmt = "\n"; - pctl->nvprt_eomfmt = fmt; - break; - - case NVLIST_FMT_BTWN_ARRAY: - if (fmt == NULL) { - pctl->nvprt_btwnarrfmt = " "; - pctl->nvprt_btwnarrfmt_nl = 0; - } else { - pctl->nvprt_btwnarrfmt = fmt; - pctl->nvprt_btwnarrfmt_nl = (strstr(fmt, "\n") != NULL); - } - break; - - default: - break; - } -} - - -void -nvlist_prtctl_dofmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which, ...) -{ - FILE *fp = pctl->nvprt_fp; - va_list ap; - char *name; - - va_start(ap, which); - - switch (which) { - case NVLIST_FMT_MEMBER_NAME: - name = va_arg(ap, char *); - (void) fprintf(fp, pctl->nvprt_nmfmt, name); - break; - - case NVLIST_FMT_MEMBER_POSTAMBLE: - (void) fprintf(fp, pctl->nvprt_eomfmt); - break; - - case NVLIST_FMT_BTWN_ARRAY: - (void) fprintf(fp, pctl->nvprt_btwnarrfmt); \ - break; - - default: - break; - } - - va_end(ap); -} - -/* - * ====================================================================== - * | | - * | Interfaces to allow appointment of replacement rendering functions.| - * | | - * ====================================================================== - */ - -#define NVLIST_PRINTCTL_REPLACE(type, vtype) \ -void \ -nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \ - int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype), \ - void *private) \ -{ \ - CUSTPRTOP(pctl, type) = func; \ - CUSTPRTOPARG(pctl, type) = private; \ -} - -NVLIST_PRINTCTL_REPLACE(boolean, int) -NVLIST_PRINTCTL_REPLACE(boolean_value, boolean_t) -NVLIST_PRINTCTL_REPLACE(byte, uchar_t) -NVLIST_PRINTCTL_REPLACE(int8, int8_t) -NVLIST_PRINTCTL_REPLACE(uint8, uint8_t) -NVLIST_PRINTCTL_REPLACE(int16, int16_t) -NVLIST_PRINTCTL_REPLACE(uint16, uint16_t) -NVLIST_PRINTCTL_REPLACE(int32, int32_t) -NVLIST_PRINTCTL_REPLACE(uint32, uint32_t) -NVLIST_PRINTCTL_REPLACE(int64, int64_t) -NVLIST_PRINTCTL_REPLACE(uint64, uint64_t) -NVLIST_PRINTCTL_REPLACE(double, double) -NVLIST_PRINTCTL_REPLACE(string, char *) -NVLIST_PRINTCTL_REPLACE(hrtime, hrtime_t) -NVLIST_PRINTCTL_REPLACE(nvlist, nvlist_t *) - -#define NVLIST_PRINTCTL_AREPLACE(type, vtype) \ -void \ -nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \ - int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, \ - uint_t), void *private) \ -{ \ - CUSTPRTOP(pctl, type) = func; \ - CUSTPRTOPARG(pctl, type) = private; \ -} - -NVLIST_PRINTCTL_AREPLACE(boolean_array, boolean_t *) -NVLIST_PRINTCTL_AREPLACE(byte_array, uchar_t *) -NVLIST_PRINTCTL_AREPLACE(int8_array, int8_t *) -NVLIST_PRINTCTL_AREPLACE(uint8_array, uint8_t *) -NVLIST_PRINTCTL_AREPLACE(int16_array, int16_t *) -NVLIST_PRINTCTL_AREPLACE(uint16_array, uint16_t *) -NVLIST_PRINTCTL_AREPLACE(int32_array, int32_t *) -NVLIST_PRINTCTL_AREPLACE(uint32_array, uint32_t *) -NVLIST_PRINTCTL_AREPLACE(int64_array, int64_t *) -NVLIST_PRINTCTL_AREPLACE(uint64_array, uint64_t *) -NVLIST_PRINTCTL_AREPLACE(string_array, char **) -NVLIST_PRINTCTL_AREPLACE(nvlist_array, nvlist_t **) - -/* - * ====================================================================== - * | | - * | Interfaces to manage nvlist_prtctl_t cookies. | - * | | - * ====================================================================== - */ - - -static const struct nvlist_printops defprtops = { - { nvprint_boolean, NULL }, - { nvprint_boolean_value, NULL }, - { nvprint_byte, NULL }, - { nvprint_int8, NULL }, - { nvprint_uint8, NULL }, - { nvprint_int16, NULL }, - { nvprint_uint16, NULL }, - { nvprint_int32, NULL }, - { nvprint_uint32, NULL }, - { nvprint_int64, NULL }, - { nvprint_uint64, NULL }, - { nvprint_double, NULL }, - { nvprint_string, NULL }, - { nvprint_hrtime, NULL }, - { nvprint_nvlist, NULL }, - { nvaprint_boolean_array, NULL }, - { nvaprint_byte_array, NULL }, - { nvaprint_int8_array, NULL }, - { nvaprint_uint8_array, NULL }, - { nvaprint_int16_array, NULL }, - { nvaprint_uint16_array, NULL }, - { nvaprint_int32_array, NULL }, - { nvaprint_uint32_array, NULL }, - { nvaprint_int64_array, NULL }, - { nvaprint_uint64_array, NULL }, - { nvaprint_string_array, NULL }, - { nvaprint_nvlist_array, NULL }, -}; - -static void -prtctl_defaults(FILE *fp, struct nvlist_prtctl *pctl, - struct nvlist_printops *ops) -{ - pctl->nvprt_fp = fp; - pctl->nvprt_indent_mode = NVLIST_INDENT_TABBED; - pctl->nvprt_indent = 0; - pctl->nvprt_indentinc = 1; - pctl->nvprt_nmfmt = "%s = "; - pctl->nvprt_eomfmt = "\n"; - pctl->nvprt_btwnarrfmt = " "; - pctl->nvprt_btwnarrfmt_nl = 0; - - pctl->nvprt_dfltops = (struct nvlist_printops *)&defprtops; - pctl->nvprt_custops = ops; -} - -nvlist_prtctl_t -nvlist_prtctl_alloc(void) -{ - struct nvlist_prtctl *pctl; - struct nvlist_printops *ops; - - if ((pctl = malloc(sizeof (*pctl))) == NULL) - return (NULL); - - if ((ops = calloc(1, sizeof (*ops))) == NULL) { - free(pctl); - return (NULL); - } - - prtctl_defaults(stdout, pctl, ops); - - return (pctl); -} - -void -nvlist_prtctl_free(nvlist_prtctl_t pctl) -{ - if (pctl != NULL) { - free(pctl->nvprt_custops); - free(pctl); - } -} - -/* - * ====================================================================== - * | | - * | Top-level print request interfaces. | - * | | - * ====================================================================== - */ - -/* - * nvlist_print - Prints elements in an event buffer - */ -static void -nvlist_print_with_indent(nvlist_t *nvl, nvlist_prtctl_t pctl) -{ - FILE *fp = pctl->nvprt_fp; - char *name; - uint_t nelem; - nvpair_t *nvp; - - if (nvl == NULL) - return; - - indent(pctl, 0); - (void) fprintf(fp, "nvlist version: %d\n", NVL_VERSION(nvl)); - - nvp = nvlist_next_nvpair(nvl, NULL); - - while (nvp) { - data_type_t type = nvpair_type(nvp); - - name = nvpair_name(nvp); - nelem = 0; - - switch (type) { - case DATA_TYPE_BOOLEAN: { - RENDER(pctl, boolean, nvl, name, 1); - break; - } - case DATA_TYPE_BOOLEAN_VALUE: { - boolean_t val; - (void) nvpair_value_boolean_value(nvp, &val); - RENDER(pctl, boolean_value, nvl, name, val); - break; - } - case DATA_TYPE_BYTE: { - uchar_t val; - (void) nvpair_value_byte(nvp, &val); - RENDER(pctl, byte, nvl, name, val); - break; - } - case DATA_TYPE_INT8: { - int8_t val; - (void) nvpair_value_int8(nvp, &val); - RENDER(pctl, int8, nvl, name, val); - break; - } - case DATA_TYPE_UINT8: { - uint8_t val; - (void) nvpair_value_uint8(nvp, &val); - RENDER(pctl, uint8, nvl, name, val); - break; - } - case DATA_TYPE_INT16: { - int16_t val; - (void) nvpair_value_int16(nvp, &val); - RENDER(pctl, int16, nvl, name, val); - break; - } - case DATA_TYPE_UINT16: { - uint16_t val; - (void) nvpair_value_uint16(nvp, &val); - RENDER(pctl, uint16, nvl, name, val); - break; - } - case DATA_TYPE_INT32: { - int32_t val; - (void) nvpair_value_int32(nvp, &val); - RENDER(pctl, int32, nvl, name, val); - break; - } - case DATA_TYPE_UINT32: { - uint32_t val; - (void) nvpair_value_uint32(nvp, &val); - RENDER(pctl, uint32, nvl, name, val); - break; - } - case DATA_TYPE_INT64: { - int64_t val; - (void) nvpair_value_int64(nvp, &val); - RENDER(pctl, int64, nvl, name, val); - break; - } - case DATA_TYPE_UINT64: { - uint64_t val; - (void) nvpair_value_uint64(nvp, &val); - RENDER(pctl, uint64, nvl, name, val); - break; - } - case DATA_TYPE_DOUBLE: { - double val; - (void) nvpair_value_double(nvp, &val); - RENDER(pctl, double, nvl, name, val); - break; - } - case DATA_TYPE_STRING: { - char *val; - (void) nvpair_value_string(nvp, &val); - RENDER(pctl, string, nvl, name, val); - break; - } - case DATA_TYPE_BOOLEAN_ARRAY: { - boolean_t *val; - (void) nvpair_value_boolean_array(nvp, &val, &nelem); - ARENDER(pctl, boolean_array, nvl, name, val, nelem); - break; - } - case DATA_TYPE_BYTE_ARRAY: { - uchar_t *val; - (void) nvpair_value_byte_array(nvp, &val, &nelem); - ARENDER(pctl, byte_array, nvl, name, val, nelem); - break; - } - case DATA_TYPE_INT8_ARRAY: { - int8_t *val; - (void) nvpair_value_int8_array(nvp, &val, &nelem); - ARENDER(pctl, int8_array, nvl, name, val, nelem); - break; - } - case DATA_TYPE_UINT8_ARRAY: { - uint8_t *val; - (void) nvpair_value_uint8_array(nvp, &val, &nelem); - ARENDER(pctl, uint8_array, nvl, name, val, nelem); - break; - } - case DATA_TYPE_INT16_ARRAY: { - int16_t *val; - (void) nvpair_value_int16_array(nvp, &val, &nelem); - ARENDER(pctl, int16_array, nvl, name, val, nelem); - break; - } - case DATA_TYPE_UINT16_ARRAY: { - uint16_t *val; - (void) nvpair_value_uint16_array(nvp, &val, &nelem); - ARENDER(pctl, uint16_array, nvl, name, val, nelem); - break; - } - case DATA_TYPE_INT32_ARRAY: { - int32_t *val; - (void) nvpair_value_int32_array(nvp, &val, &nelem); - ARENDER(pctl, int32_array, nvl, name, val, nelem); - break; - } - case DATA_TYPE_UINT32_ARRAY: { - uint32_t *val; - (void) nvpair_value_uint32_array(nvp, &val, &nelem); - ARENDER(pctl, uint32_array, nvl, name, val, nelem); - break; - } - case DATA_TYPE_INT64_ARRAY: { - int64_t *val; - (void) nvpair_value_int64_array(nvp, &val, &nelem); - ARENDER(pctl, int64_array, nvl, name, val, nelem); - break; - } - case DATA_TYPE_UINT64_ARRAY: { - uint64_t *val; - (void) nvpair_value_uint64_array(nvp, &val, &nelem); - ARENDER(pctl, uint64_array, nvl, name, val, nelem); - break; - } - case DATA_TYPE_STRING_ARRAY: { - char **val; - (void) nvpair_value_string_array(nvp, &val, &nelem); - ARENDER(pctl, string_array, nvl, name, val, nelem); - break; - } - case DATA_TYPE_HRTIME: { - hrtime_t val; - (void) nvpair_value_hrtime(nvp, &val); - RENDER(pctl, hrtime, nvl, name, val); - break; - } - case DATA_TYPE_NVLIST: { - nvlist_t *val; - (void) nvpair_value_nvlist(nvp, &val); - RENDER(pctl, nvlist, nvl, name, val); - break; - } - case DATA_TYPE_NVLIST_ARRAY: { - nvlist_t **val; - (void) nvpair_value_nvlist_array(nvp, &val, &nelem); - ARENDER(pctl, nvlist_array, nvl, name, val, nelem); - break; - } - default: - (void) fprintf(fp, " unknown data type (%d)", type); - break; - } - nvp = nvlist_next_nvpair(nvl, nvp); - } -} - -void -nvlist_print(FILE *fp, nvlist_t *nvl) -{ - struct nvlist_prtctl pc; - - prtctl_defaults(fp, &pc, NULL); - nvlist_print_with_indent(nvl, &pc); -} - -void -nvlist_prt(nvlist_t *nvl, nvlist_prtctl_t pctl) -{ - nvlist_print_with_indent(nvl, pctl); -} - -#define NVP(elem, type, vtype, ptype, format) { \ - vtype value; \ -\ - (void) nvpair_value_##type(elem, &value); \ - (void) printf("%*s%s: " format "\n", indent, "", \ - nvpair_name(elem), (ptype)value); \ -} - -#define NVPA(elem, type, vtype, ptype, format) { \ - uint_t i, count; \ - vtype *value; \ -\ - (void) nvpair_value_##type(elem, &value, &count); \ - for (i = 0; i < count; i++) { \ - (void) printf("%*s%s[%d]: " format "\n", indent, "", \ - nvpair_name(elem), i, (ptype)value[i]); \ - } \ -} - -/* - * Similar to nvlist_print() but handles arrays slightly differently. - */ -void -dump_nvlist(nvlist_t *list, int indent) -{ - nvpair_t *elem = NULL; - boolean_t bool_value; - boolean_t *bool_array_value; - nvlist_t *nvlist_value; - nvlist_t **nvlist_array_value; - uint_t i, count; - - if (list == NULL) { - return; - } - - while ((elem = nvlist_next_nvpair(list, elem)) != NULL) { - switch (nvpair_type(elem)) { - case DATA_TYPE_BOOLEAN: - (void) printf("%*s%s\n", indent, "", nvpair_name(elem)); - break; - - case DATA_TYPE_BOOLEAN_VALUE: - (void) nvpair_value_boolean_value(elem, &bool_value); - (void) printf("%*s%s: %s\n", indent, "", - nvpair_name(elem), bool_value ? "true" : "false"); - break; - - case DATA_TYPE_BYTE: - NVP(elem, byte, uchar_t, int, "%u"); - break; - - case DATA_TYPE_INT8: - NVP(elem, int8, int8_t, int, "%d"); - break; - - case DATA_TYPE_UINT8: - NVP(elem, uint8, uint8_t, int, "%u"); - break; - - case DATA_TYPE_INT16: - NVP(elem, int16, int16_t, int, "%d"); - break; - - case DATA_TYPE_UINT16: - NVP(elem, uint16, uint16_t, int, "%u"); - break; - - case DATA_TYPE_INT32: - NVP(elem, int32, int32_t, long, "%ld"); - break; - - case DATA_TYPE_UINT32: - NVP(elem, uint32, uint32_t, ulong_t, "%lu"); - break; - - case DATA_TYPE_INT64: - NVP(elem, int64, int64_t, longlong_t, "%lld"); - break; - - case DATA_TYPE_UINT64: - NVP(elem, uint64, uint64_t, u_longlong_t, "%llu"); - break; - - case DATA_TYPE_STRING: - NVP(elem, string, char *, char *, "'%s'"); - break; - - case DATA_TYPE_BOOLEAN_ARRAY: - (void) nvpair_value_boolean_array(elem, - &bool_array_value, &count); - for (i = 0; i < count; i++) { - (void) printf("%*s%s[%d]: %s\n", indent, "", - nvpair_name(elem), i, - bool_array_value[i] ? "true" : "false"); - } - break; - - case DATA_TYPE_BYTE_ARRAY: - NVPA(elem, byte_array, uchar_t, int, "%u"); - break; - - case DATA_TYPE_INT8_ARRAY: - NVPA(elem, int8_array, int8_t, int, "%d"); - break; - - case DATA_TYPE_UINT8_ARRAY: - NVPA(elem, uint8_array, uint8_t, int, "%u"); - break; - - case DATA_TYPE_INT16_ARRAY: - NVPA(elem, int16_array, int16_t, int, "%d"); - break; - - case DATA_TYPE_UINT16_ARRAY: - NVPA(elem, uint16_array, uint16_t, int, "%u"); - break; - - case DATA_TYPE_INT32_ARRAY: - NVPA(elem, int32_array, int32_t, long, "%ld"); - break; - - case DATA_TYPE_UINT32_ARRAY: - NVPA(elem, uint32_array, uint32_t, ulong_t, "%lu"); - break; - - case DATA_TYPE_INT64_ARRAY: - NVPA(elem, int64_array, int64_t, longlong_t, "%lld"); - break; - - case DATA_TYPE_UINT64_ARRAY: - NVPA(elem, uint64_array, uint64_t, u_longlong_t, - "%llu"); - break; - - case DATA_TYPE_STRING_ARRAY: - NVPA(elem, string_array, char *, char *, "'%s'"); - break; - - case DATA_TYPE_NVLIST: - (void) nvpair_value_nvlist(elem, &nvlist_value); - (void) printf("%*s%s:\n", indent, "", - nvpair_name(elem)); - dump_nvlist(nvlist_value, indent + 4); - break; - - case DATA_TYPE_NVLIST_ARRAY: - (void) nvpair_value_nvlist_array(elem, - &nvlist_array_value, &count); - for (i = 0; i < count; i++) { - (void) printf("%*s%s[%u]:\n", indent, "", - nvpair_name(elem), i); - dump_nvlist(nvlist_array_value[i], indent + 4); - } - break; - - default: - (void) printf(dgettext(TEXT_DOMAIN, "bad config type " - "%d for %s\n"), nvpair_type(elem), - nvpair_name(elem)); - } - } -} - -/* - * ====================================================================== - * | | - * | Misc private interface. | - * | | - * ====================================================================== - */ - -/* - * Determine if string 'value' matches 'nvp' value. The 'value' string is - * converted, depending on the type of 'nvp', prior to match. For numeric - * types, a radix independent sscanf conversion of 'value' is used. If 'nvp' - * is an array type, 'ai' is the index into the array against which we are - * checking for match. If nvp is of DATA_TYPE_STRING*, the caller can pass - * in a regex_t compilation of value in 'value_regex' to trigger regular - * expression string match instead of simple strcmp(). - * - * Return 1 on match, 0 on no-match, and -1 on error. If the error is - * related to value syntax error and 'ep' is non-NULL, *ep will point into - * the 'value' string at the location where the error exists. - * - * NOTE: It may be possible to move the non-regex_t version of this into - * common code used by library/kernel/boot. - */ -int -nvpair_value_match_regex(nvpair_t *nvp, int ai, - char *value, regex_t *value_regex, char **ep) -{ - char *evalue; - uint_t a_len; - int sr; - - if (ep) - *ep = NULL; - - if ((nvp == NULL) || (value == NULL)) - return (-1); /* error fail match - invalid args */ - - /* make sure array and index combination make sense */ - if ((nvpair_type_is_array(nvp) && (ai < 0)) || - (!nvpair_type_is_array(nvp) && (ai >= 0))) - return (-1); /* error fail match - bad index */ - - /* non-string values should be single 'chunk' */ - if ((nvpair_type(nvp) != DATA_TYPE_STRING) && - (nvpair_type(nvp) != DATA_TYPE_STRING_ARRAY)) { - value += strspn(value, " \t"); - evalue = value + strcspn(value, " \t"); - if (*evalue) { - if (ep) - *ep = evalue; - return (-1); /* error fail match - syntax */ - } - } - - sr = EOF; - switch (nvpair_type(nvp)) { - case DATA_TYPE_STRING: { - char *val; - - /* check string value for match */ - if (nvpair_value_string(nvp, &val) == 0) { - if (value_regex) { - if (regexec(value_regex, val, - (size_t)0, NULL, 0) == 0) - return (1); /* match */ - } else { - if (strcmp(value, val) == 0) - return (1); /* match */ - } - } - break; - } - case DATA_TYPE_STRING_ARRAY: { - char **val_array; - - /* check indexed string value of array for match */ - if ((nvpair_value_string_array(nvp, &val_array, &a_len) == 0) && - (ai < a_len)) { - if (value_regex) { - if (regexec(value_regex, val_array[ai], - (size_t)0, NULL, 0) == 0) - return (1); - } else { - if (strcmp(value, val_array[ai]) == 0) - return (1); - } - } - break; - } - case DATA_TYPE_BYTE: { - uchar_t val, val_arg; - - /* scanf uchar_t from value and check for match */ - sr = sscanf(value, "%c", &val_arg); - if ((sr == 1) && (nvpair_value_byte(nvp, &val) == 0) && - (val == val_arg)) - return (1); - break; - } - case DATA_TYPE_BYTE_ARRAY: { - uchar_t *val_array, val_arg; - - - /* check indexed value of array for match */ - sr = sscanf(value, "%c", &val_arg); - if ((sr == 1) && - (nvpair_value_byte_array(nvp, &val_array, &a_len) == 0) && - (ai < a_len) && - (val_array[ai] == val_arg)) - return (1); - break; - } - case DATA_TYPE_INT8: { - int8_t val, val_arg; - - /* scanf int8_t from value and check for match */ - sr = sscanf(value, "%"SCNi8, &val_arg); - if ((sr == 1) && - (nvpair_value_int8(nvp, &val) == 0) && - (val == val_arg)) - return (1); - break; - } - case DATA_TYPE_INT8_ARRAY: { - int8_t *val_array, val_arg; - - /* check indexed value of array for match */ - sr = sscanf(value, "%"SCNi8, &val_arg); - if ((sr == 1) && - (nvpair_value_int8_array(nvp, &val_array, &a_len) == 0) && - (ai < a_len) && - (val_array[ai] == val_arg)) - return (1); - break; - } - case DATA_TYPE_UINT8: { - uint8_t val, val_arg; - - /* scanf uint8_t from value and check for match */ - sr = sscanf(value, "%"SCNi8, (int8_t *)&val_arg); - if ((sr == 1) && - (nvpair_value_uint8(nvp, &val) == 0) && - (val == val_arg)) - return (1); - break; - } - case DATA_TYPE_UINT8_ARRAY: { - uint8_t *val_array, val_arg; - - /* check indexed value of array for match */ - sr = sscanf(value, "%"SCNi8, (int8_t *)&val_arg); - if ((sr == 1) && - (nvpair_value_uint8_array(nvp, &val_array, &a_len) == 0) && - (ai < a_len) && - (val_array[ai] == val_arg)) - return (1); - break; - } - case DATA_TYPE_INT16: { - int16_t val, val_arg; - - /* scanf int16_t from value and check for match */ - sr = sscanf(value, "%"SCNi16, &val_arg); - if ((sr == 1) && - (nvpair_value_int16(nvp, &val) == 0) && - (val == val_arg)) - return (1); - break; - } - case DATA_TYPE_INT16_ARRAY: { - int16_t *val_array, val_arg; - - /* check indexed value of array for match */ - sr = sscanf(value, "%"SCNi16, &val_arg); - if ((sr == 1) && - (nvpair_value_int16_array(nvp, &val_array, &a_len) == 0) && - (ai < a_len) && - (val_array[ai] == val_arg)) - return (1); - break; - } - case DATA_TYPE_UINT16: { - uint16_t val, val_arg; - - /* scanf uint16_t from value and check for match */ - sr = sscanf(value, "%"SCNi16, (int16_t *)&val_arg); - if ((sr == 1) && - (nvpair_value_uint16(nvp, &val) == 0) && - (val == val_arg)) - return (1); - break; - } - case DATA_TYPE_UINT16_ARRAY: { - uint16_t *val_array, val_arg; - - /* check indexed value of array for match */ - sr = sscanf(value, "%"SCNi16, (int16_t *)&val_arg); - if ((sr == 1) && - (nvpair_value_uint16_array(nvp, &val_array, &a_len) == 0) && - (ai < a_len) && - (val_array[ai] == val_arg)) - return (1); - break; - } - case DATA_TYPE_INT32: { - int32_t val, val_arg; - - /* scanf int32_t from value and check for match */ - sr = sscanf(value, "%"SCNi32, &val_arg); - if ((sr == 1) && - (nvpair_value_int32(nvp, &val) == 0) && - (val == val_arg)) - return (1); - break; - } - case DATA_TYPE_INT32_ARRAY: { - int32_t *val_array, val_arg; - - /* check indexed value of array for match */ - sr = sscanf(value, "%"SCNi32, &val_arg); - if ((sr == 1) && - (nvpair_value_int32_array(nvp, &val_array, &a_len) == 0) && - (ai < a_len) && - (val_array[ai] == val_arg)) - return (1); - break; - } - case DATA_TYPE_UINT32: { - uint32_t val, val_arg; - - /* scanf uint32_t from value and check for match */ - sr = sscanf(value, "%"SCNi32, (int32_t *)&val_arg); - if ((sr == 1) && - (nvpair_value_uint32(nvp, &val) == 0) && - (val == val_arg)) - return (1); - break; - } - case DATA_TYPE_UINT32_ARRAY: { - uint32_t *val_array, val_arg; - - /* check indexed value of array for match */ - sr = sscanf(value, "%"SCNi32, (int32_t *)&val_arg); - if ((sr == 1) && - (nvpair_value_uint32_array(nvp, &val_array, &a_len) == 0) && - (ai < a_len) && - (val_array[ai] == val_arg)) - return (1); - break; - } - case DATA_TYPE_INT64: { - int64_t val, val_arg; - - /* scanf int64_t from value and check for match */ - sr = sscanf(value, "%"SCNi64, &val_arg); - if ((sr == 1) && - (nvpair_value_int64(nvp, &val) == 0) && - (val == val_arg)) - return (1); - break; - } - case DATA_TYPE_INT64_ARRAY: { - int64_t *val_array, val_arg; - - /* check indexed value of array for match */ - sr = sscanf(value, "%"SCNi64, &val_arg); - if ((sr == 1) && - (nvpair_value_int64_array(nvp, &val_array, &a_len) == 0) && - (ai < a_len) && - (val_array[ai] == val_arg)) - return (1); - break; - } - case DATA_TYPE_UINT64: { - uint64_t val_arg, val; - - /* scanf uint64_t from value and check for match */ - sr = sscanf(value, "%"SCNi64, (int64_t *)&val_arg); - if ((sr == 1) && - (nvpair_value_uint64(nvp, &val) == 0) && - (val == val_arg)) - return (1); - break; - } - case DATA_TYPE_UINT64_ARRAY: { - uint64_t *val_array, val_arg; - - /* check indexed value of array for match */ - sr = sscanf(value, "%"SCNi64, (int64_t *)&val_arg); - if ((sr == 1) && - (nvpair_value_uint64_array(nvp, &val_array, &a_len) == 0) && - (ai < a_len) && - (val_array[ai] == val_arg)) - return (1); - break; - } - case DATA_TYPE_BOOLEAN_VALUE: { - int32_t val_arg; - boolean_t val; - - /* scanf boolean_t from value and check for match */ - sr = sscanf(value, "%"SCNi32, &val_arg); - if ((sr == 1) && - (nvpair_value_boolean_value(nvp, &val) == 0) && - (val == val_arg)) - return (1); - break; - } - case DATA_TYPE_BOOLEAN_ARRAY: { - boolean_t *val_array; - int32_t val_arg; - - /* check indexed value of array for match */ - sr = sscanf(value, "%"SCNi32, &val_arg); - if ((sr == 1) && - (nvpair_value_boolean_array(nvp, - &val_array, &a_len) == 0) && - (ai < a_len) && - (val_array[ai] == val_arg)) - return (1); - break; - } - case DATA_TYPE_HRTIME: - case DATA_TYPE_NVLIST: - case DATA_TYPE_NVLIST_ARRAY: - case DATA_TYPE_BOOLEAN: - case DATA_TYPE_DOUBLE: - case DATA_TYPE_UNKNOWN: - default: - /* - * unknown/unsupported data type - */ - return (-1); /* error fail match */ - } - - /* - * check to see if sscanf failed conversion, return approximate - * pointer to problem - */ - if (sr != 1) { - if (ep) - *ep = value; - return (-1); /* error fail match - syntax */ - } - - return (0); /* fail match */ -} - -int -nvpair_value_match(nvpair_t *nvp, int ai, char *value, char **ep) -{ - return (nvpair_value_match_regex(nvp, ai, value, NULL, ep)); -} diff --git a/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.h b/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.h deleted file mode 100644 index b05669e506ba..000000000000 --- a/cddl/contrib/opensolaris/lib/libnvpair/libnvpair.h +++ /dev/null @@ -1,196 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. - */ - -#ifndef _LIBNVPAIR_H -#define _LIBNVPAIR_H - -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * All interfaces described in this file are private to Solaris, and - * are subject to change at any time and without notice. The public - * nvlist/nvpair interfaces, as documented in manpage sections 3NVPAIR, - * are all imported from included above. - */ - -extern int nvpair_value_match(nvpair_t *, int, char *, char **); -extern int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, - char **); - -extern void nvlist_print(FILE *, nvlist_t *); -extern int nvlist_print_json(FILE *, nvlist_t *); -extern void dump_nvlist(nvlist_t *, int); - -/* - * Private nvlist printing interface that allows the caller some control - * over output rendering (as opposed to nvlist_print and dump_nvlist). - * - * Obtain an opaque nvlist_prtctl_t cookie using nvlist_prtctl_alloc - * (NULL on failure); on return the cookie is set up for default formatting - * and rendering. Quote the cookie in subsequent customisation functions and - * then pass the cookie to nvlist_prt to render the nvlist. Finally, - * use nvlist_prtctl_free to release the cookie. - * - * For all nvlist_lookup_xxx and nvlist_lookup_xxx_array functions - * we have a corresponding brace of functions that appoint replacement - * rendering functions: - * - * extern void nvlist_prtctl_xxx(nvlist_prtctl_t, - * void (*)(nvlist_prtctl_t ctl, void *private, const char *name, - * xxxtype value)) - * - * and - * - * extern void nvlist_prtctl_xxx_array(nvlist_prtctl_t, - * void (*)(nvlist_prtctl_t ctl, void *private, const char *name, - * xxxtype value, uint_t count)) - * - * where xxxtype is the C datatype corresponding to xxx, eg int8_t for "int8" - * and char * for "string". The function that is appointed to render the - * specified datatype receives as arguments the cookie, the nvlist - * member name, the value of that member (or a pointer for array function), - * and (for array rendering functions) a count of the number of elements. - */ - -typedef struct nvlist_prtctl *nvlist_prtctl_t; /* opaque */ - -enum nvlist_indent_mode { - NVLIST_INDENT_ABS, /* Absolute indentation */ - NVLIST_INDENT_TABBED /* Indent with tabstops */ -}; - -extern nvlist_prtctl_t nvlist_prtctl_alloc(void); -extern void nvlist_prtctl_free(nvlist_prtctl_t); -extern void nvlist_prt(nvlist_t *, nvlist_prtctl_t); - -/* Output stream */ -extern void nvlist_prtctl_setdest(nvlist_prtctl_t, FILE *); -extern FILE *nvlist_prtctl_getdest(nvlist_prtctl_t); - -/* Indentation mode, start indent, indent increment; default tabbed/0/1 */ -extern void nvlist_prtctl_setindent(nvlist_prtctl_t, enum nvlist_indent_mode, - int, int); -extern void nvlist_prtctl_doindent(nvlist_prtctl_t, int); - -enum nvlist_prtctl_fmt { - NVLIST_FMT_MEMBER_NAME, /* name fmt; default "%s = " */ - NVLIST_FMT_MEMBER_POSTAMBLE, /* after nvlist member; default "\n" */ - NVLIST_FMT_BTWN_ARRAY /* between array members; default " " */ -}; - -extern void nvlist_prtctl_setfmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, - const char *); -extern void nvlist_prtctl_dofmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, ...); - -/* - * Function prototypes for interfaces that appoint a new rendering function - * for single-valued nvlist members. - * - * A replacement function receives arguments as follows: - * - * nvlist_prtctl_t Print control structure; do not change preferences - * for this object from a print callback function. - * - * void * The function-private cookie argument registered - * when the replacement function was appointed. - * - * nvlist_t * The full nvlist that is being processed. The - * rendering function is called to render a single - * member (name and value passed as below) but it may - * want to reference or incorporate other aspects of - * the full nvlist. - * - * const char * Member name to render - * - * valtype Value of the member to render - * - * The function must return non-zero if it has rendered output for this - * member, or 0 if it wants to default to standard rendering for this - * one member. - */ - -#define NVLIST_PRINTCTL_SVDECL(funcname, valtype) \ - extern void funcname(nvlist_prtctl_t, \ - int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, valtype), \ - void *) - -NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean, int); -NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean_value, boolean_t); -NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_byte, uchar_t); -NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int8, int8_t); -NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint8, uint8_t); -NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int16, int16_t); -NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint16, uint16_t); -NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int32, int32_t); -NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint32, uint32_t); -NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int64, int64_t); -NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint64, uint64_t); -NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_double, double); -NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_string, char *); -NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_hrtime, hrtime_t); -NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_nvlist, nvlist_t *); - -#undef NVLIST_PRINTCTL_SVDECL /* was just for "clarity" above */ - -/* - * Function prototypes for interfaces that appoint a new rendering function - * for array-valued nvlist members. - * - * One additional argument is taken: uint_t for the number of array elements - * - * Return values as above. - */ -#define NVLIST_PRINTCTL_AVDECL(funcname, vtype) \ - extern void funcname(nvlist_prtctl_t, \ - int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, uint_t), \ - void *) - -NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_boolean_array, boolean_t *); -NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_byte_array, uchar_t *); -NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int8_array, int8_t *); -NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint8_array, uint8_t *); -NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int16_array, int16_t *); -NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint16_array, uint16_t *); -NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int32_array, int32_t *); -NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint32_array, uint32_t *); -NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int64_array, int64_t *); -NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint64_array, uint64_t *); -NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_string_array, char **); -NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_nvlist_array, nvlist_t **); - -#undef NVLIST_PRINTCTL_AVDECL /* was just for "clarity" above */ - -#ifdef __cplusplus -} -#endif - -#endif /* _LIBNVPAIR_H */ diff --git a/cddl/contrib/opensolaris/lib/libnvpair/nvpair_alloc_system.c b/cddl/contrib/opensolaris/lib/libnvpair/nvpair_alloc_system.c deleted file mode 100644 index 1aefc1004daf..000000000000 --- a/cddl/contrib/opensolaris/lib/libnvpair/nvpair_alloc_system.c +++ /dev/null @@ -1,59 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include -#include - -/*ARGSUSED*/ -static void * -nv_alloc_sys(nv_alloc_t *nva, size_t size) -{ - return (malloc(size)); -} - -/*ARGSUSED*/ -static void -nv_free_sys(nv_alloc_t *nva, void *buf, size_t size) -{ - free(buf); -} - -const nv_alloc_ops_t system_ops_def = { - NULL, /* nv_ao_init() */ - NULL, /* nv_ao_fini() */ - nv_alloc_sys, /* nv_ao_alloc() */ - nv_free_sys, /* nv_ao_free() */ - NULL /* nv_ao_reset() */ -}; - -nv_alloc_t nv_alloc_nosleep_def = { - &system_ops_def, - NULL -}; - -nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def; diff --git a/cddl/contrib/opensolaris/lib/libnvpair/nvpair_json.c b/cddl/contrib/opensolaris/lib/libnvpair/nvpair_json.c deleted file mode 100644 index b687a2f5761a..000000000000 --- a/cddl/contrib/opensolaris/lib/libnvpair/nvpair_json.c +++ /dev/null @@ -1,406 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ -/* - * Copyright (c) 2014, Joyent, Inc. - * Copyright (c) 2017 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include - -#include "libnvpair.h" - -#define FPRINTF(fp, ...) \ - do { \ - if (fprintf(fp, __VA_ARGS__) < 0) \ - return (-1); \ - } while (0) - -/* - * When formatting a string for JSON output we must escape certain characters, - * as described in RFC4627. This applies to both member names and - * DATA_TYPE_STRING values. - * - * This function will only operate correctly if the following conditions are - * met: - * - * 1. The input String is encoded in the current locale. - * - * 2. The current locale includes the Basic Multilingual Plane (plane 0) - * as defined in the Unicode standard. - * - * The output will be entirely 7-bit ASCII (as a subset of UTF-8) with all - * representable Unicode characters included in their escaped numeric form. - */ -static int -nvlist_print_json_string(FILE *fp, const char *input) -{ - mbstate_t mbr; - wchar_t c; - size_t sz; - - bzero(&mbr, sizeof (mbr)); - - FPRINTF(fp, "\""); - while ((sz = mbrtowc(&c, input, MB_CUR_MAX, &mbr)) > 0) { - switch (c) { - case '"': - FPRINTF(fp, "\\\""); - break; - case '\n': - FPRINTF(fp, "\\n"); - break; - case '\r': - FPRINTF(fp, "\\r"); - break; - case '\\': - FPRINTF(fp, "\\\\"); - break; - case '\f': - FPRINTF(fp, "\\f"); - break; - case '\t': - FPRINTF(fp, "\\t"); - break; - case '\b': - FPRINTF(fp, "\\b"); - break; - default: - if ((c >= 0x00 && c <= 0x1f) || - (c > 0x7f && c <= 0xffff)) { - /* - * Render both Control Characters and Unicode - * characters in the Basic Multilingual Plane - * as JSON-escaped multibyte characters. - */ - FPRINTF(fp, "\\u%04x", (int)(0xffff & c)); - } else if (c >= 0x20 && c <= 0x7f) { - /* - * Render other 7-bit ASCII characters directly - * and drop other, unrepresentable characters. - */ - FPRINTF(fp, "%c", (int)(0xff & c)); - } - break; - } - input += sz; - } - - if (sz == (size_t)-1 || sz == (size_t)-2) { - /* - * We last read an invalid multibyte character sequence, - * so return an error. - */ - return (-1); - } - - FPRINTF(fp, "\""); - return (0); -} - -/* - * Dump a JSON-formatted representation of an nvlist to the provided FILE *. - * This routine does not output any new-lines or additional whitespace other - * than that contained in strings, nor does it call fflush(3C). - */ -int -nvlist_print_json(FILE *fp, nvlist_t *nvl) -{ - nvpair_t *curr; - boolean_t first = B_TRUE; - - FPRINTF(fp, "{"); - - for (curr = nvlist_next_nvpair(nvl, NULL); curr; - curr = nvlist_next_nvpair(nvl, curr)) { - data_type_t type = nvpair_type(curr); - - if (!first) - FPRINTF(fp, ","); - else - first = B_FALSE; - - if (nvlist_print_json_string(fp, nvpair_name(curr)) == -1) - return (-1); - FPRINTF(fp, ":"); - - switch (type) { - case DATA_TYPE_STRING: { - char *string = fnvpair_value_string(curr); - if (nvlist_print_json_string(fp, string) == -1) - return (-1); - break; - } - - case DATA_TYPE_BOOLEAN: { - FPRINTF(fp, "true"); - break; - } - - case DATA_TYPE_BOOLEAN_VALUE: { - FPRINTF(fp, "%s", fnvpair_value_boolean_value(curr) == - B_TRUE ? "true" : "false"); - break; - } - - case DATA_TYPE_BYTE: { - FPRINTF(fp, "%hhu", fnvpair_value_byte(curr)); - break; - } - - case DATA_TYPE_INT8: { - FPRINTF(fp, "%hhd", fnvpair_value_int8(curr)); - break; - } - - case DATA_TYPE_UINT8: { - FPRINTF(fp, "%hhu", fnvpair_value_uint8_t(curr)); - break; - } - - case DATA_TYPE_INT16: { - FPRINTF(fp, "%hd", fnvpair_value_int16(curr)); - break; - } - - case DATA_TYPE_UINT16: { - FPRINTF(fp, "%hu", fnvpair_value_uint16(curr)); - break; - } - - case DATA_TYPE_INT32: { - FPRINTF(fp, "%d", fnvpair_value_int32(curr)); - break; - } - - case DATA_TYPE_UINT32: { - FPRINTF(fp, "%u", fnvpair_value_uint32(curr)); - break; - } - - case DATA_TYPE_INT64: { - FPRINTF(fp, "%lld", - (long long)fnvpair_value_int64(curr)); - break; - } - - case DATA_TYPE_UINT64: { - FPRINTF(fp, "%llu", - (unsigned long long)fnvpair_value_uint64(curr)); - break; - } - - case DATA_TYPE_HRTIME: { - hrtime_t val; - VERIFY0(nvpair_value_hrtime(curr, &val)); - FPRINTF(fp, "%llu", (unsigned long long)val); - break; - } - - case DATA_TYPE_DOUBLE: { - double val; - VERIFY0(nvpair_value_double(curr, &val)); - FPRINTF(fp, "%f", val); - break; - } - - case DATA_TYPE_NVLIST: { - if (nvlist_print_json(fp, - fnvpair_value_nvlist(curr)) == -1) - return (-1); - break; - } - - case DATA_TYPE_STRING_ARRAY: { - char **val; - uint_t valsz, i; - VERIFY0(nvpair_value_string_array(curr, &val, &valsz)); - FPRINTF(fp, "["); - for (i = 0; i < valsz; i++) { - if (i > 0) - FPRINTF(fp, ","); - if (nvlist_print_json_string(fp, val[i]) == -1) - return (-1); - } - FPRINTF(fp, "]"); - break; - } - - case DATA_TYPE_NVLIST_ARRAY: { - nvlist_t **val; - uint_t valsz, i; - VERIFY0(nvpair_value_nvlist_array(curr, &val, &valsz)); - FPRINTF(fp, "["); - for (i = 0; i < valsz; i++) { - if (i > 0) - FPRINTF(fp, ","); - if (nvlist_print_json(fp, val[i]) == -1) - return (-1); - } - FPRINTF(fp, "]"); - break; - } - - case DATA_TYPE_BOOLEAN_ARRAY: { - boolean_t *val; - uint_t valsz, i; - VERIFY0(nvpair_value_boolean_array(curr, &val, &valsz)); - FPRINTF(fp, "["); - for (i = 0; i < valsz; i++) { - if (i > 0) - FPRINTF(fp, ","); - FPRINTF(fp, val[i] == B_TRUE ? - "true" : "false"); - } - FPRINTF(fp, "]"); - break; - } - - case DATA_TYPE_BYTE_ARRAY: { - uchar_t *val; - uint_t valsz, i; - VERIFY0(nvpair_value_byte_array(curr, &val, &valsz)); - FPRINTF(fp, "["); - for (i = 0; i < valsz; i++) { - if (i > 0) - FPRINTF(fp, ","); - FPRINTF(fp, "%hhu", val[i]); - } - FPRINTF(fp, "]"); - break; - } - - case DATA_TYPE_UINT8_ARRAY: { - uint8_t *val; - uint_t valsz, i; - VERIFY0(nvpair_value_uint8_array(curr, &val, &valsz)); - FPRINTF(fp, "["); - for (i = 0; i < valsz; i++) { - if (i > 0) - FPRINTF(fp, ","); - FPRINTF(fp, "%hhu", val[i]); - } - FPRINTF(fp, "]"); - break; - } - - case DATA_TYPE_INT8_ARRAY: { - int8_t *val; - uint_t valsz, i; - VERIFY0(nvpair_value_int8_array(curr, &val, &valsz)); - FPRINTF(fp, "["); - for (i = 0; i < valsz; i++) { - if (i > 0) - FPRINTF(fp, ","); - FPRINTF(fp, "%hhd", val[i]); - } - FPRINTF(fp, "]"); - break; - } - - case DATA_TYPE_UINT16_ARRAY: { - uint16_t *val; - uint_t valsz, i; - VERIFY0(nvpair_value_uint16_array(curr, &val, &valsz)); - FPRINTF(fp, "["); - for (i = 0; i < valsz; i++) { - if (i > 0) - FPRINTF(fp, ","); - FPRINTF(fp, "%hu", val[i]); - } - FPRINTF(fp, "]"); - break; - } - - case DATA_TYPE_INT16_ARRAY: { - int16_t *val; - uint_t valsz, i; - VERIFY0(nvpair_value_int16_array(curr, &val, &valsz)); - FPRINTF(fp, "["); - for (i = 0; i < valsz; i++) { - if (i > 0) - FPRINTF(fp, ","); - FPRINTF(fp, "%hd", val[i]); - } - FPRINTF(fp, "]"); - break; - } - - case DATA_TYPE_UINT32_ARRAY: { - uint32_t *val; - uint_t valsz, i; - VERIFY0(nvpair_value_uint32_array(curr, &val, &valsz)); - FPRINTF(fp, "["); - for (i = 0; i < valsz; i++) { - if (i > 0) - FPRINTF(fp, ","); - FPRINTF(fp, "%u", val[i]); - } - FPRINTF(fp, "]"); - break; - } - - case DATA_TYPE_INT32_ARRAY: { - int32_t *val; - uint_t valsz, i; - VERIFY0(nvpair_value_int32_array(curr, &val, &valsz)); - FPRINTF(fp, "["); - for (i = 0; i < valsz; i++) { - if (i > 0) - FPRINTF(fp, ","); - FPRINTF(fp, "%d", val[i]); - } - FPRINTF(fp, "]"); - break; - } - - case DATA_TYPE_UINT64_ARRAY: { - uint64_t *val; - uint_t valsz, i; - VERIFY0(nvpair_value_uint64_array(curr, &val, &valsz)); - FPRINTF(fp, "["); - for (i = 0; i < valsz; i++) { - if (i > 0) - FPRINTF(fp, ","); - FPRINTF(fp, "%llu", - (unsigned long long)val[i]); - } - FPRINTF(fp, "]"); - break; - } - - case DATA_TYPE_INT64_ARRAY: { - int64_t *val; - uint_t valsz, i; - VERIFY0(nvpair_value_int64_array(curr, &val, &valsz)); - FPRINTF(fp, "["); - for (i = 0; i < valsz; i++) { - if (i > 0) - FPRINTF(fp, ","); - FPRINTF(fp, "%lld", (long long)val[i]); - } - FPRINTF(fp, "]"); - break; - } - - case DATA_TYPE_UNKNOWN: - case DATA_TYPE_DONTCARE: - return (-1); - } - - } - - FPRINTF(fp, "}"); - return (0); -} diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/libuutil.h b/cddl/contrib/opensolaris/lib/libuutil/common/libuutil.h deleted file mode 100644 index 7a5f8a8570c6..000000000000 --- a/cddl/contrib/opensolaris/lib/libuutil/common/libuutil.h +++ /dev/null @@ -1,391 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#ifndef _LIBUUTIL_H -#define _LIBUUTIL_H - -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Standard flags codes. - */ -#define UU_DEFAULT 0 - -/* - * Standard error codes. - */ -#define UU_ERROR_NONE 0 /* no error */ -#define UU_ERROR_INVALID_ARGUMENT 1 /* invalid argument */ -#define UU_ERROR_UNKNOWN_FLAG 2 /* passed flag invalid */ -#define UU_ERROR_NO_MEMORY 3 /* out of memory */ -#define UU_ERROR_CALLBACK_FAILED 4 /* callback-initiated error */ -#define UU_ERROR_NOT_SUPPORTED 5 /* operation not supported */ -#define UU_ERROR_EMPTY 6 /* no value provided */ -#define UU_ERROR_UNDERFLOW 7 /* value is too small */ -#define UU_ERROR_OVERFLOW 8 /* value is too value */ -#define UU_ERROR_INVALID_CHAR 9 /* value contains unexpected char */ -#define UU_ERROR_INVALID_DIGIT 10 /* value contains digit not in base */ - -#define UU_ERROR_SYSTEM 99 /* underlying system error */ -#define UU_ERROR_UNKNOWN 100 /* error status not known */ - -/* - * Standard program exit codes. - */ -#define UU_EXIT_OK (*(uu_exit_ok())) -#define UU_EXIT_FATAL (*(uu_exit_fatal())) -#define UU_EXIT_USAGE (*(uu_exit_usage())) - -/* - * Exit status profiles. - */ -#define UU_PROFILE_DEFAULT 0 -#define UU_PROFILE_LAUNCHER 1 - -/* - * Error reporting functions. - */ -uint32_t uu_error(void); -const char *uu_strerror(uint32_t); - -/* - * Program notification functions. - */ -extern void uu_alt_exit(int); -extern const char *uu_setpname(char *); -extern const char *uu_getpname(void); -/*PRINTFLIKE1*/ -extern void uu_warn(const char *, ...); -extern void uu_vwarn(const char *, va_list); -/*PRINTFLIKE1*/ -extern void uu_die(const char *, ...) __NORETURN; -extern void uu_vdie(const char *, va_list) __NORETURN; -/*PRINTFLIKE2*/ -extern void uu_xdie(int, const char *, ...) __NORETURN; -extern void uu_vxdie(int, const char *, va_list) __NORETURN; - -/* - * Exit status functions (not to be used directly) - */ -extern int *uu_exit_ok(void); -extern int *uu_exit_fatal(void); -extern int *uu_exit_usage(void); - -/* - * string->number conversions - */ -extern int uu_strtoint(const char *, void *, size_t, int, int64_t, int64_t); -extern int uu_strtouint(const char *, void *, size_t, int, uint64_t, uint64_t); - -/* - * Debug print facility functions. - */ -typedef struct uu_dprintf uu_dprintf_t; - -typedef enum { - UU_DPRINTF_SILENT, - UU_DPRINTF_FATAL, - UU_DPRINTF_WARNING, - UU_DPRINTF_NOTICE, - UU_DPRINTF_INFO, - UU_DPRINTF_DEBUG -} uu_dprintf_severity_t; - -extern uu_dprintf_t *uu_dprintf_create(const char *, uu_dprintf_severity_t, - uint_t); -/*PRINTFLIKE3*/ -extern void uu_dprintf(uu_dprintf_t *, uu_dprintf_severity_t, - const char *, ...); -extern void uu_dprintf_destroy(uu_dprintf_t *); -extern const char *uu_dprintf_getname(uu_dprintf_t *); - -/* - * Identifier test flags and function. - */ -#define UU_NAME_DOMAIN 0x1 /* allow SUNW, or com.sun, prefix */ -#define UU_NAME_PATH 0x2 /* allow '/'-delimited paths */ - -int uu_check_name(const char *, uint_t); - -/* - * File creation functions. - */ -extern int uu_open_tmp(const char *dir, uint_t uflags); - -/* - * Convenience functions. - */ -#define UU_NELEM(a) (sizeof (a) / sizeof ((a)[0])) - -/*PRINTFLIKE1*/ -extern char *uu_msprintf(const char *format, ...); -extern void *uu_zalloc(size_t); -extern char *uu_strdup(const char *); -extern void uu_free(void *); - -extern boolean_t uu_strcaseeq(const char *a, const char *b); -extern boolean_t uu_streq(const char *a, const char *b); -extern char *uu_strndup(const char *s, size_t n); -extern boolean_t uu_strbw(const char *a, const char *b); -extern void *uu_memdup(const void *buf, size_t sz); -extern void uu_dump(FILE *out, const char *prefix, const void *buf, size_t len); - -/* - * Comparison function type definition. - * Developers should be careful in their use of the _private argument. If you - * break interface guarantees, you get undefined behavior. - */ -typedef int uu_compare_fn_t(const void *__left, const void *__right, - void *__private); - -/* - * Walk variant flags. - * A data structure need not provide support for all variants and - * combinations. Refer to the appropriate documentation. - */ -#define UU_WALK_ROBUST 0x00000001 /* walk can survive removes */ -#define UU_WALK_REVERSE 0x00000002 /* reverse walk order */ - -#define UU_WALK_PREORDER 0x00000010 /* walk tree in pre-order */ -#define UU_WALK_POSTORDER 0x00000020 /* walk tree in post-order */ - -/* - * Walk callback function return codes. - */ -#define UU_WALK_ERROR -1 -#define UU_WALK_NEXT 0 -#define UU_WALK_DONE 1 - -/* - * Walk callback function type definition. - */ -typedef int uu_walk_fn_t(void *_elem, void *_private); - -/* - * lists: opaque structures - */ -typedef struct uu_list_pool uu_list_pool_t; -typedef struct uu_list uu_list_t; - -typedef struct uu_list_node { - uintptr_t uln_opaque[2]; -} uu_list_node_t; - -typedef struct uu_list_walk uu_list_walk_t; - -typedef uintptr_t uu_list_index_t; - -/* - * lists: interface - * - * basic usage: - * typedef struct foo { - * ... - * uu_list_node_t foo_node; - * ... - * } foo_t; - * - * static int - * foo_compare(void *l_arg, void *r_arg, void *private) - * { - * foo_t *l = l_arg; - * foo_t *r = r_arg; - * - * if (... l greater than r ...) - * return (1); - * if (... l less than r ...) - * return (-1); - * return (0); - * } - * - * ... - * // at initialization time - * foo_pool = uu_list_pool_create("foo_pool", - * sizeof (foo_t), offsetof(foo_t, foo_node), foo_compare, - * debugging? 0 : UU_AVL_POOL_DEBUG); - * ... - */ -uu_list_pool_t *uu_list_pool_create(const char *, size_t, size_t, - uu_compare_fn_t *, uint32_t); -#define UU_LIST_POOL_DEBUG 0x00000001 - -void uu_list_pool_destroy(uu_list_pool_t *); - -/* - * usage: - * - * foo_t *a; - * a = malloc(sizeof(*a)); - * uu_list_node_init(a, &a->foo_list, pool); - * ... - * uu_list_node_fini(a, &a->foo_list, pool); - * free(a); - */ -void uu_list_node_init(void *, uu_list_node_t *, uu_list_pool_t *); -void uu_list_node_fini(void *, uu_list_node_t *, uu_list_pool_t *); - -uu_list_t *uu_list_create(uu_list_pool_t *, void *_parent, uint32_t); -#define UU_LIST_DEBUG 0x00000001 -#define UU_LIST_SORTED 0x00000002 /* list is sorted */ - -void uu_list_destroy(uu_list_t *); /* list must be empty */ - -size_t uu_list_numnodes(uu_list_t *); - -void *uu_list_first(uu_list_t *); -void *uu_list_last(uu_list_t *); - -void *uu_list_next(uu_list_t *, void *); -void *uu_list_prev(uu_list_t *, void *); - -int uu_list_walk(uu_list_t *, uu_walk_fn_t *, void *, uint32_t); - -uu_list_walk_t *uu_list_walk_start(uu_list_t *, uint32_t); -void *uu_list_walk_next(uu_list_walk_t *); -void uu_list_walk_end(uu_list_walk_t *); - -void *uu_list_find(uu_list_t *, void *, void *, uu_list_index_t *); -void uu_list_insert(uu_list_t *, void *, uu_list_index_t); - -void *uu_list_nearest_next(uu_list_t *, uu_list_index_t); -void *uu_list_nearest_prev(uu_list_t *, uu_list_index_t); - -void *uu_list_teardown(uu_list_t *, void **); - -void uu_list_remove(uu_list_t *, void *); - -/* - * lists: interfaces for non-sorted lists only - */ -int uu_list_insert_before(uu_list_t *, void *_target, void *_elem); -int uu_list_insert_after(uu_list_t *, void *_target, void *_elem); - -/* - * avl trees: opaque structures - */ -typedef struct uu_avl_pool uu_avl_pool_t; -typedef struct uu_avl uu_avl_t; - -typedef struct uu_avl_node { -#ifdef _LP64 - uintptr_t uan_opaque[3]; -#else - uintptr_t uan_opaque[4]; -#endif -} uu_avl_node_t; - -typedef struct uu_avl_walk uu_avl_walk_t; - -typedef uintptr_t uu_avl_index_t; - -/* - * avl trees: interface - * - * basic usage: - * typedef struct foo { - * ... - * uu_avl_node_t foo_node; - * ... - * } foo_t; - * - * static int - * foo_compare(void *l_arg, void *r_arg, void *private) - * { - * foo_t *l = l_arg; - * foo_t *r = r_arg; - * - * if (... l greater than r ...) - * return (1); - * if (... l less than r ...) - * return (-1); - * return (0); - * } - * - * ... - * // at initialization time - * foo_pool = uu_avl_pool_create("foo_pool", - * sizeof (foo_t), offsetof(foo_t, foo_node), foo_compare, - * debugging? 0 : UU_AVL_POOL_DEBUG); - * ... - */ -uu_avl_pool_t *uu_avl_pool_create(const char *, size_t, size_t, - uu_compare_fn_t *, uint32_t); -#define UU_AVL_POOL_DEBUG 0x00000001 - -void uu_avl_pool_destroy(uu_avl_pool_t *); - -/* - * usage: - * - * foo_t *a; - * a = malloc(sizeof(*a)); - * uu_avl_node_init(a, &a->foo_avl, pool); - * ... - * uu_avl_node_fini(a, &a->foo_avl, pool); - * free(a); - */ -void uu_avl_node_init(void *, uu_avl_node_t *, uu_avl_pool_t *); -void uu_avl_node_fini(void *, uu_avl_node_t *, uu_avl_pool_t *); - -uu_avl_t *uu_avl_create(uu_avl_pool_t *, void *_parent, uint32_t); -#define UU_AVL_DEBUG 0x00000001 - -void uu_avl_destroy(uu_avl_t *); /* list must be empty */ - -size_t uu_avl_numnodes(uu_avl_t *); - -void *uu_avl_first(uu_avl_t *); -void *uu_avl_last(uu_avl_t *); - -void *uu_avl_next(uu_avl_t *, void *); -void *uu_avl_prev(uu_avl_t *, void *); - -int uu_avl_walk(uu_avl_t *, uu_walk_fn_t *, void *, uint32_t); - -uu_avl_walk_t *uu_avl_walk_start(uu_avl_t *, uint32_t); -void *uu_avl_walk_next(uu_avl_walk_t *); -void uu_avl_walk_end(uu_avl_walk_t *); - -void *uu_avl_find(uu_avl_t *, void *, void *, uu_avl_index_t *); -void uu_avl_insert(uu_avl_t *, void *, uu_avl_index_t); - -void *uu_avl_nearest_next(uu_avl_t *, uu_avl_index_t); -void *uu_avl_nearest_prev(uu_avl_t *, uu_avl_index_t); - -void *uu_avl_teardown(uu_avl_t *, void **); - -void uu_avl_remove(uu_avl_t *, void *); - -#ifdef __cplusplus -} -#endif - -#endif /* _LIBUUTIL_H */ diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/libuutil_common.h b/cddl/contrib/opensolaris/lib/libuutil/common/libuutil_common.h deleted file mode 100644 index 9ebaaedfd237..000000000000 --- a/cddl/contrib/opensolaris/lib/libuutil/common/libuutil_common.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _LIBUUTIL_COMMON_H -#define _LIBUUTIL_COMMON_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include -#include - -#endif /* _LIBUUTIL_COMMON_H */ diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/libuutil_impl.h b/cddl/contrib/opensolaris/lib/libuutil/common/libuutil_impl.h deleted file mode 100644 index 9466e5974581..000000000000 --- a/cddl/contrib/opensolaris/lib/libuutil/common/libuutil_impl.h +++ /dev/null @@ -1,181 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _LIBUUTIL_IMPL_H -#define _LIBUUTIL_IMPL_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include -#include - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -void uu_set_error(uint_t); -#pragma rarely_called(uu_set_error) - -/*PRINTFLIKE1*/ -void uu_panic(const char *format, ...); -#pragma rarely_called(uu_panic) - -struct uu_dprintf { - char *uud_name; - uu_dprintf_severity_t uud_severity; - uint_t uud_flags; -}; - -/* - * For debugging purposes, libuutil keeps around linked lists of all uu_lists - * and uu_avls, along with pointers to their parents. These can cause false - * negatives when looking for memory leaks, so we encode the pointers by - * storing them with swapped endianness; this is not perfect, but it's about - * the best we can do without wasting a lot of space. - */ -#ifdef _LP64 -#define UU_PTR_ENCODE(ptr) BSWAP_64((uintptr_t)(void *)(ptr)) -#else -#define UU_PTR_ENCODE(ptr) BSWAP_32((uintptr_t)(void *)(ptr)) -#endif - -#define UU_PTR_DECODE(ptr) ((void *)UU_PTR_ENCODE(ptr)) - -/* - * uu_list structures - */ -typedef struct uu_list_node_impl { - struct uu_list_node_impl *uln_next; - struct uu_list_node_impl *uln_prev; -} uu_list_node_impl_t; - -struct uu_list_walk { - uu_list_walk_t *ulw_next; - uu_list_walk_t *ulw_prev; - - uu_list_t *ulw_list; - int8_t ulw_dir; - uint8_t ulw_robust; - uu_list_node_impl_t *ulw_next_result; -}; - -struct uu_list { - uintptr_t ul_next_enc; - uintptr_t ul_prev_enc; - - uu_list_pool_t *ul_pool; - uintptr_t ul_parent_enc; /* encoded parent pointer */ - size_t ul_offset; - size_t ul_numnodes; - uint8_t ul_debug; - uint8_t ul_sorted; - uint8_t ul_index; /* mark for uu_list_index_ts */ - - uu_list_node_impl_t ul_null_node; - uu_list_walk_t ul_null_walk; /* for robust walkers */ -}; - -#define UU_LIST_PTR(ptr) ((uu_list_t *)UU_PTR_DECODE(ptr)) - -#define UU_LIST_POOL_MAXNAME 64 - -struct uu_list_pool { - uu_list_pool_t *ulp_next; - uu_list_pool_t *ulp_prev; - - char ulp_name[UU_LIST_POOL_MAXNAME]; - size_t ulp_nodeoffset; - size_t ulp_objsize; - uu_compare_fn_t *ulp_cmp; - uint8_t ulp_debug; - uint8_t ulp_last_index; - pthread_mutex_t ulp_lock; /* protects null_list */ - uu_list_t ulp_null_list; -}; - -/* - * uu_avl structures - */ -typedef struct avl_node uu_avl_node_impl_t; - -struct uu_avl_walk { - uu_avl_walk_t *uaw_next; - uu_avl_walk_t *uaw_prev; - - uu_avl_t *uaw_avl; - void *uaw_next_result; - int8_t uaw_dir; - uint8_t uaw_robust; -}; - -struct uu_avl { - uintptr_t ua_next_enc; - uintptr_t ua_prev_enc; - - uu_avl_pool_t *ua_pool; - uintptr_t ua_parent_enc; - uint8_t ua_debug; - uint8_t ua_index; /* mark for uu_avl_index_ts */ - - struct avl_tree ua_tree; - uu_avl_walk_t ua_null_walk; -}; - -#define UU_AVL_PTR(x) ((uu_avl_t *)UU_PTR_DECODE(x)) - -#define UU_AVL_POOL_MAXNAME 64 - -struct uu_avl_pool { - uu_avl_pool_t *uap_next; - uu_avl_pool_t *uap_prev; - - char uap_name[UU_AVL_POOL_MAXNAME]; - size_t uap_nodeoffset; - size_t uap_objsize; - uu_compare_fn_t *uap_cmp; - uint8_t uap_debug; - uint8_t uap_last_index; - pthread_mutex_t uap_lock; /* protects null_avl */ - uu_avl_t uap_null_avl; -}; - -/* - * atfork() handlers - */ -void uu_avl_lockup(void); -void uu_avl_release(void); - -void uu_list_lockup(void); -void uu_list_release(void); - -#ifdef __cplusplus -} -#endif - -#endif /* _LIBUUTIL_IMPL_H */ diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/uu_alloc.c b/cddl/contrib/opensolaris/lib/libuutil/common/uu_alloc.c deleted file mode 100644 index 2bef759d525e..000000000000 --- a/cddl/contrib/opensolaris/lib/libuutil/common/uu_alloc.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#include "libuutil_common.h" - -#include -#include -#include -#include - -void * -uu_zalloc(size_t n) -{ - void *p = malloc(n); - - if (p == NULL) { - uu_set_error(UU_ERROR_SYSTEM); - return (NULL); - } - - (void) memset(p, 0, n); - - return (p); -} - -void -uu_free(void *p) -{ - free(p); -} - -char * -uu_strdup(const char *str) -{ - char *buf = NULL; - - if (str != NULL) { - size_t sz; - - sz = strlen(str) + 1; - buf = uu_zalloc(sz); - if (buf != NULL) - (void) memcpy(buf, str, sz); - } - return (buf); -} - -/* - * Duplicate up to n bytes of a string. Kind of sort of like - * strdup(strlcpy(s, n)). - */ -char * -uu_strndup(const char *s, size_t n) -{ - size_t len; - char *p; - - len = strnlen(s, n); - p = uu_zalloc(len + 1); - if (p == NULL) - return (NULL); - - if (len > 0) - (void) memcpy(p, s, len); - p[len] = '\0'; - - return (p); -} - -/* - * Duplicate a block of memory. Combines malloc with memcpy, much as - * strdup combines malloc, strlen, and strcpy. - */ -void * -uu_memdup(const void *buf, size_t sz) -{ - void *p; - - p = uu_zalloc(sz); - if (p == NULL) - return (NULL); - (void) memcpy(p, buf, sz); - return (p); -} - -char * -uu_msprintf(const char *format, ...) -{ - va_list args; - char attic[1]; - uint_t M, m; - char *b; - - va_start(args, format); - M = vsnprintf(attic, 1, format, args); - va_end(args); - - for (;;) { - m = M; - if ((b = uu_zalloc(m + 1)) == NULL) - return (NULL); - - va_start(args, format); - M = vsnprintf(b, m + 1, format, args); - va_end(args); - - if (M == m) - break; /* sizes match */ - - uu_free(b); - } - - return (b); -} diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/uu_avl.c b/cddl/contrib/opensolaris/lib/libuutil/common/uu_avl.c deleted file mode 100644 index 5e78ececeec9..000000000000 --- a/cddl/contrib/opensolaris/lib/libuutil/common/uu_avl.c +++ /dev/null @@ -1,570 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include "libuutil_common.h" - -#include -#include -#include -#include - -static uu_avl_pool_t uu_null_apool = { &uu_null_apool, &uu_null_apool }; -static pthread_mutex_t uu_apool_list_lock = PTHREAD_MUTEX_INITIALIZER; - -/* - * The index mark change on every insert and delete, to catch stale - * references. - * - * We leave the low bit alone, since the avl code uses it. - */ -#define INDEX_MAX (sizeof (uintptr_t) - 2) -#define INDEX_NEXT(m) (((m) == INDEX_MAX)? 2 : ((m) + 2) & INDEX_MAX) - -#define INDEX_DECODE(i) ((i) & ~INDEX_MAX) -#define INDEX_ENCODE(p, n) (((n) & ~INDEX_MAX) | (p)->ua_index) -#define INDEX_VALID(p, i) (((i) & INDEX_MAX) == (p)->ua_index) -#define INDEX_CHECK(i) (((i) & INDEX_MAX) != 0) - -/* - * When an element is inactive (not in a tree), we keep a marked pointer to - * its containing pool in its first word, and a NULL pointer in its second. - * - * On insert, we use these to verify that it comes from the correct pool. - */ -#define NODE_ARRAY(p, n) ((uintptr_t *)((uintptr_t)(n) + \ - (pp)->uap_nodeoffset)) - -#define POOL_TO_MARKER(pp) (((uintptr_t)(pp) | 1)) - -#define DEAD_MARKER 0xc4 - -uu_avl_pool_t * -uu_avl_pool_create(const char *name, size_t objsize, size_t nodeoffset, - uu_compare_fn_t *compare_func, uint32_t flags) -{ - uu_avl_pool_t *pp, *next, *prev; - - if (name == NULL || - uu_check_name(name, UU_NAME_DOMAIN) == -1 || - nodeoffset + sizeof (uu_avl_node_t) > objsize || - compare_func == NULL) { - uu_set_error(UU_ERROR_INVALID_ARGUMENT); - return (NULL); - } - - if (flags & ~UU_AVL_POOL_DEBUG) { - uu_set_error(UU_ERROR_UNKNOWN_FLAG); - return (NULL); - } - - pp = uu_zalloc(sizeof (uu_avl_pool_t)); - if (pp == NULL) { - uu_set_error(UU_ERROR_NO_MEMORY); - return (NULL); - } - - (void) strlcpy(pp->uap_name, name, sizeof (pp->uap_name)); - pp->uap_nodeoffset = nodeoffset; - pp->uap_objsize = objsize; - pp->uap_cmp = compare_func; - if (flags & UU_AVL_POOL_DEBUG) - pp->uap_debug = 1; - pp->uap_last_index = 0; - - (void) pthread_mutex_init(&pp->uap_lock, NULL); - - pp->uap_null_avl.ua_next_enc = UU_PTR_ENCODE(&pp->uap_null_avl); - pp->uap_null_avl.ua_prev_enc = UU_PTR_ENCODE(&pp->uap_null_avl); - - (void) pthread_mutex_lock(&uu_apool_list_lock); - pp->uap_next = next = &uu_null_apool; - pp->uap_prev = prev = next->uap_prev; - next->uap_prev = pp; - prev->uap_next = pp; - (void) pthread_mutex_unlock(&uu_apool_list_lock); - - return (pp); -} - -void -uu_avl_pool_destroy(uu_avl_pool_t *pp) -{ - if (pp->uap_debug) { - if (pp->uap_null_avl.ua_next_enc != - UU_PTR_ENCODE(&pp->uap_null_avl) || - pp->uap_null_avl.ua_prev_enc != - UU_PTR_ENCODE(&pp->uap_null_avl)) { - uu_panic("uu_avl_pool_destroy: Pool \"%.*s\" (%p) has " - "outstanding avls, or is corrupt.\n", - (int)sizeof (pp->uap_name), pp->uap_name, - (void *)pp); - } - } - (void) pthread_mutex_lock(&uu_apool_list_lock); - pp->uap_next->uap_prev = pp->uap_prev; - pp->uap_prev->uap_next = pp->uap_next; - (void) pthread_mutex_unlock(&uu_apool_list_lock); - (void) pthread_mutex_destroy(&pp->uap_lock); - pp->uap_prev = NULL; - pp->uap_next = NULL; - uu_free(pp); -} - -void -uu_avl_node_init(void *base, uu_avl_node_t *np, uu_avl_pool_t *pp) -{ - uintptr_t *na = (uintptr_t *)np; - - if (pp->uap_debug) { - uintptr_t offset = (uintptr_t)np - (uintptr_t)base; - if (offset + sizeof (*np) > pp->uap_objsize) { - uu_panic("uu_avl_node_init(%p, %p, %p (\"%s\")): " - "offset %ld doesn't fit in object (size %ld)\n", - base, (void *)np, (void *)pp, pp->uap_name, - (long)offset, (long)pp->uap_objsize); - } - if (offset != pp->uap_nodeoffset) { - uu_panic("uu_avl_node_init(%p, %p, %p (\"%s\")): " - "offset %ld doesn't match pool's offset (%ld)\n", - base, (void *)np, (void *)pp, pp->uap_name, - (long)offset, (long)pp->uap_objsize); - } - } - - na[0] = POOL_TO_MARKER(pp); - na[1] = 0; -} - -void -uu_avl_node_fini(void *base, uu_avl_node_t *np, uu_avl_pool_t *pp) -{ - uintptr_t *na = (uintptr_t *)np; - - if (pp->uap_debug) { - if (na[0] == DEAD_MARKER && na[1] == DEAD_MARKER) { - uu_panic("uu_avl_node_fini(%p, %p, %p (\"%s\")): " - "node already finied\n", - base, (void *)np, (void *)pp, pp->uap_name); - } - if (na[0] != POOL_TO_MARKER(pp) || na[1] != 0) { - uu_panic("uu_avl_node_fini(%p, %p, %p (\"%s\")): " - "node corrupt, in tree, or in different pool\n", - base, (void *)np, (void *)pp, pp->uap_name); - } - } - - na[0] = DEAD_MARKER; - na[1] = DEAD_MARKER; - na[2] = DEAD_MARKER; -} - -struct uu_avl_node_compare_info { - uu_compare_fn_t *ac_compare; - void *ac_private; - void *ac_right; - void *ac_found; -}; - -static int -uu_avl_node_compare(const void *l, const void *r) -{ - struct uu_avl_node_compare_info *info = - (struct uu_avl_node_compare_info *)l; - - int res = info->ac_compare(r, info->ac_right, info->ac_private); - - if (res == 0) { - if (info->ac_found == NULL) - info->ac_found = (void *)r; - return (-1); - } - if (res < 0) - return (1); - return (-1); -} - -uu_avl_t * -uu_avl_create(uu_avl_pool_t *pp, void *parent, uint32_t flags) -{ - uu_avl_t *ap, *next, *prev; - - if (flags & ~UU_AVL_DEBUG) { - uu_set_error(UU_ERROR_UNKNOWN_FLAG); - return (NULL); - } - - ap = uu_zalloc(sizeof (*ap)); - if (ap == NULL) { - uu_set_error(UU_ERROR_NO_MEMORY); - return (NULL); - } - - ap->ua_pool = pp; - ap->ua_parent_enc = UU_PTR_ENCODE(parent); - ap->ua_debug = pp->uap_debug || (flags & UU_AVL_DEBUG); - ap->ua_index = (pp->uap_last_index = INDEX_NEXT(pp->uap_last_index)); - - avl_create(&ap->ua_tree, &uu_avl_node_compare, pp->uap_objsize, - pp->uap_nodeoffset); - - ap->ua_null_walk.uaw_next = &ap->ua_null_walk; - ap->ua_null_walk.uaw_prev = &ap->ua_null_walk; - - (void) pthread_mutex_lock(&pp->uap_lock); - next = &pp->uap_null_avl; - prev = UU_PTR_DECODE(next->ua_prev_enc); - ap->ua_next_enc = UU_PTR_ENCODE(next); - ap->ua_prev_enc = UU_PTR_ENCODE(prev); - next->ua_prev_enc = UU_PTR_ENCODE(ap); - prev->ua_next_enc = UU_PTR_ENCODE(ap); - (void) pthread_mutex_unlock(&pp->uap_lock); - - return (ap); -} - -void -uu_avl_destroy(uu_avl_t *ap) -{ - uu_avl_pool_t *pp = ap->ua_pool; - - if (ap->ua_debug) { - if (avl_numnodes(&ap->ua_tree) != 0) { - uu_panic("uu_avl_destroy(%p): tree not empty\n", - (void *)ap); - } - if (ap->ua_null_walk.uaw_next != &ap->ua_null_walk || - ap->ua_null_walk.uaw_prev != &ap->ua_null_walk) { - uu_panic("uu_avl_destroy(%p): outstanding walkers\n", - (void *)ap); - } - } - (void) pthread_mutex_lock(&pp->uap_lock); - UU_AVL_PTR(ap->ua_next_enc)->ua_prev_enc = ap->ua_prev_enc; - UU_AVL_PTR(ap->ua_prev_enc)->ua_next_enc = ap->ua_next_enc; - (void) pthread_mutex_unlock(&pp->uap_lock); - ap->ua_prev_enc = UU_PTR_ENCODE(NULL); - ap->ua_next_enc = UU_PTR_ENCODE(NULL); - - ap->ua_pool = NULL; - avl_destroy(&ap->ua_tree); - - uu_free(ap); -} - -size_t -uu_avl_numnodes(uu_avl_t *ap) -{ - return (avl_numnodes(&ap->ua_tree)); -} - -void * -uu_avl_first(uu_avl_t *ap) -{ - return (avl_first(&ap->ua_tree)); -} - -void * -uu_avl_last(uu_avl_t *ap) -{ - return (avl_last(&ap->ua_tree)); -} - -void * -uu_avl_next(uu_avl_t *ap, void *node) -{ - return (AVL_NEXT(&ap->ua_tree, node)); -} - -void * -uu_avl_prev(uu_avl_t *ap, void *node) -{ - return (AVL_PREV(&ap->ua_tree, node)); -} - -static void -_avl_walk_init(uu_avl_walk_t *wp, uu_avl_t *ap, uint32_t flags) -{ - uu_avl_walk_t *next, *prev; - - int robust = (flags & UU_WALK_ROBUST); - int direction = (flags & UU_WALK_REVERSE)? -1 : 1; - - (void) memset(wp, 0, sizeof (*wp)); - wp->uaw_avl = ap; - wp->uaw_robust = robust; - wp->uaw_dir = direction; - - if (direction > 0) - wp->uaw_next_result = avl_first(&ap->ua_tree); - else - wp->uaw_next_result = avl_last(&ap->ua_tree); - - if (ap->ua_debug || robust) { - wp->uaw_next = next = &ap->ua_null_walk; - wp->uaw_prev = prev = next->uaw_prev; - next->uaw_prev = wp; - prev->uaw_next = wp; - } -} - -static void * -_avl_walk_advance(uu_avl_walk_t *wp, uu_avl_t *ap) -{ - void *np = wp->uaw_next_result; - - avl_tree_t *t = &ap->ua_tree; - - if (np == NULL) - return (NULL); - - wp->uaw_next_result = (wp->uaw_dir > 0)? AVL_NEXT(t, np) : - AVL_PREV(t, np); - - return (np); -} - -static void -_avl_walk_fini(uu_avl_walk_t *wp) -{ - if (wp->uaw_next != NULL) { - wp->uaw_next->uaw_prev = wp->uaw_prev; - wp->uaw_prev->uaw_next = wp->uaw_next; - wp->uaw_next = NULL; - wp->uaw_prev = NULL; - } - wp->uaw_avl = NULL; - wp->uaw_next_result = NULL; -} - -uu_avl_walk_t * -uu_avl_walk_start(uu_avl_t *ap, uint32_t flags) -{ - uu_avl_walk_t *wp; - - if (flags & ~(UU_WALK_ROBUST | UU_WALK_REVERSE)) { - uu_set_error(UU_ERROR_UNKNOWN_FLAG); - return (NULL); - } - - wp = uu_zalloc(sizeof (*wp)); - if (wp == NULL) { - uu_set_error(UU_ERROR_NO_MEMORY); - return (NULL); - } - - _avl_walk_init(wp, ap, flags); - return (wp); -} - -void * -uu_avl_walk_next(uu_avl_walk_t *wp) -{ - return (_avl_walk_advance(wp, wp->uaw_avl)); -} - -void -uu_avl_walk_end(uu_avl_walk_t *wp) -{ - _avl_walk_fini(wp); - uu_free(wp); -} - -int -uu_avl_walk(uu_avl_t *ap, uu_walk_fn_t *func, void *private, uint32_t flags) -{ - void *e; - uu_avl_walk_t my_walk; - - int status = UU_WALK_NEXT; - - if (flags & ~(UU_WALK_ROBUST | UU_WALK_REVERSE)) { - uu_set_error(UU_ERROR_UNKNOWN_FLAG); - return (-1); - } - - _avl_walk_init(&my_walk, ap, flags); - while (status == UU_WALK_NEXT && - (e = _avl_walk_advance(&my_walk, ap)) != NULL) - status = (*func)(e, private); - _avl_walk_fini(&my_walk); - - if (status >= 0) - return (0); - uu_set_error(UU_ERROR_CALLBACK_FAILED); - return (-1); -} - -void -uu_avl_remove(uu_avl_t *ap, void *elem) -{ - uu_avl_walk_t *wp; - uu_avl_pool_t *pp = ap->ua_pool; - uintptr_t *na = NODE_ARRAY(pp, elem); - - if (ap->ua_debug) { - /* - * invalidate outstanding uu_avl_index_ts. - */ - ap->ua_index = INDEX_NEXT(ap->ua_index); - } - - /* - * Robust walkers most be advanced, if we are removing the node - * they are currently using. In debug mode, non-robust walkers - * are also on the walker list. - */ - for (wp = ap->ua_null_walk.uaw_next; wp != &ap->ua_null_walk; - wp = wp->uaw_next) { - if (wp->uaw_robust) { - if (elem == wp->uaw_next_result) - (void) _avl_walk_advance(wp, ap); - } else if (wp->uaw_next_result != NULL) { - uu_panic("uu_avl_remove(%p, %p): active non-robust " - "walker\n", (void *)ap, elem); - } - } - - avl_remove(&ap->ua_tree, elem); - - na[0] = POOL_TO_MARKER(pp); - na[1] = 0; -} - -void * -uu_avl_teardown(uu_avl_t *ap, void **cookie) -{ - void *elem = avl_destroy_nodes(&ap->ua_tree, cookie); - - if (elem != NULL) { - uu_avl_pool_t *pp = ap->ua_pool; - uintptr_t *na = NODE_ARRAY(pp, elem); - - na[0] = POOL_TO_MARKER(pp); - na[1] = 0; - } - return (elem); -} - -void * -uu_avl_find(uu_avl_t *ap, void *elem, void *private, uu_avl_index_t *out) -{ - struct uu_avl_node_compare_info info; - void *result; - - info.ac_compare = ap->ua_pool->uap_cmp; - info.ac_private = private; - info.ac_right = elem; - info.ac_found = NULL; - - result = avl_find(&ap->ua_tree, &info, out); - if (out != NULL) - *out = INDEX_ENCODE(ap, *out); - - if (ap->ua_debug && result != NULL) - uu_panic("uu_avl_find: internal error: avl_find succeeded\n"); - - return (info.ac_found); -} - -void -uu_avl_insert(uu_avl_t *ap, void *elem, uu_avl_index_t idx) -{ - if (ap->ua_debug) { - uu_avl_pool_t *pp = ap->ua_pool; - uintptr_t *na = NODE_ARRAY(pp, elem); - - if (na[1] != 0) - uu_panic("uu_avl_insert(%p, %p, %p): node already " - "in tree, or corrupt\n", - (void *)ap, elem, (void *)idx); - if (na[0] == 0) - uu_panic("uu_avl_insert(%p, %p, %p): node not " - "initialized\n", - (void *)ap, elem, (void *)idx); - if (na[0] != POOL_TO_MARKER(pp)) - uu_panic("uu_avl_insert(%p, %p, %p): node from " - "other pool, or corrupt\n", - (void *)ap, elem, (void *)idx); - - if (!INDEX_VALID(ap, idx)) - uu_panic("uu_avl_insert(%p, %p, %p): %s\n", - (void *)ap, elem, (void *)idx, - INDEX_CHECK(idx)? "outdated index" : - "invalid index"); - - /* - * invalidate outstanding uu_avl_index_ts. - */ - ap->ua_index = INDEX_NEXT(ap->ua_index); - } - avl_insert(&ap->ua_tree, elem, INDEX_DECODE(idx)); -} - -void * -uu_avl_nearest_next(uu_avl_t *ap, uu_avl_index_t idx) -{ - if (ap->ua_debug && !INDEX_VALID(ap, idx)) - uu_panic("uu_avl_nearest_next(%p, %p): %s\n", - (void *)ap, (void *)idx, INDEX_CHECK(idx)? - "outdated index" : "invalid index"); - return (avl_nearest(&ap->ua_tree, INDEX_DECODE(idx), AVL_AFTER)); -} - -void * -uu_avl_nearest_prev(uu_avl_t *ap, uu_avl_index_t idx) -{ - if (ap->ua_debug && !INDEX_VALID(ap, idx)) - uu_panic("uu_avl_nearest_prev(%p, %p): %s\n", - (void *)ap, (void *)idx, INDEX_CHECK(idx)? - "outdated index" : "invalid index"); - return (avl_nearest(&ap->ua_tree, INDEX_DECODE(idx), AVL_BEFORE)); -} - -/* - * called from uu_lockup() and uu_release(), as part of our fork1()-safety. - */ -void -uu_avl_lockup(void) -{ - uu_avl_pool_t *pp; - - (void) pthread_mutex_lock(&uu_apool_list_lock); - for (pp = uu_null_apool.uap_next; pp != &uu_null_apool; - pp = pp->uap_next) - (void) pthread_mutex_lock(&pp->uap_lock); -} - -void -uu_avl_release(void) -{ - uu_avl_pool_t *pp; - - for (pp = uu_null_apool.uap_next; pp != &uu_null_apool; - pp = pp->uap_next) - (void) pthread_mutex_unlock(&pp->uap_lock); - (void) pthread_mutex_unlock(&uu_apool_list_lock); -} diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/uu_dprintf.c b/cddl/contrib/opensolaris/lib/libuutil/common/uu_dprintf.c deleted file mode 100644 index 528c3e7f6d25..000000000000 --- a/cddl/contrib/opensolaris/lib/libuutil/common/uu_dprintf.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include "libuutil_common.h" - -#include -#include -#include -#include -#include -#include - -#define FACILITY_FMT "%s (%s): " - -#if !defined(TEXT_DOMAIN) -#define TEXT_DOMAIN "SYS_TEST" -#endif - -static const char * -strseverity(uu_dprintf_severity_t severity) -{ - switch (severity) { - case UU_DPRINTF_SILENT: - return (dgettext(TEXT_DOMAIN, "silent")); - case UU_DPRINTF_FATAL: - return (dgettext(TEXT_DOMAIN, "FATAL")); - case UU_DPRINTF_WARNING: - return (dgettext(TEXT_DOMAIN, "WARNING")); - case UU_DPRINTF_NOTICE: - return (dgettext(TEXT_DOMAIN, "note")); - case UU_DPRINTF_INFO: - return (dgettext(TEXT_DOMAIN, "info")); - case UU_DPRINTF_DEBUG: - return (dgettext(TEXT_DOMAIN, "debug")); - default: - return (dgettext(TEXT_DOMAIN, "unspecified")); - } -} - -uu_dprintf_t * -uu_dprintf_create(const char *name, uu_dprintf_severity_t severity, - uint_t flags) -{ - uu_dprintf_t *D; - - if (uu_check_name(name, UU_NAME_DOMAIN) == -1) { - uu_set_error(UU_ERROR_INVALID_ARGUMENT); - return (NULL); - } - - if ((D = uu_zalloc(sizeof (uu_dprintf_t))) == NULL) - return (NULL); - - if (name != NULL) { - D->uud_name = strdup(name); - if (D->uud_name == NULL) { - uu_free(D); - return (NULL); - } - } else { - D->uud_name = NULL; - } - - D->uud_severity = severity; - D->uud_flags = flags; - - return (D); -} - -/*PRINTFLIKE3*/ -void -uu_dprintf(uu_dprintf_t *D, uu_dprintf_severity_t severity, - const char *format, ...) -{ - va_list alist; - - /* XXX Assert that severity is not UU_DPRINTF_SILENT. */ - - if (severity > D->uud_severity) - return; - - (void) fprintf(stderr, FACILITY_FMT, D->uud_name, - strseverity(severity)); - - va_start(alist, format); - (void) vfprintf(stderr, format, alist); - va_end(alist); -} - -void -uu_dprintf_destroy(uu_dprintf_t *D) -{ - if (D->uud_name) - free(D->uud_name); - - uu_free(D); -} - -const char * -uu_dprintf_getname(uu_dprintf_t *D) -{ - return (D->uud_name); -} diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/uu_ident.c b/cddl/contrib/opensolaris/lib/libuutil/common/uu_ident.c deleted file mode 100644 index 9a643845f8c2..000000000000 --- a/cddl/contrib/opensolaris/lib/libuutil/common/uu_ident.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include "libuutil_common.h" - -#include - -/* - * We require names of the form: - * [provider,]identifier[/[provider,]identifier]... - * - * Where provider is either a stock symbol (SUNW) or a java-style reversed - * domain name (com.sun). - * - * Both providers and identifiers must start with a letter, and may - * only contain alphanumerics, dashes, and underlines. Providers - * may also contain periods. - * - * Note that we do _not_ use the macros in , since they are affected - * by the current locale settings. - */ - -#define IS_ALPHA(c) \ - (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) - -#define IS_DIGIT(c) \ - ((c) >= '0' && (c) <= '9') - -static int -is_valid_ident(const char *s, const char *e, int allowdot) -{ - char c; - - if (s >= e) - return (0); /* name is empty */ - - c = *s++; - if (!IS_ALPHA(c)) - return (0); /* does not start with letter */ - - while (s < e && (c = *s++) != 0) { - if (IS_ALPHA(c) || IS_DIGIT(c) || c == '-' || c == '_' || - (allowdot && c == '.')) - continue; - return (0); /* invalid character */ - } - return (1); -} - -static int -is_valid_component(const char *b, const char *e, uint_t flags) -{ - char *sp; - - if (flags & UU_NAME_DOMAIN) { - sp = strchr(b, ','); - if (sp != NULL && sp < e) { - if (!is_valid_ident(b, sp, 1)) - return (0); - b = sp + 1; - } - } - - return (is_valid_ident(b, e, 0)); -} - -int -uu_check_name(const char *name, uint_t flags) -{ - const char *end = name + strlen(name); - const char *p; - - if (flags & ~(UU_NAME_DOMAIN | UU_NAME_PATH)) { - uu_set_error(UU_ERROR_UNKNOWN_FLAG); - return (-1); - } - - if (!(flags & UU_NAME_PATH)) { - if (!is_valid_component(name, end, flags)) - goto bad; - return (0); - } - - while ((p = strchr(name, '/')) != NULL) { - if (!is_valid_component(name, p - 1, flags)) - goto bad; - name = p + 1; - } - if (!is_valid_component(name, end, flags)) - goto bad; - - return (0); - -bad: - uu_set_error(UU_ERROR_INVALID_ARGUMENT); - return (-1); -} diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/uu_list.c b/cddl/contrib/opensolaris/lib/libuutil/common/uu_list.c deleted file mode 100644 index 35c7ba800103..000000000000 --- a/cddl/contrib/opensolaris/lib/libuutil/common/uu_list.c +++ /dev/null @@ -1,718 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include "libuutil_common.h" - -#include -#include -#include -#include - -#define ELEM_TO_NODE(lp, e) \ - ((uu_list_node_impl_t *)((uintptr_t)(e) + (lp)->ul_offset)) - -#define NODE_TO_ELEM(lp, n) \ - ((void *)((uintptr_t)(n) - (lp)->ul_offset)) - -/* - * uu_list_index_ts define a location for insertion. They are simply a - * pointer to the object after the insertion point. We store a mark - * in the low-bits of the index, to help prevent mistakes. - * - * When debugging, the index mark changes on every insert and delete, to - * catch stale references. - */ -#define INDEX_MAX (sizeof (uintptr_t) - 1) -#define INDEX_NEXT(m) (((m) == INDEX_MAX)? 1 : ((m) + 1) & INDEX_MAX) - -#define INDEX_TO_NODE(i) ((uu_list_node_impl_t *)((i) & ~INDEX_MAX)) -#define NODE_TO_INDEX(p, n) (((uintptr_t)(n) & ~INDEX_MAX) | (p)->ul_index) -#define INDEX_VALID(p, i) (((i) & INDEX_MAX) == (p)->ul_index) -#define INDEX_CHECK(i) (((i) & INDEX_MAX) != 0) - -#define POOL_TO_MARKER(pp) ((void *)((uintptr_t)(pp) | 1)) - -static uu_list_pool_t uu_null_lpool = { &uu_null_lpool, &uu_null_lpool }; -static pthread_mutex_t uu_lpool_list_lock = PTHREAD_MUTEX_INITIALIZER; - -uu_list_pool_t * -uu_list_pool_create(const char *name, size_t objsize, - size_t nodeoffset, uu_compare_fn_t *compare_func, uint32_t flags) -{ - uu_list_pool_t *pp, *next, *prev; - - if (name == NULL || - uu_check_name(name, UU_NAME_DOMAIN) == -1 || - nodeoffset + sizeof (uu_list_node_t) > objsize) { - uu_set_error(UU_ERROR_INVALID_ARGUMENT); - return (NULL); - } - - if (flags & ~UU_LIST_POOL_DEBUG) { - uu_set_error(UU_ERROR_UNKNOWN_FLAG); - return (NULL); - } - - pp = uu_zalloc(sizeof (uu_list_pool_t)); - if (pp == NULL) { - uu_set_error(UU_ERROR_NO_MEMORY); - return (NULL); - } - - (void) strlcpy(pp->ulp_name, name, sizeof (pp->ulp_name)); - pp->ulp_nodeoffset = nodeoffset; - pp->ulp_objsize = objsize; - pp->ulp_cmp = compare_func; - if (flags & UU_LIST_POOL_DEBUG) - pp->ulp_debug = 1; - pp->ulp_last_index = 0; - - (void) pthread_mutex_init(&pp->ulp_lock, NULL); - - pp->ulp_null_list.ul_next_enc = UU_PTR_ENCODE(&pp->ulp_null_list); - pp->ulp_null_list.ul_prev_enc = UU_PTR_ENCODE(&pp->ulp_null_list); - - (void) pthread_mutex_lock(&uu_lpool_list_lock); - pp->ulp_next = next = &uu_null_lpool; - pp->ulp_prev = prev = next->ulp_prev; - next->ulp_prev = pp; - prev->ulp_next = pp; - (void) pthread_mutex_unlock(&uu_lpool_list_lock); - - return (pp); -} - -void -uu_list_pool_destroy(uu_list_pool_t *pp) -{ - if (pp->ulp_debug) { - if (pp->ulp_null_list.ul_next_enc != - UU_PTR_ENCODE(&pp->ulp_null_list) || - pp->ulp_null_list.ul_prev_enc != - UU_PTR_ENCODE(&pp->ulp_null_list)) { - uu_panic("uu_list_pool_destroy: Pool \"%.*s\" (%p) has " - "outstanding lists, or is corrupt.\n", - (int)sizeof (pp->ulp_name), pp->ulp_name, - (void *)pp); - } - } - (void) pthread_mutex_lock(&uu_lpool_list_lock); - pp->ulp_next->ulp_prev = pp->ulp_prev; - pp->ulp_prev->ulp_next = pp->ulp_next; - (void) pthread_mutex_unlock(&uu_lpool_list_lock); - pp->ulp_prev = NULL; - pp->ulp_next = NULL; - uu_free(pp); -} - -void -uu_list_node_init(void *base, uu_list_node_t *np_arg, uu_list_pool_t *pp) -{ - uu_list_node_impl_t *np = (uu_list_node_impl_t *)np_arg; - - if (pp->ulp_debug) { - uintptr_t offset = (uintptr_t)np - (uintptr_t)base; - if (offset + sizeof (*np) > pp->ulp_objsize) { - uu_panic("uu_list_node_init(%p, %p, %p (\"%s\")): " - "offset %ld doesn't fit in object (size %ld)\n", - base, (void *)np, (void *)pp, pp->ulp_name, - (long)offset, (long)pp->ulp_objsize); - } - if (offset != pp->ulp_nodeoffset) { - uu_panic("uu_list_node_init(%p, %p, %p (\"%s\")): " - "offset %ld doesn't match pool's offset (%ld)\n", - base, (void *)np, (void *)pp, pp->ulp_name, - (long)offset, (long)pp->ulp_objsize); - } - } - np->uln_next = POOL_TO_MARKER(pp); - np->uln_prev = NULL; -} - -void -uu_list_node_fini(void *base, uu_list_node_t *np_arg, uu_list_pool_t *pp) -{ - uu_list_node_impl_t *np = (uu_list_node_impl_t *)np_arg; - - if (pp->ulp_debug) { - if (np->uln_next == NULL && - np->uln_prev == NULL) { - uu_panic("uu_list_node_fini(%p, %p, %p (\"%s\")): " - "node already finied\n", - base, (void *)np_arg, (void *)pp, pp->ulp_name); - } - if (np->uln_next != POOL_TO_MARKER(pp) || - np->uln_prev != NULL) { - uu_panic("uu_list_node_fini(%p, %p, %p (\"%s\")): " - "node corrupt or on list\n", - base, (void *)np_arg, (void *)pp, pp->ulp_name); - } - } - np->uln_next = NULL; - np->uln_prev = NULL; -} - -uu_list_t * -uu_list_create(uu_list_pool_t *pp, void *parent, uint32_t flags) -{ - uu_list_t *lp, *next, *prev; - - if (flags & ~(UU_LIST_DEBUG | UU_LIST_SORTED)) { - uu_set_error(UU_ERROR_UNKNOWN_FLAG); - return (NULL); - } - - if ((flags & UU_LIST_SORTED) && pp->ulp_cmp == NULL) { - if (pp->ulp_debug) - uu_panic("uu_list_create(%p, ...): requested " - "UU_LIST_SORTED, but pool has no comparison func\n", - (void *)pp); - uu_set_error(UU_ERROR_NOT_SUPPORTED); - return (NULL); - } - - lp = uu_zalloc(sizeof (*lp)); - if (lp == NULL) { - uu_set_error(UU_ERROR_NO_MEMORY); - return (NULL); - } - - lp->ul_pool = pp; - lp->ul_parent_enc = UU_PTR_ENCODE(parent); - lp->ul_offset = pp->ulp_nodeoffset; - lp->ul_debug = pp->ulp_debug || (flags & UU_LIST_DEBUG); - lp->ul_sorted = (flags & UU_LIST_SORTED); - lp->ul_numnodes = 0; - lp->ul_index = (pp->ulp_last_index = INDEX_NEXT(pp->ulp_last_index)); - - lp->ul_null_node.uln_next = &lp->ul_null_node; - lp->ul_null_node.uln_prev = &lp->ul_null_node; - - lp->ul_null_walk.ulw_next = &lp->ul_null_walk; - lp->ul_null_walk.ulw_prev = &lp->ul_null_walk; - - (void) pthread_mutex_lock(&pp->ulp_lock); - next = &pp->ulp_null_list; - prev = UU_PTR_DECODE(next->ul_prev_enc); - lp->ul_next_enc = UU_PTR_ENCODE(next); - lp->ul_prev_enc = UU_PTR_ENCODE(prev); - next->ul_prev_enc = UU_PTR_ENCODE(lp); - prev->ul_next_enc = UU_PTR_ENCODE(lp); - (void) pthread_mutex_unlock(&pp->ulp_lock); - - return (lp); -} - -void -uu_list_destroy(uu_list_t *lp) -{ - uu_list_pool_t *pp = lp->ul_pool; - - if (lp->ul_debug) { - if (lp->ul_null_node.uln_next != &lp->ul_null_node || - lp->ul_null_node.uln_prev != &lp->ul_null_node) { - uu_panic("uu_list_destroy(%p): list not empty\n", - (void *)lp); - } - if (lp->ul_numnodes != 0) { - uu_panic("uu_list_destroy(%p): numnodes is nonzero, " - "but list is empty\n", (void *)lp); - } - if (lp->ul_null_walk.ulw_next != &lp->ul_null_walk || - lp->ul_null_walk.ulw_prev != &lp->ul_null_walk) { - uu_panic("uu_list_destroy(%p): outstanding walkers\n", - (void *)lp); - } - } - - (void) pthread_mutex_lock(&pp->ulp_lock); - UU_LIST_PTR(lp->ul_next_enc)->ul_prev_enc = lp->ul_prev_enc; - UU_LIST_PTR(lp->ul_prev_enc)->ul_next_enc = lp->ul_next_enc; - (void) pthread_mutex_unlock(&pp->ulp_lock); - lp->ul_prev_enc = UU_PTR_ENCODE(NULL); - lp->ul_next_enc = UU_PTR_ENCODE(NULL); - lp->ul_pool = NULL; - uu_free(lp); -} - -static void -list_insert(uu_list_t *lp, uu_list_node_impl_t *np, uu_list_node_impl_t *prev, - uu_list_node_impl_t *next) -{ - if (lp->ul_debug) { - if (next->uln_prev != prev || prev->uln_next != next) - uu_panic("insert(%p): internal error: %p and %p not " - "neighbors\n", (void *)lp, (void *)next, - (void *)prev); - - if (np->uln_next != POOL_TO_MARKER(lp->ul_pool) || - np->uln_prev != NULL) { - uu_panic("insert(%p): elem %p node %p corrupt, " - "not initialized, or already in a list.\n", - (void *)lp, NODE_TO_ELEM(lp, np), (void *)np); - } - /* - * invalidate outstanding uu_list_index_ts. - */ - lp->ul_index = INDEX_NEXT(lp->ul_index); - } - np->uln_next = next; - np->uln_prev = prev; - next->uln_prev = np; - prev->uln_next = np; - - lp->ul_numnodes++; -} - -void -uu_list_insert(uu_list_t *lp, void *elem, uu_list_index_t idx) -{ - uu_list_node_impl_t *np; - - np = INDEX_TO_NODE(idx); - if (np == NULL) - np = &lp->ul_null_node; - - if (lp->ul_debug) { - if (!INDEX_VALID(lp, idx)) - uu_panic("uu_list_insert(%p, %p, %p): %s\n", - (void *)lp, elem, (void *)idx, - INDEX_CHECK(idx)? "outdated index" : - "invalid index"); - if (np->uln_prev == NULL) - uu_panic("uu_list_insert(%p, %p, %p): out-of-date " - "index\n", (void *)lp, elem, (void *)idx); - } - - list_insert(lp, ELEM_TO_NODE(lp, elem), np->uln_prev, np); -} - -void * -uu_list_find(uu_list_t *lp, void *elem, void *private, uu_list_index_t *out) -{ - int sorted = lp->ul_sorted; - uu_compare_fn_t *func = lp->ul_pool->ulp_cmp; - uu_list_node_impl_t *np; - - if (func == NULL) { - if (out != NULL) - *out = 0; - uu_set_error(UU_ERROR_NOT_SUPPORTED); - return (NULL); - } - for (np = lp->ul_null_node.uln_next; np != &lp->ul_null_node; - np = np->uln_next) { - void *ep = NODE_TO_ELEM(lp, np); - int cmp = func(ep, elem, private); - if (cmp == 0) { - if (out != NULL) - *out = NODE_TO_INDEX(lp, np); - return (ep); - } - if (sorted && cmp > 0) { - if (out != NULL) - *out = NODE_TO_INDEX(lp, np); - return (NULL); - } - } - if (out != NULL) - *out = NODE_TO_INDEX(lp, 0); - return (NULL); -} - -void * -uu_list_nearest_next(uu_list_t *lp, uu_list_index_t idx) -{ - uu_list_node_impl_t *np = INDEX_TO_NODE(idx); - - if (np == NULL) - np = &lp->ul_null_node; - - if (lp->ul_debug) { - if (!INDEX_VALID(lp, idx)) - uu_panic("uu_list_nearest_next(%p, %p): %s\n", - (void *)lp, (void *)idx, - INDEX_CHECK(idx)? "outdated index" : - "invalid index"); - if (np->uln_prev == NULL) - uu_panic("uu_list_nearest_next(%p, %p): out-of-date " - "index\n", (void *)lp, (void *)idx); - } - - if (np == &lp->ul_null_node) - return (NULL); - else - return (NODE_TO_ELEM(lp, np)); -} - -void * -uu_list_nearest_prev(uu_list_t *lp, uu_list_index_t idx) -{ - uu_list_node_impl_t *np = INDEX_TO_NODE(idx); - - if (np == NULL) - np = &lp->ul_null_node; - - if (lp->ul_debug) { - if (!INDEX_VALID(lp, idx)) - uu_panic("uu_list_nearest_prev(%p, %p): %s\n", - (void *)lp, (void *)idx, INDEX_CHECK(idx)? - "outdated index" : "invalid index"); - if (np->uln_prev == NULL) - uu_panic("uu_list_nearest_prev(%p, %p): out-of-date " - "index\n", (void *)lp, (void *)idx); - } - - if ((np = np->uln_prev) == &lp->ul_null_node) - return (NULL); - else - return (NODE_TO_ELEM(lp, np)); -} - -static void -list_walk_init(uu_list_walk_t *wp, uu_list_t *lp, uint32_t flags) -{ - uu_list_walk_t *next, *prev; - - int robust = (flags & UU_WALK_ROBUST); - int direction = (flags & UU_WALK_REVERSE)? -1 : 1; - - (void) memset(wp, 0, sizeof (*wp)); - wp->ulw_list = lp; - wp->ulw_robust = robust; - wp->ulw_dir = direction; - if (direction > 0) - wp->ulw_next_result = lp->ul_null_node.uln_next; - else - wp->ulw_next_result = lp->ul_null_node.uln_prev; - - if (lp->ul_debug || robust) { - /* - * Add this walker to the list's list of walkers so - * uu_list_remove() can advance us if somebody tries to - * remove ulw_next_result. - */ - wp->ulw_next = next = &lp->ul_null_walk; - wp->ulw_prev = prev = next->ulw_prev; - next->ulw_prev = wp; - prev->ulw_next = wp; - } -} - -static uu_list_node_impl_t * -list_walk_advance(uu_list_walk_t *wp, uu_list_t *lp) -{ - uu_list_node_impl_t *np = wp->ulw_next_result; - uu_list_node_impl_t *next; - - if (np == &lp->ul_null_node) - return (NULL); - - next = (wp->ulw_dir > 0)? np->uln_next : np->uln_prev; - - wp->ulw_next_result = next; - return (np); -} - -static void -list_walk_fini(uu_list_walk_t *wp) -{ - /* GLXXX debugging? */ - if (wp->ulw_next != NULL) { - wp->ulw_next->ulw_prev = wp->ulw_prev; - wp->ulw_prev->ulw_next = wp->ulw_next; - wp->ulw_next = NULL; - wp->ulw_prev = NULL; - } - wp->ulw_list = NULL; - wp->ulw_next_result = NULL; -} - -uu_list_walk_t * -uu_list_walk_start(uu_list_t *lp, uint32_t flags) -{ - uu_list_walk_t *wp; - - if (flags & ~(UU_WALK_ROBUST | UU_WALK_REVERSE)) { - uu_set_error(UU_ERROR_UNKNOWN_FLAG); - return (NULL); - } - - wp = uu_zalloc(sizeof (*wp)); - if (wp == NULL) { - uu_set_error(UU_ERROR_NO_MEMORY); - return (NULL); - } - - list_walk_init(wp, lp, flags); - return (wp); -} - -void * -uu_list_walk_next(uu_list_walk_t *wp) -{ - uu_list_t *lp = wp->ulw_list; - uu_list_node_impl_t *np = list_walk_advance(wp, lp); - - if (np == NULL) - return (NULL); - - return (NODE_TO_ELEM(lp, np)); -} - -void -uu_list_walk_end(uu_list_walk_t *wp) -{ - list_walk_fini(wp); - uu_free(wp); -} - -int -uu_list_walk(uu_list_t *lp, uu_walk_fn_t *func, void *private, uint32_t flags) -{ - uu_list_node_impl_t *np; - - int status = UU_WALK_NEXT; - - int robust = (flags & UU_WALK_ROBUST); - int reverse = (flags & UU_WALK_REVERSE); - - if (flags & ~(UU_WALK_ROBUST | UU_WALK_REVERSE)) { - uu_set_error(UU_ERROR_UNKNOWN_FLAG); - return (-1); - } - - if (lp->ul_debug || robust) { - uu_list_walk_t my_walk; - void *e; - - list_walk_init(&my_walk, lp, flags); - while (status == UU_WALK_NEXT && - (e = uu_list_walk_next(&my_walk)) != NULL) - status = (*func)(e, private); - list_walk_fini(&my_walk); - } else { - if (!reverse) { - for (np = lp->ul_null_node.uln_next; - status == UU_WALK_NEXT && np != &lp->ul_null_node; - np = np->uln_next) { - status = (*func)(NODE_TO_ELEM(lp, np), private); - } - } else { - for (np = lp->ul_null_node.uln_prev; - status == UU_WALK_NEXT && np != &lp->ul_null_node; - np = np->uln_prev) { - status = (*func)(NODE_TO_ELEM(lp, np), private); - } - } - } - if (status >= 0) - return (0); - uu_set_error(UU_ERROR_CALLBACK_FAILED); - return (-1); -} - -void -uu_list_remove(uu_list_t *lp, void *elem) -{ - uu_list_node_impl_t *np = ELEM_TO_NODE(lp, elem); - uu_list_walk_t *wp; - - if (lp->ul_debug) { - if (np->uln_prev == NULL) - uu_panic("uu_list_remove(%p, %p): elem not on list\n", - (void *)lp, elem); - /* - * invalidate outstanding uu_list_index_ts. - */ - lp->ul_index = INDEX_NEXT(lp->ul_index); - } - - /* - * robust walkers must be advanced. In debug mode, non-robust - * walkers are also on the list. If there are any, it's an error. - */ - for (wp = lp->ul_null_walk.ulw_next; wp != &lp->ul_null_walk; - wp = wp->ulw_next) { - if (wp->ulw_robust) { - if (np == wp->ulw_next_result) - (void) list_walk_advance(wp, lp); - } else if (wp->ulw_next_result != NULL) { - uu_panic("uu_list_remove(%p, %p): active non-robust " - "walker\n", (void *)lp, elem); - } - } - - np->uln_next->uln_prev = np->uln_prev; - np->uln_prev->uln_next = np->uln_next; - - lp->ul_numnodes--; - - np->uln_next = POOL_TO_MARKER(lp->ul_pool); - np->uln_prev = NULL; -} - -void * -uu_list_teardown(uu_list_t *lp, void **cookie) -{ - void *ep; - - /* - * XXX: disable list modification until list is empty - */ - if (lp->ul_debug && *cookie != NULL) - uu_panic("uu_list_teardown(%p, %p): unexpected cookie\n", - (void *)lp, (void *)cookie); - - ep = uu_list_first(lp); - if (ep) - uu_list_remove(lp, ep); - return (ep); -} - -int -uu_list_insert_before(uu_list_t *lp, void *target, void *elem) -{ - uu_list_node_impl_t *np = ELEM_TO_NODE(lp, target); - - if (target == NULL) - np = &lp->ul_null_node; - - if (lp->ul_debug) { - if (np->uln_prev == NULL) - uu_panic("uu_list_insert_before(%p, %p, %p): %p is " - "not currently on a list\n", - (void *)lp, target, elem, target); - } - if (lp->ul_sorted) { - if (lp->ul_debug) - uu_panic("uu_list_insert_before(%p, ...): list is " - "UU_LIST_SORTED\n", (void *)lp); - uu_set_error(UU_ERROR_NOT_SUPPORTED); - return (-1); - } - - list_insert(lp, ELEM_TO_NODE(lp, elem), np->uln_prev, np); - return (0); -} - -int -uu_list_insert_after(uu_list_t *lp, void *target, void *elem) -{ - uu_list_node_impl_t *np = ELEM_TO_NODE(lp, target); - - if (target == NULL) - np = &lp->ul_null_node; - - if (lp->ul_debug) { - if (np->uln_prev == NULL) - uu_panic("uu_list_insert_after(%p, %p, %p): %p is " - "not currently on a list\n", - (void *)lp, target, elem, target); - } - if (lp->ul_sorted) { - if (lp->ul_debug) - uu_panic("uu_list_insert_after(%p, ...): list is " - "UU_LIST_SORTED\n", (void *)lp); - uu_set_error(UU_ERROR_NOT_SUPPORTED); - return (-1); - } - - list_insert(lp, ELEM_TO_NODE(lp, elem), np, np->uln_next); - return (0); -} - -size_t -uu_list_numnodes(uu_list_t *lp) -{ - return (lp->ul_numnodes); -} - -void * -uu_list_first(uu_list_t *lp) -{ - uu_list_node_impl_t *n = lp->ul_null_node.uln_next; - if (n == &lp->ul_null_node) - return (NULL); - return (NODE_TO_ELEM(lp, n)); -} - -void * -uu_list_last(uu_list_t *lp) -{ - uu_list_node_impl_t *n = lp->ul_null_node.uln_prev; - if (n == &lp->ul_null_node) - return (NULL); - return (NODE_TO_ELEM(lp, n)); -} - -void * -uu_list_next(uu_list_t *lp, void *elem) -{ - uu_list_node_impl_t *n = ELEM_TO_NODE(lp, elem); - - n = n->uln_next; - if (n == &lp->ul_null_node) - return (NULL); - return (NODE_TO_ELEM(lp, n)); -} - -void * -uu_list_prev(uu_list_t *lp, void *elem) -{ - uu_list_node_impl_t *n = ELEM_TO_NODE(lp, elem); - - n = n->uln_prev; - if (n == &lp->ul_null_node) - return (NULL); - return (NODE_TO_ELEM(lp, n)); -} - -/* - * called from uu_lockup() and uu_release(), as part of our fork1()-safety. - */ -void -uu_list_lockup(void) -{ - uu_list_pool_t *pp; - - (void) pthread_mutex_lock(&uu_lpool_list_lock); - for (pp = uu_null_lpool.ulp_next; pp != &uu_null_lpool; - pp = pp->ulp_next) - (void) pthread_mutex_lock(&pp->ulp_lock); -} - -void -uu_list_release(void) -{ - uu_list_pool_t *pp; - - for (pp = uu_null_lpool.ulp_next; pp != &uu_null_lpool; - pp = pp->ulp_next) - (void) pthread_mutex_unlock(&pp->ulp_lock); - (void) pthread_mutex_unlock(&uu_lpool_list_lock); -} diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/uu_misc.c b/cddl/contrib/opensolaris/lib/libuutil/common/uu_misc.c deleted file mode 100644 index b673834e4dcf..000000000000 --- a/cddl/contrib/opensolaris/lib/libuutil/common/uu_misc.c +++ /dev/null @@ -1,277 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#include "libuutil_common.h" - -#define HAVE_ASSFAIL 1 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if !defined(TEXT_DOMAIN) -#define TEXT_DOMAIN "SYS_TEST" -#endif - -/* - * All of the old code under !defined(PTHREAD_ONCE_KEY_NP) - * is here to enable the building of a native version of - * libuutil.so when the build machine has not yet been upgraded - * to a version of libc that provides pthread_key_create_once_np(). - * It should all be deleted when solaris_nevada ships. - * The code is not MT-safe in a relaxed memory model. - */ - -#if defined(PTHREAD_ONCE_KEY_NP) -static pthread_key_t uu_error_key = PTHREAD_ONCE_KEY_NP; -#else /* PTHREAD_ONCE_KEY_NP */ -static pthread_key_t uu_error_key = 0; -static pthread_mutex_t uu_key_lock = PTHREAD_MUTEX_INITIALIZER; -#endif /* PTHREAD_ONCE_KEY_NP */ - -static int uu_error_key_setup = 0; - -static pthread_mutex_t uu_panic_lock = PTHREAD_MUTEX_INITIALIZER; -/* LINTED static unused */ -static const char *uu_panic_format; -/* LINTED static unused */ -static va_list uu_panic_args; -static pthread_t uu_panic_thread; - -static uint32_t _uu_main_error; - -void -uu_set_error(uint_t code) -{ - -#if defined(PTHREAD_ONCE_KEY_NP) - if (pthread_key_create_once_np(&uu_error_key, NULL) != 0) - uu_error_key_setup = -1; - else - uu_error_key_setup = 1; -#else /* PTHREAD_ONCE_KEY_NP */ - if (uu_error_key_setup == 0) { - (void) pthread_mutex_lock(&uu_key_lock); - if (uu_error_key_setup == 0) { - if (pthread_key_create(&uu_error_key, NULL) != 0) - uu_error_key_setup = -1; - else - uu_error_key_setup = 1; - } - (void) pthread_mutex_unlock(&uu_key_lock); - } -#endif /* PTHREAD_ONCE_KEY_NP */ - if (uu_error_key_setup > 0) - (void) pthread_setspecific(uu_error_key, - (void *)(uintptr_t)code); -} - -uint32_t -uu_error(void) -{ - - if (uu_error_key_setup < 0) /* can't happen? */ - return (UU_ERROR_UNKNOWN); - - /* - * Because UU_ERROR_NONE == 0, if uu_set_error() was - * never called, then this will return UU_ERROR_NONE: - */ - return ((uint32_t)(uintptr_t)pthread_getspecific(uu_error_key)); -} - -const char * -uu_strerror(uint32_t code) -{ - const char *str; - - switch (code) { - case UU_ERROR_NONE: - str = dgettext(TEXT_DOMAIN, "No error"); - break; - - case UU_ERROR_INVALID_ARGUMENT: - str = dgettext(TEXT_DOMAIN, "Invalid argument"); - break; - - case UU_ERROR_UNKNOWN_FLAG: - str = dgettext(TEXT_DOMAIN, "Unknown flag passed"); - break; - - case UU_ERROR_NO_MEMORY: - str = dgettext(TEXT_DOMAIN, "Out of memory"); - break; - - case UU_ERROR_CALLBACK_FAILED: - str = dgettext(TEXT_DOMAIN, "Callback-initiated failure"); - break; - - case UU_ERROR_NOT_SUPPORTED: - str = dgettext(TEXT_DOMAIN, "Operation not supported"); - break; - - case UU_ERROR_EMPTY: - str = dgettext(TEXT_DOMAIN, "No value provided"); - break; - - case UU_ERROR_UNDERFLOW: - str = dgettext(TEXT_DOMAIN, "Value too small"); - break; - - case UU_ERROR_OVERFLOW: - str = dgettext(TEXT_DOMAIN, "Value too large"); - break; - - case UU_ERROR_INVALID_CHAR: - str = dgettext(TEXT_DOMAIN, - "Value contains unexpected character"); - break; - - case UU_ERROR_INVALID_DIGIT: - str = dgettext(TEXT_DOMAIN, - "Value contains digit not in base"); - break; - - case UU_ERROR_SYSTEM: - str = dgettext(TEXT_DOMAIN, "Underlying system error"); - break; - - case UU_ERROR_UNKNOWN: - str = dgettext(TEXT_DOMAIN, "Error status not known"); - break; - - default: - errno = ESRCH; - str = NULL; - break; - } - return (str); -} - -void -uu_panic(const char *format, ...) -{ - va_list args; - - va_start(args, format); - - (void) pthread_mutex_lock(&uu_panic_lock); - if (uu_panic_thread == 0) { - uu_panic_thread = pthread_self(); - uu_panic_format = format; - va_copy(uu_panic_args, args); - } - (void) pthread_mutex_unlock(&uu_panic_lock); - - (void) vfprintf(stderr, format, args); - - if (uu_panic_thread == pthread_self()) - abort(); - else - for (;;) - (void) pause(); -} - -int -assfail(const char *astring, const char *file, int line) -{ - __assert(astring, file, line); - /*NOTREACHED*/ - return (0); -} - -static void -uu_lockup(void) -{ - (void) pthread_mutex_lock(&uu_panic_lock); -#if !defined(PTHREAD_ONCE_KEY_NP) - (void) pthread_mutex_lock(&uu_key_lock); -#endif - uu_avl_lockup(); - uu_list_lockup(); -} - -static void -uu_release(void) -{ - (void) pthread_mutex_unlock(&uu_panic_lock); -#if !defined(PTHREAD_ONCE_KEY_NP) - (void) pthread_mutex_unlock(&uu_key_lock); -#endif - uu_avl_release(); - uu_list_release(); -} - -static void -uu_release_child(void) -{ - uu_panic_format = NULL; - uu_panic_thread = 0; - - uu_release(); -} - -#pragma init(uu_init) -static void -uu_init(void) -{ - (void) pthread_atfork(uu_lockup, uu_release, uu_release_child); -} - -/* - * Dump a block of memory in hex+ascii, for debugging - */ -void -uu_dump(FILE *out, const char *prefix, const void *buf, size_t len) -{ - const unsigned char *p = buf; - int i; - - for (i = 0; i < len; i += 16) { - int j; - - (void) fprintf(out, "%s", prefix); - for (j = 0; j < 16 && i + j < len; j++) { - (void) fprintf(out, "%2.2x ", p[i + j]); - } - for (; j < 16; j++) { - (void) fprintf(out, " "); - } - for (j = 0; j < 16 && i + j < len; j++) { - (void) fprintf(out, "%c", - isprint(p[i + j]) ? p[i + j] : '.'); - } - (void) fprintf(out, "\n"); - } -} diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/uu_open.c b/cddl/contrib/opensolaris/lib/libuutil/common/uu_open.c deleted file mode 100644 index 7256662e38f6..000000000000 --- a/cddl/contrib/opensolaris/lib/libuutil/common/uu_open.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include "libuutil_common.h" - -#include - -#include -#include -#include -#include -#include - -#ifdef _LP64 -#define TMPPATHFMT "%s/uu%ld" -#else /* _LP64 */ -#define TMPPATHFMT "%s/uu%lld" -#endif /* _LP64 */ - -/*ARGSUSED*/ -int -uu_open_tmp(const char *dir, uint_t uflags) -{ - int f; - char *fname = uu_zalloc(PATH_MAX); - - if (fname == NULL) - return (-1); - - for (;;) { - (void) snprintf(fname, PATH_MAX, "%s/uu%lld", dir, gethrtime()); - - f = open(fname, O_CREAT | O_EXCL | O_RDWR, 0600); - - if (f >= 0 || errno != EEXIST) - break; - } - - if (f >= 0) - (void) unlink(fname); - - uu_free(fname); - - return (f); -} diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/uu_pname.c b/cddl/contrib/opensolaris/lib/libuutil/common/uu_pname.c deleted file mode 100644 index 20626ace6b2f..000000000000 --- a/cddl/contrib/opensolaris/lib/libuutil/common/uu_pname.c +++ /dev/null @@ -1,205 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include "libuutil_common.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static const char PNAME_FMT[] = "%s: "; -static const char ERRNO_FMT[] = ": %s\n"; - -static const char *pname; - -static void -uu_die_internal(int status, const char *format, va_list alist) __NORETURN; - -int uu_exit_ok_value = EXIT_SUCCESS; -int uu_exit_fatal_value = EXIT_FAILURE; -int uu_exit_usage_value = 2; - -int * -uu_exit_ok(void) -{ - return (&uu_exit_ok_value); -} - -int * -uu_exit_fatal(void) -{ - return (&uu_exit_fatal_value); -} - -int * -uu_exit_usage(void) -{ - return (&uu_exit_usage_value); -} - -void -uu_alt_exit(int profile) -{ - switch (profile) { - case UU_PROFILE_DEFAULT: - uu_exit_ok_value = EXIT_SUCCESS; - uu_exit_fatal_value = EXIT_FAILURE; - uu_exit_usage_value = 2; - break; - case UU_PROFILE_LAUNCHER: - uu_exit_ok_value = EXIT_SUCCESS; - uu_exit_fatal_value = 124; - uu_exit_usage_value = 125; - break; - } -} - -static void -uu_warn_internal(int err, const char *format, va_list alist) -{ - if (pname != NULL) - (void) fprintf(stderr, PNAME_FMT, pname); - - (void) vfprintf(stderr, format, alist); - - if (strrchr(format, '\n') == NULL) - (void) fprintf(stderr, ERRNO_FMT, strerror(err)); -} - -void -uu_vwarn(const char *format, va_list alist) -{ - uu_warn_internal(errno, format, alist); -} - -/*PRINTFLIKE1*/ -void -uu_warn(const char *format, ...) -{ - va_list alist; - va_start(alist, format); - uu_warn_internal(errno, format, alist); - va_end(alist); -} - -static void -uu_die_internal(int status, const char *format, va_list alist) -{ - uu_warn_internal(errno, format, alist); -#ifdef DEBUG - { - char *cp; - - if (!issetugid()) { - cp = getenv("UU_DIE_ABORTS"); - if (cp != NULL && *cp != '\0') - abort(); - } - } -#endif - exit(status); -} - -void -uu_vdie(const char *format, va_list alist) -{ - uu_die_internal(UU_EXIT_FATAL, format, alist); -} - -/*PRINTFLIKE1*/ -void -uu_die(const char *format, ...) -{ - va_list alist; - va_start(alist, format); - uu_die_internal(UU_EXIT_FATAL, format, alist); - va_end(alist); -} - -void -uu_vxdie(int status, const char *format, va_list alist) -{ - uu_die_internal(status, format, alist); -} - -/*PRINTFLIKE2*/ -void -uu_xdie(int status, const char *format, ...) -{ - va_list alist; - va_start(alist, format); - uu_die_internal(status, format, alist); - va_end(alist); -} - -const char * -uu_setpname(char *arg0) -{ - /* - * Having a NULL argv[0], while uncommon, is possible. It - * makes more sense to handle this event in uu_setpname rather - * than in each of its consumers. - */ - if (arg0 == NULL) { - pname = "unknown_command"; - return (pname); - } - - /* - * Guard against '/' at end of command invocation. - */ - for (;;) { - char *p = strrchr(arg0, '/'); - if (p == NULL) { - pname = arg0; - break; - } else { - if (*(p + 1) == '\0') { - *p = '\0'; - continue; - } - - pname = p + 1; - break; - } - } - - return (pname); -} - -const char * -uu_getpname(void) -{ - return (pname); -} diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/uu_string.c b/cddl/contrib/opensolaris/lib/libuutil/common/uu_string.c deleted file mode 100644 index 66afba05e849..000000000000 --- a/cddl/contrib/opensolaris/lib/libuutil/common/uu_string.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -/* - * String helper functions - */ - -#include -#include -#include -#include -#include -#include "libuutil.h" - -/* Return true if strings are equal */ -boolean_t -uu_streq(const char *a, const char *b) -{ - return (strcmp(a, b) == 0); -} - -/* Return true if strings are equal, case-insensitively */ -boolean_t -uu_strcaseeq(const char *a, const char *b) -{ - return (strcasecmp(a, b) == 0); -} - -/* Return true if string a Begins With string b */ -boolean_t -uu_strbw(const char *a, const char *b) -{ - return (strncmp(a, b, strlen(b)) == 0); -} diff --git a/cddl/contrib/opensolaris/lib/libuutil/common/uu_strtoint.c b/cddl/contrib/opensolaris/lib/libuutil/common/uu_strtoint.c deleted file mode 100644 index 8fd1148365cb..000000000000 --- a/cddl/contrib/opensolaris/lib/libuutil/common/uu_strtoint.c +++ /dev/null @@ -1,300 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include "libuutil_common.h" - -#include -#include - -#define MAX_BASE 36 - -#define IS_DIGIT(x) ((x) >= '0' && (x) <= '9') - -#define CTOI(x) (((x) >= '0' && (x) <= '9') ? (x) - '0' : \ - ((x) >= 'a' && (x) <= 'z') ? (x) + 10 - 'a' : (x) + 10 - 'A') - -static int -strtoint(const char *s_arg, uint64_t *out, uint32_t base, int sign) -{ - const unsigned char *s = (const unsigned char *)s_arg; - - uint64_t val = 0; - uint64_t multmax; - - unsigned c, i; - - int neg = 0; - - int bad_digit = 0; - int bad_char = 0; - int overflow = 0; - - if (s == NULL || base == 1 || base > MAX_BASE) { - uu_set_error(UU_ERROR_INVALID_ARGUMENT); - return (-1); - } - - while ((c = *s) != 0 && isspace(c)) - s++; - - switch (c) { - case '-': - if (!sign) - overflow = 1; /* becomes underflow below */ - neg = 1; - /*FALLTHRU*/ - case '+': - c = *++s; - break; - default: - break; - } - - if (c == '\0') { - uu_set_error(UU_ERROR_EMPTY); - return (-1); - } - - if (base == 0) { - if (c != '0') - base = 10; - else if (s[1] == 'x' || s[1] == 'X') - base = 16; - else - base = 8; - } - - if (base == 16 && c == '0' && (s[1] == 'x' || s[1] == 'X')) - c = *(s += 2); - - if ((val = CTOI(c)) >= base) { - if (IS_DIGIT(c)) - bad_digit = 1; - else - bad_char = 1; - val = 0; - } - - multmax = (uint64_t)UINT64_MAX / (uint64_t)base; - - for (c = *++s; c != '\0'; c = *++s) { - if ((i = CTOI(c)) >= base) { - if (isspace(c)) - break; - if (IS_DIGIT(c)) - bad_digit = 1; - else - bad_char = 1; - i = 0; - } - - if (val > multmax) - overflow = 1; - - val *= base; - if ((uint64_t)UINT64_MAX - val < (uint64_t)i) - overflow = 1; - - val += i; - } - - while ((c = *s) != 0) { - if (!isspace(c)) - bad_char = 1; - s++; - } - - if (sign) { - if (neg) { - if (val > -(uint64_t)INT64_MIN) - overflow = 1; - } else { - if (val > INT64_MAX) - overflow = 1; - } - } - - if (neg) - val = -val; - - if (bad_char | bad_digit | overflow) { - if (bad_char) - uu_set_error(UU_ERROR_INVALID_CHAR); - else if (bad_digit) - uu_set_error(UU_ERROR_INVALID_DIGIT); - else if (overflow) { - if (neg) - uu_set_error(UU_ERROR_UNDERFLOW); - else - uu_set_error(UU_ERROR_OVERFLOW); - } - return (-1); - } - - *out = val; - return (0); -} - -int -uu_strtoint(const char *s, void *v, size_t sz, int base, - int64_t min, int64_t max) -{ - uint64_t val_u; - int64_t val; - - if (min > max) - goto bad_argument; - - switch (sz) { - case 1: - if (max > INT8_MAX || min < INT8_MIN) - goto bad_argument; - break; - case 2: - if (max > INT16_MAX || min < INT16_MIN) - goto bad_argument; - break; - case 4: - if (max > INT32_MAX || min < INT32_MIN) - goto bad_argument; - break; - case 8: - if (max > INT64_MAX || min < INT64_MIN) - goto bad_argument; - break; - default: - goto bad_argument; - } - - if (min == 0 && max == 0) { - min = -(1ULL << (8 * sz - 1)); - max = (1ULL << (8 * sz - 1)) - 1; - } - - if (strtoint(s, &val_u, base, 1) == -1) - return (-1); - - val = (int64_t)val_u; - - if (val < min) { - uu_set_error(UU_ERROR_UNDERFLOW); - return (-1); - } else if (val > max) { - uu_set_error(UU_ERROR_OVERFLOW); - return (-1); - } - - switch (sz) { - case 1: - *(int8_t *)v = val; - return (0); - case 2: - *(int16_t *)v = val; - return (0); - case 4: - *(int32_t *)v = val; - return (0); - case 8: - *(int64_t *)v = val; - return (0); - default: - break; /* fall through to bad_argument */ - } - -bad_argument: - uu_set_error(UU_ERROR_INVALID_ARGUMENT); - return (-1); -} - -int -uu_strtouint(const char *s, void *v, size_t sz, int base, - uint64_t min, uint64_t max) -{ - uint64_t val; - - if (min > max) - goto bad_argument; - - switch (sz) { - case 1: - if (max > UINT8_MAX) - goto bad_argument; - break; - case 2: - if (max > UINT16_MAX) - goto bad_argument; - break; - case 4: - if (max > UINT32_MAX) - goto bad_argument; - break; - case 8: - if (max > UINT64_MAX) - goto bad_argument; - break; - default: - goto bad_argument; - } - - if (min == 0 && max == 0) { - /* we have to be careful, since << can overflow */ - max = (1ULL << (8 * sz - 1)) * 2 - 1; - } - - if (strtoint(s, &val, base, 0) == -1) - return (-1); - - if (val < min) { - uu_set_error(UU_ERROR_UNDERFLOW); - return (-1); - } else if (val > max) { - uu_set_error(UU_ERROR_OVERFLOW); - return (-1); - } - - switch (sz) { - case 1: - *(uint8_t *)v = val; - return (0); - case 2: - *(uint16_t *)v = val; - return (0); - case 4: - *(uint32_t *)v = val; - return (0); - case 8: - *(uint64_t *)v = val; - return (0); - default: - break; /* shouldn't happen, fall through */ - } - -bad_argument: - uu_set_error(UU_ERROR_INVALID_ARGUMENT); - return (-1); -} diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h deleted file mode 100644 index 1899e318d53e..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h +++ /dev/null @@ -1,894 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. - * Copyright 2019 Joyent, Inc. - * Copyright (c) 2012 Martin Matuska . All rights reserved. - * Copyright (c) 2013 Steven Hartland. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2016 Nexenta Systems, Inc. - * Copyright (c) 2019 Datto Inc. - */ - -#ifndef _LIBZFS_H -#define _LIBZFS_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Miscellaneous ZFS constants - */ -#define ZFS_MAXPROPLEN MAXPATHLEN -#define ZPOOL_MAXPROPLEN MAXPATHLEN - -/* - * libzfs errors - */ -typedef enum zfs_error { - EZFS_SUCCESS = 0, /* no error -- success */ - EZFS_NOMEM = 2000, /* out of memory */ - EZFS_BADPROP, /* invalid property value */ - EZFS_PROPREADONLY, /* cannot set readonly property */ - EZFS_PROPTYPE, /* property does not apply to dataset type */ - EZFS_PROPNONINHERIT, /* property is not inheritable */ - EZFS_PROPSPACE, /* bad quota or reservation */ - EZFS_BADTYPE, /* dataset is not of appropriate type */ - EZFS_BUSY, /* pool or dataset is busy */ - EZFS_EXISTS, /* pool or dataset already exists */ - EZFS_NOENT, /* no such pool or dataset */ - EZFS_BADSTREAM, /* bad backup stream */ - EZFS_DSREADONLY, /* dataset is readonly */ - EZFS_VOLTOOBIG, /* volume is too large for 32-bit system */ - EZFS_INVALIDNAME, /* invalid dataset name */ - EZFS_BADRESTORE, /* unable to restore to destination */ - EZFS_BADBACKUP, /* backup failed */ - EZFS_BADTARGET, /* bad attach/detach/replace target */ - EZFS_NODEVICE, /* no such device in pool */ - EZFS_BADDEV, /* invalid device to add */ - EZFS_NOREPLICAS, /* no valid replicas */ - EZFS_RESILVERING, /* currently resilvering */ - EZFS_BADVERSION, /* unsupported version */ - EZFS_POOLUNAVAIL, /* pool is currently unavailable */ - EZFS_DEVOVERFLOW, /* too many devices in one vdev */ - EZFS_BADPATH, /* must be an absolute path */ - EZFS_CROSSTARGET, /* rename or clone across pool or dataset */ - EZFS_ZONED, /* used improperly in local zone */ - EZFS_MOUNTFAILED, /* failed to mount dataset */ - EZFS_UMOUNTFAILED, /* failed to unmount dataset */ - EZFS_UNSHARENFSFAILED, /* unshare(1M) failed */ - EZFS_SHARENFSFAILED, /* share(1M) failed */ - EZFS_PERM, /* permission denied */ - EZFS_NOSPC, /* out of space */ - EZFS_FAULT, /* bad address */ - EZFS_IO, /* I/O error */ - EZFS_INTR, /* signal received */ - EZFS_ISSPARE, /* device is a hot spare */ - EZFS_INVALCONFIG, /* invalid vdev configuration */ - EZFS_RECURSIVE, /* recursive dependency */ - EZFS_NOHISTORY, /* no history object */ - EZFS_POOLPROPS, /* couldn't retrieve pool props */ - EZFS_POOL_NOTSUP, /* ops not supported for this type of pool */ - EZFS_POOL_INVALARG, /* invalid argument for this pool operation */ - EZFS_NAMETOOLONG, /* dataset name is too long */ - EZFS_OPENFAILED, /* open of device failed */ - EZFS_NOCAP, /* couldn't get capacity */ - EZFS_LABELFAILED, /* write of label failed */ - EZFS_BADWHO, /* invalid permission who */ - EZFS_BADPERM, /* invalid permission */ - EZFS_BADPERMSET, /* invalid permission set name */ - EZFS_NODELEGATION, /* delegated administration is disabled */ - EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */ - EZFS_SHARESMBFAILED, /* failed to share over smb */ - EZFS_BADCACHE, /* bad cache file */ - EZFS_ISL2CACHE, /* device is for the level 2 ARC */ - EZFS_VDEVNOTSUP, /* unsupported vdev type */ - EZFS_NOTSUP, /* ops not supported on this dataset */ - EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */ - EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */ - EZFS_REFTAG_RELE, /* snapshot release: tag not found */ - EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */ - EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */ - EZFS_PIPEFAILED, /* pipe create failed */ - EZFS_THREADCREATEFAILED, /* thread create failed */ - EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ - EZFS_SCRUBBING, /* currently scrubbing */ - EZFS_NO_SCRUB, /* no active scrub */ - EZFS_DIFF, /* general failure of zfs diff */ - EZFS_DIFFDATA, /* bad zfs diff data */ - EZFS_POOLREADONLY, /* pool is in read-only mode */ - EZFS_SCRUB_PAUSED, /* scrub currently paused */ - EZFS_ACTIVE_POOL, /* pool is imported on a different system */ - EZFS_NO_PENDING, /* cannot cancel, no operation is pending */ - EZFS_CHECKPOINT_EXISTS, /* checkpoint exists */ - EZFS_DISCARDING_CHECKPOINT, /* currently discarding a checkpoint */ - EZFS_NO_CHECKPOINT, /* pool has no checkpoint */ - EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */ - EZFS_VDEV_TOO_BIG, /* a device is too big to be used */ - EZFS_TOOMANY, /* argument list too long */ - EZFS_INITIALIZING, /* currently initializing */ - EZFS_NO_INITIALIZE, /* no active initialize */ - EZFS_WRONG_PARENT, /* invalid parent dataset (e.g ZVOL) */ - EZFS_IOC_NOTSUPPORTED, /* operation not supported by zfs module */ - EZFS_UNKNOWN -} zfs_error_t; - -/* - * UEFI boot support parameters. When creating whole disk boot pool, - * zpool create should allow to create EFI System partition for UEFI boot - * program. In case of BIOS, the EFI System partition is not used - * even if it does exist. - */ -typedef enum zpool_boot_label { - ZPOOL_NO_BOOT_LABEL = 0, - ZPOOL_CREATE_BOOT_LABEL, - ZPOOL_COPY_BOOT_LABEL -} zpool_boot_label_t; - -/* - * The following data structures are all part - * of the zfs_allow_t data structure which is - * used for printing 'allow' permissions. - * It is a linked list of zfs_allow_t's which - * then contain avl tree's for user/group/sets/... - * and each one of the entries in those trees have - * avl tree's for the permissions they belong to and - * whether they are local,descendent or local+descendent - * permissions. The AVL trees are used primarily for - * sorting purposes, but also so that we can quickly find - * a given user and or permission. - */ -typedef struct zfs_perm_node { - avl_node_t z_node; - char z_pname[MAXPATHLEN]; -} zfs_perm_node_t; - -typedef struct zfs_allow_node { - avl_node_t z_node; - char z_key[MAXPATHLEN]; /* name, such as joe */ - avl_tree_t z_localdescend; /* local+descendent perms */ - avl_tree_t z_local; /* local permissions */ - avl_tree_t z_descend; /* descendent permissions */ -} zfs_allow_node_t; - -typedef struct zfs_allow { - struct zfs_allow *z_next; - char z_setpoint[MAXPATHLEN]; - avl_tree_t z_sets; - avl_tree_t z_crperms; - avl_tree_t z_user; - avl_tree_t z_group; - avl_tree_t z_everyone; -} zfs_allow_t; - -/* - * Basic handle types - */ -typedef struct zfs_handle zfs_handle_t; -typedef struct zpool_handle zpool_handle_t; -typedef struct libzfs_handle libzfs_handle_t; - -/* - * Library initialization - */ -extern libzfs_handle_t *libzfs_init(void); -extern void libzfs_fini(libzfs_handle_t *); - -extern libzfs_handle_t *zpool_get_handle(zpool_handle_t *); -extern libzfs_handle_t *zfs_get_handle(zfs_handle_t *); - -extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t); - -extern void zfs_save_arguments(int argc, char **, char *, int); -extern int zpool_log_history(libzfs_handle_t *, const char *); - -extern int libzfs_errno(libzfs_handle_t *); -extern const char *libzfs_error_action(libzfs_handle_t *); -extern const char *libzfs_error_description(libzfs_handle_t *); -extern int zfs_standard_error(libzfs_handle_t *, int, const char *); -extern void libzfs_mnttab_init(libzfs_handle_t *); -extern void libzfs_mnttab_fini(libzfs_handle_t *); -extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t); -extern int libzfs_mnttab_find(libzfs_handle_t *, const char *, - struct mnttab *); -extern void libzfs_mnttab_add(libzfs_handle_t *, const char *, - const char *, const char *); -extern void libzfs_mnttab_remove(libzfs_handle_t *, const char *); - -/* - * Basic handle functions - */ -extern zpool_handle_t *zpool_open(libzfs_handle_t *, const char *); -extern zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *); -extern void zpool_close(zpool_handle_t *); -extern const char *zpool_get_name(zpool_handle_t *); -extern int zpool_get_state(zpool_handle_t *); -extern const char *zpool_state_to_name(vdev_state_t, vdev_aux_t); -extern const char *zpool_pool_state_to_name(pool_state_t); -extern void zpool_free_handles(libzfs_handle_t *); -extern int zpool_nextboot(libzfs_handle_t *, uint64_t, uint64_t, const char *); - -/* - * Iterate over all active pools in the system. - */ -typedef int (*zpool_iter_f)(zpool_handle_t *, void *); -extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *); -extern boolean_t zpool_skip_pool(const char *); - -/* - * Functions to create and destroy pools - */ -extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, - nvlist_t *, nvlist_t *); -extern int zpool_destroy(zpool_handle_t *, const char *); -extern int zpool_add(zpool_handle_t *, nvlist_t *); - -typedef struct splitflags { - /* do not split, but return the config that would be split off */ - int dryrun : 1; - - /* after splitting, import the pool */ - int import : 1; - int name_flags; -} splitflags_t; - -/* - * Functions to manipulate pool and vdev state - */ -extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); -extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, - nvlist_t *); -extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); -extern int zpool_reguid(zpool_handle_t *); -extern int zpool_reopen(zpool_handle_t *); - -extern int zpool_sync_one(zpool_handle_t *, void *); - -extern int zpool_vdev_online(zpool_handle_t *, const char *, int, - vdev_state_t *); -extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); -extern int zpool_vdev_attach(zpool_handle_t *, const char *, - const char *, nvlist_t *, int); -extern int zpool_vdev_detach(zpool_handle_t *, const char *); -extern int zpool_vdev_remove(zpool_handle_t *, const char *); -extern int zpool_vdev_remove_cancel(zpool_handle_t *); -extern int zpool_vdev_indirect_size(zpool_handle_t *, const char *, uint64_t *); -extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, - splitflags_t); - -extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); -extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); -extern int zpool_vdev_clear(zpool_handle_t *, uint64_t); - -extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, - boolean_t *, boolean_t *); -extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, - boolean_t *, boolean_t *, boolean_t *); -extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, const char *, - zpool_boot_label_t, uint64_t, int *); - -/* - * Functions to manage pool properties - */ -extern int zpool_set_prop(zpool_handle_t *, const char *, const char *); -extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, - size_t proplen, zprop_source_t *, boolean_t); -extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, - zprop_source_t *); - -extern const char *zpool_prop_to_name(zpool_prop_t); -extern const char *zpool_prop_values(zpool_prop_t); - -/* - * Pool health statistics. - */ -typedef enum { - /* - * The following correspond to faults as defined in the (fault.fs.zfs.*) - * event namespace. Each is associated with a corresponding message ID. - * This must be kept in sync with the zfs_msgid_table in - * lib/libzfs/libzfs_status.c. - */ - ZPOOL_STATUS_CORRUPT_CACHE, /* corrupt /kernel/drv/zpool.cache */ - ZPOOL_STATUS_MISSING_DEV_R, /* missing device with replicas */ - ZPOOL_STATUS_MISSING_DEV_NR, /* missing device with no replicas */ - ZPOOL_STATUS_CORRUPT_LABEL_R, /* bad device label with replicas */ - ZPOOL_STATUS_CORRUPT_LABEL_NR, /* bad device label with no replicas */ - ZPOOL_STATUS_BAD_GUID_SUM, /* sum of device guids didn't match */ - ZPOOL_STATUS_CORRUPT_POOL, /* pool metadata is corrupted */ - ZPOOL_STATUS_CORRUPT_DATA, /* data errors in user (meta)data */ - ZPOOL_STATUS_FAILING_DEV, /* device experiencing errors */ - ZPOOL_STATUS_VERSION_NEWER, /* newer on-disk version */ - ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */ - ZPOOL_STATUS_HOSTID_ACTIVE, /* currently active on another system */ - ZPOOL_STATUS_HOSTID_REQUIRED, /* multihost=on and hostid=0 */ - ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */ - ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */ - ZPOOL_STATUS_IO_FAILURE_MMP, /* failed MMP, failmode not 'panic' */ - ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ - - /* - * If the pool has unsupported features but can still be opened in - * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the - * pool has unsupported features but cannot be opened at all, its - * status is ZPOOL_STATUS_UNSUP_FEAT_READ. - */ - ZPOOL_STATUS_UNSUP_FEAT_READ, /* unsupported features for read */ - ZPOOL_STATUS_UNSUP_FEAT_WRITE, /* unsupported features for write */ - - /* - * These faults have no corresponding message ID. At the time we are - * checking the status, the original reason for the FMA fault (I/O or - * checksum errors) has been lost. - */ - ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */ - ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */ - - /* - * The following are not faults per se, but still an error possibly - * requiring administrative attention. There is no corresponding - * message ID. - */ - ZPOOL_STATUS_VERSION_OLDER, /* older legacy on-disk version */ - ZPOOL_STATUS_FEAT_DISABLED, /* supported features are disabled */ - ZPOOL_STATUS_RESILVERING, /* device being resilvered */ - ZPOOL_STATUS_OFFLINE_DEV, /* device offline */ - ZPOOL_STATUS_REMOVED_DEV, /* removed device */ - ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */ - - /* - * Finally, the following indicates a healthy pool. - */ - ZPOOL_STATUS_OK -} zpool_status_t; - -extern zpool_status_t zpool_get_status(zpool_handle_t *, char **); -extern zpool_status_t zpool_import_status(nvlist_t *, char **); -extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh); - -/* - * Statistics and configuration functions. - */ -extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); -extern nvlist_t *zpool_get_features(zpool_handle_t *); -extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *); -extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **); -extern boolean_t zpool_is_bootable(zpool_handle_t *); - -/* - * Import and export functions - */ -extern int zpool_export(zpool_handle_t *, boolean_t, const char *); -extern int zpool_export_force(zpool_handle_t *, const char *); -extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, - char *altroot); -extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, - nvlist_t *, int); -extern void zpool_print_unsup_feat(nvlist_t *config); - -/* - * Search for pools to import - */ - -typedef struct importargs { - char **path; /* a list of paths to search */ - int paths; /* number of paths to search */ - char *poolname; /* name of a pool to find */ - uint64_t guid; /* guid of a pool to find */ - char *cachefile; /* cachefile to use for import */ - int can_be_active : 1; /* can the pool be active? */ - int unique : 1; /* does 'poolname' already exist? */ - int exists : 1; /* set on return if pool already exists */ - nvlist_t *policy; /* load policy (max txg, rewind, etc.) */ -} importargs_t; - -extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *); -extern int zpool_tryimport(libzfs_handle_t *hdl, char *target, - nvlist_t **configp, importargs_t *args); - -/* legacy pool search routines */ -extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **); -extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *, - char *, uint64_t); - -/* - * Miscellaneous pool functions - */ -struct zfs_cmd; - -extern const char *zfs_history_event_names[]; - -typedef enum { - VDEV_NAME_PATH = 1 << 0, - VDEV_NAME_GUID = 1 << 1, - VDEV_NAME_FOLLOW_LINKS = 1 << 2, - VDEV_NAME_TYPE_ID = 1 << 3, -} vdev_name_t; - -extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, - int name_flags); -extern int zpool_upgrade(zpool_handle_t *, uint64_t); -extern int zpool_get_history(zpool_handle_t *, nvlist_t **, uint64_t *, - boolean_t *); -extern int zpool_history_unpack(char *, uint64_t, uint64_t *, - nvlist_t ***, uint_t *); -extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, - size_t len); -extern int zfs_ioctl(libzfs_handle_t *, int request, struct zfs_cmd *); -extern int zpool_get_physpath(zpool_handle_t *, char *, size_t); -extern void zpool_explain_recover(libzfs_handle_t *, const char *, int, - nvlist_t *); -extern int zpool_checkpoint(zpool_handle_t *); -extern int zpool_discard_checkpoint(zpool_handle_t *); - -/* - * Basic handle manipulations. These functions do not create or destroy the - * underlying datasets, only the references to them. - */ -extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int); -extern zfs_handle_t *zfs_handle_dup(zfs_handle_t *); -extern void zfs_close(zfs_handle_t *); -extern zfs_type_t zfs_get_type(const zfs_handle_t *); -extern const char *zfs_get_name(const zfs_handle_t *); -extern zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *); -extern const char *zfs_get_pool_name(const zfs_handle_t *); - -/* - * Property management functions. Some functions are shared with the kernel, - * and are found in sys/fs/zfs.h. - */ - -/* - * zfs dataset property management - */ -extern const char *zfs_prop_default_string(zfs_prop_t); -extern uint64_t zfs_prop_default_numeric(zfs_prop_t); -extern const char *zfs_prop_column_name(zfs_prop_t); -extern boolean_t zfs_prop_align_right(zfs_prop_t); - -extern nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t, - nvlist_t *, uint64_t, zfs_handle_t *, zpool_handle_t *, const char *); - -extern const char *zfs_prop_to_name(zfs_prop_t); -extern int zfs_prop_set(zfs_handle_t *, const char *, const char *); -extern int zfs_prop_set_list(zfs_handle_t *, nvlist_t *); -extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, - zprop_source_t *, char *, size_t, boolean_t); -extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t, - boolean_t); -extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, - zprop_source_t *, char *, size_t); -extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, - uint64_t *propvalue); -extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, - char *propbuf, int proplen, boolean_t literal); -extern int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, - uint64_t *propvalue); -extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, - char *propbuf, int proplen, boolean_t literal); -extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, - char *buf, size_t len); -extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); -extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); -extern const char *zfs_prop_values(zfs_prop_t); -extern int zfs_prop_is_string(zfs_prop_t prop); -extern nvlist_t *zfs_get_user_props(zfs_handle_t *); -extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *); -extern nvlist_t *zfs_get_clones_nvl(zfs_handle_t *); - - -typedef struct zprop_list { - int pl_prop; - char *pl_user_prop; - struct zprop_list *pl_next; - boolean_t pl_all; - size_t pl_width; - size_t pl_recvd_width; - boolean_t pl_fixed; -} zprop_list_t; - -extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t, - boolean_t); -extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); - -#define ZFS_MOUNTPOINT_NONE "none" -#define ZFS_MOUNTPOINT_LEGACY "legacy" - -#define ZFS_FEATURE_DISABLED "disabled" -#define ZFS_FEATURE_ENABLED "enabled" -#define ZFS_FEATURE_ACTIVE "active" - -#define ZFS_UNSUPPORTED_INACTIVE "inactive" -#define ZFS_UNSUPPORTED_READONLY "readonly" - -/* - * zpool property management - */ -extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **); -extern int zpool_prop_get_feature(zpool_handle_t *, const char *, char *, - size_t); -extern const char *zpool_prop_default_string(zpool_prop_t); -extern uint64_t zpool_prop_default_numeric(zpool_prop_t); -extern const char *zpool_prop_column_name(zpool_prop_t); -extern boolean_t zpool_prop_align_right(zpool_prop_t); - -/* - * Functions shared by zfs and zpool property management. - */ -extern int zprop_iter(zprop_func func, void *cb, boolean_t show_all, - boolean_t ordered, zfs_type_t type); -extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **, - zfs_type_t); -extern void zprop_free_list(zprop_list_t *); - -#define ZFS_GET_NCOLS 5 - -typedef enum { - GET_COL_NONE, - GET_COL_NAME, - GET_COL_PROPERTY, - GET_COL_VALUE, - GET_COL_RECVD, - GET_COL_SOURCE -} zfs_get_column_t; - -/* - * Functions for printing zfs or zpool properties - */ -typedef struct zprop_get_cbdata { - int cb_sources; - zfs_get_column_t cb_columns[ZFS_GET_NCOLS]; - int cb_colwidths[ZFS_GET_NCOLS + 1]; - boolean_t cb_scripted; - boolean_t cb_literal; - boolean_t cb_first; - zprop_list_t *cb_proplist; - zfs_type_t cb_type; -} zprop_get_cbdata_t; - -void zprop_print_one_property(const char *, zprop_get_cbdata_t *, - const char *, const char *, zprop_source_t, const char *, - const char *); - -/* - * Iterator functions. - */ -typedef int (*zfs_iter_f)(zfs_handle_t *, void *); -extern int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *); -extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *); -extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *); -extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); -extern int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *, - uint64_t, uint64_t); -extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *, - uint64_t, uint64_t); -extern int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *); -extern int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *); - -typedef struct get_all_cb { - zfs_handle_t **cb_handles; - size_t cb_alloc; - size_t cb_used; -} get_all_cb_t; - -void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, size_t, - zfs_iter_f, void*, boolean_t); - -void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); - -/* - * Functions to create and destroy datasets. - */ -extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t, - nvlist_t *); -extern int zfs_create_ancestors(libzfs_handle_t *, const char *); -extern int zfs_destroy(zfs_handle_t *, boolean_t); -extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); -extern int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t); -extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); -extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); -extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, - nvlist_t *props); -extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); - -typedef struct renameflags { - /* recursive rename */ - int recurse : 1; - - /* don't unmount file systems */ - int nounmount : 1; - - /* force unmount file systems */ - int forceunmount : 1; -} renameflags_t; - -extern int zfs_rename(zfs_handle_t *, const char *, const char *, - renameflags_t flags); - -typedef struct sendflags { - /* print informational messages (ie, -v was specified) */ - boolean_t verbose; - - /* recursive send (ie, -R) */ - boolean_t replicate; - - /* for incrementals, do all intermediate snapshots */ - boolean_t doall; - - /* if dataset is a clone, do incremental from its origin */ - boolean_t fromorigin; - - /* do deduplication */ - boolean_t dedup; - - /* send properties (ie, -p) */ - boolean_t props; - - /* do not send (no-op, ie. -n) */ - boolean_t dryrun; - - /* parsable verbose output (ie. -P) */ - boolean_t parsable; - - /* show progress (ie. -v) */ - boolean_t progress; - - /* large blocks (>128K) are permitted */ - boolean_t largeblock; - - /* WRITE_EMBEDDED records of type DATA are permitted */ - boolean_t embed_data; - - /* compressed WRITE records are permitted */ - boolean_t compress; - - /* show progress as process title(ie. -V) */ - boolean_t progressastitle; -} sendflags_t; - -typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); - -extern int zfs_send(zfs_handle_t *, const char *, const char *, - sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); -extern int zfs_send_one(zfs_handle_t *, const char *, int, sendflags_t flags); -extern int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd, - const char *); -extern nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, - const char *token); - -extern int zfs_promote(zfs_handle_t *); -extern int zfs_hold(zfs_handle_t *, const char *, const char *, - boolean_t, int); -extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); -extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); -extern int zfs_get_holds(zfs_handle_t *, nvlist_t **); -extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *); - -typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, - uid_t rid, uint64_t space); - -extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t, - zfs_userspace_cb_t, void *); - -extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **); -extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *); - -typedef struct recvflags { - /* print informational messages (ie, -v was specified) */ - boolean_t verbose; - - /* the destination is a prefix, not the exact fs (ie, -d) */ - boolean_t isprefix; - - /* - * Only the tail of the sent snapshot path is appended to the - * destination to determine the received snapshot name (ie, -e). - */ - boolean_t istail; - - /* do not actually do the recv, just check if it would work (ie, -n) */ - boolean_t dryrun; - - /* rollback/destroy filesystems as necessary (eg, -F) */ - boolean_t force; - - /* set "canmount=off" on all modified filesystems */ - boolean_t canmountoff; - - /* - * Mark the file systems as "resumable" and do not destroy them if the - * receive is interrupted - */ - boolean_t resumable; - - /* byteswap flag is used internally; callers need not specify */ - boolean_t byteswap; - - /* do not mount file systems as they are extracted (private) */ - boolean_t nomount; - - /* force unmount while recv snapshot (private) */ - boolean_t forceunmount; -} recvflags_t; - -extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, - recvflags_t *, int, avl_tree_t *); - -typedef enum diff_flags { - ZFS_DIFF_PARSEABLE = 0x1, - ZFS_DIFF_TIMESTAMP = 0x2, - ZFS_DIFF_CLASSIFY = 0x4 -} diff_flags_t; - -extern int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *, - int); - -/* - * Miscellaneous functions. - */ -extern const char *zfs_type_to_name(zfs_type_t); -extern void zfs_refresh_properties(zfs_handle_t *); -extern int zfs_name_valid(const char *, zfs_type_t); -extern zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, char *, zfs_type_t); -extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, - zfs_type_t); -extern int zfs_spa_version(zfs_handle_t *, int *); -extern boolean_t zfs_bookmark_exists(const char *path); -extern ulong_t get_system_hostid(void); - -/* - * Mount support functions. - */ -extern boolean_t is_mounted(libzfs_handle_t *, const char *special, char **); -extern boolean_t zfs_is_mounted(zfs_handle_t *, char **); -extern int zfs_mount(zfs_handle_t *, const char *, int); -extern int zfs_mount_at(zfs_handle_t *, const char *, int, const char *); -extern int zfs_unmount(zfs_handle_t *, const char *, int); -extern int zfs_unmountall(zfs_handle_t *, int); - -/* - * Share support functions. - */ -extern boolean_t zfs_is_shared(zfs_handle_t *); -extern int zfs_share(zfs_handle_t *); -extern int zfs_unshare(zfs_handle_t *); - -/* - * Protocol-specific share support functions. - */ -extern boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **); -extern boolean_t zfs_is_shared_smb(zfs_handle_t *, char **); -extern int zfs_share_nfs(zfs_handle_t *); -extern int zfs_share_smb(zfs_handle_t *); -extern int zfs_shareall(zfs_handle_t *); -extern int zfs_unshare_nfs(zfs_handle_t *, const char *); -extern int zfs_unshare_smb(zfs_handle_t *, const char *); -extern int zfs_unshareall_nfs(zfs_handle_t *); -extern int zfs_unshareall_smb(zfs_handle_t *); -extern int zfs_unshareall_bypath(zfs_handle_t *, const char *); -extern int zfs_unshareall(zfs_handle_t *); -extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, - void *, void *, int, zfs_share_op_t); - -/* - * FreeBSD-specific jail support function. - */ -extern int zfs_jail(zfs_handle_t *, int, int); - -/* - * When dealing with nvlists, verify() is extremely useful - */ -#ifndef verify -#ifdef NDEBUG -#define verify(EX) ((void)(EX)) -#else -#define verify(EX) assert(EX) -#endif -#endif - -/* - * Utility function to convert a number to a human-readable form. - */ -extern void zfs_nicenum(uint64_t, char *, size_t); -extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); - -/* - * Given a device or file, determine if it is part of a pool. - */ -extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, - boolean_t *); - -/* - * Label manipulation. - */ -extern int zpool_read_label(int, nvlist_t **); -extern int zpool_read_all_labels(int, nvlist_t **); -extern int zpool_clear_label(int); -extern int zpool_set_bootenv(zpool_handle_t *, const char *); -extern int zpool_get_bootenv(zpool_handle_t *, char *, size_t, off_t); - -/* is this zvol valid for use as a dump device? */ -extern int zvol_check_dump_config(char *); - -/* - * Management interfaces for SMB ACL files - */ - -int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *); -int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *); -int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *); -int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); - -/* - * Enable and disable datasets within a pool by mounting/unmounting and - * sharing/unsharing them. - */ -extern int zpool_enable_datasets(zpool_handle_t *, const char *, int); -extern int zpool_disable_datasets(zpool_handle_t *, boolean_t); - -/* - * Mappings between vdev and FRU. - */ -extern void libzfs_fru_refresh(libzfs_handle_t *); -extern const char *libzfs_fru_lookup(libzfs_handle_t *, const char *); -extern const char *libzfs_fru_devpath(libzfs_handle_t *, const char *); -extern boolean_t libzfs_fru_compare(libzfs_handle_t *, const char *, - const char *); -extern boolean_t libzfs_fru_notself(libzfs_handle_t *, const char *); -extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *); - -#ifndef illumos -extern int zmount(const char *, const char *, int, char *, char *, int, char *, - int); -#endif -extern int zfs_remap_indirects(libzfs_handle_t *hdl, const char *); - -/* Allow consumers to initialize libshare externally for optimal performance */ -extern int zfs_init_libshare_arg(libzfs_handle_t *, int, void *); -/* - * For most consumers, zfs_init_libshare_arg is sufficient on its own, and - * zfs_uninit_libshare is unnecessary. zfs_uninit_libshare should only be called - * if the caller has already initialized libshare for one set of zfs handles, - * and wishes to share or unshare filesystems outside of that set. In that case, - * the caller should uninitialize libshare, and then re-initialize it with the - * new handles being shared or unshared. - */ -extern void zfs_uninit_libshare(libzfs_handle_t *); -#ifdef __cplusplus -} -#endif - -#endif /* _LIBZFS_H */ diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_changelist.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_changelist.c deleted file mode 100644 index 7bbb68328f29..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_changelist.c +++ /dev/null @@ -1,736 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - * - * Portions Copyright 2007 Ramprakash Jelari - * Copyright (c) 2011 Pawel Jakub Dawidek . - * All rights reserved. - * Copyright (c) 2014, 2016 by Delphix. All rights reserved. - * Copyright 2016 Igor Kozhukhov - */ - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "libzfs_impl.h" - -/* - * Structure to keep track of dataset state. Before changing the 'sharenfs' or - * 'mountpoint' property, we record whether the filesystem was previously - * mounted/shared. This prior state dictates whether we remount/reshare the - * dataset after the property has been changed. - * - * The interface consists of the following sequence of functions: - * - * changelist_gather() - * changelist_prefix() - * < change property > - * changelist_postfix() - * changelist_free() - * - * Other interfaces: - * - * changelist_remove() - remove a node from a gathered list - * changelist_rename() - renames all datasets appropriately when doing a rename - * changelist_unshare() - unshares all the nodes in a given changelist - * changelist_haszonedchild() - check if there is any child exported to - * a local zone - */ -typedef struct prop_changenode { - zfs_handle_t *cn_handle; - int cn_shared; - int cn_mounted; - int cn_zoned; - boolean_t cn_needpost; /* is postfix() needed? */ - uu_list_node_t cn_listnode; -} prop_changenode_t; - -struct prop_changelist { - zfs_prop_t cl_prop; - zfs_prop_t cl_realprop; - zfs_prop_t cl_shareprop; /* used with sharenfs/sharesmb */ - uu_list_pool_t *cl_pool; - uu_list_t *cl_list; - boolean_t cl_waslegacy; - boolean_t cl_allchildren; - boolean_t cl_alldependents; - int cl_mflags; /* Mount flags */ - int cl_gflags; /* Gather request flags */ - boolean_t cl_haszonedchild; - boolean_t cl_sorted; -}; - -/* - * If the property is 'mountpoint', go through and unmount filesystems as - * necessary. We don't do the same for 'sharenfs', because we can just re-share - * with different options without interrupting service. We do handle 'sharesmb' - * since there may be old resource names that need to be removed. - */ -int -changelist_prefix(prop_changelist_t *clp) -{ - prop_changenode_t *cn; - int ret = 0; - - if (clp->cl_prop != ZFS_PROP_MOUNTPOINT && - clp->cl_prop != ZFS_PROP_SHARESMB) - return (0); - - for (cn = uu_list_first(clp->cl_list); cn != NULL; - cn = uu_list_next(clp->cl_list, cn)) { - - /* if a previous loop failed, set the remaining to false */ - if (ret == -1) { - cn->cn_needpost = B_FALSE; - continue; - } - - /* - * If we are in the global zone, but this dataset is exported - * to a local zone, do nothing. - */ - if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned) - continue; - - if (!ZFS_IS_VOLUME(cn->cn_handle)) { - /* - * Do the property specific processing. - */ - switch (clp->cl_prop) { - case ZFS_PROP_MOUNTPOINT: - if (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT) - break; - if (zfs_unmount(cn->cn_handle, NULL, - clp->cl_mflags) != 0) { - ret = -1; - cn->cn_needpost = B_FALSE; - } - break; - case ZFS_PROP_SHARESMB: - (void) zfs_unshare_smb(cn->cn_handle, NULL); - break; - - default: - break; - } - } - } - - if (ret == -1) - (void) changelist_postfix(clp); - - return (ret); -} - -/* - * If the property is 'mountpoint' or 'sharenfs', go through and remount and/or - * reshare the filesystems as necessary. In changelist_gather() we recorded - * whether the filesystem was previously shared or mounted. The action we take - * depends on the previous state, and whether the value was previously 'legacy'. - * For non-legacy properties, we only remount/reshare the filesystem if it was - * previously mounted/shared. Otherwise, we always remount/reshare the - * filesystem. - */ -int -changelist_postfix(prop_changelist_t *clp) -{ - prop_changenode_t *cn; - char shareopts[ZFS_MAXPROPLEN]; - int errors = 0; - libzfs_handle_t *hdl; -#ifdef illumos - size_t num_datasets = 0, i; - zfs_handle_t **zhandle_arr; - sa_init_selective_arg_t sharearg; -#endif - - /* - * If we're changing the mountpoint, attempt to destroy the underlying - * mountpoint. All other datasets will have inherited from this dataset - * (in which case their mountpoints exist in the filesystem in the new - * location), or have explicit mountpoints set (in which case they won't - * be in the changelist). - */ - if ((cn = uu_list_last(clp->cl_list)) == NULL) - return (0); - - if (clp->cl_prop == ZFS_PROP_MOUNTPOINT && - !(clp->cl_gflags & CL_GATHER_DONT_UNMOUNT)) { - remove_mountpoint(cn->cn_handle); - } - - /* - * It is possible that the changelist_prefix() used libshare - * to unshare some entries. Since libshare caches data, an - * attempt to reshare during postfix can fail unless libshare - * is uninitialized here so that it will reinitialize later. - */ - if (cn->cn_handle != NULL) { - hdl = cn->cn_handle->zfs_hdl; - assert(hdl != NULL); - zfs_uninit_libshare(hdl); - -#ifdef illumos - /* - * For efficiencies sake, we initialize libshare for only a few - * shares (the ones affected here). Future initializations in - * this process should just use the cached initialization. - */ - for (cn = uu_list_last(clp->cl_list); cn != NULL; - cn = uu_list_prev(clp->cl_list, cn)) { - num_datasets++; - } - - zhandle_arr = zfs_alloc(hdl, - num_datasets * sizeof (zfs_handle_t *)); - for (i = 0, cn = uu_list_last(clp->cl_list); cn != NULL; - cn = uu_list_prev(clp->cl_list, cn)) { - zhandle_arr[i++] = cn->cn_handle; - zfs_refresh_properties(cn->cn_handle); - } - assert(i == num_datasets); - sharearg.zhandle_arr = zhandle_arr; - sharearg.zhandle_len = num_datasets; - errors = zfs_init_libshare_arg(hdl, SA_INIT_SHARE_API_SELECTIVE, - &sharearg); - free(zhandle_arr); -#endif - } - /* - * We walk the datasets in reverse, because we want to mount any parent - * datasets before mounting the children. We walk all datasets even if - * there are errors. - */ - for (cn = uu_list_last(clp->cl_list); cn != NULL; - cn = uu_list_prev(clp->cl_list, cn)) { - - boolean_t sharenfs; - boolean_t sharesmb; - boolean_t mounted; - - /* - * If we are in the global zone, but this dataset is exported - * to a local zone, do nothing. - */ - if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned) - continue; - - /* Only do post-processing if it's required */ - if (!cn->cn_needpost) - continue; - cn->cn_needpost = B_FALSE; - -#ifndef illumos - zfs_refresh_properties(cn->cn_handle); -#endif - - if (ZFS_IS_VOLUME(cn->cn_handle)) - continue; - - /* - * Remount if previously mounted or mountpoint was legacy, - * or sharenfs or sharesmb property is set. - */ - sharenfs = ((zfs_prop_get(cn->cn_handle, ZFS_PROP_SHARENFS, - shareopts, sizeof (shareopts), NULL, NULL, 0, - B_FALSE) == 0) && (strcmp(shareopts, "off") != 0)); - - sharesmb = ((zfs_prop_get(cn->cn_handle, ZFS_PROP_SHARESMB, - shareopts, sizeof (shareopts), NULL, NULL, 0, - B_FALSE) == 0) && (strcmp(shareopts, "off") != 0)); - - mounted = (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT) || - zfs_is_mounted(cn->cn_handle, NULL); - - if (!mounted && (cn->cn_mounted || - ((sharenfs || sharesmb || clp->cl_waslegacy) && - (zfs_prop_get_int(cn->cn_handle, - ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON)))) { - - if (zfs_mount(cn->cn_handle, NULL, 0) != 0) - errors++; - else - mounted = TRUE; - } - - /* - * If the file system is mounted we always re-share even - * if the filesystem is currently shared, so that we can - * adopt any new options. - */ - if (sharenfs && mounted) - errors += zfs_share_nfs(cn->cn_handle); - else if (cn->cn_shared || clp->cl_waslegacy) - errors += zfs_unshare_nfs(cn->cn_handle, NULL); - if (sharesmb && mounted) - errors += zfs_share_smb(cn->cn_handle); - else if (cn->cn_shared || clp->cl_waslegacy) - errors += zfs_unshare_smb(cn->cn_handle, NULL); - } - - return (errors ? -1 : 0); -} - -/* - * Is this "dataset" a child of "parent"? - */ -boolean_t -isa_child_of(const char *dataset, const char *parent) -{ - int len; - - len = strlen(parent); - - if (strncmp(dataset, parent, len) == 0 && - (dataset[len] == '@' || dataset[len] == '/' || - dataset[len] == '\0')) - return (B_TRUE); - else - return (B_FALSE); - -} - -/* - * If we rename a filesystem, child filesystem handles are no longer valid - * since we identify each dataset by its name in the ZFS namespace. As a - * result, we have to go through and fix up all the names appropriately. We - * could do this automatically if libzfs kept track of all open handles, but - * this is a lot less work. - */ -void -changelist_rename(prop_changelist_t *clp, const char *src, const char *dst) -{ - prop_changenode_t *cn; - char newname[ZFS_MAX_DATASET_NAME_LEN]; - - for (cn = uu_list_first(clp->cl_list); cn != NULL; - cn = uu_list_next(clp->cl_list, cn)) { - /* - * Do not rename a clone that's not in the source hierarchy. - */ - if (!isa_child_of(cn->cn_handle->zfs_name, src)) - continue; - - /* - * Destroy the previous mountpoint if needed. - */ - remove_mountpoint(cn->cn_handle); - - (void) strlcpy(newname, dst, sizeof (newname)); - (void) strcat(newname, cn->cn_handle->zfs_name + strlen(src)); - - (void) strlcpy(cn->cn_handle->zfs_name, newname, - sizeof (cn->cn_handle->zfs_name)); - } -} - -/* - * Given a gathered changelist for the 'sharenfs' or 'sharesmb' property, - * unshare all the datasets in the list. - */ -int -changelist_unshare(prop_changelist_t *clp, zfs_share_proto_t *proto) -{ - prop_changenode_t *cn; - int ret = 0; - - if (clp->cl_prop != ZFS_PROP_SHARENFS && - clp->cl_prop != ZFS_PROP_SHARESMB) - return (0); - - for (cn = uu_list_first(clp->cl_list); cn != NULL; - cn = uu_list_next(clp->cl_list, cn)) { - if (zfs_unshare_proto(cn->cn_handle, NULL, proto) != 0) - ret = -1; - } - - return (ret); -} - -/* - * Check if there is any child exported to a local zone in a given changelist. - * This information has already been recorded while gathering the changelist - * via changelist_gather(). - */ -int -changelist_haszonedchild(prop_changelist_t *clp) -{ - return (clp->cl_haszonedchild); -} - -/* - * Remove a node from a gathered list. - */ -void -changelist_remove(prop_changelist_t *clp, const char *name) -{ - prop_changenode_t *cn; - - for (cn = uu_list_first(clp->cl_list); cn != NULL; - cn = uu_list_next(clp->cl_list, cn)) { - - if (strcmp(cn->cn_handle->zfs_name, name) == 0) { - uu_list_remove(clp->cl_list, cn); - zfs_close(cn->cn_handle); - free(cn); - return; - } - } -} - -/* - * Release any memory associated with a changelist. - */ -void -changelist_free(prop_changelist_t *clp) -{ - prop_changenode_t *cn; - void *cookie; - - if (clp->cl_list) { - cookie = NULL; - while ((cn = uu_list_teardown(clp->cl_list, &cookie)) != NULL) { - zfs_close(cn->cn_handle); - free(cn); - } - - uu_list_destroy(clp->cl_list); - } - if (clp->cl_pool) - uu_list_pool_destroy(clp->cl_pool); - - free(clp); -} - -static int -change_one(zfs_handle_t *zhp, void *data) -{ - prop_changelist_t *clp = data; - char property[ZFS_MAXPROPLEN]; - char where[64]; - prop_changenode_t *cn; - zprop_source_t sourcetype; - zprop_source_t share_sourcetype; - - /* - * We only want to unmount/unshare those filesystems that may inherit - * from the target filesystem. If we find any filesystem with a - * locally set mountpoint, we ignore any children since changing the - * property will not affect them. If this is a rename, we iterate - * over all children regardless, since we need them unmounted in - * order to do the rename. Also, if this is a volume and we're doing - * a rename, then always add it to the changelist. - */ - - if (!(ZFS_IS_VOLUME(zhp) && clp->cl_realprop == ZFS_PROP_NAME) && - zfs_prop_get(zhp, clp->cl_prop, property, - sizeof (property), &sourcetype, where, sizeof (where), - B_FALSE) != 0) { - zfs_close(zhp); - return (0); - } - - /* - * If we are "watching" sharenfs or sharesmb - * then check out the companion property which is tracked - * in cl_shareprop - */ - if (clp->cl_shareprop != ZPROP_INVAL && - zfs_prop_get(zhp, clp->cl_shareprop, property, - sizeof (property), &share_sourcetype, where, sizeof (where), - B_FALSE) != 0) { - zfs_close(zhp); - return (0); - } - - if (clp->cl_alldependents || clp->cl_allchildren || - sourcetype == ZPROP_SRC_DEFAULT || - sourcetype == ZPROP_SRC_INHERITED || - (clp->cl_shareprop != ZPROP_INVAL && - (share_sourcetype == ZPROP_SRC_DEFAULT || - share_sourcetype == ZPROP_SRC_INHERITED))) { - if ((cn = zfs_alloc(zfs_get_handle(zhp), - sizeof (prop_changenode_t))) == NULL) { - zfs_close(zhp); - return (-1); - } - - cn->cn_handle = zhp; - cn->cn_mounted = (clp->cl_gflags & CL_GATHER_MOUNT_ALWAYS) || - zfs_is_mounted(zhp, NULL); - cn->cn_shared = zfs_is_shared(zhp); - cn->cn_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); - cn->cn_needpost = B_TRUE; - - /* Indicate if any child is exported to a local zone. */ - if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned) - clp->cl_haszonedchild = B_TRUE; - - uu_list_node_init(cn, &cn->cn_listnode, clp->cl_pool); - - if (clp->cl_sorted) { - uu_list_index_t idx; - - (void) uu_list_find(clp->cl_list, cn, NULL, - &idx); - uu_list_insert(clp->cl_list, cn, idx); - } else { - /* - * Add this child to beginning of the list. Children - * below this one in the hierarchy will get added above - * this one in the list. This produces a list in - * reverse dataset name order. - * This is necessary when the original mountpoint - * is legacy or none. - */ - verify(uu_list_insert_before(clp->cl_list, - uu_list_first(clp->cl_list), cn) == 0); - } - - if (!clp->cl_alldependents) - return (zfs_iter_children(zhp, change_one, data)); - } else { - zfs_close(zhp); - } - - return (0); -} - -/*ARGSUSED*/ -static int -compare_mountpoints(const void *a, const void *b, void *unused) -{ - const prop_changenode_t *ca = a; - const prop_changenode_t *cb = b; - - char mounta[MAXPATHLEN]; - char mountb[MAXPATHLEN]; - - boolean_t hasmounta, hasmountb; - - /* - * When unsharing or unmounting filesystems, we need to do it in - * mountpoint order. This allows the user to have a mountpoint - * hierarchy that is different from the dataset hierarchy, and still - * allow it to be changed. However, if either dataset doesn't have a - * mountpoint (because it is a volume or a snapshot), we place it at the - * end of the list, because it doesn't affect our change at all. - */ - hasmounta = (zfs_prop_get(ca->cn_handle, ZFS_PROP_MOUNTPOINT, mounta, - sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0); - hasmountb = (zfs_prop_get(cb->cn_handle, ZFS_PROP_MOUNTPOINT, mountb, - sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0); - - if (!hasmounta && hasmountb) - return (-1); - else if (hasmounta && !hasmountb) - return (1); - else if (!hasmounta && !hasmountb) - return (0); - else - return (strcmp(mountb, mounta)); -} - -/* - * Given a ZFS handle and a property, construct a complete list of datasets - * that need to be modified as part of this process. For anything but the - * 'mountpoint' and 'sharenfs' properties, this just returns an empty list. - * Otherwise, we iterate over all children and look for any datasets that - * inherit the property. For each such dataset, we add it to the list and - * mark whether it was shared beforehand. - */ -prop_changelist_t * -changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, - int mnt_flags) -{ - prop_changelist_t *clp; - prop_changenode_t *cn; - zfs_handle_t *temp; - char property[ZFS_MAXPROPLEN]; - uu_compare_fn_t *compare = NULL; - boolean_t legacy = B_FALSE; - - if ((clp = zfs_alloc(zhp->zfs_hdl, sizeof (prop_changelist_t))) == NULL) - return (NULL); - - /* - * For mountpoint-related tasks, we want to sort everything by - * mountpoint, so that we mount and unmount them in the appropriate - * order, regardless of their position in the hierarchy. - */ - if (prop == ZFS_PROP_NAME || prop == ZFS_PROP_ZONED || - prop == ZFS_PROP_MOUNTPOINT || prop == ZFS_PROP_SHARENFS || - prop == ZFS_PROP_SHARESMB) { - - if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, - property, sizeof (property), - NULL, NULL, 0, B_FALSE) == 0 && - (strcmp(property, "legacy") == 0 || - strcmp(property, "none") == 0)) { - - legacy = B_TRUE; - } - if (!legacy) { - compare = compare_mountpoints; - clp->cl_sorted = B_TRUE; - } - } - - clp->cl_pool = uu_list_pool_create("changelist_pool", - sizeof (prop_changenode_t), - offsetof(prop_changenode_t, cn_listnode), - compare, 0); - if (clp->cl_pool == NULL) { - assert(uu_error() == UU_ERROR_NO_MEMORY); - (void) zfs_error(zhp->zfs_hdl, EZFS_NOMEM, "internal error"); - changelist_free(clp); - return (NULL); - } - - clp->cl_list = uu_list_create(clp->cl_pool, NULL, - clp->cl_sorted ? UU_LIST_SORTED : 0); - clp->cl_gflags = gather_flags; - clp->cl_mflags = mnt_flags; - - if (clp->cl_list == NULL) { - assert(uu_error() == UU_ERROR_NO_MEMORY); - (void) zfs_error(zhp->zfs_hdl, EZFS_NOMEM, "internal error"); - changelist_free(clp); - return (NULL); - } - - /* - * If this is a rename or the 'zoned' property, we pretend we're - * changing the mountpoint and flag it so we can catch all children in - * change_one(). - * - * Flag cl_alldependents to catch all children plus the dependents - * (clones) that are not in the hierarchy. - */ - if (prop == ZFS_PROP_NAME) { - clp->cl_prop = ZFS_PROP_MOUNTPOINT; - clp->cl_alldependents = B_TRUE; - } else if (prop == ZFS_PROP_ZONED) { - clp->cl_prop = ZFS_PROP_MOUNTPOINT; - clp->cl_allchildren = B_TRUE; - } else if (prop == ZFS_PROP_CANMOUNT) { - clp->cl_prop = ZFS_PROP_MOUNTPOINT; - } else if (prop == ZFS_PROP_VOLSIZE) { - clp->cl_prop = ZFS_PROP_MOUNTPOINT; - } else { - clp->cl_prop = prop; - } - clp->cl_realprop = prop; - - if (clp->cl_prop != ZFS_PROP_MOUNTPOINT && - clp->cl_prop != ZFS_PROP_SHARENFS && - clp->cl_prop != ZFS_PROP_SHARESMB) - return (clp); - - /* - * If watching SHARENFS or SHARESMB then - * also watch its companion property. - */ - if (clp->cl_prop == ZFS_PROP_SHARENFS) - clp->cl_shareprop = ZFS_PROP_SHARESMB; - else if (clp->cl_prop == ZFS_PROP_SHARESMB) - clp->cl_shareprop = ZFS_PROP_SHARENFS; - - if (clp->cl_alldependents) { - if (zfs_iter_dependents(zhp, B_TRUE, change_one, clp) != 0) { - changelist_free(clp); - return (NULL); - } - } else if (zfs_iter_children(zhp, change_one, clp) != 0) { - changelist_free(clp); - return (NULL); - } - - /* - * We have to re-open ourselves because we auto-close all the handles - * and can't tell the difference. - */ - if ((temp = zfs_open(zhp->zfs_hdl, zfs_get_name(zhp), - ZFS_TYPE_DATASET)) == NULL) { - changelist_free(clp); - return (NULL); - } - - /* - * Always add ourself to the list. We add ourselves to the end so that - * we're the last to be unmounted. - */ - if ((cn = zfs_alloc(zhp->zfs_hdl, - sizeof (prop_changenode_t))) == NULL) { - zfs_close(temp); - changelist_free(clp); - return (NULL); - } - - cn->cn_handle = temp; - cn->cn_mounted = (clp->cl_gflags & CL_GATHER_MOUNT_ALWAYS) || - zfs_is_mounted(temp, NULL); - cn->cn_shared = zfs_is_shared(temp); - cn->cn_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); - cn->cn_needpost = B_TRUE; - - uu_list_node_init(cn, &cn->cn_listnode, clp->cl_pool); - if (clp->cl_sorted) { - uu_list_index_t idx; - (void) uu_list_find(clp->cl_list, cn, NULL, &idx); - uu_list_insert(clp->cl_list, cn, idx); - } else { - /* - * Add the target dataset to the end of the list. - * The list is not really unsorted. The list will be - * in reverse dataset name order. This is necessary - * when the original mountpoint is legacy or none. - */ - verify(uu_list_insert_after(clp->cl_list, - uu_list_last(clp->cl_list), cn) == 0); - } - - /* - * If the mountpoint property was previously 'legacy', or 'none', - * record it as the behavior of changelist_postfix() will be different. - */ - if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) && legacy) { - /* - * do not automatically mount ex-legacy datasets if - * we specifically set canmount to noauto - */ - if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) != - ZFS_CANMOUNT_NOAUTO) - clp->cl_waslegacy = B_TRUE; - } - - return (clp); -} diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c deleted file mode 100644 index 7545331b40b4..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - * CDDL HEADER SART - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2013 Martin Matuska . All rights reserved. - */ - -#include "libzfs_compat.h" - -int zfs_ioctl_version = ZFS_IOCVER_UNDEF; -static int zfs_spa_version = -1; - -/* - * Get zfs_ioctl_version - */ -int -get_zfs_ioctl_version(void) -{ - size_t ver_size; - int ver = ZFS_IOCVER_NONE; - - ver_size = sizeof(ver); - sysctlbyname("vfs.zfs.version.ioctl", &ver, &ver_size, NULL, 0); - - return (ver); -} - -/* - * Get the SPA version - */ -static int -get_zfs_spa_version(void) -{ - size_t ver_size; - int ver = 0; - - ver_size = sizeof(ver); - sysctlbyname("vfs.zfs.version.spa", &ver, &ver_size, NULL, 0); - - return (ver); -} - -/* - * This is FreeBSD version of ioctl, because Solaris' ioctl() updates - * zc_nvlist_dst_size even if an error is returned, on FreeBSD if an - * error is returned zc_nvlist_dst_size won't be updated. - */ -int -zcmd_ioctl(int fd, int request, zfs_cmd_t *zc) -{ - size_t oldsize; - int ret, cflag = ZFS_CMD_COMPAT_NONE; - - if (zfs_ioctl_version == ZFS_IOCVER_UNDEF) - zfs_ioctl_version = get_zfs_ioctl_version(); - - if (zfs_ioctl_version >= ZFS_IOCVER_DEADMAN) { - switch (zfs_ioctl_version) { - case ZFS_IOCVER_INLANES: - cflag = ZFS_CMD_COMPAT_INLANES; - break; - case ZFS_IOCVER_RESUME: - cflag = ZFS_CMD_COMPAT_RESUME; - break; - case ZFS_IOCVER_EDBP: - cflag = ZFS_CMD_COMPAT_EDBP; - break; - case ZFS_IOCVER_ZCMD: - cflag = ZFS_CMD_COMPAT_ZCMD; - break; - case ZFS_IOCVER_LZC: - cflag = ZFS_CMD_COMPAT_LZC; - break; - case ZFS_IOCVER_DEADMAN: - cflag = ZFS_CMD_COMPAT_DEADMAN; - break; - } - } else { - /* - * If vfs.zfs.version.ioctl is not defined, assume we have v28 - * compatible binaries and use vfs.zfs.version.spa to test for v15 - */ - cflag = ZFS_CMD_COMPAT_V28; - - if (zfs_spa_version < 0) - zfs_spa_version = get_zfs_spa_version(); - - if (zfs_spa_version == SPA_VERSION_15 || - zfs_spa_version == SPA_VERSION_14 || - zfs_spa_version == SPA_VERSION_13) - cflag = ZFS_CMD_COMPAT_V15; - } - - oldsize = zc->zc_nvlist_dst_size; - ret = zcmd_ioctl_compat(fd, request, zc, cflag); - - if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) { - ret = -1; - errno = ENOMEM; - } - - return (ret); -} diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h deleted file mode 100644 index 37616683330a..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * CDDL HEADER SART - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2013 Martin Matuska . All rights reserved. - */ - -#ifndef _LIBZFS_COMPAT_H -#define _LIBZFS_COMPAT_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -int get_zfs_ioctl_version(void); -int zcmd_ioctl(int fd, int request, zfs_cmd_t *zc); - -#define ioctl(fd, ioc, zc) zcmd_ioctl((fd), (ioc), (zc)) - -#ifdef __cplusplus -} -#endif - -#endif /* _LIBZFS_COMPAT_H */ diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_config.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_config.c deleted file mode 100644 index b33d86432dc5..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_config.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2012 by Delphix. All rights reserved. - * Copyright (c) 2015 by Syneto S.R.L. All rights reserved. - * Copyright 2016 Nexenta Systems, Inc. - */ - -/* - * The pool configuration repository is stored in /etc/zfs/zpool.cache as a - * single packed nvlist. While it would be nice to just read in this - * file from userland, this wouldn't work from a local zone. So we have to have - * a zpool ioctl to return the complete configuration for all pools. In the - * global zone, this will be identical to reading the file and unpacking it in - * userland. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "libzfs_impl.h" - -typedef struct config_node { - char *cn_name; - nvlist_t *cn_config; - uu_avl_node_t cn_avl; -} config_node_t; - -/* ARGSUSED */ -static int -config_node_compare(const void *a, const void *b, void *unused) -{ - int ret; - - const config_node_t *ca = (config_node_t *)a; - const config_node_t *cb = (config_node_t *)b; - - ret = strcmp(ca->cn_name, cb->cn_name); - - if (ret < 0) - return (-1); - else if (ret > 0) - return (1); - else - return (0); -} - -void -namespace_clear(libzfs_handle_t *hdl) -{ - if (hdl->libzfs_ns_avl) { - config_node_t *cn; - void *cookie = NULL; - - while ((cn = uu_avl_teardown(hdl->libzfs_ns_avl, - &cookie)) != NULL) { - nvlist_free(cn->cn_config); - free(cn->cn_name); - free(cn); - } - - uu_avl_destroy(hdl->libzfs_ns_avl); - hdl->libzfs_ns_avl = NULL; - } - - if (hdl->libzfs_ns_avlpool) { - uu_avl_pool_destroy(hdl->libzfs_ns_avlpool); - hdl->libzfs_ns_avlpool = NULL; - } -} - -/* - * Loads the pool namespace, or re-loads it if the cache has changed. - */ -static int -namespace_reload(libzfs_handle_t *hdl) -{ - nvlist_t *config; - config_node_t *cn; - nvpair_t *elem; - zfs_cmd_t zc = { 0 }; - void *cookie; - - if (hdl->libzfs_ns_gen == 0) { - /* - * This is the first time we've accessed the configuration - * cache. Initialize the AVL tree and then fall through to the - * common code. - */ - if ((hdl->libzfs_ns_avlpool = uu_avl_pool_create("config_pool", - sizeof (config_node_t), - offsetof(config_node_t, cn_avl), - config_node_compare, UU_DEFAULT)) == NULL) - return (no_memory(hdl)); - - if ((hdl->libzfs_ns_avl = uu_avl_create(hdl->libzfs_ns_avlpool, - NULL, UU_DEFAULT)) == NULL) - return (no_memory(hdl)); - } - - if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) - return (-1); - - for (;;) { - zc.zc_cookie = hdl->libzfs_ns_gen; - if (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_CONFIGS, &zc) != 0) { - switch (errno) { - case EEXIST: - /* - * The namespace hasn't changed. - */ - zcmd_free_nvlists(&zc); - return (0); - - case ENOMEM: - if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { - zcmd_free_nvlists(&zc); - return (-1); - } - break; - - default: - zcmd_free_nvlists(&zc); - return (zfs_standard_error(hdl, errno, - dgettext(TEXT_DOMAIN, "failed to read " - "pool configuration"))); - } - } else { - hdl->libzfs_ns_gen = zc.zc_cookie; - break; - } - } - - if (zcmd_read_dst_nvlist(hdl, &zc, &config) != 0) { - zcmd_free_nvlists(&zc); - return (-1); - } - - zcmd_free_nvlists(&zc); - - /* - * Clear out any existing configuration information. - */ - cookie = NULL; - while ((cn = uu_avl_teardown(hdl->libzfs_ns_avl, &cookie)) != NULL) { - nvlist_free(cn->cn_config); - free(cn->cn_name); - free(cn); - } - - elem = NULL; - while ((elem = nvlist_next_nvpair(config, elem)) != NULL) { - nvlist_t *child; - uu_avl_index_t where; - - if ((cn = zfs_alloc(hdl, sizeof (config_node_t))) == NULL) { - nvlist_free(config); - return (-1); - } - - if ((cn->cn_name = zfs_strdup(hdl, - nvpair_name(elem))) == NULL) { - free(cn); - nvlist_free(config); - return (-1); - } - - verify(nvpair_value_nvlist(elem, &child) == 0); - if (nvlist_dup(child, &cn->cn_config, 0) != 0) { - free(cn->cn_name); - free(cn); - nvlist_free(config); - return (no_memory(hdl)); - } - verify(uu_avl_find(hdl->libzfs_ns_avl, cn, NULL, &where) - == NULL); - - uu_avl_insert(hdl->libzfs_ns_avl, cn, where); - } - - nvlist_free(config); - return (0); -} - -/* - * Retrieve the configuration for the given pool. The configuration is a nvlist - * describing the vdevs, as well as the statistics associated with each one. - */ -nvlist_t * -zpool_get_config(zpool_handle_t *zhp, nvlist_t **oldconfig) -{ - if (oldconfig) - *oldconfig = zhp->zpool_old_config; - return (zhp->zpool_config); -} - -/* - * Retrieves a list of enabled features and their refcounts and caches it in - * the pool handle. - */ -nvlist_t * -zpool_get_features(zpool_handle_t *zhp) -{ - nvlist_t *config, *features; - - config = zpool_get_config(zhp, NULL); - - if (config == NULL || !nvlist_exists(config, - ZPOOL_CONFIG_FEATURE_STATS)) { - int error; - boolean_t missing = B_FALSE; - - error = zpool_refresh_stats(zhp, &missing); - - if (error != 0 || missing) - return (NULL); - - config = zpool_get_config(zhp, NULL); - } - - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, - &features) != 0) - return (NULL); - - return (features); -} - -/* - * Refresh the vdev statistics associated with the given pool. This is used in - * iostat to show configuration changes and determine the delta from the last - * time the function was called. This function can fail, in case the pool has - * been destroyed. - */ -int -zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) -{ - zfs_cmd_t zc = { 0 }; - int error; - nvlist_t *config; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - *missing = B_FALSE; - (void) strcpy(zc.zc_name, zhp->zpool_name); - - if (zhp->zpool_config_size == 0) - zhp->zpool_config_size = 1 << 16; - - if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size) != 0) - return (-1); - - for (;;) { - if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_POOL_STATS, - &zc) == 0) { - /* - * The real error is returned in the zc_cookie field. - */ - error = zc.zc_cookie; - break; - } - - if (errno == ENOMEM) { - if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { - zcmd_free_nvlists(&zc); - return (-1); - } - } else { - zcmd_free_nvlists(&zc); - if (errno == ENOENT || errno == EINVAL) - *missing = B_TRUE; - zhp->zpool_state = POOL_STATE_UNAVAIL; - return (0); - } - } - - if (zcmd_read_dst_nvlist(hdl, &zc, &config) != 0) { - zcmd_free_nvlists(&zc); - return (-1); - } - - zcmd_free_nvlists(&zc); - - zhp->zpool_config_size = zc.zc_nvlist_dst_size; - - if (zhp->zpool_config != NULL) { - uint64_t oldtxg, newtxg; - - verify(nvlist_lookup_uint64(zhp->zpool_config, - ZPOOL_CONFIG_POOL_TXG, &oldtxg) == 0); - verify(nvlist_lookup_uint64(config, - ZPOOL_CONFIG_POOL_TXG, &newtxg) == 0); - - nvlist_free(zhp->zpool_old_config); - - if (oldtxg != newtxg) { - nvlist_free(zhp->zpool_config); - zhp->zpool_old_config = NULL; - } else { - zhp->zpool_old_config = zhp->zpool_config; - } - } - - zhp->zpool_config = config; - if (error) - zhp->zpool_state = POOL_STATE_UNAVAIL; - else - zhp->zpool_state = POOL_STATE_ACTIVE; - - return (0); -} - -/* - * The following environment variables are undocumented - * and should be used for testing purposes only: - * - * __ZFS_POOL_EXCLUDE - don't iterate over the pools it lists - * __ZFS_POOL_RESTRICT - iterate only over the pools it lists - * - * This function returns B_TRUE if the pool should be skipped - * during iteration. - */ -boolean_t -zpool_skip_pool(const char *poolname) -{ - static boolean_t initialized = B_FALSE; - static const char *exclude = NULL; - static const char *restricted = NULL; - - const char *cur, *end; - int len; - int namelen = strlen(poolname); - - if (!initialized) { - initialized = B_TRUE; - exclude = getenv("__ZFS_POOL_EXCLUDE"); - restricted = getenv("__ZFS_POOL_RESTRICT"); - } - - if (exclude != NULL) { - cur = exclude; - do { - end = strchr(cur, ' '); - len = (NULL == end) ? strlen(cur) : (end - cur); - if (len == namelen && 0 == strncmp(cur, poolname, len)) - return (B_TRUE); - cur += (len + 1); - } while (NULL != end); - } - - if (NULL == restricted) - return (B_FALSE); - - cur = restricted; - do { - end = strchr(cur, ' '); - len = (NULL == end) ? strlen(cur) : (end - cur); - - if (len == namelen && 0 == strncmp(cur, poolname, len)) { - return (B_FALSE); - } - - cur += (len + 1); - } while (NULL != end); - - return (B_TRUE); -} - -/* - * Iterate over all pools in the system. - */ -int -zpool_iter(libzfs_handle_t *hdl, zpool_iter_f func, void *data) -{ - config_node_t *cn; - zpool_handle_t *zhp; - int ret; - - /* - * If someone makes a recursive call to zpool_iter(), we want to avoid - * refreshing the namespace because that will invalidate the parent - * context. We allow recursive calls, but simply re-use the same - * namespace AVL tree. - */ - if (!hdl->libzfs_pool_iter && namespace_reload(hdl) != 0) - return (-1); - - hdl->libzfs_pool_iter++; - for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL; - cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) { - - if (zpool_skip_pool(cn->cn_name)) - continue; - - if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0) { - hdl->libzfs_pool_iter--; - return (-1); - } - - if (zhp == NULL) - continue; - - if ((ret = func(zhp, data)) != 0) { - hdl->libzfs_pool_iter--; - return (ret); - } - } - hdl->libzfs_pool_iter--; - - return (0); -} - -/* - * Iterate over root datasets, calling the given function for each. The zfs - * handle passed each time must be explicitly closed by the callback. - */ -int -zfs_iter_root(libzfs_handle_t *hdl, zfs_iter_f func, void *data) -{ - config_node_t *cn; - zfs_handle_t *zhp; - int ret; - - if (namespace_reload(hdl) != 0) - return (-1); - - for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL; - cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) { - - if (zpool_skip_pool(cn->cn_name)) - continue; - - if ((zhp = make_dataset_handle(hdl, cn->cn_name)) == NULL) - continue; - - if ((ret = func(zhp, data)) != 0) - return (ret); - } - - return (0); -} diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c deleted file mode 100644 index 7075d060c78d..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c +++ /dev/null @@ -1,5284 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018, Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. - * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. - * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. - * Copyright (c) 2013 Martin Matuska. All rights reserved. - * Copyright (c) 2013 Steven Hartland. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2017 Nexenta Systems, Inc. - * Copyright 2016 Igor Kozhukhov - * Copyright 2017-2018 RackTop Systems. - * Copyright (c) 2019 Datto Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef illumos -#include -#endif - -#include -#include -#include -#include -#include - -#include "zfs_namecheck.h" -#include "zfs_prop.h" -#include "libzfs_impl.h" -#include "zfs_deleg.h" - -static int userquota_propname_decode(const char *propname, boolean_t zoned, - zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp); - -/* - * Given a single type (not a mask of types), return the type in a human - * readable form. - */ -const char * -zfs_type_to_name(zfs_type_t type) -{ - switch (type) { - case ZFS_TYPE_FILESYSTEM: - return (dgettext(TEXT_DOMAIN, "filesystem")); - case ZFS_TYPE_SNAPSHOT: - return (dgettext(TEXT_DOMAIN, "snapshot")); - case ZFS_TYPE_VOLUME: - return (dgettext(TEXT_DOMAIN, "volume")); - case ZFS_TYPE_POOL: - return (dgettext(TEXT_DOMAIN, "pool")); - case ZFS_TYPE_BOOKMARK: - return (dgettext(TEXT_DOMAIN, "bookmark")); - default: - assert(!"unhandled zfs_type_t"); - } - - return (NULL); -} - -/* - * Validate a ZFS path. This is used even before trying to open the dataset, to - * provide a more meaningful error message. We call zfs_error_aux() to - * explain exactly why the name was not valid. - */ -int -zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, - boolean_t modifying) -{ - namecheck_err_t why; - char what; - - if (entity_namecheck(path, &why, &what) != 0) { - if (hdl != NULL) { - switch (why) { - case NAME_ERR_TOOLONG: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "name is too long")); - break; - - case NAME_ERR_LEADING_SLASH: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "leading slash in name")); - break; - - case NAME_ERR_EMPTY_COMPONENT: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "empty component in name")); - break; - - case NAME_ERR_TRAILING_SLASH: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "trailing slash in name")); - break; - - case NAME_ERR_INVALCHAR: - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, "invalid character " - "'%c' in name"), what); - break; - - case NAME_ERR_MULTIPLE_DELIMITERS: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "multiple '@' and/or '#' delimiters in " - "name")); - break; - - case NAME_ERR_NOLETTER: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool doesn't begin with a letter")); - break; - - case NAME_ERR_RESERVED: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "name is reserved")); - break; - - case NAME_ERR_DISKLIKE: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "reserved disk name")); - break; - - default: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "(%d) not defined"), why); - break; - } - } - - return (0); - } - - if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) { - if (hdl != NULL) - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "snapshot delimiter '@' is not expected here")); - return (0); - } - - if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) { - if (hdl != NULL) - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "missing '@' delimiter in snapshot name")); - return (0); - } - - if (!(type & ZFS_TYPE_BOOKMARK) && strchr(path, '#') != NULL) { - if (hdl != NULL) - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "bookmark delimiter '#' is not expected here")); - return (0); - } - - if (type == ZFS_TYPE_BOOKMARK && strchr(path, '#') == NULL) { - if (hdl != NULL) - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "missing '#' delimiter in bookmark name")); - return (0); - } - - if (modifying && strchr(path, '%') != NULL) { - if (hdl != NULL) - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid character %c in name"), '%'); - return (0); - } - - return (-1); -} - -int -zfs_name_valid(const char *name, zfs_type_t type) -{ - if (type == ZFS_TYPE_POOL) - return (zpool_name_valid(NULL, B_FALSE, name)); - return (zfs_validate_name(NULL, name, type, B_FALSE)); -} - -/* - * This function takes the raw DSL properties, and filters out the user-defined - * properties into a separate nvlist. - */ -static nvlist_t * -process_user_props(zfs_handle_t *zhp, nvlist_t *props) -{ - libzfs_handle_t *hdl = zhp->zfs_hdl; - nvpair_t *elem; - nvlist_t *propval; - nvlist_t *nvl; - - if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) { - (void) no_memory(hdl); - return (NULL); - } - - elem = NULL; - while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { - if (!zfs_prop_user(nvpair_name(elem))) - continue; - - verify(nvpair_value_nvlist(elem, &propval) == 0); - if (nvlist_add_nvlist(nvl, nvpair_name(elem), propval) != 0) { - nvlist_free(nvl); - (void) no_memory(hdl); - return (NULL); - } - } - - return (nvl); -} - -static zpool_handle_t * -zpool_add_handle(zfs_handle_t *zhp, const char *pool_name) -{ - libzfs_handle_t *hdl = zhp->zfs_hdl; - zpool_handle_t *zph; - - if ((zph = zpool_open_canfail(hdl, pool_name)) != NULL) { - if (hdl->libzfs_pool_handles != NULL) - zph->zpool_next = hdl->libzfs_pool_handles; - hdl->libzfs_pool_handles = zph; - } - return (zph); -} - -static zpool_handle_t * -zpool_find_handle(zfs_handle_t *zhp, const char *pool_name, int len) -{ - libzfs_handle_t *hdl = zhp->zfs_hdl; - zpool_handle_t *zph = hdl->libzfs_pool_handles; - - while ((zph != NULL) && - (strncmp(pool_name, zpool_get_name(zph), len) != 0)) - zph = zph->zpool_next; - return (zph); -} - -/* - * Returns a handle to the pool that contains the provided dataset. - * If a handle to that pool already exists then that handle is returned. - * Otherwise, a new handle is created and added to the list of handles. - */ -static zpool_handle_t * -zpool_handle(zfs_handle_t *zhp) -{ - char *pool_name; - int len; - zpool_handle_t *zph; - - len = strcspn(zhp->zfs_name, "/@#") + 1; - pool_name = zfs_alloc(zhp->zfs_hdl, len); - (void) strlcpy(pool_name, zhp->zfs_name, len); - - zph = zpool_find_handle(zhp, pool_name, len); - if (zph == NULL) - zph = zpool_add_handle(zhp, pool_name); - - free(pool_name); - return (zph); -} - -void -zpool_free_handles(libzfs_handle_t *hdl) -{ - zpool_handle_t *next, *zph = hdl->libzfs_pool_handles; - - while (zph != NULL) { - next = zph->zpool_next; - zpool_close(zph); - zph = next; - } - hdl->libzfs_pool_handles = NULL; -} - -/* - * Utility function to gather stats (objset and zpl) for the given object. - */ -static int -get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc) -{ - libzfs_handle_t *hdl = zhp->zfs_hdl; - - (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); - - while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, zc) != 0) { - if (errno == ENOMEM) { - if (zcmd_expand_dst_nvlist(hdl, zc) != 0) { - return (-1); - } - } else { - return (-1); - } - } - return (0); -} - -/* - * Utility function to get the received properties of the given object. - */ -static int -get_recvd_props_ioctl(zfs_handle_t *zhp) -{ - libzfs_handle_t *hdl = zhp->zfs_hdl; - nvlist_t *recvdprops; - zfs_cmd_t zc = { 0 }; - int err; - - if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) - return (-1); - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - - while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) { - if (errno == ENOMEM) { - if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { - return (-1); - } - } else { - zcmd_free_nvlists(&zc); - return (-1); - } - } - - err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops); - zcmd_free_nvlists(&zc); - if (err != 0) - return (-1); - - nvlist_free(zhp->zfs_recvd_props); - zhp->zfs_recvd_props = recvdprops; - - return (0); -} - -static int -put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc) -{ - nvlist_t *allprops, *userprops; - - zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */ - - if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) { - return (-1); - } - - /* - * XXX Why do we store the user props separately, in addition to - * storing them in zfs_props? - */ - if ((userprops = process_user_props(zhp, allprops)) == NULL) { - nvlist_free(allprops); - return (-1); - } - - nvlist_free(zhp->zfs_props); - nvlist_free(zhp->zfs_user_props); - - zhp->zfs_props = allprops; - zhp->zfs_user_props = userprops; - - return (0); -} - -static int -get_stats(zfs_handle_t *zhp) -{ - int rc = 0; - zfs_cmd_t zc = { 0 }; - - if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) - return (-1); - if (get_stats_ioctl(zhp, &zc) != 0) - rc = -1; - else if (put_stats_zhdl(zhp, &zc) != 0) - rc = -1; - zcmd_free_nvlists(&zc); - return (rc); -} - -/* - * Refresh the properties currently stored in the handle. - */ -void -zfs_refresh_properties(zfs_handle_t *zhp) -{ - (void) get_stats(zhp); -} - -/* - * Makes a handle from the given dataset name. Used by zfs_open() and - * zfs_iter_* to create child handles on the fly. - */ -static int -make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc) -{ - if (put_stats_zhdl(zhp, zc) != 0) - return (-1); - - /* - * We've managed to open the dataset and gather statistics. Determine - * the high-level type. - */ - if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) - zhp->zfs_head_type = ZFS_TYPE_VOLUME; - else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) - zhp->zfs_head_type = ZFS_TYPE_FILESYSTEM; - else if (zhp->zfs_dmustats.dds_type == DMU_OST_OTHER) - return (-1); - else - abort(); - - if (zhp->zfs_dmustats.dds_is_snapshot) - zhp->zfs_type = ZFS_TYPE_SNAPSHOT; - else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) - zhp->zfs_type = ZFS_TYPE_VOLUME; - else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) - zhp->zfs_type = ZFS_TYPE_FILESYSTEM; - else - abort(); /* we should never see any other types */ - - if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) - return (-1); - - return (0); -} - -zfs_handle_t * -make_dataset_handle(libzfs_handle_t *hdl, const char *path) -{ - zfs_cmd_t zc = { 0 }; - - zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); - - if (zhp == NULL) - return (NULL); - - zhp->zfs_hdl = hdl; - (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); - if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) { - free(zhp); - return (NULL); - } - if (get_stats_ioctl(zhp, &zc) == -1) { - zcmd_free_nvlists(&zc); - free(zhp); - return (NULL); - } - if (make_dataset_handle_common(zhp, &zc) == -1) { - free(zhp); - zhp = NULL; - } - zcmd_free_nvlists(&zc); - return (zhp); -} - -zfs_handle_t * -make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc) -{ - zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); - - if (zhp == NULL) - return (NULL); - - zhp->zfs_hdl = hdl; - (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); - if (make_dataset_handle_common(zhp, zc) == -1) { - free(zhp); - return (NULL); - } - return (zhp); -} - -zfs_handle_t * -make_dataset_simple_handle_zc(zfs_handle_t *pzhp, zfs_cmd_t *zc) -{ - zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); - - if (zhp == NULL) - return (NULL); - - zhp->zfs_hdl = pzhp->zfs_hdl; - (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); - zhp->zfs_head_type = pzhp->zfs_type; - zhp->zfs_type = ZFS_TYPE_SNAPSHOT; - zhp->zpool_hdl = zpool_handle(zhp); - return (zhp); -} - -zfs_handle_t * -zfs_handle_dup(zfs_handle_t *zhp_orig) -{ - zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); - - if (zhp == NULL) - return (NULL); - - zhp->zfs_hdl = zhp_orig->zfs_hdl; - zhp->zpool_hdl = zhp_orig->zpool_hdl; - (void) strlcpy(zhp->zfs_name, zhp_orig->zfs_name, - sizeof (zhp->zfs_name)); - zhp->zfs_type = zhp_orig->zfs_type; - zhp->zfs_head_type = zhp_orig->zfs_head_type; - zhp->zfs_dmustats = zhp_orig->zfs_dmustats; - if (zhp_orig->zfs_props != NULL) { - if (nvlist_dup(zhp_orig->zfs_props, &zhp->zfs_props, 0) != 0) { - (void) no_memory(zhp->zfs_hdl); - zfs_close(zhp); - return (NULL); - } - } - if (zhp_orig->zfs_user_props != NULL) { - if (nvlist_dup(zhp_orig->zfs_user_props, - &zhp->zfs_user_props, 0) != 0) { - (void) no_memory(zhp->zfs_hdl); - zfs_close(zhp); - return (NULL); - } - } - if (zhp_orig->zfs_recvd_props != NULL) { - if (nvlist_dup(zhp_orig->zfs_recvd_props, - &zhp->zfs_recvd_props, 0)) { - (void) no_memory(zhp->zfs_hdl); - zfs_close(zhp); - return (NULL); - } - } - zhp->zfs_mntcheck = zhp_orig->zfs_mntcheck; - if (zhp_orig->zfs_mntopts != NULL) { - zhp->zfs_mntopts = zfs_strdup(zhp_orig->zfs_hdl, - zhp_orig->zfs_mntopts); - } - zhp->zfs_props_table = zhp_orig->zfs_props_table; - return (zhp); -} - -boolean_t -zfs_bookmark_exists(const char *path) -{ - nvlist_t *bmarks; - nvlist_t *props; - char fsname[ZFS_MAX_DATASET_NAME_LEN]; - char *bmark_name; - char *pound; - int err; - boolean_t rv; - - - (void) strlcpy(fsname, path, sizeof (fsname)); - pound = strchr(fsname, '#'); - if (pound == NULL) - return (B_FALSE); - - *pound = '\0'; - bmark_name = pound + 1; - props = fnvlist_alloc(); - err = lzc_get_bookmarks(fsname, props, &bmarks); - nvlist_free(props); - if (err != 0) { - nvlist_free(bmarks); - return (B_FALSE); - } - - rv = nvlist_exists(bmarks, bmark_name); - nvlist_free(bmarks); - return (rv); -} - -zfs_handle_t * -make_bookmark_handle(zfs_handle_t *parent, const char *path, - nvlist_t *bmark_props) -{ - zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); - - if (zhp == NULL) - return (NULL); - - /* Fill in the name. */ - zhp->zfs_hdl = parent->zfs_hdl; - (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); - - /* Set the property lists. */ - if (nvlist_dup(bmark_props, &zhp->zfs_props, 0) != 0) { - free(zhp); - return (NULL); - } - - /* Set the types. */ - zhp->zfs_head_type = parent->zfs_head_type; - zhp->zfs_type = ZFS_TYPE_BOOKMARK; - - if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) { - nvlist_free(zhp->zfs_props); - free(zhp); - return (NULL); - } - - return (zhp); -} - -struct zfs_open_bookmarks_cb_data { - const char *path; - zfs_handle_t *zhp; -}; - -static int -zfs_open_bookmarks_cb(zfs_handle_t *zhp, void *data) -{ - struct zfs_open_bookmarks_cb_data *dp = data; - - /* - * Is it the one we are looking for? - */ - if (strcmp(dp->path, zfs_get_name(zhp)) == 0) { - /* - * We found it. Save it and let the caller know we are done. - */ - dp->zhp = zhp; - return (EEXIST); - } - - /* - * Not found. Close the handle and ask for another one. - */ - zfs_close(zhp); - return (0); -} - -/* - * Opens the given snapshot, bookmark, filesystem, or volume. The 'types' - * argument is a mask of acceptable types. The function will print an - * appropriate error message and return NULL if it can't be opened. - */ -zfs_handle_t * -zfs_open(libzfs_handle_t *hdl, const char *path, int types) -{ - zfs_handle_t *zhp; - char errbuf[1024]; - char *bookp; - - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot open '%s'"), path); - - /* - * Validate the name before we even try to open it. - */ - if (!zfs_validate_name(hdl, path, types, B_FALSE)) { - (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); - return (NULL); - } - - /* - * Bookmarks needs to be handled separately. - */ - bookp = strchr(path, '#'); - if (bookp == NULL) { - /* - * Try to get stats for the dataset, which will tell us if it - * exists. - */ - errno = 0; - if ((zhp = make_dataset_handle(hdl, path)) == NULL) { - (void) zfs_standard_error(hdl, errno, errbuf); - return (NULL); - } - } else { - char dsname[ZFS_MAX_DATASET_NAME_LEN]; - zfs_handle_t *pzhp; - struct zfs_open_bookmarks_cb_data cb_data = {path, NULL}; - - /* - * We need to cut out '#' and everything after '#' - * to get the parent dataset name only. - */ - assert(bookp - path < sizeof (dsname)); - (void) strncpy(dsname, path, bookp - path); - dsname[bookp - path] = '\0'; - - /* - * Create handle for the parent dataset. - */ - errno = 0; - if ((pzhp = make_dataset_handle(hdl, dsname)) == NULL) { - (void) zfs_standard_error(hdl, errno, errbuf); - return (NULL); - } - - /* - * Iterate bookmarks to find the right one. - */ - errno = 0; - if ((zfs_iter_bookmarks(pzhp, zfs_open_bookmarks_cb, - &cb_data) == 0) && (cb_data.zhp == NULL)) { - (void) zfs_error(hdl, EZFS_NOENT, errbuf); - zfs_close(pzhp); - return (NULL); - } - if (cb_data.zhp == NULL) { - (void) zfs_standard_error(hdl, errno, errbuf); - zfs_close(pzhp); - return (NULL); - } - zhp = cb_data.zhp; - - /* - * Cleanup. - */ - zfs_close(pzhp); - } - - if (zhp == NULL) { - char *at = strchr(path, '@'); - - if (at != NULL) - *at = '\0'; - errno = 0; - if ((zhp = make_dataset_handle(hdl, path)) == NULL) { - (void) zfs_standard_error(hdl, errno, errbuf); - return (NULL); - } - if (at != NULL) - *at = '@'; - (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); - zhp->zfs_type = ZFS_TYPE_SNAPSHOT; - } - - if (!(types & zhp->zfs_type)) { - (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); - zfs_close(zhp); - return (NULL); - } - - return (zhp); -} - -/* - * Release a ZFS handle. Nothing to do but free the associated memory. - */ -void -zfs_close(zfs_handle_t *zhp) -{ - if (zhp->zfs_mntopts) - free(zhp->zfs_mntopts); - nvlist_free(zhp->zfs_props); - nvlist_free(zhp->zfs_user_props); - nvlist_free(zhp->zfs_recvd_props); - free(zhp); -} - -typedef struct mnttab_node { - struct mnttab mtn_mt; - avl_node_t mtn_node; -} mnttab_node_t; - -static int -libzfs_mnttab_cache_compare(const void *arg1, const void *arg2) -{ - const mnttab_node_t *mtn1 = (const mnttab_node_t *)arg1; - const mnttab_node_t *mtn2 = (const mnttab_node_t *)arg2; - int rv; - - rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special); - - return (AVL_ISIGN(rv)); -} - -void -libzfs_mnttab_init(libzfs_handle_t *hdl) -{ - pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL); - assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0); - avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare, - sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node)); -} - -void -libzfs_mnttab_update(libzfs_handle_t *hdl) -{ - struct mnttab entry; - - rewind(hdl->libzfs_mnttab); - while (getmntent(hdl->libzfs_mnttab, &entry) == 0) { - mnttab_node_t *mtn; - - if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) - continue; - mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); - mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special); - mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp); - mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype); - mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); - avl_add(&hdl->libzfs_mnttab_cache, mtn); - } -} - -void -libzfs_mnttab_fini(libzfs_handle_t *hdl) -{ - void *cookie = NULL; - mnttab_node_t *mtn; - - while ((mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) - != NULL) { - free(mtn->mtn_mt.mnt_special); - free(mtn->mtn_mt.mnt_mountp); - free(mtn->mtn_mt.mnt_fstype); - free(mtn->mtn_mt.mnt_mntopts); - free(mtn); - } - avl_destroy(&hdl->libzfs_mnttab_cache); - (void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock); -} - -void -libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable) -{ - hdl->libzfs_mnttab_enable = enable; -} - -int -libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, - struct mnttab *entry) -{ - mnttab_node_t find; - mnttab_node_t *mtn; - int ret = ENOENT; - - if (!hdl->libzfs_mnttab_enable) { - struct mnttab srch = { 0 }; - - if (avl_numnodes(&hdl->libzfs_mnttab_cache)) - libzfs_mnttab_fini(hdl); - rewind(hdl->libzfs_mnttab); - srch.mnt_special = (char *)fsname; - srch.mnt_fstype = MNTTYPE_ZFS; - if (getmntany(hdl->libzfs_mnttab, entry, &srch) == 0) - return (0); - else - return (ENOENT); - } - - pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); - if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) - libzfs_mnttab_update(hdl); - - find.mtn_mt.mnt_special = (char *)fsname; - mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL); - if (mtn) { - *entry = mtn->mtn_mt; - ret = 0; - } - pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); - return (ret); -} - -void -libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special, - const char *mountp, const char *mntopts) -{ - mnttab_node_t *mtn; - - pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); - if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) { - mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); - mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special); - mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp); - mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS); - mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts); - avl_add(&hdl->libzfs_mnttab_cache, mtn); - } - pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); -} - -void -libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname) -{ - mnttab_node_t find; - mnttab_node_t *ret; - - pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); - find.mtn_mt.mnt_special = (char *)fsname; - if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) - != NULL) { - avl_remove(&hdl->libzfs_mnttab_cache, ret); - free(ret->mtn_mt.mnt_special); - free(ret->mtn_mt.mnt_mountp); - free(ret->mtn_mt.mnt_fstype); - free(ret->mtn_mt.mnt_mntopts); - free(ret); - } - pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); -} - -int -zfs_spa_version(zfs_handle_t *zhp, int *spa_version) -{ - zpool_handle_t *zpool_handle = zhp->zpool_hdl; - - if (zpool_handle == NULL) - return (-1); - - *spa_version = zpool_get_prop_int(zpool_handle, - ZPOOL_PROP_VERSION, NULL); - return (0); -} - -/* - * The choice of reservation property depends on the SPA version. - */ -static int -zfs_which_resv_prop(zfs_handle_t *zhp, zfs_prop_t *resv_prop) -{ - int spa_version; - - if (zfs_spa_version(zhp, &spa_version) < 0) - return (-1); - - if (spa_version >= SPA_VERSION_REFRESERVATION) - *resv_prop = ZFS_PROP_REFRESERVATION; - else - *resv_prop = ZFS_PROP_RESERVATION; - - return (0); -} - -/* - * Given an nvlist of properties to set, validates that they are correct, and - * parses any numeric properties (index, boolean, etc) if they are specified as - * strings. - */ -nvlist_t * -zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, - uint64_t zoned, zfs_handle_t *zhp, zpool_handle_t *zpool_hdl, - const char *errbuf) -{ - nvpair_t *elem; - uint64_t intval; - char *strval; - zfs_prop_t prop; - nvlist_t *ret; - int chosen_normal = -1; - int chosen_utf = -1; - - if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) { - (void) no_memory(hdl); - return (NULL); - } - - /* - * Make sure this property is valid and applies to this type. - */ - - elem = NULL; - while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { - const char *propname = nvpair_name(elem); - - prop = zfs_name_to_prop(propname); - if (prop == ZPROP_INVAL && zfs_prop_user(propname)) { - /* - * This is a user property: make sure it's a - * string, and that it's less than ZAP_MAXNAMELEN. - */ - if (nvpair_type(elem) != DATA_TYPE_STRING) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' must be a string"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - - if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property name '%s' is too long"), - propname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - - (void) nvpair_value_string(elem, &strval); - if (nvlist_add_string(ret, propname, strval) != 0) { - (void) no_memory(hdl); - goto error; - } - continue; - } - - /* - * Currently, only user properties can be modified on - * snapshots. - */ - if (type == ZFS_TYPE_SNAPSHOT) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "this property can not be modified for snapshots")); - (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); - goto error; - } - - if (prop == ZPROP_INVAL && zfs_prop_userquota(propname)) { - zfs_userquota_prop_t uqtype; - char newpropname[128]; - char domain[128]; - uint64_t rid; - uint64_t valary[3]; - - if (userquota_propname_decode(propname, zoned, - &uqtype, domain, sizeof (domain), &rid) != 0) { - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, - "'%s' has an invalid user/group name"), - propname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - - if (uqtype != ZFS_PROP_USERQUOTA && - uqtype != ZFS_PROP_GROUPQUOTA) { - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, "'%s' is readonly"), - propname); - (void) zfs_error(hdl, EZFS_PROPREADONLY, - errbuf); - goto error; - } - - if (nvpair_type(elem) == DATA_TYPE_STRING) { - (void) nvpair_value_string(elem, &strval); - if (strcmp(strval, "none") == 0) { - intval = 0; - } else if (zfs_nicestrtonum(hdl, - strval, &intval) != 0) { - (void) zfs_error(hdl, - EZFS_BADPROP, errbuf); - goto error; - } - } else if (nvpair_type(elem) == - DATA_TYPE_UINT64) { - (void) nvpair_value_uint64(elem, &intval); - if (intval == 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "use 'none' to disable " - "userquota/groupquota")); - goto error; - } - } else { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' must be a number"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - - /* - * Encode the prop name as - * userquota@-domain, to make it easy - * for the kernel to decode. - */ - (void) snprintf(newpropname, sizeof (newpropname), - "%s%llx-%s", zfs_userquota_prop_prefixes[uqtype], - (longlong_t)rid, domain); - valary[0] = uqtype; - valary[1] = rid; - valary[2] = intval; - if (nvlist_add_uint64_array(ret, newpropname, - valary, 3) != 0) { - (void) no_memory(hdl); - goto error; - } - continue; - } else if (prop == ZPROP_INVAL && zfs_prop_written(propname)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' is readonly"), - propname); - (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); - goto error; - } - - if (prop == ZPROP_INVAL) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid property '%s'"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - - if (!zfs_prop_valid_for_type(prop, type)) { - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, "'%s' does not " - "apply to datasets of this type"), propname); - (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); - goto error; - } - - if (zfs_prop_readonly(prop) && - (!zfs_prop_setonce(prop) || zhp != NULL)) { - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, "'%s' is readonly"), - propname); - (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); - goto error; - } - - if (zprop_parse_value(hdl, elem, prop, type, ret, - &strval, &intval, errbuf) != 0) - goto error; - - /* - * Perform some additional checks for specific properties. - */ - switch (prop) { - case ZFS_PROP_VERSION: - { - int version; - - if (zhp == NULL) - break; - version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); - if (intval < version) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Can not downgrade; already at version %u"), - version); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - break; - } - - case ZFS_PROP_VOLBLOCKSIZE: - case ZFS_PROP_RECORDSIZE: - { - int maxbs = SPA_MAXBLOCKSIZE; - if (zpool_hdl != NULL) { - maxbs = zpool_get_prop_int(zpool_hdl, - ZPOOL_PROP_MAXBLOCKSIZE, NULL); - } - /* - * Volumes are limited to a volblocksize of 128KB, - * because they typically service workloads with - * small random writes, which incur a large performance - * penalty with large blocks. - */ - if (prop == ZFS_PROP_VOLBLOCKSIZE) - maxbs = SPA_OLD_MAXBLOCKSIZE; - /* - * The value must be a power of two between - * SPA_MINBLOCKSIZE and maxbs. - */ - if (intval < SPA_MINBLOCKSIZE || - intval > maxbs || !ISP2(intval)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' must be power of 2 from 512B " - "to %uKB"), propname, maxbs >> 10); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - break; - } - - case ZFS_PROP_SPECIAL_SMALL_BLOCKS: - if (zpool_hdl != NULL) { - char state[64] = ""; - - /* - * Issue a warning but do not fail so that - * tests for setable properties succeed. - */ - if (zpool_prop_get_feature(zpool_hdl, - "feature@allocation_classes", state, - sizeof (state)) != 0 || - strcmp(state, ZFS_FEATURE_ACTIVE) != 0) { - (void) fprintf(stderr, gettext( - "%s: property requires a special " - "device in the pool\n"), propname); - } - } - if (intval != 0 && - (intval < SPA_MINBLOCKSIZE || - intval > SPA_OLD_MAXBLOCKSIZE || !ISP2(intval))) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid '%s=%d' property: must be zero or " - "a power of 2 from 512B to 128K"), propname, - intval); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - break; - - case ZFS_PROP_MLSLABEL: - { -#ifdef illumos - /* - * Verify the mlslabel string and convert to - * internal hex label string. - */ - - m_label_t *new_sl; - char *hex = NULL; /* internal label string */ - - /* Default value is already OK. */ - if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) - break; - - /* Verify the label can be converted to binary form */ - if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) || - (str_to_label(strval, &new_sl, MAC_LABEL, - L_NO_CORRECTION, NULL) == -1)) { - goto badlabel; - } - - /* Now translate to hex internal label string */ - if (label_to_str(new_sl, &hex, M_INTERNAL, - DEF_NAMES) != 0) { - if (hex) - free(hex); - goto badlabel; - } - m_label_free(new_sl); - - /* If string is already in internal form, we're done. */ - if (strcmp(strval, hex) == 0) { - free(hex); - break; - } - - /* Replace the label string with the internal form. */ - (void) nvlist_remove(ret, zfs_prop_to_name(prop), - DATA_TYPE_STRING); - verify(nvlist_add_string(ret, zfs_prop_to_name(prop), - hex) == 0); - free(hex); - - break; - -badlabel: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid mlslabel '%s'"), strval); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - m_label_free(new_sl); /* OK if null */ -#else /* !illumos */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "mlslabel is not supported on FreeBSD")); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); -#endif /* illumos */ - goto error; - - } - - case ZFS_PROP_MOUNTPOINT: - { - namecheck_err_t why; - - if (strcmp(strval, ZFS_MOUNTPOINT_NONE) == 0 || - strcmp(strval, ZFS_MOUNTPOINT_LEGACY) == 0) - break; - - if (mountpoint_namecheck(strval, &why)) { - switch (why) { - case NAME_ERR_LEADING_SLASH: - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, - "'%s' must be an absolute path, " - "'none', or 'legacy'"), propname); - break; - case NAME_ERR_TOOLONG: - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, - "component of '%s' is too long"), - propname); - break; - - default: - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, - "(%d) not defined"), - why); - break; - } - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - } - - /*FALLTHRU*/ - - case ZFS_PROP_SHARESMB: - case ZFS_PROP_SHARENFS: - /* - * For the mountpoint and sharenfs or sharesmb - * properties, check if it can be set in a - * global/non-global zone based on - * the zoned property value: - * - * global zone non-global zone - * -------------------------------------------------- - * zoned=on mountpoint (no) mountpoint (yes) - * sharenfs (no) sharenfs (no) - * sharesmb (no) sharesmb (no) - * - * zoned=off mountpoint (yes) N/A - * sharenfs (yes) - * sharesmb (yes) - */ - if (zoned) { - if (getzoneid() == GLOBAL_ZONEID) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' cannot be set on " - "dataset in a non-global zone"), - propname); - (void) zfs_error(hdl, EZFS_ZONED, - errbuf); - goto error; - } else if (prop == ZFS_PROP_SHARENFS || - prop == ZFS_PROP_SHARESMB) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' cannot be set in " - "a non-global zone"), propname); - (void) zfs_error(hdl, EZFS_ZONED, - errbuf); - goto error; - } - } else if (getzoneid() != GLOBAL_ZONEID) { - /* - * If zoned property is 'off', this must be in - * a global zone. If not, something is wrong. - */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' cannot be set while dataset " - "'zoned' property is set"), propname); - (void) zfs_error(hdl, EZFS_ZONED, errbuf); - goto error; - } - - /* - * At this point, it is legitimate to set the - * property. Now we want to make sure that the - * property value is valid if it is sharenfs. - */ - if ((prop == ZFS_PROP_SHARENFS || - prop == ZFS_PROP_SHARESMB) && - strcmp(strval, "on") != 0 && - strcmp(strval, "off") != 0) { - zfs_share_proto_t proto; - - if (prop == ZFS_PROP_SHARESMB) - proto = PROTO_SMB; - else - proto = PROTO_NFS; - - /* - * Must be an valid sharing protocol - * option string so init the libshare - * in order to enable the parser and - * then parse the options. We use the - * control API since we don't care about - * the current configuration and don't - * want the overhead of loading it - * until we actually do something. - */ - - if (zfs_init_libshare(hdl, - SA_INIT_CONTROL_API) != SA_OK) { - /* - * An error occurred so we can't do - * anything - */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' cannot be set: problem " - "in share initialization"), - propname); - (void) zfs_error(hdl, EZFS_BADPROP, - errbuf); - goto error; - } - - if (zfs_parse_options(strval, proto) != SA_OK) { - /* - * There was an error in parsing so - * deal with it by issuing an error - * message and leaving after - * uninitializing the the libshare - * interface. - */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' cannot be set to invalid " - "options"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, - errbuf); - zfs_uninit_libshare(hdl); - goto error; - } - zfs_uninit_libshare(hdl); - } - - break; - - case ZFS_PROP_UTF8ONLY: - chosen_utf = (int)intval; - break; - - case ZFS_PROP_NORMALIZE: - chosen_normal = (int)intval; - break; - - default: - break; - } - - /* - * For changes to existing volumes, we have some additional - * checks to enforce. - */ - if (type == ZFS_TYPE_VOLUME && zhp != NULL) { - uint64_t volsize = zfs_prop_get_int(zhp, - ZFS_PROP_VOLSIZE); - uint64_t blocksize = zfs_prop_get_int(zhp, - ZFS_PROP_VOLBLOCKSIZE); - char buf[64]; - - switch (prop) { - case ZFS_PROP_RESERVATION: - if (intval > volsize) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' is greater than current " - "volume size"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, - errbuf); - goto error; - } - break; - - case ZFS_PROP_REFRESERVATION: - if (intval > volsize && intval != UINT64_MAX) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' is greater than current " - "volume size"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, - errbuf); - goto error; - } - break; - - case ZFS_PROP_VOLSIZE: - if (intval % blocksize != 0) { - zfs_nicenum(blocksize, buf, - sizeof (buf)); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' must be a multiple of " - "volume block size (%s)"), - propname, buf); - (void) zfs_error(hdl, EZFS_BADPROP, - errbuf); - goto error; - } - - if (intval == 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' cannot be zero"), - propname); - (void) zfs_error(hdl, EZFS_BADPROP, - errbuf); - goto error; - } - break; - - default: - break; - } - } - } - - /* - * If normalization was chosen, but no UTF8 choice was made, - * enforce rejection of non-UTF8 names. - * - * If normalization was chosen, but rejecting non-UTF8 names - * was explicitly not chosen, it is an error. - */ - if (chosen_normal > 0 && chosen_utf < 0) { - if (nvlist_add_uint64(ret, - zfs_prop_to_name(ZFS_PROP_UTF8ONLY), 1) != 0) { - (void) no_memory(hdl); - goto error; - } - } else if (chosen_normal > 0 && chosen_utf == 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' must be set 'on' if normalization chosen"), - zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - return (ret); - -error: - nvlist_free(ret); - return (NULL); -} - -int -zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) -{ - uint64_t old_volsize; - uint64_t new_volsize; - uint64_t old_reservation; - uint64_t new_reservation; - zfs_prop_t resv_prop; - nvlist_t *props; - - /* - * If this is an existing volume, and someone is setting the volsize, - * make sure that it matches the reservation, or add it if necessary. - */ - old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); - if (zfs_which_resv_prop(zhp, &resv_prop) < 0) - return (-1); - old_reservation = zfs_prop_get_int(zhp, resv_prop); - - props = fnvlist_alloc(); - fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), - zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); - - if ((zvol_volsize_to_reservation(old_volsize, props) != - old_reservation) || nvlist_exists(nvl, - zfs_prop_to_name(resv_prop))) { - fnvlist_free(props); - return (0); - } - if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), - &new_volsize) != 0) { - fnvlist_free(props); - return (-1); - } - new_reservation = zvol_volsize_to_reservation(new_volsize, props); - fnvlist_free(props); - - if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop), - new_reservation) != 0) { - (void) no_memory(zhp->zfs_hdl); - return (-1); - } - return (1); -} - -/* - * Helper for 'zfs {set|clone} refreservation=auto'. Must be called after - * zfs_valid_proplist(), as it is what sets the UINT64_MAX sentinal value. - * Return codes must match zfs_add_synthetic_resv(). - */ -static int -zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl) -{ - uint64_t volsize; - uint64_t resvsize; - zfs_prop_t prop; - nvlist_t *props; - - if (!ZFS_IS_VOLUME(zhp)) { - return (0); - } - - if (zfs_which_resv_prop(zhp, &prop) != 0) { - return (-1); - } - - if (prop != ZFS_PROP_REFRESERVATION) { - return (0); - } - - if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(prop), &resvsize) != 0) { - /* No value being set, so it can't be "auto" */ - return (0); - } - if (resvsize != UINT64_MAX) { - /* Being set to a value other than "auto" */ - return (0); - } - - props = fnvlist_alloc(); - - fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), - zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); - - if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), - &volsize) != 0) { - volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); - } - - resvsize = zvol_volsize_to_reservation(volsize, props); - fnvlist_free(props); - - (void) nvlist_remove_all(nvl, zfs_prop_to_name(prop)); - if (nvlist_add_uint64(nvl, zfs_prop_to_name(prop), resvsize) != 0) { - (void) no_memory(zhp->zfs_hdl); - return (-1); - } - return (1); -} - -void -zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err, - char *errbuf) -{ - switch (err) { - - case ENOSPC: - /* - * For quotas and reservations, ENOSPC indicates - * something different; setting a quota or reservation - * doesn't use any disk space. - */ - switch (prop) { - case ZFS_PROP_QUOTA: - case ZFS_PROP_REFQUOTA: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "size is less than current used or " - "reserved space")); - (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); - break; - - case ZFS_PROP_RESERVATION: - case ZFS_PROP_REFRESERVATION: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "size is greater than available space")); - (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); - break; - - default: - (void) zfs_standard_error(hdl, err, errbuf); - break; - } - break; - - case EBUSY: - (void) zfs_standard_error(hdl, EBUSY, errbuf); - break; - - case EROFS: - (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf); - break; - - case E2BIG: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property value too long")); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - break; - - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool and or dataset must be upgraded to set this " - "property or value")); - (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); - break; - - case ERANGE: - case EDOM: - if (prop == ZFS_PROP_COMPRESSION || - prop == ZFS_PROP_RECORDSIZE) { - (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property setting is not allowed on " - "bootable datasets")); - (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); - } else if (prop == ZFS_PROP_CHECKSUM || - prop == ZFS_PROP_DEDUP) { - (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property setting is not allowed on " - "root pools")); - (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); - } else { - (void) zfs_standard_error(hdl, err, errbuf); - } - break; - - case EINVAL: - if (prop == ZPROP_INVAL) { - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - } else { - (void) zfs_standard_error(hdl, err, errbuf); - } - break; - - case EOVERFLOW: - /* - * This platform can't address a volume this big. - */ -#ifdef _ILP32 - if (prop == ZFS_PROP_VOLSIZE) { - (void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf); - break; - } -#endif - /* FALLTHROUGH */ - default: - (void) zfs_standard_error(hdl, err, errbuf); - } -} - -/* - * Given a property name and value, set the property for the given dataset. - */ -int -zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) -{ - int ret = -1; - char errbuf[1024]; - libzfs_handle_t *hdl = zhp->zfs_hdl; - nvlist_t *nvl = NULL; - - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), - zhp->zfs_name); - - if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 || - nvlist_add_string(nvl, propname, propval) != 0) { - (void) no_memory(hdl); - goto error; - } - - ret = zfs_prop_set_list(zhp, nvl); - -error: - nvlist_free(nvl); - return (ret); -} - - - -/* - * Given an nvlist of property names and values, set the properties for the - * given dataset. - */ -int -zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) -{ - zfs_cmd_t zc = { 0 }; - int ret = -1; - prop_changelist_t **cls = NULL; - int cl_idx; - char errbuf[1024]; - libzfs_handle_t *hdl = zhp->zfs_hdl; - nvlist_t *nvl; - int nvl_len; - int added_resv = 0; - - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), - zhp->zfs_name); - - if ((nvl = zfs_valid_proplist(hdl, zhp->zfs_type, props, - zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, zhp->zpool_hdl, - errbuf)) == NULL) - goto error; - - /* - * We have to check for any extra properties which need to be added - * before computing the length of the nvlist. - */ - for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL); - elem != NULL; - elem = nvlist_next_nvpair(nvl, elem)) { - if (zfs_name_to_prop(nvpair_name(elem)) == ZFS_PROP_VOLSIZE && - (added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1) { - goto error; - } - } - - if (added_resv != 1 && - (added_resv = zfs_fix_auto_resv(zhp, nvl)) == -1) { - goto error; - } - - /* - * Check how many properties we're setting and allocate an array to - * store changelist pointers for postfix(). - */ - nvl_len = 0; - for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL); - elem != NULL; - elem = nvlist_next_nvpair(nvl, elem)) - nvl_len++; - if ((cls = calloc(nvl_len, sizeof (prop_changelist_t *))) == NULL) - goto error; - - cl_idx = 0; - for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL); - elem != NULL; - elem = nvlist_next_nvpair(nvl, elem)) { - - zfs_prop_t prop = zfs_name_to_prop(nvpair_name(elem)); - - assert(cl_idx < nvl_len); - /* - * We don't want to unmount & remount the dataset when changing - * its canmount property to 'on' or 'noauto'. We only use - * the changelist logic to unmount when setting canmount=off. - */ - if (prop != ZFS_PROP_CANMOUNT || - (fnvpair_value_uint64(elem) == ZFS_CANMOUNT_OFF && - zfs_is_mounted(zhp, NULL))) { - cls[cl_idx] = changelist_gather(zhp, prop, 0, 0); - if (cls[cl_idx] == NULL) - goto error; - } - - if (prop == ZFS_PROP_MOUNTPOINT && - changelist_haszonedchild(cls[cl_idx])) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "child dataset with inherited mountpoint is used " - "in a non-global zone")); - ret = zfs_error(hdl, EZFS_ZONED, errbuf); - goto error; - } - - /* We don't support those properties on FreeBSD. */ - switch (prop) { - case ZFS_PROP_DEVICES: - case ZFS_PROP_ISCSIOPTIONS: - case ZFS_PROP_XATTR: - case ZFS_PROP_VSCAN: - case ZFS_PROP_NBMAND: - case ZFS_PROP_MLSLABEL: - (void) snprintf(errbuf, sizeof (errbuf), - "property '%s' not supported on FreeBSD", - nvpair_name(elem)); - ret = zfs_error(hdl, EZFS_PERM, errbuf); - goto error; - } - - if (cls[cl_idx] != NULL && - (ret = changelist_prefix(cls[cl_idx])) != 0) - goto error; - - cl_idx++; - } - assert(cl_idx == nvl_len); - - /* - * Execute the corresponding ioctl() to set this list of properties. - */ - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - - if ((ret = zcmd_write_src_nvlist(hdl, &zc, nvl)) != 0 || - (ret = zcmd_alloc_dst_nvlist(hdl, &zc, 0)) != 0) - goto error; - - ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); - - if (ret != 0) { - if (zc.zc_nvlist_dst_filled == B_FALSE) { - (void) zfs_standard_error(hdl, errno, errbuf); - goto error; - } - - /* Get the list of unset properties back and report them. */ - nvlist_t *errorprops = NULL; - if (zcmd_read_dst_nvlist(hdl, &zc, &errorprops) != 0) - goto error; - for (nvpair_t *elem = nvlist_next_nvpair(errorprops, NULL); - elem != NULL; - elem = nvlist_next_nvpair(errorprops, elem)) { - zfs_prop_t prop = zfs_name_to_prop(nvpair_name(elem)); - zfs_setprop_error(hdl, prop, errno, errbuf); - } - nvlist_free(errorprops); - - if (added_resv && errno == ENOSPC) { - /* clean up the volsize property we tried to set */ - uint64_t old_volsize = zfs_prop_get_int(zhp, - ZFS_PROP_VOLSIZE); - nvlist_free(nvl); - nvl = NULL; - zcmd_free_nvlists(&zc); - - if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) - goto error; - if (nvlist_add_uint64(nvl, - zfs_prop_to_name(ZFS_PROP_VOLSIZE), - old_volsize) != 0) - goto error; - if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0) - goto error; - (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); - } - } else { - for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { - if (cls[cl_idx] != NULL) { - int clp_err = changelist_postfix(cls[cl_idx]); - if (clp_err != 0) - ret = clp_err; - } - } - - /* - * Refresh the statistics so the new property value - * is reflected. - */ - if (ret == 0) - (void) get_stats(zhp); - } - -error: - nvlist_free(nvl); - zcmd_free_nvlists(&zc); - if (cls != NULL) { - for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { - if (cls[cl_idx] != NULL) - changelist_free(cls[cl_idx]); - } - free(cls); - } - return (ret); -} - -/* - * Given a property, inherit the value from the parent dataset, or if received - * is TRUE, revert to the received value, if any. - */ -int -zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received) -{ - zfs_cmd_t zc = { 0 }; - int ret; - prop_changelist_t *cl; - libzfs_handle_t *hdl = zhp->zfs_hdl; - char errbuf[1024]; - zfs_prop_t prop; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot inherit %s for '%s'"), propname, zhp->zfs_name); - - zc.zc_cookie = received; - if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) { - /* - * For user properties, the amount of work we have to do is very - * small, so just do it here. - */ - if (!zfs_prop_user(propname)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid property")); - return (zfs_error(hdl, EZFS_BADPROP, errbuf)); - } - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); - - if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0) - return (zfs_standard_error(hdl, errno, errbuf)); - - return (0); - } - - /* - * Verify that this property is inheritable. - */ - if (zfs_prop_readonly(prop)) - return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf)); - - if (!zfs_prop_inheritable(prop) && !received) - return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf)); - - /* - * Check to see if the value applies to this type - */ - if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) - return (zfs_error(hdl, EZFS_PROPTYPE, errbuf)); - - /* - * Normalize the name, to get rid of shorthand abbreviations. - */ - propname = zfs_prop_to_name(prop); - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); - - if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID && - zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "dataset is used in a non-global zone")); - return (zfs_error(hdl, EZFS_ZONED, errbuf)); - } - - /* - * Determine datasets which will be affected by this change, if any. - */ - if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL) - return (-1); - - if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "child dataset with inherited mountpoint is used " - "in a non-global zone")); - ret = zfs_error(hdl, EZFS_ZONED, errbuf); - goto error; - } - - if ((ret = changelist_prefix(cl)) != 0) - goto error; - - if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc)) != 0) { - return (zfs_standard_error(hdl, errno, errbuf)); - } else { - - if ((ret = changelist_postfix(cl)) != 0) - goto error; - - /* - * Refresh the statistics so the new property is reflected. - */ - (void) get_stats(zhp); - } - -error: - changelist_free(cl); - return (ret); -} - -/* - * True DSL properties are stored in an nvlist. The following two functions - * extract them appropriately. - */ -static uint64_t -getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source) -{ - nvlist_t *nv; - uint64_t value; - - *source = NULL; - if (nvlist_lookup_nvlist(zhp->zfs_props, - zfs_prop_to_name(prop), &nv) == 0) { - verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0); - (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); - } else { - verify(!zhp->zfs_props_table || - zhp->zfs_props_table[prop] == B_TRUE); - value = zfs_prop_default_numeric(prop); - *source = ""; - } - - return (value); -} - -static const char * -getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source) -{ - nvlist_t *nv; - const char *value; - - *source = NULL; - if (nvlist_lookup_nvlist(zhp->zfs_props, - zfs_prop_to_name(prop), &nv) == 0) { - value = fnvlist_lookup_string(nv, ZPROP_VALUE); - (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); - } else { - verify(!zhp->zfs_props_table || - zhp->zfs_props_table[prop] == B_TRUE); - value = zfs_prop_default_string(prop); - *source = ""; - } - - return (value); -} - -static boolean_t -zfs_is_recvd_props_mode(zfs_handle_t *zhp) -{ - return (zhp->zfs_props == zhp->zfs_recvd_props); -} - -static void -zfs_set_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) -{ - *cookie = (uint64_t)(uintptr_t)zhp->zfs_props; - zhp->zfs_props = zhp->zfs_recvd_props; -} - -static void -zfs_unset_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) -{ - zhp->zfs_props = (nvlist_t *)(uintptr_t)*cookie; - *cookie = 0; -} - -/* - * Internal function for getting a numeric property. Both zfs_prop_get() and - * zfs_prop_get_int() are built using this interface. - * - * Certain properties can be overridden using 'mount -o'. In this case, scan - * the contents of the /etc/mnttab entry, searching for the appropriate options. - * If they differ from the on-disk values, report the current values and mark - * the source "temporary". - */ -static int -get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, - char **source, uint64_t *val) -{ - zfs_cmd_t zc = { 0 }; - nvlist_t *zplprops = NULL; - struct mnttab mnt; - char *mntopt_on = NULL; - char *mntopt_off = NULL; - boolean_t received = zfs_is_recvd_props_mode(zhp); - - *source = NULL; - - switch (prop) { - case ZFS_PROP_ATIME: - mntopt_on = MNTOPT_ATIME; - mntopt_off = MNTOPT_NOATIME; - break; - - case ZFS_PROP_DEVICES: - mntopt_on = MNTOPT_DEVICES; - mntopt_off = MNTOPT_NODEVICES; - break; - - case ZFS_PROP_EXEC: - mntopt_on = MNTOPT_EXEC; - mntopt_off = MNTOPT_NOEXEC; - break; - - case ZFS_PROP_READONLY: - mntopt_on = MNTOPT_RO; - mntopt_off = MNTOPT_RW; - break; - - case ZFS_PROP_SETUID: - mntopt_on = MNTOPT_SETUID; - mntopt_off = MNTOPT_NOSETUID; - break; - - case ZFS_PROP_XATTR: - mntopt_on = MNTOPT_XATTR; - mntopt_off = MNTOPT_NOXATTR; - break; - - case ZFS_PROP_NBMAND: - mntopt_on = MNTOPT_NBMAND; - mntopt_off = MNTOPT_NONBMAND; - break; - - default: - break; - } - - /* - * Because looking up the mount options is potentially expensive - * (iterating over all of /etc/mnttab), we defer its calculation until - * we're looking up a property which requires its presence. - */ - if (!zhp->zfs_mntcheck && - (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) { - libzfs_handle_t *hdl = zhp->zfs_hdl; - struct mnttab entry; - - if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) { - zhp->zfs_mntopts = zfs_strdup(hdl, - entry.mnt_mntopts); - if (zhp->zfs_mntopts == NULL) - return (-1); - } - - zhp->zfs_mntcheck = B_TRUE; - } - - if (zhp->zfs_mntopts == NULL) - mnt.mnt_mntopts = ""; - else - mnt.mnt_mntopts = zhp->zfs_mntopts; - - switch (prop) { - case ZFS_PROP_ATIME: - case ZFS_PROP_DEVICES: - case ZFS_PROP_EXEC: - case ZFS_PROP_READONLY: - case ZFS_PROP_SETUID: - case ZFS_PROP_XATTR: - case ZFS_PROP_NBMAND: - *val = getprop_uint64(zhp, prop, source); - - if (received) - break; - - if (hasmntopt(&mnt, mntopt_on) && !*val) { - *val = B_TRUE; - if (src) - *src = ZPROP_SRC_TEMPORARY; - } else if (hasmntopt(&mnt, mntopt_off) && *val) { - *val = B_FALSE; - if (src) - *src = ZPROP_SRC_TEMPORARY; - } - break; - - case ZFS_PROP_CANMOUNT: - case ZFS_PROP_VOLSIZE: - case ZFS_PROP_QUOTA: - case ZFS_PROP_REFQUOTA: - case ZFS_PROP_RESERVATION: - case ZFS_PROP_REFRESERVATION: - case ZFS_PROP_FILESYSTEM_LIMIT: - case ZFS_PROP_SNAPSHOT_LIMIT: - case ZFS_PROP_FILESYSTEM_COUNT: - case ZFS_PROP_SNAPSHOT_COUNT: - *val = getprop_uint64(zhp, prop, source); - - if (*source == NULL) { - /* not default, must be local */ - *source = zhp->zfs_name; - } - break; - - case ZFS_PROP_MOUNTED: - *val = (zhp->zfs_mntopts != NULL); - break; - - case ZFS_PROP_NUMCLONES: - *val = zhp->zfs_dmustats.dds_num_clones; - break; - - case ZFS_PROP_VERSION: - case ZFS_PROP_NORMALIZE: - case ZFS_PROP_UTF8ONLY: - case ZFS_PROP_CASE: - if (!zfs_prop_valid_for_type(prop, zhp->zfs_head_type) || - zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) - return (-1); - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) { - zcmd_free_nvlists(&zc); - return (-1); - } - if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 || - nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop), - val) != 0) { - zcmd_free_nvlists(&zc); - return (-1); - } - nvlist_free(zplprops); - zcmd_free_nvlists(&zc); - break; - - case ZFS_PROP_INCONSISTENT: - *val = zhp->zfs_dmustats.dds_inconsistent; - break; - - default: - switch (zfs_prop_get_type(prop)) { - case PROP_TYPE_NUMBER: - case PROP_TYPE_INDEX: - *val = getprop_uint64(zhp, prop, source); - /* - * If we tried to use a default value for a - * readonly property, it means that it was not - * present. Note this only applies to "truly" - * readonly properties, not set-once properties - * like volblocksize. - */ - if (zfs_prop_readonly(prop) && - !zfs_prop_setonce(prop) && - *source != NULL && (*source)[0] == '\0') { - *source = NULL; - return (-1); - } - break; - - case PROP_TYPE_STRING: - default: - zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, - "cannot get non-numeric property")); - return (zfs_error(zhp->zfs_hdl, EZFS_BADPROP, - dgettext(TEXT_DOMAIN, "internal error"))); - } - } - - return (0); -} - -/* - * Calculate the source type, given the raw source string. - */ -static void -get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source, - char *statbuf, size_t statlen) -{ - if (statbuf == NULL || *srctype == ZPROP_SRC_TEMPORARY) - return; - - if (source == NULL) { - *srctype = ZPROP_SRC_NONE; - } else if (source[0] == '\0') { - *srctype = ZPROP_SRC_DEFAULT; - } else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) { - *srctype = ZPROP_SRC_RECEIVED; - } else { - if (strcmp(source, zhp->zfs_name) == 0) { - *srctype = ZPROP_SRC_LOCAL; - } else { - (void) strlcpy(statbuf, source, statlen); - *srctype = ZPROP_SRC_INHERITED; - } - } - -} - -int -zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf, - size_t proplen, boolean_t literal) -{ - zfs_prop_t prop; - int err = 0; - - if (zhp->zfs_recvd_props == NULL) - if (get_recvd_props_ioctl(zhp) != 0) - return (-1); - - prop = zfs_name_to_prop(propname); - - if (prop != ZPROP_INVAL) { - uint64_t cookie; - if (!nvlist_exists(zhp->zfs_recvd_props, propname)) - return (-1); - zfs_set_recvd_props_mode(zhp, &cookie); - err = zfs_prop_get(zhp, prop, propbuf, proplen, - NULL, NULL, 0, literal); - zfs_unset_recvd_props_mode(zhp, &cookie); - } else { - nvlist_t *propval; - char *recvdval; - if (nvlist_lookup_nvlist(zhp->zfs_recvd_props, - propname, &propval) != 0) - return (-1); - verify(nvlist_lookup_string(propval, ZPROP_VALUE, - &recvdval) == 0); - (void) strlcpy(propbuf, recvdval, proplen); - } - - return (err == 0 ? 0 : -1); -} - -static int -get_clones_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) -{ - nvlist_t *value; - nvpair_t *pair; - - value = zfs_get_clones_nvl(zhp); - if (value == NULL) - return (-1); - - propbuf[0] = '\0'; - for (pair = nvlist_next_nvpair(value, NULL); pair != NULL; - pair = nvlist_next_nvpair(value, pair)) { - if (propbuf[0] != '\0') - (void) strlcat(propbuf, ",", proplen); - (void) strlcat(propbuf, nvpair_name(pair), proplen); - } - - return (0); -} - -struct get_clones_arg { - uint64_t numclones; - nvlist_t *value; - const char *origin; - char buf[ZFS_MAX_DATASET_NAME_LEN]; -}; - -int -get_clones_cb(zfs_handle_t *zhp, void *arg) -{ - struct get_clones_arg *gca = arg; - - if (gca->numclones == 0) { - zfs_close(zhp); - return (0); - } - - if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, gca->buf, sizeof (gca->buf), - NULL, NULL, 0, B_TRUE) != 0) - goto out; - if (strcmp(gca->buf, gca->origin) == 0) { - fnvlist_add_boolean(gca->value, zfs_get_name(zhp)); - gca->numclones--; - } - -out: - (void) zfs_iter_children(zhp, get_clones_cb, gca); - zfs_close(zhp); - return (0); -} - -nvlist_t * -zfs_get_clones_nvl(zfs_handle_t *zhp) -{ - nvlist_t *nv, *value; - - if (nvlist_lookup_nvlist(zhp->zfs_props, - zfs_prop_to_name(ZFS_PROP_CLONES), &nv) != 0) { - struct get_clones_arg gca; - - /* - * if this is a snapshot, then the kernel wasn't able - * to get the clones. Do it by slowly iterating. - */ - if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) - return (NULL); - if (nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) != 0) - return (NULL); - if (nvlist_alloc(&value, NV_UNIQUE_NAME, 0) != 0) { - nvlist_free(nv); - return (NULL); - } - - gca.numclones = zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES); - gca.value = value; - gca.origin = zhp->zfs_name; - - if (gca.numclones != 0) { - zfs_handle_t *root; - char pool[ZFS_MAX_DATASET_NAME_LEN]; - char *cp = pool; - - /* get the pool name */ - (void) strlcpy(pool, zhp->zfs_name, sizeof (pool)); - (void) strsep(&cp, "/@"); - root = zfs_open(zhp->zfs_hdl, pool, - ZFS_TYPE_FILESYSTEM); - - (void) get_clones_cb(root, &gca); - } - - if (gca.numclones != 0 || - nvlist_add_nvlist(nv, ZPROP_VALUE, value) != 0 || - nvlist_add_nvlist(zhp->zfs_props, - zfs_prop_to_name(ZFS_PROP_CLONES), nv) != 0) { - nvlist_free(nv); - nvlist_free(value); - return (NULL); - } - nvlist_free(nv); - nvlist_free(value); - verify(0 == nvlist_lookup_nvlist(zhp->zfs_props, - zfs_prop_to_name(ZFS_PROP_CLONES), &nv)); - } - - verify(nvlist_lookup_nvlist(nv, ZPROP_VALUE, &value) == 0); - - return (value); -} - -/* - * Accepts a property and value and checks that the value - * matches the one found by the channel program. If they are - * not equal, print both of them. - */ -void -zcp_check(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t intval, - const char *strval) -{ - if (!zhp->zfs_hdl->libzfs_prop_debug) - return; - int error; - char *poolname = zhp->zpool_hdl->zpool_name; - const char *program = - "args = ...\n" - "ds = args['dataset']\n" - "prop = args['property']\n" - "value, setpoint = zfs.get_prop(ds, prop)\n" - "return {value=value, setpoint=setpoint}\n"; - nvlist_t *outnvl; - nvlist_t *retnvl; - nvlist_t *argnvl = fnvlist_alloc(); - - fnvlist_add_string(argnvl, "dataset", zhp->zfs_name); - fnvlist_add_string(argnvl, "property", zfs_prop_to_name(prop)); - - error = lzc_channel_program_nosync(poolname, program, - 10 * 1000 * 1000, 10 * 1024 * 1024, argnvl, &outnvl); - - if (error == 0) { - retnvl = fnvlist_lookup_nvlist(outnvl, "return"); - if (zfs_prop_get_type(prop) == PROP_TYPE_NUMBER) { - int64_t ans; - error = nvlist_lookup_int64(retnvl, "value", &ans); - if (error != 0) { - (void) fprintf(stderr, "zcp check error: %u\n", - error); - return; - } - if (ans != intval) { - (void) fprintf(stderr, - "%s: zfs found %lld, but zcp found %lld\n", - zfs_prop_to_name(prop), - (longlong_t)intval, (longlong_t)ans); - } - } else { - char *str_ans; - error = nvlist_lookup_string(retnvl, "value", &str_ans); - if (error != 0) { - (void) fprintf(stderr, "zcp check error: %u\n", - error); - return; - } - if (strcmp(strval, str_ans) != 0) { - (void) fprintf(stderr, - "%s: zfs found %s, but zcp found %s\n", - zfs_prop_to_name(prop), - strval, str_ans); - } - } - } else { - (void) fprintf(stderr, - "zcp check failed, channel program error: %u\n", error); - } - nvlist_free(argnvl); - nvlist_free(outnvl); -} - -/* - * Retrieve a property from the given object. If 'literal' is specified, then - * numbers are left as exact values. Otherwise, numbers are converted to a - * human-readable form. - * - * Returns 0 on success, or -1 on error. - */ -int -zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, - zprop_source_t *src, char *statbuf, size_t statlen, boolean_t literal) -{ - char *source = NULL; - uint64_t val; - const char *str; - const char *strval; - boolean_t received = zfs_is_recvd_props_mode(zhp); - - /* - * Check to see if this property applies to our object - */ - if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) - return (-1); - - if (received && zfs_prop_readonly(prop)) - return (-1); - - if (src) - *src = ZPROP_SRC_NONE; - - switch (prop) { - case ZFS_PROP_CREATION: - /* - * 'creation' is a time_t stored in the statistics. We convert - * this into a string unless 'literal' is specified. - */ - { - val = getprop_uint64(zhp, prop, &source); - time_t time = (time_t)val; - struct tm t; - - if (literal || - localtime_r(&time, &t) == NULL || - strftime(propbuf, proplen, "%a %b %e %k:%M %Y", - &t) == 0) - (void) snprintf(propbuf, proplen, "%llu", val); - } - zcp_check(zhp, prop, val, NULL); - break; - - case ZFS_PROP_MOUNTPOINT: - /* - * Getting the precise mountpoint can be tricky. - * - * - for 'none' or 'legacy', return those values. - * - for inherited mountpoints, we want to take everything - * after our ancestor and append it to the inherited value. - * - * If the pool has an alternate root, we want to prepend that - * root to any values we return. - */ - - str = getprop_string(zhp, prop, &source); - - if (str[0] == '/') { - char buf[MAXPATHLEN]; - char *root = buf; - const char *relpath; - - /* - * If we inherit the mountpoint, even from a dataset - * with a received value, the source will be the path of - * the dataset we inherit from. If source is - * ZPROP_SOURCE_VAL_RECVD, the received value is not - * inherited. - */ - if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) { - relpath = ""; - } else { - relpath = zhp->zfs_name + strlen(source); - if (relpath[0] == '/') - relpath++; - } - - if ((zpool_get_prop(zhp->zpool_hdl, - ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL, - B_FALSE)) || (strcmp(root, "-") == 0)) - root[0] = '\0'; - /* - * Special case an alternate root of '/'. This will - * avoid having multiple leading slashes in the - * mountpoint path. - */ - if (strcmp(root, "/") == 0) - root++; - - /* - * If the mountpoint is '/' then skip over this - * if we are obtaining either an alternate root or - * an inherited mountpoint. - */ - if (str[1] == '\0' && (root[0] != '\0' || - relpath[0] != '\0')) - str++; - - if (relpath[0] == '\0') - (void) snprintf(propbuf, proplen, "%s%s", - root, str); - else - (void) snprintf(propbuf, proplen, "%s%s%s%s", - root, str, relpath[0] == '@' ? "" : "/", - relpath); - } else { - /* 'legacy' or 'none' */ - (void) strlcpy(propbuf, str, proplen); - } - zcp_check(zhp, prop, NULL, propbuf); - break; - - case ZFS_PROP_ORIGIN: - str = getprop_string(zhp, prop, &source); - if (str == NULL) - return (-1); - (void) strlcpy(propbuf, str, proplen); - zcp_check(zhp, prop, NULL, str); - break; - - case ZFS_PROP_CLONES: - if (get_clones_string(zhp, propbuf, proplen) != 0) - return (-1); - break; - - case ZFS_PROP_QUOTA: - case ZFS_PROP_REFQUOTA: - case ZFS_PROP_RESERVATION: - case ZFS_PROP_REFRESERVATION: - - if (get_numeric_property(zhp, prop, src, &source, &val) != 0) - return (-1); - /* - * If quota or reservation is 0, we translate this into 'none' - * (unless literal is set), and indicate that it's the default - * value. Otherwise, we print the number nicely and indicate - * that its set locally. - */ - if (val == 0) { - if (literal) - (void) strlcpy(propbuf, "0", proplen); - else - (void) strlcpy(propbuf, "none", proplen); - } else { - if (literal) - (void) snprintf(propbuf, proplen, "%llu", - (u_longlong_t)val); - else - zfs_nicenum(val, propbuf, proplen); - } - zcp_check(zhp, prop, val, NULL); - break; - - case ZFS_PROP_FILESYSTEM_LIMIT: - case ZFS_PROP_SNAPSHOT_LIMIT: - case ZFS_PROP_FILESYSTEM_COUNT: - case ZFS_PROP_SNAPSHOT_COUNT: - - if (get_numeric_property(zhp, prop, src, &source, &val) != 0) - return (-1); - - /* - * If limit is UINT64_MAX, we translate this into 'none' (unless - * literal is set), and indicate that it's the default value. - * Otherwise, we print the number nicely and indicate that it's - * set locally. - */ - if (literal) { - (void) snprintf(propbuf, proplen, "%llu", - (u_longlong_t)val); - } else if (val == UINT64_MAX) { - (void) strlcpy(propbuf, "none", proplen); - } else { - zfs_nicenum(val, propbuf, proplen); - } - - zcp_check(zhp, prop, val, NULL); - break; - - case ZFS_PROP_REFRATIO: - case ZFS_PROP_COMPRESSRATIO: - if (get_numeric_property(zhp, prop, src, &source, &val) != 0) - return (-1); - (void) snprintf(propbuf, proplen, "%llu.%02llux", - (u_longlong_t)(val / 100), - (u_longlong_t)(val % 100)); - zcp_check(zhp, prop, val, NULL); - break; - - case ZFS_PROP_TYPE: - switch (zhp->zfs_type) { - case ZFS_TYPE_FILESYSTEM: - str = "filesystem"; - break; - case ZFS_TYPE_VOLUME: - str = "volume"; - break; - case ZFS_TYPE_SNAPSHOT: - str = "snapshot"; - break; - case ZFS_TYPE_BOOKMARK: - str = "bookmark"; - break; - default: - abort(); - } - (void) snprintf(propbuf, proplen, "%s", str); - zcp_check(zhp, prop, NULL, propbuf); - break; - - case ZFS_PROP_MOUNTED: - /* - * The 'mounted' property is a pseudo-property that described - * whether the filesystem is currently mounted. Even though - * it's a boolean value, the typical values of "on" and "off" - * don't make sense, so we translate to "yes" and "no". - */ - if (get_numeric_property(zhp, ZFS_PROP_MOUNTED, - src, &source, &val) != 0) - return (-1); - if (val) - (void) strlcpy(propbuf, "yes", proplen); - else - (void) strlcpy(propbuf, "no", proplen); - break; - - case ZFS_PROP_NAME: - /* - * The 'name' property is a pseudo-property derived from the - * dataset name. It is presented as a real property to simplify - * consumers. - */ - (void) strlcpy(propbuf, zhp->zfs_name, proplen); - zcp_check(zhp, prop, NULL, propbuf); - break; - - case ZFS_PROP_MLSLABEL: - { -#ifdef illumos - m_label_t *new_sl = NULL; - char *ascii = NULL; /* human readable label */ - - (void) strlcpy(propbuf, - getprop_string(zhp, prop, &source), proplen); - - if (literal || (strcasecmp(propbuf, - ZFS_MLSLABEL_DEFAULT) == 0)) - break; - - /* - * Try to translate the internal hex string to - * human-readable output. If there are any - * problems just use the hex string. - */ - - if (str_to_label(propbuf, &new_sl, MAC_LABEL, - L_NO_CORRECTION, NULL) == -1) { - m_label_free(new_sl); - break; - } - - if (label_to_str(new_sl, &ascii, M_LABEL, - DEF_NAMES) != 0) { - if (ascii) - free(ascii); - m_label_free(new_sl); - break; - } - m_label_free(new_sl); - - (void) strlcpy(propbuf, ascii, proplen); - free(ascii); -#else /* !illumos */ - propbuf[0] = '\0'; -#endif /* illumos */ - } - break; - - case ZFS_PROP_GUID: - case ZFS_PROP_CREATETXG: - /* - * GUIDs are stored as numbers, but they are identifiers. - * We don't want them to be pretty printed, because pretty - * printing mangles the ID into a truncated and useless value. - */ - if (get_numeric_property(zhp, prop, src, &source, &val) != 0) - return (-1); - (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); - zcp_check(zhp, prop, val, NULL); - break; - - default: - switch (zfs_prop_get_type(prop)) { - case PROP_TYPE_NUMBER: - if (get_numeric_property(zhp, prop, src, - &source, &val) != 0) { - return (-1); - } - - if (literal) { - (void) snprintf(propbuf, proplen, "%llu", - (u_longlong_t)val); - } else { - zfs_nicenum(val, propbuf, proplen); - } - zcp_check(zhp, prop, val, NULL); - break; - - case PROP_TYPE_STRING: - str = getprop_string(zhp, prop, &source); - if (str == NULL) - return (-1); - - (void) strlcpy(propbuf, str, proplen); - zcp_check(zhp, prop, NULL, str); - break; - - case PROP_TYPE_INDEX: - if (get_numeric_property(zhp, prop, src, - &source, &val) != 0) - return (-1); - if (zfs_prop_index_to_string(prop, val, &strval) != 0) - return (-1); - - (void) strlcpy(propbuf, strval, proplen); - zcp_check(zhp, prop, NULL, strval); - break; - - default: - abort(); - } - } - - get_source(zhp, src, source, statbuf, statlen); - - return (0); -} - -/* - * Utility function to get the given numeric property. Does no validation that - * the given property is the appropriate type; should only be used with - * hard-coded property types. - */ -uint64_t -zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop) -{ - char *source; - uint64_t val; - - (void) get_numeric_property(zhp, prop, NULL, &source, &val); - - return (val); -} - -int -zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val) -{ - char buf[64]; - - (void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val); - return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf)); -} - -/* - * Similar to zfs_prop_get(), but returns the value as an integer. - */ -int -zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value, - zprop_source_t *src, char *statbuf, size_t statlen) -{ - char *source; - - /* - * Check to see if this property applies to our object - */ - if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) { - return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE, - dgettext(TEXT_DOMAIN, "cannot get property '%s'"), - zfs_prop_to_name(prop))); - } - - if (src) - *src = ZPROP_SRC_NONE; - - if (get_numeric_property(zhp, prop, src, &source, value) != 0) - return (-1); - - get_source(zhp, src, source, statbuf, statlen); - - return (0); -} - -static int -idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser, - char **domainp, idmap_rid_t *ridp) -{ -#ifdef illumos - idmap_get_handle_t *get_hdl = NULL; - idmap_stat status; - int err = EINVAL; - - if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS) - goto out; - - if (isuser) { - err = idmap_get_sidbyuid(get_hdl, id, - IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); - } else { - err = idmap_get_sidbygid(get_hdl, id, - IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); - } - if (err == IDMAP_SUCCESS && - idmap_get_mappings(get_hdl) == IDMAP_SUCCESS && - status == IDMAP_SUCCESS) - err = 0; - else - err = EINVAL; -out: - if (get_hdl) - idmap_get_destroy(get_hdl); - return (err); -#else /* !illumos */ - assert(!"invalid code path"); - return (EINVAL); // silence compiler warning -#endif /* illumos */ -} - -/* - * convert the propname into parameters needed by kernel - * Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829 - * Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789 - */ -static int -userquota_propname_decode(const char *propname, boolean_t zoned, - zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp) -{ - zfs_userquota_prop_t type; - char *cp, *end; - char *numericsid = NULL; - boolean_t isuser; - - domain[0] = '\0'; - *ridp = 0; - /* Figure out the property type ({user|group}{quota|space}) */ - for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { - if (strncmp(propname, zfs_userquota_prop_prefixes[type], - strlen(zfs_userquota_prop_prefixes[type])) == 0) - break; - } - if (type == ZFS_NUM_USERQUOTA_PROPS) - return (EINVAL); - *typep = type; - - isuser = (type == ZFS_PROP_USERQUOTA || - type == ZFS_PROP_USERUSED); - - cp = strchr(propname, '@') + 1; - - if (strchr(cp, '@')) { -#ifdef illumos - /* - * It's a SID name (eg "user@domain") that needs to be - * turned into S-1-domainID-RID. - */ - int flag = 0; - idmap_stat stat, map_stat; - uid_t pid; - idmap_rid_t rid; - idmap_get_handle_t *gh = NULL; - - stat = idmap_get_create(&gh); - if (stat != IDMAP_SUCCESS) { - idmap_get_destroy(gh); - return (ENOMEM); - } - if (zoned && getzoneid() == GLOBAL_ZONEID) - return (ENOENT); - if (isuser) { - stat = idmap_getuidbywinname(cp, NULL, flag, &pid); - if (stat < 0) - return (ENOENT); - stat = idmap_get_sidbyuid(gh, pid, flag, &numericsid, - &rid, &map_stat); - } else { - stat = idmap_getgidbywinname(cp, NULL, flag, &pid); - if (stat < 0) - return (ENOENT); - stat = idmap_get_sidbygid(gh, pid, flag, &numericsid, - &rid, &map_stat); - } - if (stat < 0) { - idmap_get_destroy(gh); - return (ENOENT); - } - stat = idmap_get_mappings(gh); - idmap_get_destroy(gh); - - if (stat < 0) { - return (ENOENT); - } - if (numericsid == NULL) - return (ENOENT); - cp = numericsid; - *ridp = rid; - /* will be further decoded below */ -#else /* !illumos */ - return (ENOENT); -#endif /* illumos */ - } - - if (strncmp(cp, "S-1-", 4) == 0) { - /* It's a numeric SID (eg "S-1-234-567-89") */ - (void) strlcpy(domain, cp, domainlen); - errno = 0; - if (*ridp == 0) { - cp = strrchr(domain, '-'); - *cp = '\0'; - cp++; - *ridp = strtoull(cp, &end, 10); - } else { - end = ""; - } - if (numericsid) { - free(numericsid); - numericsid = NULL; - } - if (errno != 0 || *end != '\0') - return (EINVAL); - } else if (!isdigit(*cp)) { - /* - * It's a user/group name (eg "user") that needs to be - * turned into a uid/gid - */ - if (zoned && getzoneid() == GLOBAL_ZONEID) - return (ENOENT); - if (isuser) { - struct passwd *pw; - pw = getpwnam(cp); - if (pw == NULL) - return (ENOENT); - *ridp = pw->pw_uid; - } else { - struct group *gr; - gr = getgrnam(cp); - if (gr == NULL) - return (ENOENT); - *ridp = gr->gr_gid; - } - } else { - /* It's a user/group ID (eg "12345"). */ - uid_t id = strtoul(cp, &end, 10); - idmap_rid_t rid; - char *mapdomain; - - if (*end != '\0') - return (EINVAL); - if (id > MAXUID) { - /* It's an ephemeral ID. */ - if (idmap_id_to_numeric_domain_rid(id, isuser, - &mapdomain, &rid) != 0) - return (ENOENT); - (void) strlcpy(domain, mapdomain, domainlen); - *ridp = rid; - } else { - *ridp = id; - } - } - - ASSERT3P(numericsid, ==, NULL); - return (0); -} - -static int -zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname, - uint64_t *propvalue, zfs_userquota_prop_t *typep) -{ - int err; - zfs_cmd_t zc = { 0 }; - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - - err = userquota_propname_decode(propname, - zfs_prop_get_int(zhp, ZFS_PROP_ZONED), - typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid); - zc.zc_objset_type = *typep; - if (err) - return (err); - - err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_USERSPACE_ONE, &zc); - if (err) - return (err); - - *propvalue = zc.zc_cookie; - return (0); -} - -int -zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, - uint64_t *propvalue) -{ - zfs_userquota_prop_t type; - - return (zfs_prop_get_userquota_common(zhp, propname, propvalue, - &type)); -} - -int -zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, - char *propbuf, int proplen, boolean_t literal) -{ - int err; - uint64_t propvalue; - zfs_userquota_prop_t type; - - err = zfs_prop_get_userquota_common(zhp, propname, &propvalue, - &type); - - if (err) - return (err); - - if (literal) { - (void) snprintf(propbuf, proplen, "%llu", propvalue); - } else if (propvalue == 0 && - (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA)) { - (void) strlcpy(propbuf, "none", proplen); - } else { - zfs_nicenum(propvalue, propbuf, proplen); - } - return (0); -} - -int -zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, - uint64_t *propvalue) -{ - int err; - zfs_cmd_t zc = { 0 }; - const char *snapname; - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - - snapname = strchr(propname, '@') + 1; - if (strchr(snapname, '@')) { - (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); - } else { - /* snapname is the short name, append it to zhp's fsname */ - char *cp; - - (void) strlcpy(zc.zc_value, zhp->zfs_name, - sizeof (zc.zc_value)); - cp = strchr(zc.zc_value, '@'); - if (cp != NULL) - *cp = '\0'; - (void) strlcat(zc.zc_value, "@", sizeof (zc.zc_value)); - (void) strlcat(zc.zc_value, snapname, sizeof (zc.zc_value)); - } - - err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SPACE_WRITTEN, &zc); - if (err) - return (err); - - *propvalue = zc.zc_cookie; - return (0); -} - -int -zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, - char *propbuf, int proplen, boolean_t literal) -{ - int err; - uint64_t propvalue; - - err = zfs_prop_get_written_int(zhp, propname, &propvalue); - - if (err) - return (err); - - if (literal) { - (void) snprintf(propbuf, proplen, "%llu", propvalue); - } else { - zfs_nicenum(propvalue, propbuf, proplen); - } - return (0); -} - -/* - * Returns the name of the given zfs handle. - */ -const char * -zfs_get_name(const zfs_handle_t *zhp) -{ - return (zhp->zfs_name); -} - -/* - * Returns the name of the parent pool for the given zfs handle. - */ -const char * -zfs_get_pool_name(const zfs_handle_t *zhp) -{ - return (zhp->zpool_hdl->zpool_name); -} - -/* - * Returns the type of the given zfs handle. - */ -zfs_type_t -zfs_get_type(const zfs_handle_t *zhp) -{ - return (zhp->zfs_type); -} - -/* - * Is one dataset name a child dataset of another? - * - * Needs to handle these cases: - * Dataset 1 "a/foo" "a/foo" "a/foo" "a/foo" - * Dataset 2 "a/fo" "a/foobar" "a/bar/baz" "a/foo/bar" - * Descendant? No. No. No. Yes. - */ -static boolean_t -is_descendant(const char *ds1, const char *ds2) -{ - size_t d1len = strlen(ds1); - - /* ds2 can't be a descendant if it's smaller */ - if (strlen(ds2) < d1len) - return (B_FALSE); - - /* otherwise, compare strings and verify that there's a '/' char */ - return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0)); -} - -/* - * Given a complete name, return just the portion that refers to the parent. - * Will return -1 if there is no parent (path is just the name of the - * pool). - */ -static int -parent_name(const char *path, char *buf, size_t buflen) -{ - char *slashp; - - (void) strlcpy(buf, path, buflen); - - if ((slashp = strrchr(buf, '/')) == NULL) - return (-1); - *slashp = '\0'; - - return (0); -} - -/* - * If accept_ancestor is false, then check to make sure that the given path has - * a parent, and that it exists. If accept_ancestor is true, then find the - * closest existing ancestor for the given path. In prefixlen return the - * length of already existing prefix of the given path. We also fetch the - * 'zoned' property, which is used to validate property settings when creating - * new datasets. - */ -static int -check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned, - boolean_t accept_ancestor, int *prefixlen) -{ - zfs_cmd_t zc = { 0 }; - char parent[ZFS_MAX_DATASET_NAME_LEN]; - char *slash; - zfs_handle_t *zhp; - char errbuf[1024]; - uint64_t is_zoned; - - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); - - /* get parent, and check to see if this is just a pool */ - if (parent_name(path, parent, sizeof (parent)) != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "missing dataset name")); - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - } - - /* check to see if the pool exists */ - if ((slash = strchr(parent, '/')) == NULL) - slash = parent + strlen(parent); - (void) strncpy(zc.zc_name, parent, slash - parent); - zc.zc_name[slash - parent] = '\0'; - if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0 && - errno == ENOENT) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "no such pool '%s'"), zc.zc_name); - return (zfs_error(hdl, EZFS_NOENT, errbuf)); - } - - /* check to see if the parent dataset exists */ - while ((zhp = make_dataset_handle(hdl, parent)) == NULL) { - if (errno == ENOENT && accept_ancestor) { - /* - * Go deeper to find an ancestor, give up on top level. - */ - if (parent_name(parent, parent, sizeof (parent)) != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "no such pool '%s'"), zc.zc_name); - return (zfs_error(hdl, EZFS_NOENT, errbuf)); - } - } else if (errno == ENOENT) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "parent does not exist")); - return (zfs_error(hdl, EZFS_NOENT, errbuf)); - } else - return (zfs_standard_error(hdl, errno, errbuf)); - } - - is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); - if (zoned != NULL) - *zoned = is_zoned; - - /* we are in a non-global zone, but parent is in the global zone */ - if (getzoneid() != GLOBAL_ZONEID && !is_zoned) { - (void) zfs_standard_error(hdl, EPERM, errbuf); - zfs_close(zhp); - return (-1); - } - - /* make sure parent is a filesystem */ - if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "parent is not a filesystem")); - (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); - zfs_close(zhp); - return (-1); - } - - zfs_close(zhp); - if (prefixlen != NULL) - *prefixlen = strlen(parent); - return (0); -} - -/* - * Finds whether the dataset of the given type(s) exists. - */ -boolean_t -zfs_dataset_exists(libzfs_handle_t *hdl, const char *path, zfs_type_t types) -{ - zfs_handle_t *zhp; - - if (!zfs_validate_name(hdl, path, types, B_FALSE)) - return (B_FALSE); - - /* - * Try to get stats for the dataset, which will tell us if it exists. - */ - if ((zhp = make_dataset_handle(hdl, path)) != NULL) { - int ds_type = zhp->zfs_type; - - zfs_close(zhp); - if (types & ds_type) - return (B_TRUE); - } - return (B_FALSE); -} - -/* - * Given a path to 'target', create all the ancestors between - * the prefixlen portion of the path, and the target itself. - * Fail if the initial prefixlen-ancestor does not already exist. - */ -int -create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) -{ - zfs_handle_t *h; - char *cp; - const char *opname; - - /* make sure prefix exists */ - cp = target + prefixlen; - if (*cp != '/') { - assert(strchr(cp, '/') == NULL); - h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); - } else { - *cp = '\0'; - h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); - *cp = '/'; - } - if (h == NULL) - return (-1); - zfs_close(h); - - /* - * Attempt to create, mount, and share any ancestor filesystems, - * up to the prefixlen-long one. - */ - for (cp = target + prefixlen + 1; - (cp = strchr(cp, '/')) != NULL; *cp = '/', cp++) { - - *cp = '\0'; - - h = make_dataset_handle(hdl, target); - if (h) { - /* it already exists, nothing to do here */ - zfs_close(h); - continue; - } - - if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM, - NULL) != 0) { - opname = dgettext(TEXT_DOMAIN, "create"); - goto ancestorerr; - } - - h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); - if (h == NULL) { - opname = dgettext(TEXT_DOMAIN, "open"); - goto ancestorerr; - } - - if (zfs_mount(h, NULL, 0) != 0) { - opname = dgettext(TEXT_DOMAIN, "mount"); - goto ancestorerr; - } - - if (zfs_share(h) != 0) { - opname = dgettext(TEXT_DOMAIN, "share"); - goto ancestorerr; - } - - zfs_close(h); - } - - return (0); - -ancestorerr: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "failed to %s ancestor '%s'"), opname, target); - return (-1); -} - -/* - * Creates non-existing ancestors of the given path. - */ -int -zfs_create_ancestors(libzfs_handle_t *hdl, const char *path) -{ - int prefix; - char *path_copy; - char errbuf[1024]; - int rc = 0; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot create '%s'"), path); - - /* - * Check that we are not passing the nesting limit - * before we start creating any ancestors. - */ - if (dataset_nestcheck(path) != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "maximum name nesting depth exceeded")); - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - } - - if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0) - return (-1); - - if ((path_copy = strdup(path)) != NULL) { - rc = create_parents(hdl, path_copy, prefix); - free(path_copy); - } - if (path_copy == NULL || rc != 0) - return (-1); - - return (0); -} - -/* - * Create a new filesystem or volume. - */ -int -zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, - nvlist_t *props) -{ - int ret; - uint64_t size = 0; - uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); - char errbuf[1024]; - uint64_t zoned; - enum lzc_dataset_type ost; - zpool_handle_t *zpool_handle; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot create '%s'"), path); - - /* validate the path, taking care to note the extended error message */ - if (!zfs_validate_name(hdl, path, type, B_TRUE)) - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - - if (dataset_nestcheck(path) != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "maximum name nesting depth exceeded")); - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - } - - /* validate parents exist */ - if (check_parents(hdl, path, &zoned, B_FALSE, NULL) != 0) - return (-1); - - /* - * The failure modes when creating a dataset of a different type over - * one that already exists is a little strange. In particular, if you - * try to create a dataset on top of an existing dataset, the ioctl() - * will return ENOENT, not EEXIST. To prevent this from happening, we - * first try to see if the dataset exists. - */ - if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "dataset already exists")); - return (zfs_error(hdl, EZFS_EXISTS, errbuf)); - } - - if (type == ZFS_TYPE_VOLUME) - ost = LZC_DATSET_TYPE_ZVOL; - else - ost = LZC_DATSET_TYPE_ZFS; - - /* open zpool handle for prop validation */ - char pool_path[ZFS_MAX_DATASET_NAME_LEN]; - (void) strlcpy(pool_path, path, sizeof (pool_path)); - - /* truncate pool_path at first slash */ - char *p = strchr(pool_path, '/'); - if (p != NULL) - *p = '\0'; - - if ((zpool_handle = zpool_open(hdl, pool_path)) == NULL) - return (-1); - - if (props && (props = zfs_valid_proplist(hdl, type, props, - zoned, NULL, zpool_handle, errbuf)) == 0) { - zpool_close(zpool_handle); - return (-1); - } - zpool_close(zpool_handle); - - if (type == ZFS_TYPE_VOLUME) { - /* - * If we are creating a volume, the size and block size must - * satisfy a few restraints. First, the blocksize must be a - * valid block size between SPA_{MIN,MAX}BLOCKSIZE. Second, the - * volsize must be a multiple of the block size, and cannot be - * zero. - */ - if (props == NULL || nvlist_lookup_uint64(props, - zfs_prop_to_name(ZFS_PROP_VOLSIZE), &size) != 0) { - nvlist_free(props); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "missing volume size")); - return (zfs_error(hdl, EZFS_BADPROP, errbuf)); - } - - if ((ret = nvlist_lookup_uint64(props, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), - &blocksize)) != 0) { - if (ret == ENOENT) { - blocksize = zfs_prop_default_numeric( - ZFS_PROP_VOLBLOCKSIZE); - } else { - nvlist_free(props); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "missing volume block size")); - return (zfs_error(hdl, EZFS_BADPROP, errbuf)); - } - } - - if (size == 0) { - nvlist_free(props); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "volume size cannot be zero")); - return (zfs_error(hdl, EZFS_BADPROP, errbuf)); - } - - if (size % blocksize != 0) { - nvlist_free(props); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "volume size must be a multiple of volume block " - "size")); - return (zfs_error(hdl, EZFS_BADPROP, errbuf)); - } - } - - /* create the dataset */ - ret = lzc_create(path, ost, props); - nvlist_free(props); - - /* check for failure */ - if (ret != 0) { - char parent[ZFS_MAX_DATASET_NAME_LEN]; - (void) parent_name(path, parent, sizeof (parent)); - - switch (errno) { - case ENOENT: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "no such parent '%s'"), parent); - return (zfs_error(hdl, EZFS_NOENT, errbuf)); - - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgraded to set this " - "property or value")); - return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); - case ERANGE: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid property value(s) specified")); - return (zfs_error(hdl, EZFS_BADPROP, errbuf)); -#ifdef _ILP32 - case EOVERFLOW: - /* - * This platform can't address a volume this big. - */ - if (type == ZFS_TYPE_VOLUME) - return (zfs_error(hdl, EZFS_VOLTOOBIG, - errbuf)); -#endif - /* FALLTHROUGH */ - default: - return (zfs_standard_error(hdl, errno, errbuf)); - } - } - - return (0); -} - -/* - * Destroys the given dataset. The caller must make sure that the filesystem - * isn't mounted, and that there are no active dependents. If the file system - * does not exist this function does nothing. - */ -int -zfs_destroy(zfs_handle_t *zhp, boolean_t defer) -{ - int error; - - if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT && defer) - return (EINVAL); - - if (zhp->zfs_type == ZFS_TYPE_BOOKMARK) { - nvlist_t *nv = fnvlist_alloc(); - fnvlist_add_boolean(nv, zhp->zfs_name); - error = lzc_destroy_bookmarks(nv, NULL); - fnvlist_free(nv); - if (error != 0) { - return (zfs_standard_error_fmt(zhp->zfs_hdl, error, - dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), - zhp->zfs_name)); - } - return (0); - } - - if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { - nvlist_t *nv = fnvlist_alloc(); - fnvlist_add_boolean(nv, zhp->zfs_name); - error = lzc_destroy_snaps(nv, defer, NULL); - fnvlist_free(nv); - } else { - error = lzc_destroy(zhp->zfs_name); - } - - if (error != 0 && error != ENOENT) { - return (zfs_standard_error_fmt(zhp->zfs_hdl, errno, - dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), - zhp->zfs_name)); - } - - remove_mountpoint(zhp); - - return (0); -} - -struct destroydata { - nvlist_t *nvl; - const char *snapname; -}; - -static int -zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) -{ - struct destroydata *dd = arg; - char name[ZFS_MAX_DATASET_NAME_LEN]; - int rv = 0; - - (void) snprintf(name, sizeof (name), - "%s@%s", zhp->zfs_name, dd->snapname); - - if (lzc_exists(name)) - verify(nvlist_add_boolean(dd->nvl, name) == 0); - - rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, dd); - zfs_close(zhp); - return (rv); -} - -/* - * Destroys all snapshots with the given name in zhp & descendants. - */ -int -zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) -{ - int ret; - struct destroydata dd = { 0 }; - - dd.snapname = snapname; - verify(nvlist_alloc(&dd.nvl, NV_UNIQUE_NAME, 0) == 0); - (void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd); - - if (nvlist_empty(dd.nvl)) { - ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, - dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"), - zhp->zfs_name, snapname); - } else { - ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer); - } - nvlist_free(dd.nvl); - return (ret); -} - -/* - * Destroys all the snapshots named in the nvlist. - */ -int -zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer) -{ - int ret; - nvlist_t *errlist = NULL; - - ret = lzc_destroy_snaps(snaps, defer, &errlist); - - if (ret == 0) { - nvlist_free(errlist); - return (0); - } - - if (nvlist_empty(errlist)) { - char errbuf[1024]; - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot destroy snapshots")); - - ret = zfs_standard_error(hdl, ret, errbuf); - } - for (nvpair_t *pair = nvlist_next_nvpair(errlist, NULL); - pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) { - char errbuf[1024]; - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"), - nvpair_name(pair)); - - switch (fnvpair_value_int32(pair)) { - case EEXIST: - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, "snapshot is cloned")); - ret = zfs_error(hdl, EZFS_EXISTS, errbuf); - break; - default: - ret = zfs_standard_error(hdl, errno, errbuf); - break; - } - } - - nvlist_free(errlist); - return (ret); -} - -/* - * Clones the given dataset. The target must be of the same type as the source. - */ -int -zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) -{ - char parent[ZFS_MAX_DATASET_NAME_LEN]; - int ret; - char errbuf[1024]; - libzfs_handle_t *hdl = zhp->zfs_hdl; - uint64_t zoned; - - assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot create '%s'"), target); - - /* validate the target/clone name */ - if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE)) - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - - /* validate parents exist */ - if (check_parents(hdl, target, &zoned, B_FALSE, NULL) != 0) - return (-1); - - (void) parent_name(target, parent, sizeof (parent)); - - /* do the clone */ - - if (props) { - zfs_type_t type; - - if (ZFS_IS_VOLUME(zhp)) { - type = ZFS_TYPE_VOLUME; - } else { - type = ZFS_TYPE_FILESYSTEM; - } - if ((props = zfs_valid_proplist(hdl, type, props, zoned, - zhp, zhp->zpool_hdl, errbuf)) == NULL) - return (-1); - if (zfs_fix_auto_resv(zhp, props) == -1) { - nvlist_free(props); - return (-1); - } - } - - ret = lzc_clone(target, zhp->zfs_name, props); - nvlist_free(props); - - if (ret != 0) { - switch (errno) { - - case ENOENT: - /* - * The parent doesn't exist. We should have caught this - * above, but there may a race condition that has since - * destroyed the parent. - * - * At this point, we don't know whether it's the source - * that doesn't exist anymore, or whether the target - * dataset doesn't exist. - */ - zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, - "no such parent '%s'"), parent); - return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); - - case EXDEV: - zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, - "source and target pools differ")); - return (zfs_error(zhp->zfs_hdl, EZFS_CROSSTARGET, - errbuf)); - - default: - return (zfs_standard_error(zhp->zfs_hdl, errno, - errbuf)); - } - } - - return (ret); -} - -/* - * Promotes the given clone fs to be the clone parent. - */ -int -zfs_promote(zfs_handle_t *zhp) -{ - libzfs_handle_t *hdl = zhp->zfs_hdl; - char snapname[ZFS_MAX_DATASET_NAME_LEN]; - int ret; - char errbuf[1024]; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot promote '%s'"), zhp->zfs_name); - - if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "snapshots can not be promoted")); - return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); - } - - if (zhp->zfs_dmustats.dds_origin[0] == '\0') { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "not a cloned filesystem")); - return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); - } - - if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE)) - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - - ret = lzc_promote(zhp->zfs_name, snapname, sizeof (snapname)); - - if (ret != 0) { - switch (ret) { - case EEXIST: - /* There is a conflicting snapshot name. */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "conflicting snapshot '%s' from parent '%s'"), - snapname, zhp->zfs_dmustats.dds_origin); - return (zfs_error(hdl, EZFS_EXISTS, errbuf)); - - default: - return (zfs_standard_error(hdl, ret, errbuf)); - } - } - return (ret); -} - -typedef struct snapdata { - nvlist_t *sd_nvl; - const char *sd_snapname; -} snapdata_t; - -static int -zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) -{ - snapdata_t *sd = arg; - char name[ZFS_MAX_DATASET_NAME_LEN]; - int rv = 0; - - if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) { - (void) snprintf(name, sizeof (name), - "%s@%s", zfs_get_name(zhp), sd->sd_snapname); - - fnvlist_add_boolean(sd->sd_nvl, name); - - rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); - } - zfs_close(zhp); - - return (rv); -} - -int -zfs_remap_indirects(libzfs_handle_t *hdl, const char *fs) -{ - int err; - char errbuf[1024]; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot remap dataset '%s'"), fs); - - err = lzc_remap(fs); - - if (err != 0) { - switch (err) { - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgraded")); - (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); - break; - case EINVAL: - (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); - break; - default: - (void) zfs_standard_error(hdl, err, errbuf); - break; - } - } - - return (err); -} - -/* - * Creates snapshots. The keys in the snaps nvlist are the snapshots to be - * created. - */ -int -zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props) -{ - int ret; - char errbuf[1024]; - nvpair_t *elem; - nvlist_t *errors; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot create snapshots ")); - - elem = NULL; - while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) { - const char *snapname = nvpair_name(elem); - - /* validate the target name */ - if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT, - B_TRUE)) { - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, - "cannot create snapshot '%s'"), snapname); - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - } - } - - /* - * get pool handle for prop validation. assumes all snaps are in the - * same pool, as does lzc_snapshot (below). - */ - char pool[ZFS_MAX_DATASET_NAME_LEN]; - elem = nvlist_next_nvpair(snaps, NULL); - (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); - pool[strcspn(pool, "/@")] = '\0'; - zpool_handle_t *zpool_hdl = zpool_open(hdl, pool); - - if (props != NULL && - (props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, - props, B_FALSE, NULL, zpool_hdl, errbuf)) == NULL) { - zpool_close(zpool_hdl); - return (-1); - } - zpool_close(zpool_hdl); - - ret = lzc_snapshot(snaps, props, &errors); - - if (ret != 0) { - boolean_t printed = B_FALSE; - for (elem = nvlist_next_nvpair(errors, NULL); - elem != NULL; - elem = nvlist_next_nvpair(errors, elem)) { - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, - "cannot create snapshot '%s'"), nvpair_name(elem)); - (void) zfs_standard_error(hdl, - fnvpair_value_int32(elem), errbuf); - printed = B_TRUE; - } - if (!printed) { - switch (ret) { - case EXDEV: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "multiple snapshots of same " - "fs not allowed")); - (void) zfs_error(hdl, EZFS_EXISTS, errbuf); - - break; - default: - (void) zfs_standard_error(hdl, ret, errbuf); - } - } - } - - nvlist_free(props); - nvlist_free(errors); - return (ret); -} - -int -zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, - nvlist_t *props) -{ - int ret; - snapdata_t sd = { 0 }; - char fsname[ZFS_MAX_DATASET_NAME_LEN]; - char *cp; - zfs_handle_t *zhp; - char errbuf[1024]; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot snapshot %s"), path); - - if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - - (void) strlcpy(fsname, path, sizeof (fsname)); - cp = strchr(fsname, '@'); - *cp = '\0'; - sd.sd_snapname = cp + 1; - - if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | - ZFS_TYPE_VOLUME)) == NULL) { - return (-1); - } - - verify(nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) == 0); - if (recursive) { - (void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd); - } else { - fnvlist_add_boolean(sd.sd_nvl, path); - } - - ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props); - nvlist_free(sd.sd_nvl); - zfs_close(zhp); - return (ret); -} - -/* - * Destroy any more recent snapshots. We invoke this callback on any dependents - * of the snapshot first. If the 'cb_dependent' member is non-zero, then this - * is a dependent and we should just destroy it without checking the transaction - * group. - */ -typedef struct rollback_data { - const char *cb_target; /* the snapshot */ - uint64_t cb_create; /* creation time reference */ - boolean_t cb_error; - boolean_t cb_force; -} rollback_data_t; - -static int -rollback_destroy_dependent(zfs_handle_t *zhp, void *data) -{ - rollback_data_t *cbp = data; - prop_changelist_t *clp; - - /* We must destroy this clone; first unmount it */ - clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, - cbp->cb_force ? MS_FORCE: 0); - if (clp == NULL || changelist_prefix(clp) != 0) { - cbp->cb_error = B_TRUE; - zfs_close(zhp); - return (0); - } - if (zfs_destroy(zhp, B_FALSE) != 0) - cbp->cb_error = B_TRUE; - else - changelist_remove(clp, zhp->zfs_name); - (void) changelist_postfix(clp); - changelist_free(clp); - - zfs_close(zhp); - return (0); -} - -static int -rollback_destroy(zfs_handle_t *zhp, void *data) -{ - rollback_data_t *cbp = data; - - if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { - cbp->cb_error |= zfs_iter_dependents(zhp, B_FALSE, - rollback_destroy_dependent, cbp); - - cbp->cb_error |= zfs_destroy(zhp, B_FALSE); - } - - zfs_close(zhp); - return (0); -} - -/* - * Given a dataset, rollback to a specific snapshot, discarding any - * data changes since then and making it the active dataset. - * - * Any snapshots and bookmarks more recent than the target are - * destroyed, along with their dependents (i.e. clones). - */ -int -zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) -{ - rollback_data_t cb = { 0 }; - int err; - boolean_t restore_resv = 0; - uint64_t min_txg = 0, old_volsize = 0, new_volsize; - zfs_prop_t resv_prop; - - assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM || - zhp->zfs_type == ZFS_TYPE_VOLUME); - - /* - * Destroy all recent snapshots and their dependents. - */ - cb.cb_force = force; - cb.cb_target = snap->zfs_name; - cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); - - if (cb.cb_create > 0) - min_txg = cb.cb_create; - - (void) zfs_iter_snapshots(zhp, B_FALSE, rollback_destroy, &cb, - min_txg, 0); - - (void) zfs_iter_bookmarks(zhp, rollback_destroy, &cb); - - if (cb.cb_error) - return (-1); - - /* - * Now that we have verified that the snapshot is the latest, - * rollback to the given snapshot. - */ - - if (zhp->zfs_type == ZFS_TYPE_VOLUME) { - if (zfs_which_resv_prop(zhp, &resv_prop) < 0) - return (-1); - old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); - restore_resv = - (old_volsize == zfs_prop_get_int(zhp, resv_prop)); - } - - /* - * Pass both the filesystem and the wanted snapshot names, - * we would get an error back if the snapshot is destroyed or - * a new snapshot is created before this request is processed. - */ - err = lzc_rollback_to(zhp->zfs_name, snap->zfs_name); - if (err != 0) { - char errbuf[1024]; - - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot rollback '%s'"), - zhp->zfs_name); - switch (err) { - case EEXIST: - zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, - "there is a snapshot or bookmark more recent " - "than '%s'"), snap->zfs_name); - (void) zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf); - break; - case ESRCH: - zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, - "'%s' is not found among snapshots of '%s'"), - snap->zfs_name, zhp->zfs_name); - (void) zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf); - break; - case EINVAL: - (void) zfs_error(zhp->zfs_hdl, EZFS_BADTYPE, errbuf); - break; - default: - (void) zfs_standard_error(zhp->zfs_hdl, err, errbuf); - } - return (err); - } - - /* - * For volumes, if the pre-rollback volsize matched the pre- - * rollback reservation and the volsize has changed then set - * the reservation property to the post-rollback volsize. - * Make a new handle since the rollback closed the dataset. - */ - if ((zhp->zfs_type == ZFS_TYPE_VOLUME) && - (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) { - if (restore_resv) { - new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); - if (old_volsize != new_volsize) - err = zfs_prop_set_int(zhp, resv_prop, - new_volsize); - } - zfs_close(zhp); - } - return (err); -} - -/* - * Renames the given dataset. - */ -int -zfs_rename(zfs_handle_t *zhp, const char *source, const char *target, - renameflags_t flags) -{ - int ret = 0; - zfs_cmd_t zc = { 0 }; - char *delim; - prop_changelist_t *cl = NULL; - zfs_handle_t *zhrp = NULL; - char *parentname = NULL; - char parent[ZFS_MAX_DATASET_NAME_LEN]; - char property[ZFS_MAXPROPLEN]; - libzfs_handle_t *hdl = zhp->zfs_hdl; - char errbuf[1024]; - - /* if we have the same exact name, just return success */ - if (strcmp(zhp->zfs_name, target) == 0) - return (0); - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot rename to '%s'"), target); - - if (source != NULL) { - /* - * This is recursive snapshots rename, put snapshot name - * (that might not exist) into zfs_name. - */ - assert(flags.recurse); - - (void) strlcat(zhp->zfs_name, "@", sizeof(zhp->zfs_name)); - (void) strlcat(zhp->zfs_name, source, sizeof(zhp->zfs_name)); - zhp->zfs_type = ZFS_TYPE_SNAPSHOT; - } - - /* make sure source name is valid */ - if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE)) - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - - /* - * Make sure the target name is valid - */ - if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT || - zhp->zfs_type == ZFS_TYPE_BOOKMARK) { - const char sep = zhp->zfs_type == ZFS_TYPE_SNAPSHOT ? '@' : '#'; - - if ((strchr(target, sep) == NULL) || *target == sep) { - /* - * Snapshot target name is abbreviated, - * reconstruct full dataset name - */ - (void) strlcpy(parent, zhp->zfs_name, sizeof (parent)); - delim = strchr(parent, sep); - if (strchr(target, sep) == NULL) - *(++delim) = '\0'; - else - *delim = '\0'; - (void) strlcat(parent, target, sizeof (parent)); - target = parent; - } else { - /* - * Make sure we're renaming within the same dataset. - */ - delim = strchr(target, sep); - if (strncmp(zhp->zfs_name, target, delim - target) - != 0 || zhp->zfs_name[delim - target] != sep) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "%s must be part of same dataset"), - zhp->zfs_type == ZFS_TYPE_SNAPSHOT ? - "snapshots" : "bookmarks"); - return (zfs_error(hdl, EZFS_CROSSTARGET, - errbuf)); - } - } - - if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - } else { - if (flags.recurse) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "recursive rename must be a snapshot")); - return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); - } - - if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - - /* validate parents */ - if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0) - return (-1); - - /* make sure we're in the same pool */ - verify((delim = strchr(target, '/')) != NULL); - if (strncmp(zhp->zfs_name, target, delim - target) != 0 || - zhp->zfs_name[delim - target] != '/') { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "datasets must be within same pool")); - return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); - } - - /* new name cannot be a child of the current dataset name */ - if (is_descendant(zhp->zfs_name, target)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "New dataset name cannot be a descendant of " - "current dataset name")); - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - } - } - - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zhp->zfs_name); - - if (getzoneid() == GLOBAL_ZONEID && - zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "dataset is used in a non-global zone")); - return (zfs_error(hdl, EZFS_ZONED, errbuf)); - } - - /* - * Avoid unmounting file systems with mountpoint property set to - * 'legacy' or 'none' even if -u option is not given. - */ - if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && - !flags.recurse && !flags.nounmount && - zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, property, - sizeof (property), NULL, NULL, 0, B_FALSE) == 0 && - (strcmp(property, "legacy") == 0 || - strcmp(property, "none") == 0)) { - flags.nounmount = B_TRUE; - } - if (flags.recurse) { - parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name); - if (parentname == NULL) { - ret = -1; - goto error; - } - delim = strchr(parentname, '@'); - *delim = '\0'; - zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_DATASET); - if (zhrp == NULL) { - ret = -1; - goto error; - } - } else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT && - zhp->zfs_type != ZFS_TYPE_BOOKMARK) { - if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, - flags.nounmount ? CL_GATHER_DONT_UNMOUNT : 0, - flags.forceunmount ? MS_FORCE : 0)) == NULL) { - return (-1); - } - - if (changelist_haszonedchild(cl)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "child dataset with inherited mountpoint is used " - "in a non-global zone")); - (void) zfs_error(hdl, EZFS_ZONED, errbuf); - ret = -1; - goto error; - } - - if ((ret = changelist_prefix(cl)) != 0) - goto error; - } - - if (ZFS_IS_VOLUME(zhp)) - zc.zc_objset_type = DMU_OST_ZVOL; - else - zc.zc_objset_type = DMU_OST_ZFS; - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); - - zc.zc_cookie = flags.recurse ? 1 : 0; - if (flags.nounmount) - zc.zc_cookie |= 2; - - if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) { - /* - * if it was recursive, the one that actually failed will - * be in zc.zc_name - */ - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot rename '%s'"), zc.zc_name); - - if (flags.recurse && errno == EEXIST) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "a child dataset already has a snapshot " - "with the new name")); - (void) zfs_error(hdl, EZFS_EXISTS, errbuf); - } else if (errno == EINVAL) { - (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); - } else { - (void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf); - } - - /* - * On failure, we still want to remount any filesystems that - * were previously mounted, so we don't alter the system state. - */ - if (cl != NULL) - (void) changelist_postfix(cl); - } else { - if (cl != NULL) { - changelist_rename(cl, zfs_get_name(zhp), target); - ret = changelist_postfix(cl); - } - } - -error: - if (parentname != NULL) { - free(parentname); - } - if (zhrp != NULL) { - zfs_close(zhrp); - } - if (cl != NULL) { - changelist_free(cl); - } - return (ret); -} - -nvlist_t * -zfs_get_user_props(zfs_handle_t *zhp) -{ - return (zhp->zfs_user_props); -} - -nvlist_t * -zfs_get_recvd_props(zfs_handle_t *zhp) -{ - if (zhp->zfs_recvd_props == NULL) - if (get_recvd_props_ioctl(zhp) != 0) - return (NULL); - return (zhp->zfs_recvd_props); -} - -/* - * This function is used by 'zfs list' to determine the exact set of columns to - * display, and their maximum widths. This does two main things: - * - * - If this is a list of all properties, then expand the list to include - * all native properties, and set a flag so that for each dataset we look - * for new unique user properties and add them to the list. - * - * - For non fixed-width properties, keep track of the maximum width seen - * so that we can size the column appropriately. If the user has - * requested received property values, we also need to compute the width - * of the RECEIVED column. - */ -int -zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received, - boolean_t literal) -{ - libzfs_handle_t *hdl = zhp->zfs_hdl; - zprop_list_t *entry; - zprop_list_t **last, **start; - nvlist_t *userprops, *propval; - nvpair_t *elem; - char *strval; - char buf[ZFS_MAXPROPLEN]; - - if (zprop_expand_list(hdl, plp, ZFS_TYPE_DATASET) != 0) - return (-1); - - userprops = zfs_get_user_props(zhp); - - entry = *plp; - if (entry->pl_all && nvlist_next_nvpair(userprops, NULL) != NULL) { - /* - * Go through and add any user properties as necessary. We - * start by incrementing our list pointer to the first - * non-native property. - */ - start = plp; - while (*start != NULL) { - if ((*start)->pl_prop == ZPROP_INVAL) - break; - start = &(*start)->pl_next; - } - - elem = NULL; - while ((elem = nvlist_next_nvpair(userprops, elem)) != NULL) { - /* - * See if we've already found this property in our list. - */ - for (last = start; *last != NULL; - last = &(*last)->pl_next) { - if (strcmp((*last)->pl_user_prop, - nvpair_name(elem)) == 0) - break; - } - - if (*last == NULL) { - if ((entry = zfs_alloc(hdl, - sizeof (zprop_list_t))) == NULL || - ((entry->pl_user_prop = zfs_strdup(hdl, - nvpair_name(elem)))) == NULL) { - free(entry); - return (-1); - } - - entry->pl_prop = ZPROP_INVAL; - entry->pl_width = strlen(nvpair_name(elem)); - entry->pl_all = B_TRUE; - *last = entry; - } - } - } - - /* - * Now go through and check the width of any non-fixed columns - */ - for (entry = *plp; entry != NULL; entry = entry->pl_next) { - if (entry->pl_fixed && !literal) - continue; - - if (entry->pl_prop != ZPROP_INVAL) { - if (zfs_prop_get(zhp, entry->pl_prop, - buf, sizeof (buf), NULL, NULL, 0, literal) == 0) { - if (strlen(buf) > entry->pl_width) - entry->pl_width = strlen(buf); - } - if (received && zfs_prop_get_recvd(zhp, - zfs_prop_to_name(entry->pl_prop), - buf, sizeof (buf), literal) == 0) - if (strlen(buf) > entry->pl_recvd_width) - entry->pl_recvd_width = strlen(buf); - } else { - if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop, - &propval) == 0) { - verify(nvlist_lookup_string(propval, - ZPROP_VALUE, &strval) == 0); - if (strlen(strval) > entry->pl_width) - entry->pl_width = strlen(strval); - } - if (received && zfs_prop_get_recvd(zhp, - entry->pl_user_prop, - buf, sizeof (buf), literal) == 0) - if (strlen(buf) > entry->pl_recvd_width) - entry->pl_recvd_width = strlen(buf); - } - } - - return (0); -} - -int -zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path, - char *resource, void *export, void *sharetab, - int sharemax, zfs_share_op_t operation) -{ - zfs_cmd_t zc = { 0 }; - int error; - - (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value)); - if (resource) - (void) strlcpy(zc.zc_string, resource, sizeof (zc.zc_string)); - zc.zc_share.z_sharedata = (uint64_t)(uintptr_t)sharetab; - zc.zc_share.z_exportdata = (uint64_t)(uintptr_t)export; - zc.zc_share.z_sharetype = operation; - zc.zc_share.z_sharemax = sharemax; - error = ioctl(hdl->libzfs_fd, ZFS_IOC_SHARE, &zc); - return (error); -} - -void -zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props) -{ - nvpair_t *curr; - - /* - * Keep a reference to the props-table against which we prune the - * properties. - */ - zhp->zfs_props_table = props; - - curr = nvlist_next_nvpair(zhp->zfs_props, NULL); - - while (curr) { - zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr)); - nvpair_t *next = nvlist_next_nvpair(zhp->zfs_props, curr); - - /* - * User properties will result in ZPROP_INVAL, and since we - * only know how to prune standard ZFS properties, we always - * leave these in the list. This can also happen if we - * encounter an unknown DSL property (when running older - * software, for example). - */ - if (zfs_prop != ZPROP_INVAL && props[zfs_prop] == B_FALSE) - (void) nvlist_remove(zhp->zfs_props, - nvpair_name(curr), nvpair_type(curr)); - curr = next; - } -} - -#ifdef illumos -static int -zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path, - zfs_smb_acl_op_t cmd, char *resource1, char *resource2) -{ - zfs_cmd_t zc = { 0 }; - nvlist_t *nvlist = NULL; - int error; - - (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value)); - zc.zc_cookie = (uint64_t)cmd; - - if (cmd == ZFS_SMB_ACL_RENAME) { - if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) { - (void) no_memory(hdl); - return (0); - } - } - - switch (cmd) { - case ZFS_SMB_ACL_ADD: - case ZFS_SMB_ACL_REMOVE: - (void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string)); - break; - case ZFS_SMB_ACL_RENAME: - if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC, - resource1) != 0) { - (void) no_memory(hdl); - return (-1); - } - if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET, - resource2) != 0) { - (void) no_memory(hdl); - return (-1); - } - if (zcmd_write_src_nvlist(hdl, &zc, nvlist) != 0) { - nvlist_free(nvlist); - return (-1); - } - break; - case ZFS_SMB_ACL_PURGE: - break; - default: - return (-1); - } - error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc); - nvlist_free(nvlist); - return (error); -} - -int -zfs_smb_acl_add(libzfs_handle_t *hdl, char *dataset, - char *path, char *resource) -{ - return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD, - resource, NULL)); -} - -int -zfs_smb_acl_remove(libzfs_handle_t *hdl, char *dataset, - char *path, char *resource) -{ - return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE, - resource, NULL)); -} - -int -zfs_smb_acl_purge(libzfs_handle_t *hdl, char *dataset, char *path) -{ - return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE, - NULL, NULL)); -} - -int -zfs_smb_acl_rename(libzfs_handle_t *hdl, char *dataset, char *path, - char *oldname, char *newname) -{ - return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME, - oldname, newname)); -} -#endif /* illumos */ - -int -zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, - zfs_userspace_cb_t func, void *arg) -{ - zfs_cmd_t zc = { 0 }; - zfs_useracct_t buf[100]; - libzfs_handle_t *hdl = zhp->zfs_hdl; - int ret; - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - - zc.zc_objset_type = type; - zc.zc_nvlist_dst = (uintptr_t)buf; - - for (;;) { - zfs_useracct_t *zua = buf; - - zc.zc_nvlist_dst_size = sizeof (buf); - if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) { - char errbuf[1024]; - - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, - "cannot get used/quota for %s"), zc.zc_name); - return (zfs_standard_error_fmt(hdl, errno, errbuf)); - } - if (zc.zc_nvlist_dst_size == 0) - break; - - while (zc.zc_nvlist_dst_size > 0) { - if ((ret = func(arg, zua->zu_domain, zua->zu_rid, - zua->zu_space)) != 0) - return (ret); - zua++; - zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t); - } - } - - return (0); -} - -struct holdarg { - nvlist_t *nvl; - const char *snapname; - const char *tag; - boolean_t recursive; - int error; -}; - -static int -zfs_hold_one(zfs_handle_t *zhp, void *arg) -{ - struct holdarg *ha = arg; - char name[ZFS_MAX_DATASET_NAME_LEN]; - int rv = 0; - - (void) snprintf(name, sizeof (name), - "%s@%s", zhp->zfs_name, ha->snapname); - - if (lzc_exists(name)) - fnvlist_add_string(ha->nvl, name, ha->tag); - - if (ha->recursive) - rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha); - zfs_close(zhp); - return (rv); -} - -int -zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, - boolean_t recursive, int cleanup_fd) -{ - int ret; - struct holdarg ha; - - ha.nvl = fnvlist_alloc(); - ha.snapname = snapname; - ha.tag = tag; - ha.recursive = recursive; - (void) zfs_hold_one(zfs_handle_dup(zhp), &ha); - - if (nvlist_empty(ha.nvl)) { - char errbuf[1024]; - - fnvlist_free(ha.nvl); - ret = ENOENT; - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, - "cannot hold snapshot '%s@%s'"), - zhp->zfs_name, snapname); - (void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf); - return (ret); - } - - ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl); - fnvlist_free(ha.nvl); - - return (ret); -} - -int -zfs_hold_nvl(zfs_handle_t *zhp, int cleanup_fd, nvlist_t *holds) -{ - int ret; - nvlist_t *errors; - libzfs_handle_t *hdl = zhp->zfs_hdl; - char errbuf[1024]; - nvpair_t *elem; - - errors = NULL; - ret = lzc_hold(holds, cleanup_fd, &errors); - - if (ret == 0) { - /* There may be errors even in the success case. */ - fnvlist_free(errors); - return (0); - } - - if (nvlist_empty(errors)) { - /* no hold-specific errors */ - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot hold")); - switch (ret) { - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgraded")); - (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); - break; - case EINVAL: - (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); - break; - default: - (void) zfs_standard_error(hdl, ret, errbuf); - } - } - - for (elem = nvlist_next_nvpair(errors, NULL); - elem != NULL; - elem = nvlist_next_nvpair(errors, elem)) { - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, - "cannot hold snapshot '%s'"), nvpair_name(elem)); - switch (fnvpair_value_int32(elem)) { - case E2BIG: - /* - * Temporary tags wind up having the ds object id - * prepended. So even if we passed the length check - * above, it's still possible for the tag to wind - * up being slightly too long. - */ - (void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf); - break; - case EINVAL: - (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); - break; - case EEXIST: - (void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf); - break; - default: - (void) zfs_standard_error(hdl, - fnvpair_value_int32(elem), errbuf); - } - } - - fnvlist_free(errors); - return (ret); -} - -static int -zfs_release_one(zfs_handle_t *zhp, void *arg) -{ - struct holdarg *ha = arg; - char name[ZFS_MAX_DATASET_NAME_LEN]; - int rv = 0; - nvlist_t *existing_holds; - - (void) snprintf(name, sizeof (name), - "%s@%s", zhp->zfs_name, ha->snapname); - - if (lzc_get_holds(name, &existing_holds) != 0) { - ha->error = ENOENT; - } else if (!nvlist_exists(existing_holds, ha->tag)) { - ha->error = ESRCH; - } else { - nvlist_t *torelease = fnvlist_alloc(); - fnvlist_add_boolean(torelease, ha->tag); - fnvlist_add_nvlist(ha->nvl, name, torelease); - fnvlist_free(torelease); - } - - if (ha->recursive) - rv = zfs_iter_filesystems(zhp, zfs_release_one, ha); - zfs_close(zhp); - return (rv); -} - -int -zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, - boolean_t recursive) -{ - int ret; - struct holdarg ha; - nvlist_t *errors = NULL; - nvpair_t *elem; - libzfs_handle_t *hdl = zhp->zfs_hdl; - char errbuf[1024]; - - ha.nvl = fnvlist_alloc(); - ha.snapname = snapname; - ha.tag = tag; - ha.recursive = recursive; - ha.error = 0; - (void) zfs_release_one(zfs_handle_dup(zhp), &ha); - - if (nvlist_empty(ha.nvl)) { - fnvlist_free(ha.nvl); - ret = ha.error; - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, - "cannot release hold from snapshot '%s@%s'"), - zhp->zfs_name, snapname); - if (ret == ESRCH) { - (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); - } else { - (void) zfs_standard_error(hdl, ret, errbuf); - } - return (ret); - } - - ret = lzc_release(ha.nvl, &errors); - fnvlist_free(ha.nvl); - - if (ret == 0) { - /* There may be errors even in the success case. */ - fnvlist_free(errors); - return (0); - } - - if (nvlist_empty(errors)) { - /* no hold-specific errors */ - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot release")); - switch (errno) { - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgraded")); - (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); - break; - default: - (void) zfs_standard_error_fmt(hdl, errno, errbuf); - } - } - - for (elem = nvlist_next_nvpair(errors, NULL); - elem != NULL; - elem = nvlist_next_nvpair(errors, elem)) { - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, - "cannot release hold from snapshot '%s'"), - nvpair_name(elem)); - switch (fnvpair_value_int32(elem)) { - case ESRCH: - (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); - break; - case EINVAL: - (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); - break; - default: - (void) zfs_standard_error_fmt(hdl, - fnvpair_value_int32(elem), errbuf); - } - } - - fnvlist_free(errors); - return (ret); -} - -int -zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl) -{ - zfs_cmd_t zc = { 0 }; - libzfs_handle_t *hdl = zhp->zfs_hdl; - int nvsz = 2048; - void *nvbuf; - int err = 0; - char errbuf[1024]; - - assert(zhp->zfs_type == ZFS_TYPE_VOLUME || - zhp->zfs_type == ZFS_TYPE_FILESYSTEM); - -tryagain: - - nvbuf = malloc(nvsz); - if (nvbuf == NULL) { - err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno))); - goto out; - } - - zc.zc_nvlist_dst_size = nvsz; - zc.zc_nvlist_dst = (uintptr_t)nvbuf; - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - - if (ioctl(hdl->libzfs_fd, ZFS_IOC_GET_FSACL, &zc) != 0) { - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"), - zc.zc_name); - switch (errno) { - case ENOMEM: - free(nvbuf); - nvsz = zc.zc_nvlist_dst_size; - goto tryagain; - - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgraded")); - err = zfs_error(hdl, EZFS_BADVERSION, errbuf); - break; - case EINVAL: - err = zfs_error(hdl, EZFS_BADTYPE, errbuf); - break; - case ENOENT: - err = zfs_error(hdl, EZFS_NOENT, errbuf); - break; - default: - err = zfs_standard_error_fmt(hdl, errno, errbuf); - break; - } - } else { - /* success */ - int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0); - if (rc) { - (void) snprintf(errbuf, sizeof (errbuf), dgettext( - TEXT_DOMAIN, "cannot get permissions on '%s'"), - zc.zc_name); - err = zfs_standard_error_fmt(hdl, rc, errbuf); - } - } - - free(nvbuf); -out: - return (err); -} - -int -zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl) -{ - zfs_cmd_t zc = { 0 }; - libzfs_handle_t *hdl = zhp->zfs_hdl; - char *nvbuf; - char errbuf[1024]; - size_t nvsz; - int err; - - assert(zhp->zfs_type == ZFS_TYPE_VOLUME || - zhp->zfs_type == ZFS_TYPE_FILESYSTEM); - - err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE); - assert(err == 0); - - nvbuf = malloc(nvsz); - - err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0); - assert(err == 0); - - zc.zc_nvlist_src_size = nvsz; - zc.zc_nvlist_src = (uintptr_t)nvbuf; - zc.zc_perm_action = un; - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - - if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) { - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"), - zc.zc_name); - switch (errno) { - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgraded")); - err = zfs_error(hdl, EZFS_BADVERSION, errbuf); - break; - case EINVAL: - err = zfs_error(hdl, EZFS_BADTYPE, errbuf); - break; - case ENOENT: - err = zfs_error(hdl, EZFS_NOENT, errbuf); - break; - default: - err = zfs_standard_error_fmt(hdl, errno, errbuf); - break; - } - } - - free(nvbuf); - - return (err); -} - -int -zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) -{ - int err; - char errbuf[1024]; - - err = lzc_get_holds(zhp->zfs_name, nvl); - - if (err != 0) { - libzfs_handle_t *hdl = zhp->zfs_hdl; - - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"), - zhp->zfs_name); - switch (err) { - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgraded")); - err = zfs_error(hdl, EZFS_BADVERSION, errbuf); - break; - case EINVAL: - err = zfs_error(hdl, EZFS_BADTYPE, errbuf); - break; - case ENOENT: - err = zfs_error(hdl, EZFS_NOENT, errbuf); - break; - default: - err = zfs_standard_error_fmt(hdl, errno, errbuf); - break; - } - } - - return (err); -} - -/* - * Convert the zvol's volume size to an appropriate reservation. - * Note: If this routine is updated, it is necessary to update the ZFS test - * suite's shell version in reservation.kshlib. - */ -uint64_t -zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) -{ - uint64_t numdb; - uint64_t nblocks, volblocksize; - int ncopies; - char *strval; - - if (nvlist_lookup_string(props, - zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0) - ncopies = atoi(strval); - else - ncopies = 1; - if (nvlist_lookup_uint64(props, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), - &volblocksize) != 0) - volblocksize = ZVOL_DEFAULT_BLOCKSIZE; - nblocks = volsize/volblocksize; - /* start with metadnode L0-L6 */ - numdb = 7; - /* calculate number of indirects */ - while (nblocks > 1) { - nblocks += DNODES_PER_LEVEL - 1; - nblocks /= DNODES_PER_LEVEL; - numdb += nblocks; - } - numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1); - volsize *= ncopies; - /* - * this is exactly DN_MAX_INDBLKSHIFT when metadata isn't - * compressed, but in practice they compress down to about - * 1100 bytes - */ - numdb *= 1ULL << DN_MAX_INDBLKSHIFT; - volsize += numdb; - return (volsize); -} - -/* - * Attach/detach the given filesystem to/from the given jail. - */ -int -zfs_jail(zfs_handle_t *zhp, int jailid, int attach) -{ - libzfs_handle_t *hdl = zhp->zfs_hdl; - zfs_cmd_t zc = { 0 }; - char errbuf[1024]; - unsigned long cmd; - int ret; - - if (attach) { - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot jail '%s'"), zhp->zfs_name); - } else { - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot unjail '%s'"), zhp->zfs_name); - } - - switch (zhp->zfs_type) { - case ZFS_TYPE_VOLUME: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "volumes can not be jailed")); - return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); - case ZFS_TYPE_SNAPSHOT: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "snapshots can not be jailed")); - return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); - } - assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM); - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - zc.zc_objset_type = DMU_OST_ZFS; - zc.zc_jailid = jailid; - - cmd = attach ? ZFS_IOC_JAIL : ZFS_IOC_UNJAIL; - if ((ret = ioctl(hdl->libzfs_fd, cmd, &zc)) != 0) - zfs_standard_error(hdl, errno, errbuf); - - return (ret); -} diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_diff.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_diff.c deleted file mode 100644 index db132190154c..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_diff.c +++ /dev/null @@ -1,834 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2015, 2018 by Delphix. All rights reserved. - * Copyright 2016 Joyent, Inc. - * Copyright 2016 Igor Kozhukhov - */ - -/* - * zfs diff support - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "libzfs_impl.h" - -#define ZDIFF_SNAPDIR "/.zfs/snapshot/" -#define ZDIFF_SHARESDIR "/.zfs/shares/" -#define ZDIFF_PREFIX "zfs-diff-%d" - -#define ZDIFF_ADDED '+' -#define ZDIFF_MODIFIED 'M' -#define ZDIFF_REMOVED '-' -#define ZDIFF_RENAMED 'R' - -typedef struct differ_info { - zfs_handle_t *zhp; - char *fromsnap; - char *frommnt; - char *tosnap; - char *tomnt; - char *ds; - char *dsmnt; - char *tmpsnap; - char errbuf[1024]; - boolean_t isclone; - boolean_t scripted; - boolean_t classify; - boolean_t timestamped; - uint64_t shares; - int zerr; - int cleanupfd; - int outputfd; - int datafd; -} differ_info_t; - -/* - * Given a {dsname, object id}, get the object path - */ -static int -get_stats_for_obj(differ_info_t *di, const char *dsname, uint64_t obj, - char *pn, int maxlen, zfs_stat_t *sb) -{ - zfs_cmd_t zc = { 0 }; - int error; - - (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); - zc.zc_obj = obj; - - errno = 0; - error = ioctl(di->zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_STATS, &zc); - di->zerr = errno; - - /* we can get stats even if we failed to get a path */ - (void) memcpy(sb, &zc.zc_stat, sizeof (zfs_stat_t)); - if (error == 0) { - ASSERT(di->zerr == 0); - (void) strlcpy(pn, zc.zc_value, maxlen); - return (0); - } - - if (di->zerr == ESTALE) { - (void) snprintf(pn, maxlen, "(on_delete_queue)"); - return (0); - } else if (di->zerr == EPERM) { - (void) snprintf(di->errbuf, sizeof (di->errbuf), - dgettext(TEXT_DOMAIN, - "The sys_config privilege or diff delegated permission " - "is needed\nto discover path names")); - return (-1); - } else { - (void) snprintf(di->errbuf, sizeof (di->errbuf), - dgettext(TEXT_DOMAIN, - "Unable to determine path or stats for " - "object %jd in %s"), (uintmax_t)obj, dsname); - return (-1); - } -} - -/* - * stream_bytes - * - * Prints a file name out a character at a time. If the character is - * not in the range of what we consider "printable" ASCII, display it - * as an escaped 3-digit octal value. ASCII values less than a space - * are all control characters and we declare the upper end as the - * DELete character. This also is the last 7-bit ASCII character. - * We choose to treat all 8-bit ASCII as not printable for this - * application. - */ -static void -stream_bytes(FILE *fp, const char *string) -{ - char c; - - while ((c = *string++) != '\0') { - if (c > ' ' && c != '\\' && c < '\177') { - (void) fprintf(fp, "%c", c); - } else { - (void) fprintf(fp, "\\%03o", (uint8_t)c); - } - } -} - -static void -print_what(FILE *fp, mode_t what) -{ - char symbol; - - switch (what & S_IFMT) { - case S_IFBLK: - symbol = 'B'; - break; - case S_IFCHR: - symbol = 'C'; - break; - case S_IFDIR: - symbol = '/'; - break; -#ifdef S_IFDOOR - case S_IFDOOR: - symbol = '>'; - break; -#endif - case S_IFIFO: - symbol = '|'; - break; - case S_IFLNK: - symbol = '@'; - break; -#ifdef S_IFPORT - case S_IFPORT: - symbol = 'P'; - break; -#endif - case S_IFSOCK: - symbol = '='; - break; - case S_IFREG: - symbol = 'F'; - break; - default: - symbol = '?'; - break; - } - (void) fprintf(fp, "%c", symbol); -} - -static void -print_cmn(FILE *fp, differ_info_t *di, const char *file) -{ - stream_bytes(fp, di->dsmnt); - stream_bytes(fp, file); -} - -static void -print_rename(FILE *fp, differ_info_t *di, const char *old, const char *new, - zfs_stat_t *isb) -{ - if (di->timestamped) - (void) fprintf(fp, "%10lld.%09lld\t", - (longlong_t)isb->zs_ctime[0], - (longlong_t)isb->zs_ctime[1]); - (void) fprintf(fp, "%c\t", ZDIFF_RENAMED); - if (di->classify) { - print_what(fp, isb->zs_mode); - (void) fprintf(fp, "\t"); - } - print_cmn(fp, di, old); - if (di->scripted) - (void) fprintf(fp, "\t"); - else - (void) fprintf(fp, " -> "); - print_cmn(fp, di, new); - (void) fprintf(fp, "\n"); -} - -static void -print_link_change(FILE *fp, differ_info_t *di, int delta, const char *file, - zfs_stat_t *isb) -{ - if (di->timestamped) - (void) fprintf(fp, "%10lld.%09lld\t", - (longlong_t)isb->zs_ctime[0], - (longlong_t)isb->zs_ctime[1]); - (void) fprintf(fp, "%c\t", ZDIFF_MODIFIED); - if (di->classify) { - print_what(fp, isb->zs_mode); - (void) fprintf(fp, "\t"); - } - print_cmn(fp, di, file); - (void) fprintf(fp, "\t(%+d)", delta); - (void) fprintf(fp, "\n"); -} - -static void -print_file(FILE *fp, differ_info_t *di, char type, const char *file, - zfs_stat_t *isb) -{ - if (di->timestamped) - (void) fprintf(fp, "%10lld.%09lld\t", - (longlong_t)isb->zs_ctime[0], - (longlong_t)isb->zs_ctime[1]); - (void) fprintf(fp, "%c\t", type); - if (di->classify) { - print_what(fp, isb->zs_mode); - (void) fprintf(fp, "\t"); - } - print_cmn(fp, di, file); - (void) fprintf(fp, "\n"); -} - -static int -write_inuse_diffs_one(FILE *fp, differ_info_t *di, uint64_t dobj) -{ - struct zfs_stat fsb, tsb; - mode_t fmode, tmode; - char fobjname[MAXPATHLEN], tobjname[MAXPATHLEN]; - int fobjerr, tobjerr; - int change; - - if (dobj == di->shares) - return (0); - - /* - * Check the from and to snapshots for info on the object. If - * we get ENOENT, then the object just didn't exist in that - * snapshot. If we get ENOTSUP, then we tried to get - * info on a non-ZPL object, which we don't care about anyway. - */ - fobjerr = get_stats_for_obj(di, di->fromsnap, dobj, fobjname, - MAXPATHLEN, &fsb); - if (fobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP) - return (-1); - - tobjerr = get_stats_for_obj(di, di->tosnap, dobj, tobjname, - MAXPATHLEN, &tsb); - if (tobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP) - return (-1); - - /* - * Unallocated object sharing the same meta dnode block - */ - if (fobjerr && tobjerr) { - ASSERT(di->zerr == ENOENT || di->zerr == ENOTSUP); - di->zerr = 0; - return (0); - } - - di->zerr = 0; /* negate get_stats_for_obj() from side that failed */ - fmode = fsb.zs_mode & S_IFMT; - tmode = tsb.zs_mode & S_IFMT; - if (fmode == S_IFDIR || tmode == S_IFDIR || fsb.zs_links == 0 || - tsb.zs_links == 0) - change = 0; - else - change = tsb.zs_links - fsb.zs_links; - - if (fobjerr) { - if (change) { - print_link_change(fp, di, change, tobjname, &tsb); - return (0); - } - print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb); - return (0); - } else if (tobjerr) { - if (change) { - print_link_change(fp, di, change, fobjname, &fsb); - return (0); - } - print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb); - return (0); - } - - if (fmode != tmode && fsb.zs_gen == tsb.zs_gen) - tsb.zs_gen++; /* Force a generational difference */ - - /* Simple modification or no change */ - if (fsb.zs_gen == tsb.zs_gen) { - /* No apparent changes. Could we assert !this? */ - if (fsb.zs_ctime[0] == tsb.zs_ctime[0] && - fsb.zs_ctime[1] == tsb.zs_ctime[1]) - return (0); - if (change) { - print_link_change(fp, di, change, - change > 0 ? fobjname : tobjname, &tsb); - } else if (strcmp(fobjname, tobjname) == 0) { - print_file(fp, di, ZDIFF_MODIFIED, fobjname, &tsb); - } else { - print_rename(fp, di, fobjname, tobjname, &tsb); - } - return (0); - } else { - /* file re-created or object re-used */ - print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb); - print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb); - return (0); - } -} - -static int -write_inuse_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) -{ - uint64_t o; - int err; - - for (o = dr->ddr_first; o <= dr->ddr_last; o++) { - if ((err = write_inuse_diffs_one(fp, di, o)) != 0) - return (err); - } - return (0); -} - -static int -describe_free(FILE *fp, differ_info_t *di, uint64_t object, char *namebuf, - int maxlen) -{ - struct zfs_stat sb; - - if (get_stats_for_obj(di, di->fromsnap, object, namebuf, - maxlen, &sb) != 0) { - return (-1); - } - /* Don't print if in the delete queue on from side */ - if (di->zerr == ESTALE) { - di->zerr = 0; - return (0); - } - - print_file(fp, di, ZDIFF_REMOVED, namebuf, &sb); - return (0); -} - -static int -write_free_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) -{ - zfs_cmd_t zc = { 0 }; - libzfs_handle_t *lhdl = di->zhp->zfs_hdl; - char fobjname[MAXPATHLEN]; - - (void) strlcpy(zc.zc_name, di->fromsnap, sizeof (zc.zc_name)); - zc.zc_obj = dr->ddr_first - 1; - - ASSERT(di->zerr == 0); - - while (zc.zc_obj < dr->ddr_last) { - int err; - - err = ioctl(lhdl->libzfs_fd, ZFS_IOC_NEXT_OBJ, &zc); - if (err == 0) { - if (zc.zc_obj == di->shares) { - zc.zc_obj++; - continue; - } - if (zc.zc_obj > dr->ddr_last) { - break; - } - err = describe_free(fp, di, zc.zc_obj, fobjname, - MAXPATHLEN); - if (err) - break; - } else if (errno == ESRCH) { - break; - } else { - (void) snprintf(di->errbuf, sizeof (di->errbuf), - dgettext(TEXT_DOMAIN, - "next allocated object (> %jd) find failure"), - (uintmax_t)zc.zc_obj); - di->zerr = errno; - break; - } - } - if (di->zerr) - return (-1); - return (0); -} - -static void * -differ(void *arg) -{ - differ_info_t *di = arg; - dmu_diff_record_t dr; - FILE *ofp; - int err = 0; - - if ((ofp = fdopen(di->outputfd, "w")) == NULL) { - di->zerr = errno; - (void) strerror_r(errno, di->errbuf, sizeof (di->errbuf)); - (void) close(di->datafd); - return ((void *)-1); - } - - for (;;) { - char *cp = (char *)&dr; - int len = sizeof (dr); - int rv; - - do { - rv = read(di->datafd, cp, len); - cp += rv; - len -= rv; - } while (len > 0 && rv > 0); - - if (rv < 0 || (rv == 0 && len != sizeof (dr))) { - di->zerr = EPIPE; - break; - } else if (rv == 0) { - /* end of file at a natural breaking point */ - break; - } - - switch (dr.ddr_type) { - case DDR_FREE: - err = write_free_diffs(ofp, di, &dr); - break; - case DDR_INUSE: - err = write_inuse_diffs(ofp, di, &dr); - break; - default: - di->zerr = EPIPE; - break; - } - - if (err || di->zerr) - break; - } - - (void) fclose(ofp); - (void) close(di->datafd); - if (err) - return ((void *)-1); - if (di->zerr) { - ASSERT(di->zerr == EPIPE); - (void) snprintf(di->errbuf, sizeof (di->errbuf), - dgettext(TEXT_DOMAIN, - "Internal error: bad data from diff IOCTL")); - return ((void *)-1); - } - return ((void *)0); -} - -static int -find_shares_object(differ_info_t *di) -{ - char fullpath[MAXPATHLEN]; - struct stat64 sb = { 0 }; - - (void) strlcpy(fullpath, di->dsmnt, MAXPATHLEN); - (void) strlcat(fullpath, ZDIFF_SHARESDIR, MAXPATHLEN); - - if (stat64(fullpath, &sb) != 0) { -#ifdef illumos - (void) snprintf(di->errbuf, sizeof (di->errbuf), - dgettext(TEXT_DOMAIN, "Cannot stat %s"), fullpath); - return (zfs_error(di->zhp->zfs_hdl, EZFS_DIFF, di->errbuf)); -#else - return (0); -#endif - } - - di->shares = (uint64_t)sb.st_ino; - return (0); -} - -static int -make_temp_snapshot(differ_info_t *di) -{ - libzfs_handle_t *hdl = di->zhp->zfs_hdl; - zfs_cmd_t zc = { 0 }; - - (void) snprintf(zc.zc_value, sizeof (zc.zc_value), - ZDIFF_PREFIX, getpid()); - (void) strlcpy(zc.zc_name, di->ds, sizeof (zc.zc_name)); - zc.zc_cleanup_fd = di->cleanupfd; - - if (ioctl(hdl->libzfs_fd, ZFS_IOC_TMP_SNAPSHOT, &zc) != 0) { - int err = errno; - if (err == EPERM) { - (void) snprintf(di->errbuf, sizeof (di->errbuf), - dgettext(TEXT_DOMAIN, "The diff delegated " - "permission is needed in order\nto create a " - "just-in-time snapshot for diffing\n")); - return (zfs_error(hdl, EZFS_DIFF, di->errbuf)); - } else { - (void) snprintf(di->errbuf, sizeof (di->errbuf), - dgettext(TEXT_DOMAIN, "Cannot create just-in-time " - "snapshot of '%s'"), zc.zc_name); - return (zfs_standard_error(hdl, err, di->errbuf)); - } - } - - di->tmpsnap = zfs_strdup(hdl, zc.zc_value); - di->tosnap = zfs_asprintf(hdl, "%s@%s", di->ds, di->tmpsnap); - return (0); -} - -static void -teardown_differ_info(differ_info_t *di) -{ - free(di->ds); - free(di->dsmnt); - free(di->fromsnap); - free(di->frommnt); - free(di->tosnap); - free(di->tmpsnap); - free(di->tomnt); - (void) close(di->cleanupfd); -} - -static int -get_snapshot_names(differ_info_t *di, const char *fromsnap, - const char *tosnap) -{ - libzfs_handle_t *hdl = di->zhp->zfs_hdl; - char *atptrf = NULL; - char *atptrt = NULL; - int fdslen, fsnlen; - int tdslen, tsnlen; - - /* - * Can accept - * dataset@snap1 - * dataset@snap1 dataset@snap2 - * dataset@snap1 @snap2 - * dataset@snap1 dataset - * @snap1 dataset@snap2 - */ - if (tosnap == NULL) { - /* only a from snapshot given, must be valid */ - (void) snprintf(di->errbuf, sizeof (di->errbuf), - dgettext(TEXT_DOMAIN, - "Badly formed snapshot name %s"), fromsnap); - - if (!zfs_validate_name(hdl, fromsnap, ZFS_TYPE_SNAPSHOT, - B_FALSE)) { - return (zfs_error(hdl, EZFS_INVALIDNAME, - di->errbuf)); - } - - atptrf = strchr(fromsnap, '@'); - ASSERT(atptrf != NULL); - fdslen = atptrf - fromsnap; - - di->fromsnap = zfs_strdup(hdl, fromsnap); - di->ds = zfs_strdup(hdl, fromsnap); - di->ds[fdslen] = '\0'; - - /* the to snap will be a just-in-time snap of the head */ - return (make_temp_snapshot(di)); - } - - (void) snprintf(di->errbuf, sizeof (di->errbuf), - dgettext(TEXT_DOMAIN, - "Unable to determine which snapshots to compare")); - - atptrf = strchr(fromsnap, '@'); - atptrt = strchr(tosnap, '@'); - fdslen = atptrf ? atptrf - fromsnap : strlen(fromsnap); - tdslen = atptrt ? atptrt - tosnap : strlen(tosnap); - fsnlen = strlen(fromsnap) - fdslen; /* includes @ sign */ - tsnlen = strlen(tosnap) - tdslen; /* includes @ sign */ - - if (fsnlen <= 1 || tsnlen == 1 || (fdslen == 0 && tdslen == 0) || - (fsnlen == 0 && tsnlen == 0)) { - return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf)); - } else if ((fdslen > 0 && tdslen > 0) && - ((tdslen != fdslen || strncmp(fromsnap, tosnap, fdslen) != 0))) { - /* - * not the same dataset name, might be okay if - * tosnap is a clone of a fromsnap descendant. - */ - char origin[ZFS_MAX_DATASET_NAME_LEN]; - zprop_source_t src; - zfs_handle_t *zhp; - - di->ds = zfs_alloc(di->zhp->zfs_hdl, tdslen + 1); - (void) strncpy(di->ds, tosnap, tdslen); - di->ds[tdslen] = '\0'; - - zhp = zfs_open(hdl, di->ds, ZFS_TYPE_FILESYSTEM); - while (zhp != NULL) { - if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin, - sizeof (origin), &src, NULL, 0, B_FALSE) != 0) { - (void) zfs_close(zhp); - zhp = NULL; - break; - } - if (strncmp(origin, fromsnap, fsnlen) == 0) - break; - - (void) zfs_close(zhp); - zhp = zfs_open(hdl, origin, ZFS_TYPE_FILESYSTEM); - } - - if (zhp == NULL) { - (void) snprintf(di->errbuf, sizeof (di->errbuf), - dgettext(TEXT_DOMAIN, - "Not an earlier snapshot from the same fs")); - return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf)); - } else { - (void) zfs_close(zhp); - } - - di->isclone = B_TRUE; - di->fromsnap = zfs_strdup(hdl, fromsnap); - if (tsnlen) { - di->tosnap = zfs_strdup(hdl, tosnap); - } else { - return (make_temp_snapshot(di)); - } - } else { - int dslen = fdslen ? fdslen : tdslen; - - di->ds = zfs_alloc(hdl, dslen + 1); - (void) strncpy(di->ds, fdslen ? fromsnap : tosnap, dslen); - di->ds[dslen] = '\0'; - - di->fromsnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrf); - if (tsnlen) { - di->tosnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrt); - } else { - return (make_temp_snapshot(di)); - } - } - return (0); -} - -static int -get_mountpoint(differ_info_t *di, char *dsnm, char **mntpt) -{ - boolean_t mounted; - - mounted = is_mounted(di->zhp->zfs_hdl, dsnm, mntpt); - if (mounted == B_FALSE) { - (void) snprintf(di->errbuf, sizeof (di->errbuf), - dgettext(TEXT_DOMAIN, - "Cannot diff an unmounted snapshot")); - return (zfs_error(di->zhp->zfs_hdl, EZFS_BADTYPE, di->errbuf)); - } - - /* Avoid a double slash at the beginning of root-mounted datasets */ - if (**mntpt == '/' && *(*mntpt + 1) == '\0') - **mntpt = '\0'; - return (0); -} - -static int -get_mountpoints(differ_info_t *di) -{ - char *strptr; - char *frommntpt; - - /* - * first get the mountpoint for the parent dataset - */ - if (get_mountpoint(di, di->ds, &di->dsmnt) != 0) - return (-1); - - strptr = strchr(di->tosnap, '@'); - ASSERT3P(strptr, !=, NULL); - di->tomnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", di->dsmnt, - ZDIFF_SNAPDIR, ++strptr); - - strptr = strchr(di->fromsnap, '@'); - ASSERT3P(strptr, !=, NULL); - - frommntpt = di->dsmnt; - if (di->isclone) { - char *mntpt; - int err; - - *strptr = '\0'; - err = get_mountpoint(di, di->fromsnap, &mntpt); - *strptr = '@'; - if (err != 0) - return (-1); - frommntpt = mntpt; - } - - di->frommnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", frommntpt, - ZDIFF_SNAPDIR, ++strptr); - - if (di->isclone) - free(frommntpt); - - return (0); -} - -static int -setup_differ_info(zfs_handle_t *zhp, const char *fromsnap, - const char *tosnap, differ_info_t *di) -{ - di->zhp = zhp; - - di->cleanupfd = open(ZFS_DEV, O_RDWR|O_EXCL); - VERIFY(di->cleanupfd >= 0); - - if (get_snapshot_names(di, fromsnap, tosnap) != 0) - return (-1); - - if (get_mountpoints(di) != 0) - return (-1); - - if (find_shares_object(di) != 0) - return (-1); - - return (0); -} - -int -zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap, - const char *tosnap, int flags) -{ - zfs_cmd_t zc = { 0 }; - char errbuf[1024]; - differ_info_t di = { 0 }; - pthread_t tid; - int pipefd[2]; - int iocerr; - - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "zfs diff failed")); - - if (setup_differ_info(zhp, fromsnap, tosnap, &di)) { - teardown_differ_info(&di); - return (-1); - } - - if (pipe(pipefd)) { - zfs_error_aux(zhp->zfs_hdl, strerror(errno)); - teardown_differ_info(&di); - return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, errbuf)); - } - - di.scripted = (flags & ZFS_DIFF_PARSEABLE); - di.classify = (flags & ZFS_DIFF_CLASSIFY); - di.timestamped = (flags & ZFS_DIFF_TIMESTAMP); - - di.outputfd = outfd; - di.datafd = pipefd[0]; - - if (pthread_create(&tid, NULL, differ, &di)) { - zfs_error_aux(zhp->zfs_hdl, strerror(errno)); - (void) close(pipefd[0]); - (void) close(pipefd[1]); - teardown_differ_info(&di); - return (zfs_error(zhp->zfs_hdl, - EZFS_THREADCREATEFAILED, errbuf)); - } - - /* do the ioctl() */ - (void) strlcpy(zc.zc_value, di.fromsnap, strlen(di.fromsnap) + 1); - (void) strlcpy(zc.zc_name, di.tosnap, strlen(di.tosnap) + 1); - zc.zc_cookie = pipefd[1]; - - iocerr = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DIFF, &zc); - if (iocerr != 0) { - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "Unable to obtain diffs")); - if (errno == EPERM) { - zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, - "\n The sys_mount privilege or diff delegated " - "permission is needed\n to execute the " - "diff ioctl")); - } else if (errno == EXDEV) { - zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, - "\n Not an earlier snapshot from the same fs")); - } else if (errno != EPIPE || di.zerr == 0) { - zfs_error_aux(zhp->zfs_hdl, strerror(errno)); - } - (void) close(pipefd[1]); - (void) pthread_cancel(tid); - (void) pthread_join(tid, NULL); - teardown_differ_info(&di); - if (di.zerr != 0 && di.zerr != EPIPE) { - zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr)); - return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf)); - } else { - return (zfs_error(zhp->zfs_hdl, EZFS_DIFFDATA, errbuf)); - } - } - - (void) close(pipefd[1]); - (void) pthread_join(tid, NULL); - - if (di.zerr != 0) { - zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr)); - return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf)); - } - teardown_differ_info(&di); - return (0); -} diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_fru.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_fru.c deleted file mode 100644 index 474470c416ea..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_fru.c +++ /dev/null @@ -1,452 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include "libzfs_impl.h" - -/* - * This file is responsible for determining the relationship between I/O - * devices paths and physical locations. In the world of MPxIO and external - * enclosures, the device path is not synonymous with the physical location. - * If you remove a drive and insert it into a different slot, it will end up - * with the same path under MPxIO. If you recable storage enclosures, the - * device paths may change. All of this makes it difficult to implement the - * 'autoreplace' property, which is supposed to automatically manage disk - * replacement based on physical slot. - * - * In order to work around these limitations, we have a per-vdev FRU property - * that is the libtopo path (minus disk-specific authority information) to the - * physical location of the device on the system. This is an optional - * property, and is only needed when using the 'autoreplace' property or when - * generating FMA faults against vdevs. - */ - -/* - * Because the FMA packages depend on ZFS, we have to dlopen() libtopo in case - * it is not present. We only need this once per library instance, so it is - * not part of the libzfs handle. - */ -static void *_topo_dlhandle; -static topo_hdl_t *(*_topo_open)(int, const char *, int *); -static void (*_topo_close)(topo_hdl_t *); -static char *(*_topo_snap_hold)(topo_hdl_t *, const char *, int *); -static void (*_topo_snap_release)(topo_hdl_t *); -static topo_walk_t *(*_topo_walk_init)(topo_hdl_t *, const char *, - topo_walk_cb_t, void *, int *); -static int (*_topo_walk_step)(topo_walk_t *, int); -static void (*_topo_walk_fini)(topo_walk_t *); -static void (*_topo_hdl_strfree)(topo_hdl_t *, char *); -static char *(*_topo_node_name)(tnode_t *); -static int (*_topo_prop_get_string)(tnode_t *, const char *, const char *, - char **, int *); -static int (*_topo_node_fru)(tnode_t *, nvlist_t **, nvlist_t *, int *); -static int (*_topo_fmri_nvl2str)(topo_hdl_t *, nvlist_t *, char **, int *); -static int (*_topo_fmri_strcmp_noauth)(topo_hdl_t *, const char *, - const char *); - -#define ZFS_FRU_HASH_SIZE 257 - -static size_t -fru_strhash(const char *key) -{ - ulong_t g, h = 0; - const char *p; - - for (p = key; *p != '\0'; p++) { - h = (h << 4) + *p; - - if ((g = (h & 0xf0000000)) != 0) { - h ^= (g >> 24); - h ^= g; - } - } - - return (h % ZFS_FRU_HASH_SIZE); -} - -static int -libzfs_fru_gather(topo_hdl_t *thp, tnode_t *tn, void *arg) -{ - libzfs_handle_t *hdl = arg; - nvlist_t *fru; - char *devpath, *frustr; - int err; - libzfs_fru_t *frup; - size_t idx; - - /* - * If this is the chassis node, and we don't yet have the system - * chassis ID, then fill in this value now. - */ - if (hdl->libzfs_chassis_id[0] == '\0' && - strcmp(_topo_node_name(tn), "chassis") == 0) { - if (_topo_prop_get_string(tn, FM_FMRI_AUTHORITY, - FM_FMRI_AUTH_CHASSIS, &devpath, &err) == 0) - (void) strlcpy(hdl->libzfs_chassis_id, devpath, - sizeof (hdl->libzfs_chassis_id)); - } - - /* - * Skip non-disk nodes. - */ - if (strcmp(_topo_node_name(tn), "disk") != 0) - return (TOPO_WALK_NEXT); - - /* - * Get the devfs path and FRU. - */ - if (_topo_prop_get_string(tn, "io", "devfs-path", &devpath, &err) != 0) - return (TOPO_WALK_NEXT); - - if (libzfs_fru_lookup(hdl, devpath) != NULL) { - _topo_hdl_strfree(thp, devpath); - return (TOPO_WALK_NEXT); - } - - if (_topo_node_fru(tn, &fru, NULL, &err) != 0) { - _topo_hdl_strfree(thp, devpath); - return (TOPO_WALK_NEXT); - } - - /* - * Convert the FRU into a string. - */ - if (_topo_fmri_nvl2str(thp, fru, &frustr, &err) != 0) { - nvlist_free(fru); - _topo_hdl_strfree(thp, devpath); - return (TOPO_WALK_NEXT); - } - - nvlist_free(fru); - - /* - * Finally, we have a FRU string and device path. Add it to the hash. - */ - if ((frup = calloc(sizeof (libzfs_fru_t), 1)) == NULL) { - _topo_hdl_strfree(thp, devpath); - _topo_hdl_strfree(thp, frustr); - return (TOPO_WALK_NEXT); - } - - if ((frup->zf_device = strdup(devpath)) == NULL || - (frup->zf_fru = strdup(frustr)) == NULL) { - free(frup->zf_device); - free(frup); - _topo_hdl_strfree(thp, devpath); - _topo_hdl_strfree(thp, frustr); - return (TOPO_WALK_NEXT); - } - - _topo_hdl_strfree(thp, devpath); - _topo_hdl_strfree(thp, frustr); - - idx = fru_strhash(frup->zf_device); - frup->zf_chain = hdl->libzfs_fru_hash[idx]; - hdl->libzfs_fru_hash[idx] = frup; - frup->zf_next = hdl->libzfs_fru_list; - hdl->libzfs_fru_list = frup; - - return (TOPO_WALK_NEXT); -} - -/* - * Called during initialization to setup the dynamic libtopo connection. - */ -#pragma init(libzfs_init_fru) -static void -libzfs_init_fru(void) -{ - char path[MAXPATHLEN]; - char isa[257]; - -#if defined(_LP64) - if (sysinfo(SI_ARCHITECTURE_64, isa, sizeof (isa)) < 0) - isa[0] = '\0'; -#else - isa[0] = '\0'; -#endif - (void) snprintf(path, sizeof (path), - "/usr/lib/fm/%s/libtopo.so", isa); - - if ((_topo_dlhandle = dlopen(path, RTLD_LAZY)) == NULL) - return; - - _topo_open = (topo_hdl_t *(*)()) - dlsym(_topo_dlhandle, "topo_open"); - _topo_close = (void (*)()) - dlsym(_topo_dlhandle, "topo_close"); - _topo_snap_hold = (char *(*)()) - dlsym(_topo_dlhandle, "topo_snap_hold"); - _topo_snap_release = (void (*)()) - dlsym(_topo_dlhandle, "topo_snap_release"); - _topo_walk_init = (topo_walk_t *(*)()) - dlsym(_topo_dlhandle, "topo_walk_init"); - _topo_walk_step = (int (*)()) - dlsym(_topo_dlhandle, "topo_walk_step"); - _topo_walk_fini = (void (*)()) - dlsym(_topo_dlhandle, "topo_walk_fini"); - _topo_hdl_strfree = (void (*)()) - dlsym(_topo_dlhandle, "topo_hdl_strfree"); - _topo_node_name = (char *(*)()) - dlsym(_topo_dlhandle, "topo_node_name"); - _topo_prop_get_string = (int (*)()) - dlsym(_topo_dlhandle, "topo_prop_get_string"); - _topo_node_fru = (int (*)()) - dlsym(_topo_dlhandle, "topo_node_fru"); - _topo_fmri_nvl2str = (int (*)()) - dlsym(_topo_dlhandle, "topo_fmri_nvl2str"); - _topo_fmri_strcmp_noauth = (int (*)()) - dlsym(_topo_dlhandle, "topo_fmri_strcmp_noauth"); - - if (_topo_open == NULL || _topo_close == NULL || - _topo_snap_hold == NULL || _topo_snap_release == NULL || - _topo_walk_init == NULL || _topo_walk_step == NULL || - _topo_walk_fini == NULL || _topo_hdl_strfree == NULL || - _topo_node_name == NULL || _topo_prop_get_string == NULL || - _topo_node_fru == NULL || _topo_fmri_nvl2str == NULL || - _topo_fmri_strcmp_noauth == NULL) { - (void) dlclose(_topo_dlhandle); - _topo_dlhandle = NULL; - } -} - -/* - * Refresh the mappings from device path -> FMRI. We do this by walking the - * hc topology looking for disk nodes, and recording the io/devfs-path and FRU. - * Note that we strip out the disk-specific authority information (serial, - * part, revision, etc) so that we are left with only the identifying - * characteristics of the slot (hc path and chassis-id). - */ -void -libzfs_fru_refresh(libzfs_handle_t *hdl) -{ - int err; - char *uuid; - topo_hdl_t *thp; - topo_walk_t *twp; - - if (_topo_dlhandle == NULL) - return; - - /* - * Clear the FRU hash and initialize our basic structures. - */ - libzfs_fru_clear(hdl, B_FALSE); - - if ((hdl->libzfs_topo_hdl = _topo_open(TOPO_VERSION, - NULL, &err)) == NULL) - return; - - thp = hdl->libzfs_topo_hdl; - - if ((uuid = _topo_snap_hold(thp, NULL, &err)) == NULL) - return; - - _topo_hdl_strfree(thp, uuid); - - if (hdl->libzfs_fru_hash == NULL && - (hdl->libzfs_fru_hash = - calloc(ZFS_FRU_HASH_SIZE, sizeof (void *))) == NULL) - return; - - /* - * We now have a topo snapshot, so iterate over the hc topology looking - * for disks to add to the hash. - */ - twp = _topo_walk_init(thp, FM_FMRI_SCHEME_HC, - libzfs_fru_gather, hdl, &err); - if (twp != NULL) { - (void) _topo_walk_step(twp, TOPO_WALK_CHILD); - _topo_walk_fini(twp); - } -} - -/* - * Given a devfs path, return the FRU for the device, if known. This will - * automatically call libzfs_fru_refresh() if it hasn't already been called by - * the consumer. The string returned is valid until the next call to - * libzfs_fru_refresh(). - */ -const char * -libzfs_fru_lookup(libzfs_handle_t *hdl, const char *devpath) -{ - size_t idx = fru_strhash(devpath); - libzfs_fru_t *frup; - - if (hdl->libzfs_fru_hash == NULL) - libzfs_fru_refresh(hdl); - - if (hdl->libzfs_fru_hash == NULL) - return (NULL); - - for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL; - frup = frup->zf_chain) { - if (strcmp(devpath, frup->zf_device) == 0) - return (frup->zf_fru); - } - - return (NULL); -} - -/* - * Given a fru path, return the device path. This will automatically call - * libzfs_fru_refresh() if it hasn't already been called by the consumer. The - * string returned is valid until the next call to libzfs_fru_refresh(). - */ -const char * -libzfs_fru_devpath(libzfs_handle_t *hdl, const char *fru) -{ - libzfs_fru_t *frup; - size_t idx; - - if (hdl->libzfs_fru_hash == NULL) - libzfs_fru_refresh(hdl); - - if (hdl->libzfs_fru_hash == NULL) - return (NULL); - - for (idx = 0; idx < ZFS_FRU_HASH_SIZE; idx++) { - for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL; - frup = frup->zf_next) { - if (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl, - fru, frup->zf_fru)) - return (frup->zf_device); - } - } - - return (NULL); -} - -/* - * Change the stored FRU for the given vdev. - */ -int -zpool_fru_set(zpool_handle_t *zhp, uint64_t vdev_guid, const char *fru) -{ - zfs_cmd_t zc = { 0 }; - - (void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - (void) strncpy(zc.zc_value, fru, sizeof (zc.zc_value)); - zc.zc_guid = vdev_guid; - - if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SETFRU, &zc) != 0) - return (zpool_standard_error_fmt(zhp->zpool_hdl, errno, - dgettext(TEXT_DOMAIN, "cannot set FRU"))); - - return (0); -} - -/* - * Compare to two FRUs, ignoring any authority information. - */ -boolean_t -libzfs_fru_compare(libzfs_handle_t *hdl, const char *a, const char *b) -{ - if (hdl->libzfs_fru_hash == NULL) - libzfs_fru_refresh(hdl); - - if (hdl->libzfs_fru_hash == NULL) - return (strcmp(a, b) == 0); - - return (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl, a, b)); -} - -/* - * This special function checks to see whether the FRU indicates it's supposed - * to be in the system chassis, but the chassis-id doesn't match. This can - * happen in a clustered case, where both head nodes have the same logical - * disk, but opening the device on the other head node is meaningless. - */ -boolean_t -libzfs_fru_notself(libzfs_handle_t *hdl, const char *fru) -{ - const char *chassisid; - size_t len; - - if (hdl->libzfs_fru_hash == NULL) - libzfs_fru_refresh(hdl); - - if (hdl->libzfs_chassis_id[0] == '\0') - return (B_FALSE); - - if (strstr(fru, "/chassis=0/") == NULL) - return (B_FALSE); - - if ((chassisid = strstr(fru, ":chassis-id=")) == NULL) - return (B_FALSE); - - chassisid += 12; - len = strlen(hdl->libzfs_chassis_id); - if (strncmp(chassisid, hdl->libzfs_chassis_id, len) == 0 && - (chassisid[len] == '/' || chassisid[len] == ':')) - return (B_FALSE); - - return (B_TRUE); -} - -/* - * Clear memory associated with the FRU hash. - */ -void -libzfs_fru_clear(libzfs_handle_t *hdl, boolean_t final) -{ - libzfs_fru_t *frup; - - while ((frup = hdl->libzfs_fru_list) != NULL) { - hdl->libzfs_fru_list = frup->zf_next; - free(frup->zf_device); - free(frup->zf_fru); - free(frup); - } - - hdl->libzfs_fru_list = NULL; - - if (hdl->libzfs_topo_hdl != NULL) { - _topo_snap_release(hdl->libzfs_topo_hdl); - _topo_close(hdl->libzfs_topo_hdl); - hdl->libzfs_topo_hdl = NULL; - } - - if (final) { - free(hdl->libzfs_fru_hash); - } else if (hdl->libzfs_fru_hash != NULL) { - bzero(hdl->libzfs_fru_hash, - ZFS_FRU_HASH_SIZE * sizeof (void *)); - } -} diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h deleted file mode 100644 index a0338afadb8f..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h +++ /dev/null @@ -1,228 +0,0 @@ -/* - * CDDL HEADER SART - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. - * Copyright (c) 2013 Martin Matuska . All rights reserved. - */ - -#ifndef _LIBZFS_IMPL_H -#define _LIBZFS_IMPL_H - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef VERIFY -#undef VERIFY -#endif -#define VERIFY verify - -typedef struct libzfs_fru { - char *zf_device; - char *zf_fru; - struct libzfs_fru *zf_chain; - struct libzfs_fru *zf_next; -} libzfs_fru_t; - -struct libzfs_handle { - int libzfs_error; - int libzfs_fd; - FILE *libzfs_mnttab; - FILE *libzfs_sharetab; - zpool_handle_t *libzfs_pool_handles; - uu_avl_pool_t *libzfs_ns_avlpool; - uu_avl_t *libzfs_ns_avl; - uint64_t libzfs_ns_gen; - int libzfs_desc_active; - char libzfs_action[1024]; - char libzfs_desc[1024]; - int libzfs_printerr; - int libzfs_storeerr; /* stuff error messages into buffer */ - void *libzfs_sharehdl; /* libshare handle */ - boolean_t libzfs_mnttab_enable; - /* - * We need a lock to handle the case where parallel mount - * threads are populating the mnttab cache simultaneously. The - * lock only protects the integrity of the avl tree, and does - * not protect the contents of the mnttab entries themselves. - */ - pthread_mutex_t libzfs_mnttab_cache_lock; - avl_tree_t libzfs_mnttab_cache; - int libzfs_pool_iter; - libzfs_fru_t **libzfs_fru_hash; - libzfs_fru_t *libzfs_fru_list; - char libzfs_chassis_id[256]; - boolean_t libzfs_prop_debug; -}; - -struct zfs_handle { - libzfs_handle_t *zfs_hdl; - zpool_handle_t *zpool_hdl; - char zfs_name[ZFS_MAX_DATASET_NAME_LEN]; - zfs_type_t zfs_type; /* type including snapshot */ - zfs_type_t zfs_head_type; /* type excluding snapshot */ - dmu_objset_stats_t zfs_dmustats; - nvlist_t *zfs_props; - nvlist_t *zfs_user_props; - nvlist_t *zfs_recvd_props; - boolean_t zfs_mntcheck; - char *zfs_mntopts; - uint8_t *zfs_props_table; -}; - -/* - * This is different from checking zfs_type, because it will also catch - * snapshots of volumes. - */ -#define ZFS_IS_VOLUME(zhp) ((zhp)->zfs_head_type == ZFS_TYPE_VOLUME) - -struct zpool_handle { - libzfs_handle_t *zpool_hdl; - zpool_handle_t *zpool_next; - char zpool_name[ZFS_MAX_DATASET_NAME_LEN]; - int zpool_state; - size_t zpool_config_size; - nvlist_t *zpool_config; - nvlist_t *zpool_old_config; - nvlist_t *zpool_props; - diskaddr_t zpool_start_block; -}; - -typedef enum { - PROTO_NFS = 0, - PROTO_SMB = 1, - PROTO_END = 2 -} zfs_share_proto_t; - -/* - * The following can be used as a bitmask and any new values - * added must preserve that capability. - */ -typedef enum { - SHARED_NOT_SHARED = 0x0, - SHARED_NFS = 0x2, - SHARED_SMB = 0x4 -} zfs_share_type_t; - -#define CONFIG_BUF_MINSIZE 262144 - -int zfs_error(libzfs_handle_t *, int, const char *); -int zfs_error_fmt(libzfs_handle_t *, int, const char *, ...); -void zfs_error_aux(libzfs_handle_t *, const char *, ...); -void *zfs_alloc(libzfs_handle_t *, size_t); -void *zfs_realloc(libzfs_handle_t *, void *, size_t, size_t); -char *zfs_asprintf(libzfs_handle_t *, const char *, ...); -char *zfs_strdup(libzfs_handle_t *, const char *); -int no_memory(libzfs_handle_t *); - -int zfs_standard_error(libzfs_handle_t *, int, const char *); -int zfs_standard_error_fmt(libzfs_handle_t *, int, const char *, ...); -int zpool_standard_error(libzfs_handle_t *, int, const char *); -int zpool_standard_error_fmt(libzfs_handle_t *, int, const char *, ...); - -int get_dependents(libzfs_handle_t *, boolean_t, const char *, char ***, - size_t *); -zfs_handle_t *make_dataset_handle_zc(libzfs_handle_t *, zfs_cmd_t *); -zfs_handle_t *make_dataset_simple_handle_zc(zfs_handle_t *, zfs_cmd_t *); - -int zprop_parse_value(libzfs_handle_t *, nvpair_t *, int, zfs_type_t, - nvlist_t *, char **, uint64_t *, const char *); -int zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, - zfs_type_t type); - -/* - * Use this changelist_gather() flag to force attempting mounts - * on each change node regardless of whether or not it is currently - * mounted. - */ -#define CL_GATHER_MOUNT_ALWAYS 0x01 -/* - * Use this changelist_gather() flag to prevent unmounting of file systems. - */ -#define CL_GATHER_DONT_UNMOUNT 0x02 - -typedef struct prop_changelist prop_changelist_t; - -int zcmd_alloc_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, size_t); -int zcmd_write_src_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *); -int zcmd_write_conf_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *); -int zcmd_expand_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *); -int zcmd_read_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t **); -void zcmd_free_nvlists(zfs_cmd_t *); - -int changelist_prefix(prop_changelist_t *); -int changelist_postfix(prop_changelist_t *); -void changelist_rename(prop_changelist_t *, const char *, const char *); -void changelist_remove(prop_changelist_t *, const char *); -void changelist_free(prop_changelist_t *); -prop_changelist_t *changelist_gather(zfs_handle_t *, zfs_prop_t, int, int); -int changelist_unshare(prop_changelist_t *, zfs_share_proto_t *); -int changelist_haszonedchild(prop_changelist_t *); - -void remove_mountpoint(zfs_handle_t *); -int create_parents(libzfs_handle_t *, char *, int); -boolean_t isa_child_of(const char *dataset, const char *parent); - -zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *); -zfs_handle_t *make_bookmark_handle(zfs_handle_t *, const char *, - nvlist_t *props); - -int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **); - -boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *); - -int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, - boolean_t modifying); - -void namespace_clear(libzfs_handle_t *); - -/* - * libshare (sharemgr) interfaces used internally. - */ - -extern int zfs_init_libshare(libzfs_handle_t *, int); -extern int zfs_parse_options(char *, zfs_share_proto_t); - -extern int zfs_unshare_proto(zfs_handle_t *, - const char *, zfs_share_proto_t *); - -extern void libzfs_fru_clear(libzfs_handle_t *, boolean_t); - -#ifdef __cplusplus -} -#endif - -#endif /* _LIBZFS_IMPL_H */ diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c deleted file mode 100644 index 87c8dd14898b..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c +++ /dev/null @@ -1,1929 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright 2015 RackTop Systems. - * Copyright 2016 Nexenta Systems, Inc. - */ - -/* - * Pool import support functions. - * - * To import a pool, we rely on reading the configuration information from the - * ZFS label of each device. If we successfully read the label, then we - * organize the configuration information in the following hierarchy: - * - * pool guid -> toplevel vdev guid -> label txg - * - * Duplicate entries matching this same tuple will be discarded. Once we have - * examined every device, we pick the best label txg config for each toplevel - * vdev. We then arrange these toplevel vdevs into a complete pool config, and - * update any paths that have changed. Finally, we attempt to import the pool - * using our derived config, and record the results. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "libzfs.h" -#include "libzfs_impl.h" - -/* - * Intermediate structures used to gather configuration information. - */ -typedef struct config_entry { - uint64_t ce_txg; - nvlist_t *ce_config; - struct config_entry *ce_next; -} config_entry_t; - -typedef struct vdev_entry { - uint64_t ve_guid; - config_entry_t *ve_configs; - struct vdev_entry *ve_next; -} vdev_entry_t; - -typedef struct pool_entry { - uint64_t pe_guid; - vdev_entry_t *pe_vdevs; - struct pool_entry *pe_next; -} pool_entry_t; - -typedef struct name_entry { - char *ne_name; - uint64_t ne_guid; - struct name_entry *ne_next; -} name_entry_t; - -typedef struct pool_list { - pool_entry_t *pools; - name_entry_t *names; -} pool_list_t; - -static char * -get_devid(const char *path) -{ -#ifdef have_devid - int fd; - ddi_devid_t devid; - char *minor, *ret; - - if ((fd = open(path, O_RDONLY)) < 0) - return (NULL); - - minor = NULL; - ret = NULL; - if (devid_get(fd, &devid) == 0) { - if (devid_get_minor_name(fd, &minor) == 0) - ret = devid_str_encode(devid, minor); - if (minor != NULL) - devid_str_free(minor); - devid_free(devid); - } - (void) close(fd); - - return (ret); -#else - return (NULL); -#endif -} - - -/* - * Go through and fix up any path and/or devid information for the given vdev - * configuration. - */ -static int -fix_paths(nvlist_t *nv, name_entry_t *names) -{ - nvlist_t **child; - uint_t c, children; - uint64_t guid; - name_entry_t *ne, *best; - char *path, *devid; - int matched; - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (c = 0; c < children; c++) - if (fix_paths(child[c], names) != 0) - return (-1); - return (0); - } - - /* - * This is a leaf (file or disk) vdev. In either case, go through - * the name list and see if we find a matching guid. If so, replace - * the path and see if we can calculate a new devid. - * - * There may be multiple names associated with a particular guid, in - * which case we have overlapping slices or multiple paths to the same - * disk. If this is the case, then we want to pick the path that is - * the most similar to the original, where "most similar" is the number - * of matching characters starting from the end of the path. This will - * preserve slice numbers even if the disks have been reorganized, and - * will also catch preferred disk names if multiple paths exist. - */ - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0); - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) - path = NULL; - - matched = 0; - best = NULL; - for (ne = names; ne != NULL; ne = ne->ne_next) { - if (ne->ne_guid == guid) { - const char *src, *dst; - int count; - - if (path == NULL) { - best = ne; - break; - } - - src = ne->ne_name + strlen(ne->ne_name) - 1; - dst = path + strlen(path) - 1; - for (count = 0; src >= ne->ne_name && dst >= path; - src--, dst--, count++) - if (*src != *dst) - break; - - /* - * At this point, 'count' is the number of characters - * matched from the end. - */ - if (count > matched || best == NULL) { - best = ne; - matched = count; - } - } - } - - if (best == NULL) - return (0); - - if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0) - return (-1); - - if ((devid = get_devid(best->ne_name)) == NULL) { - (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); - } else { - if (nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, devid) != 0) { - devid_str_free(devid); - return (-1); - } - devid_str_free(devid); - } - - return (0); -} - -/* - * Add the given configuration to the list of known devices. - */ -static int -add_config(libzfs_handle_t *hdl, pool_list_t *pl, const char *path, - nvlist_t *config) -{ - uint64_t pool_guid, vdev_guid, top_guid, txg, state; - pool_entry_t *pe; - vdev_entry_t *ve; - config_entry_t *ce; - name_entry_t *ne; - - /* - * If this is a hot spare not currently in use or level 2 cache - * device, add it to the list of names to translate, but don't do - * anything else. - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, - &state) == 0 && - (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) && - nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) { - if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL) - return (-1); - - if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) { - free(ne); - return (-1); - } - - ne->ne_guid = vdev_guid; - ne->ne_next = pl->names; - pl->names = ne; - - return (0); - } - - /* - * If we have a valid config but cannot read any of these fields, then - * it means we have a half-initialized label. In vdev_label_init() - * we write a label with txg == 0 so that we can identify the device - * in case the user refers to the same disk later on. If we fail to - * create the pool, we'll be left with a label in this state - * which should not be considered part of a valid pool. - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, - &pool_guid) != 0 || - nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, - &vdev_guid) != 0 || - nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, - &top_guid) != 0 || - nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, - &txg) != 0 || txg == 0) { - return (0); - } - - /* - * First, see if we know about this pool. If not, then add it to the - * list of known pools. - */ - for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { - if (pe->pe_guid == pool_guid) - break; - } - - if (pe == NULL) { - if ((pe = zfs_alloc(hdl, sizeof (pool_entry_t))) == NULL) { - return (-1); - } - pe->pe_guid = pool_guid; - pe->pe_next = pl->pools; - pl->pools = pe; - } - - /* - * Second, see if we know about this toplevel vdev. Add it if its - * missing. - */ - for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { - if (ve->ve_guid == top_guid) - break; - } - - if (ve == NULL) { - if ((ve = zfs_alloc(hdl, sizeof (vdev_entry_t))) == NULL) { - return (-1); - } - ve->ve_guid = top_guid; - ve->ve_next = pe->pe_vdevs; - pe->pe_vdevs = ve; - } - - /* - * Third, see if we have a config with a matching transaction group. If - * so, then we do nothing. Otherwise, add it to the list of known - * configs. - */ - for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) { - if (ce->ce_txg == txg) - break; - } - - if (ce == NULL) { - if ((ce = zfs_alloc(hdl, sizeof (config_entry_t))) == NULL) { - return (-1); - } - ce->ce_txg = txg; - ce->ce_config = fnvlist_dup(config); - ce->ce_next = ve->ve_configs; - ve->ve_configs = ce; - } - - /* - * At this point we've successfully added our config to the list of - * known configs. The last thing to do is add the vdev guid -> path - * mappings so that we can fix up the configuration as necessary before - * doing the import. - */ - if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL) - return (-1); - - if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) { - free(ne); - return (-1); - } - - ne->ne_guid = vdev_guid; - ne->ne_next = pl->names; - pl->names = ne; - - return (0); -} - -/* - * Returns true if the named pool matches the given GUID. - */ -static int -pool_active(libzfs_handle_t *hdl, const char *name, uint64_t guid, - boolean_t *isactive) -{ - zpool_handle_t *zhp; - uint64_t theguid; - - if (zpool_open_silent(hdl, name, &zhp) != 0) - return (-1); - - if (zhp == NULL) { - *isactive = B_FALSE; - return (0); - } - - verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_POOL_GUID, - &theguid) == 0); - - zpool_close(zhp); - - *isactive = (theguid == guid); - return (0); -} - -static nvlist_t * -refresh_config(libzfs_handle_t *hdl, nvlist_t *config) -{ - nvlist_t *nvl; - zfs_cmd_t zc = { 0 }; - int err, dstbuf_size; - - if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) - return (NULL); - - dstbuf_size = MAX(CONFIG_BUF_MINSIZE, zc.zc_nvlist_conf_size * 4); - - if (zcmd_alloc_dst_nvlist(hdl, &zc, dstbuf_size) != 0) { - zcmd_free_nvlists(&zc); - return (NULL); - } - - while ((err = ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_TRYIMPORT, - &zc)) != 0 && errno == ENOMEM) { - if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { - zcmd_free_nvlists(&zc); - return (NULL); - } - } - - if (err) { - zcmd_free_nvlists(&zc); - return (NULL); - } - - if (zcmd_read_dst_nvlist(hdl, &zc, &nvl) != 0) { - zcmd_free_nvlists(&zc); - return (NULL); - } - - zcmd_free_nvlists(&zc); - return (nvl); -} - -/* - * Determine if the vdev id is a hole in the namespace. - */ -boolean_t -vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) -{ - for (int c = 0; c < holes; c++) { - - /* Top-level is a hole */ - if (hole_array[c] == id) - return (B_TRUE); - } - return (B_FALSE); -} - -/* - * Convert our list of pools into the definitive set of configurations. We - * start by picking the best config for each toplevel vdev. Once that's done, - * we assemble the toplevel vdevs into a full config for the pool. We make a - * pass to fix up any incorrect paths, and then add it to the main list to - * return to the user. - */ -static nvlist_t * -get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok, - nvlist_t *policy) -{ - pool_entry_t *pe; - vdev_entry_t *ve; - config_entry_t *ce; - nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot; - nvlist_t **spares, **l2cache; - uint_t i, nspares, nl2cache; - boolean_t config_seen; - uint64_t best_txg; - char *name, *hostname = NULL; - uint64_t guid; - uint_t children = 0; - nvlist_t **child = NULL; - uint_t holes; - uint64_t *hole_array, max_id; - uint_t c; - boolean_t isactive; - uint64_t hostid; - nvlist_t *nvl; - boolean_t found_one = B_FALSE; - boolean_t valid_top_config = B_FALSE; - - if (nvlist_alloc(&ret, 0, 0) != 0) - goto nomem; - - for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { - uint64_t id, max_txg = 0; - - if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0) - goto nomem; - config_seen = B_FALSE; - - /* - * Iterate over all toplevel vdevs. Grab the pool configuration - * from the first one we find, and then go through the rest and - * add them as necessary to the 'vdevs' member of the config. - */ - for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { - - /* - * Determine the best configuration for this vdev by - * selecting the config with the latest transaction - * group. - */ - best_txg = 0; - for (ce = ve->ve_configs; ce != NULL; - ce = ce->ce_next) { - - if (ce->ce_txg > best_txg) { - tmp = ce->ce_config; - best_txg = ce->ce_txg; - } - } - - /* - * We rely on the fact that the max txg for the - * pool will contain the most up-to-date information - * about the valid top-levels in the vdev namespace. - */ - if (best_txg > max_txg) { - (void) nvlist_remove(config, - ZPOOL_CONFIG_VDEV_CHILDREN, - DATA_TYPE_UINT64); - (void) nvlist_remove(config, - ZPOOL_CONFIG_HOLE_ARRAY, - DATA_TYPE_UINT64_ARRAY); - - max_txg = best_txg; - hole_array = NULL; - holes = 0; - max_id = 0; - valid_top_config = B_FALSE; - - if (nvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) { - verify(nvlist_add_uint64(config, - ZPOOL_CONFIG_VDEV_CHILDREN, - max_id) == 0); - valid_top_config = B_TRUE; - } - - if (nvlist_lookup_uint64_array(tmp, - ZPOOL_CONFIG_HOLE_ARRAY, &hole_array, - &holes) == 0) { - verify(nvlist_add_uint64_array(config, - ZPOOL_CONFIG_HOLE_ARRAY, - hole_array, holes) == 0); - } - } - - if (!config_seen) { - /* - * Copy the relevant pieces of data to the pool - * configuration: - * - * version - * pool guid - * name - * comment (if available) - * pool state - * hostid (if available) - * hostname (if available) - */ - uint64_t state, version; - char *comment = NULL; - - version = fnvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_VERSION); - fnvlist_add_uint64(config, - ZPOOL_CONFIG_VERSION, version); - guid = fnvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_POOL_GUID); - fnvlist_add_uint64(config, - ZPOOL_CONFIG_POOL_GUID, guid); - name = fnvlist_lookup_string(tmp, - ZPOOL_CONFIG_POOL_NAME); - fnvlist_add_string(config, - ZPOOL_CONFIG_POOL_NAME, name); - - if (nvlist_lookup_string(tmp, - ZPOOL_CONFIG_COMMENT, &comment) == 0) - fnvlist_add_string(config, - ZPOOL_CONFIG_COMMENT, comment); - - state = fnvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_POOL_STATE); - fnvlist_add_uint64(config, - ZPOOL_CONFIG_POOL_STATE, state); - - hostid = 0; - if (nvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_HOSTID, &hostid) == 0) { - fnvlist_add_uint64(config, - ZPOOL_CONFIG_HOSTID, hostid); - hostname = fnvlist_lookup_string(tmp, - ZPOOL_CONFIG_HOSTNAME); - fnvlist_add_string(config, - ZPOOL_CONFIG_HOSTNAME, hostname); - } - - config_seen = B_TRUE; - } - - /* - * Add this top-level vdev to the child array. - */ - verify(nvlist_lookup_nvlist(tmp, - ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); - verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID, - &id) == 0); - - if (id >= children) { - nvlist_t **newchild; - - newchild = zfs_alloc(hdl, (id + 1) * - sizeof (nvlist_t *)); - if (newchild == NULL) - goto nomem; - - for (c = 0; c < children; c++) - newchild[c] = child[c]; - - free(child); - child = newchild; - children = id + 1; - } - if (nvlist_dup(nvtop, &child[id], 0) != 0) - goto nomem; - - } - - /* - * If we have information about all the top-levels then - * clean up the nvlist which we've constructed. This - * means removing any extraneous devices that are - * beyond the valid range or adding devices to the end - * of our array which appear to be missing. - */ - if (valid_top_config) { - if (max_id < children) { - for (c = max_id; c < children; c++) - nvlist_free(child[c]); - children = max_id; - } else if (max_id > children) { - nvlist_t **newchild; - - newchild = zfs_alloc(hdl, (max_id) * - sizeof (nvlist_t *)); - if (newchild == NULL) - goto nomem; - - for (c = 0; c < children; c++) - newchild[c] = child[c]; - - free(child); - child = newchild; - children = max_id; - } - } - - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, - &guid) == 0); - - /* - * The vdev namespace may contain holes as a result of - * device removal. We must add them back into the vdev - * tree before we process any missing devices. - */ - if (holes > 0) { - ASSERT(valid_top_config); - - for (c = 0; c < children; c++) { - nvlist_t *holey; - - if (child[c] != NULL || - !vdev_is_hole(hole_array, holes, c)) - continue; - - if (nvlist_alloc(&holey, NV_UNIQUE_NAME, - 0) != 0) - goto nomem; - - /* - * Holes in the namespace are treated as - * "hole" top-level vdevs and have a - * special flag set on them. - */ - if (nvlist_add_string(holey, - ZPOOL_CONFIG_TYPE, - VDEV_TYPE_HOLE) != 0 || - nvlist_add_uint64(holey, - ZPOOL_CONFIG_ID, c) != 0 || - nvlist_add_uint64(holey, - ZPOOL_CONFIG_GUID, 0ULL) != 0) { - nvlist_free(holey); - goto nomem; - } - child[c] = holey; - } - } - - /* - * Look for any missing top-level vdevs. If this is the case, - * create a faked up 'missing' vdev as a placeholder. We cannot - * simply compress the child array, because the kernel performs - * certain checks to make sure the vdev IDs match their location - * in the configuration. - */ - for (c = 0; c < children; c++) { - if (child[c] == NULL) { - nvlist_t *missing; - if (nvlist_alloc(&missing, NV_UNIQUE_NAME, - 0) != 0) - goto nomem; - if (nvlist_add_string(missing, - ZPOOL_CONFIG_TYPE, - VDEV_TYPE_MISSING) != 0 || - nvlist_add_uint64(missing, - ZPOOL_CONFIG_ID, c) != 0 || - nvlist_add_uint64(missing, - ZPOOL_CONFIG_GUID, 0ULL) != 0) { - nvlist_free(missing); - goto nomem; - } - child[c] = missing; - } - } - - /* - * Put all of this pool's top-level vdevs into a root vdev. - */ - if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) - goto nomem; - if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_ROOT) != 0 || - nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 || - nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 || - nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - child, children) != 0) { - nvlist_free(nvroot); - goto nomem; - } - - for (c = 0; c < children; c++) - nvlist_free(child[c]); - free(child); - children = 0; - child = NULL; - - /* - * Go through and fix up any paths and/or devids based on our - * known list of vdev GUID -> path mappings. - */ - if (fix_paths(nvroot, pl->names) != 0) { - nvlist_free(nvroot); - goto nomem; - } - - /* - * Add the root vdev to this pool's configuration. - */ - if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - nvroot) != 0) { - nvlist_free(nvroot); - goto nomem; - } - nvlist_free(nvroot); - - /* - * zdb uses this path to report on active pools that were - * imported or created using -R. - */ - if (active_ok) - goto add_pool; - - /* - * Determine if this pool is currently active, in which case we - * can't actually import it. - */ - verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, - &name) == 0); - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, - &guid) == 0); - - if (pool_active(hdl, name, guid, &isactive) != 0) - goto error; - - if (isactive) { - nvlist_free(config); - config = NULL; - continue; - } - - if (policy != NULL) { - if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, - policy) != 0) - goto nomem; - } - - if ((nvl = refresh_config(hdl, config)) == NULL) { - nvlist_free(config); - config = NULL; - continue; - } - - nvlist_free(config); - config = nvl; - - /* - * Go through and update the paths for spares, now that we have - * them. - */ - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - for (i = 0; i < nspares; i++) { - if (fix_paths(spares[i], pl->names) != 0) - goto nomem; - } - } - - /* - * Update the paths for l2cache devices. - */ - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, - &l2cache, &nl2cache) == 0) { - for (i = 0; i < nl2cache; i++) { - if (fix_paths(l2cache[i], pl->names) != 0) - goto nomem; - } - } - - /* - * Restore the original information read from the actual label. - */ - (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID, - DATA_TYPE_UINT64); - (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME, - DATA_TYPE_STRING); - if (hostid != 0) { - verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, - hostid) == 0); - verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, - hostname) == 0); - } - -add_pool: - /* - * Add this pool to the list of configs. - */ - verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, - &name) == 0); - if (nvlist_add_nvlist(ret, name, config) != 0) - goto nomem; - - found_one = B_TRUE; - nvlist_free(config); - config = NULL; - } - - if (!found_one) { - nvlist_free(ret); - ret = NULL; - } - - return (ret); - -nomem: - (void) no_memory(hdl); -error: - nvlist_free(config); - nvlist_free(ret); - for (c = 0; c < children; c++) - nvlist_free(child[c]); - free(child); - - return (NULL); -} - -/* - * Return the offset of the given label. - */ -static uint64_t -label_offset(uint64_t size, int l) -{ - ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0); - return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? - 0 : size - VDEV_LABELS * sizeof (vdev_label_t))); -} - -/* - * Given a file descriptor, read the label information and return an nvlist - * describing the configuration, if there is one. - * Return 0 on success, or -1 on failure - */ -int -zpool_read_label(int fd, nvlist_t **config) -{ - struct stat64 statbuf; - int l; - vdev_label_t *label; - uint64_t state, txg, size; - - *config = NULL; - - if (fstat64(fd, &statbuf) == -1) - return (-1); - size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); - - if ((label = malloc(sizeof (vdev_label_t))) == NULL) - return (-1); - - for (l = 0; l < VDEV_LABELS; l++) { - if (pread64(fd, label, sizeof (vdev_label_t), - label_offset(size, l)) != sizeof (vdev_label_t)) - continue; - - if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, - sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) - continue; - - if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, - &state) != 0 || state > POOL_STATE_L2CACHE) { - nvlist_free(*config); - continue; - } - - if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && - (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, - &txg) != 0 || txg == 0)) { - nvlist_free(*config); - continue; - } - - free(label); - return (0); - } - - free(label); - *config = NULL; - errno = ENOENT; - return (-1); -} - -/* - * Given a file descriptor, read the label information and return an nvlist - * describing the configuration, if there is one. - * returns the number of valid labels found - * If a label is found, returns it via config. The caller is responsible for - * freeing it. - */ -int -zpool_read_all_labels(int fd, nvlist_t **config) -{ - struct stat64 statbuf; - struct aiocb aiocbs[VDEV_LABELS]; - struct aiocb *aiocbps[VDEV_LABELS]; - int l; - vdev_phys_t *labels; - uint64_t state, txg, size; - int nlabels = 0; - - *config = NULL; - - if (fstat64(fd, &statbuf) == -1) - return (0); - size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); - - if ((labels = calloc(VDEV_LABELS, sizeof (vdev_phys_t))) == NULL) - return (0); - - memset(aiocbs, 0, sizeof(aiocbs)); - for (l = 0; l < VDEV_LABELS; l++) { - aiocbs[l].aio_fildes = fd; - aiocbs[l].aio_offset = label_offset(size, l) + VDEV_SKIP_SIZE; - aiocbs[l].aio_buf = &labels[l]; - aiocbs[l].aio_nbytes = sizeof(vdev_phys_t); - aiocbs[l].aio_lio_opcode = LIO_READ; - aiocbps[l] = &aiocbs[l]; - } - - if (lio_listio(LIO_WAIT, aiocbps, VDEV_LABELS, NULL) != 0) { - if (errno == EAGAIN || errno == EINTR || errno == EIO) { - for (l = 0; l < VDEV_LABELS; l++) { - errno = 0; - int r = aio_error(&aiocbs[l]); - if (r != EINVAL) - (void)aio_return(&aiocbs[l]); - } - } - free(labels); - return (0); - } - - for (l = 0; l < VDEV_LABELS; l++) { - nvlist_t *temp = NULL; - - if (aio_return(&aiocbs[l]) != sizeof(vdev_phys_t)) - continue; - - if (nvlist_unpack(labels[l].vp_nvlist, - sizeof (labels[l].vp_nvlist), &temp, 0) != 0) - continue; - - if (nvlist_lookup_uint64(temp, ZPOOL_CONFIG_POOL_STATE, - &state) != 0 || state > POOL_STATE_L2CACHE) { - nvlist_free(temp); - temp = NULL; - continue; - } - - if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && - (nvlist_lookup_uint64(temp, ZPOOL_CONFIG_POOL_TXG, - &txg) != 0 || txg == 0)) { - nvlist_free(temp); - temp = NULL; - continue; - } - if (temp) - *config = temp; - - nlabels++; - } - - free(labels); - return (nlabels); -} - -typedef struct rdsk_node { - char *rn_name; - int rn_dfd; - libzfs_handle_t *rn_hdl; - nvlist_t *rn_config; - avl_tree_t *rn_avl; - avl_node_t rn_node; - boolean_t rn_nozpool; -} rdsk_node_t; - -static int -slice_cache_compare(const void *arg1, const void *arg2) -{ - const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; - const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; - char *nm1slice, *nm2slice; - int rv; - - /* - * slices zero and two are the most likely to provide results, - * so put those first - */ - nm1slice = strstr(nm1, "s0"); - nm2slice = strstr(nm2, "s0"); - if (nm1slice && !nm2slice) { - return (-1); - } - if (!nm1slice && nm2slice) { - return (1); - } - nm1slice = strstr(nm1, "s2"); - nm2slice = strstr(nm2, "s2"); - if (nm1slice && !nm2slice) { - return (-1); - } - if (!nm1slice && nm2slice) { - return (1); - } - - rv = strcmp(nm1, nm2); - if (rv == 0) - return (0); - return (rv > 0 ? 1 : -1); -} - -#ifdef illumos -static void -check_one_slice(avl_tree_t *r, char *diskname, uint_t partno, - diskaddr_t size, uint_t blksz) -{ - rdsk_node_t tmpnode; - rdsk_node_t *node; - char sname[MAXNAMELEN]; - - tmpnode.rn_name = &sname[0]; - (void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u", - diskname, partno); - /* - * protect against division by zero for disk labels that - * contain a bogus sector size - */ - if (blksz == 0) - blksz = DEV_BSIZE; - /* too small to contain a zpool? */ - if ((size < (SPA_MINDEVSIZE / blksz)) && - (node = avl_find(r, &tmpnode, NULL))) - node->rn_nozpool = B_TRUE; -} -#endif /* illumos */ - -static void -nozpool_all_slices(avl_tree_t *r, const char *sname) -{ -#ifdef illumos - char diskname[MAXNAMELEN]; - char *ptr; - int i; - - (void) strncpy(diskname, sname, MAXNAMELEN); - if (((ptr = strrchr(diskname, 's')) == NULL) && - ((ptr = strrchr(diskname, 'p')) == NULL)) - return; - ptr[0] = 's'; - ptr[1] = '\0'; - for (i = 0; i < NDKMAP; i++) - check_one_slice(r, diskname, i, 0, 1); - ptr[0] = 'p'; - for (i = 0; i <= FD_NUMPART; i++) - check_one_slice(r, diskname, i, 0, 1); -#endif /* illumos */ -} - -#ifdef illumos -static void -check_slices(avl_tree_t *r, int fd, const char *sname) -{ - struct extvtoc vtoc; - struct dk_gpt *gpt; - char diskname[MAXNAMELEN]; - char *ptr; - int i; - - (void) strncpy(diskname, sname, MAXNAMELEN); - if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1])) - return; - ptr[1] = '\0'; - - if (read_extvtoc(fd, &vtoc) >= 0) { - for (i = 0; i < NDKMAP; i++) - check_one_slice(r, diskname, i, - vtoc.v_part[i].p_size, vtoc.v_sectorsz); - } else if (efi_alloc_and_read(fd, &gpt) >= 0) { - /* - * on x86 we'll still have leftover links that point - * to slices s[9-15], so use NDKMAP instead - */ - for (i = 0; i < NDKMAP; i++) - check_one_slice(r, diskname, i, - gpt->efi_parts[i].p_size, gpt->efi_lbasize); - /* nodes p[1-4] are never used with EFI labels */ - ptr[0] = 'p'; - for (i = 1; i <= FD_NUMPART; i++) - check_one_slice(r, diskname, i, 0, 1); - efi_free(gpt); - } -} -#endif /* illumos */ - -static void -zpool_open_func(void *arg) -{ - rdsk_node_t *rn = arg; - struct stat64 statbuf; - nvlist_t *config; - int fd; - - if (rn->rn_nozpool) - return; - if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) { - /* symlink to a device that's no longer there */ - if (errno == ENOENT) - nozpool_all_slices(rn->rn_avl, rn->rn_name); - return; - } - /* - * Ignore failed stats. We only want regular - * files, character devs and block devs. - */ - if (fstat64(fd, &statbuf) != 0 || - (!S_ISREG(statbuf.st_mode) && - !S_ISCHR(statbuf.st_mode) && - !S_ISBLK(statbuf.st_mode))) { - (void) close(fd); - return; - } - /* this file is too small to hold a zpool */ -#ifdef illumos - if (S_ISREG(statbuf.st_mode) && - statbuf.st_size < SPA_MINDEVSIZE) { - (void) close(fd); - return; - } else if (!S_ISREG(statbuf.st_mode)) { - /* - * Try to read the disk label first so we don't have to - * open a bunch of minor nodes that can't have a zpool. - */ - check_slices(rn->rn_avl, fd, rn->rn_name); - } -#else /* !illumos */ - if (statbuf.st_size < SPA_MINDEVSIZE) { - (void) close(fd); - return; - } -#endif /* illumos */ - - if ((zpool_read_label(fd, &config)) != 0 && errno == ENOMEM) { - (void) close(fd); - (void) no_memory(rn->rn_hdl); - return; - } - (void) close(fd); - - rn->rn_config = config; -} - -/* - * Given a file descriptor, clear (zero) the label information. - */ -int -zpool_clear_label(int fd) -{ - struct stat64 statbuf; - int l; - vdev_label_t *label; - uint64_t size; - - if (fstat64(fd, &statbuf) == -1) - return (0); - size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); - - if ((label = calloc(sizeof (vdev_label_t), 1)) == NULL) - return (-1); - - for (l = 0; l < VDEV_LABELS; l++) { - if (pwrite64(fd, label, sizeof (vdev_label_t), - label_offset(size, l)) != sizeof (vdev_label_t)) { - free(label); - return (-1); - } - } - - free(label); - return (0); -} - -/* - * Given a list of directories to search, find all pools stored on disk. This - * includes partial pools which are not available to import. If no args are - * given (argc is 0), then the default directory (/dev/dsk) is searched. - * poolname or guid (but not both) are provided by the caller when trying - * to import a specific pool. - */ -static nvlist_t * -zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) -{ - int i, dirs = iarg->paths; - struct dirent64 *dp; - char path[MAXPATHLEN]; - char *end, **dir = iarg->path; - size_t pathleft; - nvlist_t *ret = NULL; - static char *default_dir = "/dev"; - pool_list_t pools = { 0 }; - pool_entry_t *pe, *penext; - vdev_entry_t *ve, *venext; - config_entry_t *ce, *cenext; - name_entry_t *ne, *nenext; - avl_tree_t slice_cache; - rdsk_node_t *slice; - void *cookie; - boolean_t skip_zvols = B_FALSE; - int value; - size_t size = sizeof(value); - - if (dirs == 0) { - dirs = 1; - dir = &default_dir; - } - - if (sysctlbyname("vfs.zfs.vol.recursive", &value, &size, NULL, 0) == 0 - && value == 0) { - skip_zvols = B_TRUE; - } - - /* - * Go through and read the label configuration information from every - * possible device, organizing the information according to pool GUID - * and toplevel GUID. - */ - for (i = 0; i < dirs; i++) { - tpool_t *t; - char rdsk[MAXPATHLEN]; - int dfd; - boolean_t config_failed = B_FALSE; - DIR *dirp; - - /* use realpath to normalize the path */ - if (realpath(dir[i], path) == 0) { - (void) zfs_error_fmt(hdl, EZFS_BADPATH, - dgettext(TEXT_DOMAIN, "cannot open '%s'"), dir[i]); - goto error; - } - end = &path[strlen(path)]; - *end++ = '/'; - *end = 0; - pathleft = &path[sizeof (path)] - end; - -#ifdef illumos - /* - * Using raw devices instead of block devices when we're - * reading the labels skips a bunch of slow operations during - * close(2) processing, so we replace /dev/dsk with /dev/rdsk. - */ - if (strcmp(path, ZFS_DISK_ROOTD) == 0) - (void) strlcpy(rdsk, ZFS_RDISK_ROOTD, sizeof (rdsk)); - else -#endif - (void) strlcpy(rdsk, path, sizeof (rdsk)); - - if ((dfd = open64(rdsk, O_RDONLY)) < 0 || - (dirp = fdopendir(dfd)) == NULL) { - if (dfd >= 0) - (void) close(dfd); - zfs_error_aux(hdl, strerror(errno)); - (void) zfs_error_fmt(hdl, EZFS_BADPATH, - dgettext(TEXT_DOMAIN, "cannot open '%s'"), - rdsk); - goto error; - } - - avl_create(&slice_cache, slice_cache_compare, - sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); - - if (strcmp(rdsk, "/dev/") == 0) { - struct gmesh mesh; - struct gclass *mp; - struct ggeom *gp; - struct gprovider *pp; - - errno = geom_gettree(&mesh); - if (errno != 0) { - zfs_error_aux(hdl, strerror(errno)); - (void) zfs_error_fmt(hdl, EZFS_BADPATH, - dgettext(TEXT_DOMAIN, "cannot get GEOM tree")); - goto error; - } - - LIST_FOREACH(mp, &mesh.lg_class, lg_class) { - if (skip_zvols && - strcmp(mp->lg_name, "ZFS::ZVOL") == 0) { - continue; - } - LIST_FOREACH(gp, &mp->lg_geom, lg_geom) { - LIST_FOREACH(pp, &gp->lg_provider, lg_provider) { - slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); - slice->rn_name = zfs_strdup(hdl, pp->lg_name); - slice->rn_avl = &slice_cache; - slice->rn_dfd = dfd; - slice->rn_hdl = hdl; - slice->rn_nozpool = B_FALSE; - avl_add(&slice_cache, slice); - } - } - } - - geom_deletetree(&mesh); - goto skipdir; - } - - /* - * This is not MT-safe, but we have no MT consumers of libzfs - */ - while ((dp = readdir64(dirp)) != NULL) { - const char *name = dp->d_name; - if (name[0] == '.' && - (name[1] == 0 || (name[1] == '.' && name[2] == 0))) - continue; - - slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); - slice->rn_name = zfs_strdup(hdl, name); - slice->rn_avl = &slice_cache; - slice->rn_dfd = dfd; - slice->rn_hdl = hdl; - slice->rn_nozpool = B_FALSE; - avl_add(&slice_cache, slice); - } -skipdir: - /* - * create a thread pool to do all of this in parallel; - * rn_nozpool is not protected, so this is racy in that - * multiple tasks could decide that the same slice can - * not hold a zpool, which is benign. Also choose - * double the number of processors; we hold a lot of - * locks in the kernel, so going beyond this doesn't - * buy us much. - */ - t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), - 0, NULL); - for (slice = avl_first(&slice_cache); slice; - (slice = avl_walk(&slice_cache, slice, - AVL_AFTER))) - (void) tpool_dispatch(t, zpool_open_func, slice); - tpool_wait(t); - tpool_destroy(t); - - cookie = NULL; - while ((slice = avl_destroy_nodes(&slice_cache, - &cookie)) != NULL) { - if (slice->rn_config != NULL && !config_failed) { - nvlist_t *config = slice->rn_config; - boolean_t matched = B_TRUE; - - if (iarg->poolname != NULL) { - char *pname; - - matched = nvlist_lookup_string(config, - ZPOOL_CONFIG_POOL_NAME, - &pname) == 0 && - strcmp(iarg->poolname, pname) == 0; - } else if (iarg->guid != 0) { - uint64_t this_guid; - - matched = nvlist_lookup_uint64(config, - ZPOOL_CONFIG_POOL_GUID, - &this_guid) == 0 && - iarg->guid == this_guid; - } - if (matched) { - /* - * use the non-raw path for the config - */ - (void) strlcpy(end, slice->rn_name, - pathleft); - if (add_config(hdl, &pools, path, - config) != 0) - config_failed = B_TRUE; - } - nvlist_free(config); - } - free(slice->rn_name); - free(slice); - } - avl_destroy(&slice_cache); - - (void) closedir(dirp); - - if (config_failed) - goto error; - } - - ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy); - -error: - for (pe = pools.pools; pe != NULL; pe = penext) { - penext = pe->pe_next; - for (ve = pe->pe_vdevs; ve != NULL; ve = venext) { - venext = ve->ve_next; - for (ce = ve->ve_configs; ce != NULL; ce = cenext) { - cenext = ce->ce_next; - nvlist_free(ce->ce_config); - free(ce); - } - free(ve); - } - free(pe); - } - - for (ne = pools.names; ne != NULL; ne = nenext) { - nenext = ne->ne_next; - free(ne->ne_name); - free(ne); - } - - return (ret); -} - -nvlist_t * -zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv) -{ - importargs_t iarg = { 0 }; - - iarg.paths = argc; - iarg.path = argv; - - return (zpool_find_import_impl(hdl, &iarg)); -} - -/* - * Given a cache file, return the contents as a list of importable pools. - * poolname or guid (but not both) are provided by the caller when trying - * to import a specific pool. - */ -nvlist_t * -zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile, - char *poolname, uint64_t guid) -{ - char *buf; - int fd; - struct stat64 statbuf; - nvlist_t *raw, *src, *dst; - nvlist_t *pools; - nvpair_t *elem; - char *name; - uint64_t this_guid; - boolean_t active; - - verify(poolname == NULL || guid == 0); - - if ((fd = open(cachefile, O_RDONLY)) < 0) { - zfs_error_aux(hdl, "%s", strerror(errno)); - (void) zfs_error(hdl, EZFS_BADCACHE, - dgettext(TEXT_DOMAIN, "failed to open cache file")); - return (NULL); - } - - if (fstat64(fd, &statbuf) != 0) { - zfs_error_aux(hdl, "%s", strerror(errno)); - (void) close(fd); - (void) zfs_error(hdl, EZFS_BADCACHE, - dgettext(TEXT_DOMAIN, "failed to get size of cache file")); - return (NULL); - } - - if ((buf = zfs_alloc(hdl, statbuf.st_size)) == NULL) { - (void) close(fd); - return (NULL); - } - - if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { - (void) close(fd); - free(buf); - (void) zfs_error(hdl, EZFS_BADCACHE, - dgettext(TEXT_DOMAIN, - "failed to read cache file contents")); - return (NULL); - } - - (void) close(fd); - - if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) { - free(buf); - (void) zfs_error(hdl, EZFS_BADCACHE, - dgettext(TEXT_DOMAIN, - "invalid or corrupt cache file contents")); - return (NULL); - } - - free(buf); - - /* - * Go through and get the current state of the pools and refresh their - * state. - */ - if (nvlist_alloc(&pools, 0, 0) != 0) { - (void) no_memory(hdl); - nvlist_free(raw); - return (NULL); - } - - elem = NULL; - while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) { - src = fnvpair_value_nvlist(elem); - - name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME); - if (poolname != NULL && strcmp(poolname, name) != 0) - continue; - - this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID); - if (guid != 0 && guid != this_guid) - continue; - - if (pool_active(hdl, name, this_guid, &active) != 0) { - nvlist_free(raw); - nvlist_free(pools); - return (NULL); - } - - if (active) - continue; - - if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE, - cachefile) != 0) { - (void) no_memory(hdl); - nvlist_free(raw); - nvlist_free(pools); - return (NULL); - } - - if ((dst = refresh_config(hdl, src)) == NULL) { - nvlist_free(raw); - nvlist_free(pools); - return (NULL); - } - - if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) { - (void) no_memory(hdl); - nvlist_free(dst); - nvlist_free(raw); - nvlist_free(pools); - return (NULL); - } - nvlist_free(dst); - } - - nvlist_free(raw); - return (pools); -} - -static int -name_or_guid_exists(zpool_handle_t *zhp, void *data) -{ - importargs_t *import = data; - int found = 0; - - if (import->poolname != NULL) { - char *pool_name; - - verify(nvlist_lookup_string(zhp->zpool_config, - ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0); - if (strcmp(pool_name, import->poolname) == 0) - found = 1; - } else { - uint64_t pool_guid; - - verify(nvlist_lookup_uint64(zhp->zpool_config, - ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0); - if (pool_guid == import->guid) - found = 1; - } - - zpool_close(zhp); - return (found); -} - -nvlist_t * -zpool_search_import(libzfs_handle_t *hdl, importargs_t *import) -{ - nvlist_t *pools = NULL; - - verify(import->poolname == NULL || import->guid == 0); - - if (import->unique) - import->exists = zpool_iter(hdl, name_or_guid_exists, import); - - if (import->cachefile != NULL) - pools = zpool_find_import_cached(hdl, import->cachefile, - import->poolname, import->guid); - else - pools = zpool_find_import_impl(hdl, import); - - return (pools); -} - -static boolean_t -pool_match(nvlist_t *cfg, char *tgt) -{ - uint64_t v, guid = strtoull(tgt, NULL, 0); - char *s; - - if (guid != 0) { - if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0) - return (v == guid); - } else { - if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0) - return (strcmp(s, tgt) == 0); - } - return (B_FALSE); -} - -int -zpool_tryimport(libzfs_handle_t *hdl, char *target, nvlist_t **configp, - importargs_t *args) -{ - nvlist_t *pools; - nvlist_t *match = NULL; - nvlist_t *config = NULL; - char *sepp = NULL; - int count = 0; - char *targetdup = strdup(target); - - *configp = NULL; - - if ((sepp = strpbrk(targetdup, "/@")) != NULL) { - *sepp = '\0'; - } - - pools = zpool_search_import(hdl, args); - - if (pools != NULL) { - nvpair_t *elem = NULL; - while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { - VERIFY0(nvpair_value_nvlist(elem, &config)); - if (pool_match(config, targetdup)) { - count++; - if (match != NULL) { - /* multiple matches found */ - continue; - } else { - match = config; - } - } - } - } - - if (count == 0) { - free(targetdup); - return (ENOENT); - } - - if (count > 1) { - free(targetdup); - return (EINVAL); - } - - *configp = match; - free(targetdup); - - return (0); -} - -boolean_t -find_guid(nvlist_t *nv, uint64_t guid) -{ - uint64_t tmp; - nvlist_t **child; - uint_t c, children; - - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &tmp) == 0); - if (tmp == guid) - return (B_TRUE); - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (c = 0; c < children; c++) - if (find_guid(child[c], guid)) - return (B_TRUE); - } - - return (B_FALSE); -} - -typedef struct aux_cbdata { - const char *cb_type; - uint64_t cb_guid; - zpool_handle_t *cb_zhp; -} aux_cbdata_t; - -static int -find_aux(zpool_handle_t *zhp, void *data) -{ - aux_cbdata_t *cbp = data; - nvlist_t **list; - uint_t i, count; - uint64_t guid; - nvlist_t *nvroot; - - verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - - if (nvlist_lookup_nvlist_array(nvroot, cbp->cb_type, - &list, &count) == 0) { - for (i = 0; i < count; i++) { - verify(nvlist_lookup_uint64(list[i], - ZPOOL_CONFIG_GUID, &guid) == 0); - if (guid == cbp->cb_guid) { - cbp->cb_zhp = zhp; - return (1); - } - } - } - - zpool_close(zhp); - return (0); -} - -/* - * Determines if the pool is in use. If so, it returns true and the state of - * the pool as well as the name of the pool. Both strings are allocated and - * must be freed by the caller. - */ -int -zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr, - boolean_t *inuse) -{ - nvlist_t *config; - char *name; - boolean_t ret; - uint64_t guid, vdev_guid; - zpool_handle_t *zhp; - nvlist_t *pool_config; - uint64_t stateval, isspare; - aux_cbdata_t cb = { 0 }; - boolean_t isactive; - - *inuse = B_FALSE; - - if (zpool_read_label(fd, &config) != 0 && errno == ENOMEM) { - (void) no_memory(hdl); - return (-1); - } - - if (config == NULL) - return (0); - - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, - &stateval) == 0); - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, - &vdev_guid) == 0); - - if (stateval != POOL_STATE_SPARE && stateval != POOL_STATE_L2CACHE) { - verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, - &name) == 0); - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, - &guid) == 0); - } - - switch (stateval) { - case POOL_STATE_EXPORTED: - /* - * A pool with an exported state may in fact be imported - * read-only, so check the in-core state to see if it's - * active and imported read-only. If it is, set - * its state to active. - */ - if (pool_active(hdl, name, guid, &isactive) == 0 && isactive && - (zhp = zpool_open_canfail(hdl, name)) != NULL) { - if (zpool_get_prop_int(zhp, ZPOOL_PROP_READONLY, NULL)) - stateval = POOL_STATE_ACTIVE; - - /* - * All we needed the zpool handle for is the - * readonly prop check. - */ - zpool_close(zhp); - } - - ret = B_TRUE; - break; - - case POOL_STATE_ACTIVE: - /* - * For an active pool, we have to determine if it's really part - * of a currently active pool (in which case the pool will exist - * and the guid will be the same), or whether it's part of an - * active pool that was disconnected without being explicitly - * exported. - */ - if (pool_active(hdl, name, guid, &isactive) != 0) { - nvlist_free(config); - return (-1); - } - - if (isactive) { - /* - * Because the device may have been removed while - * offlined, we only report it as active if the vdev is - * still present in the config. Otherwise, pretend like - * it's not in use. - */ - if ((zhp = zpool_open_canfail(hdl, name)) != NULL && - (pool_config = zpool_get_config(zhp, NULL)) - != NULL) { - nvlist_t *nvroot; - - verify(nvlist_lookup_nvlist(pool_config, - ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - ret = find_guid(nvroot, vdev_guid); - } else { - ret = B_FALSE; - } - - /* - * If this is an active spare within another pool, we - * treat it like an unused hot spare. This allows the - * user to create a pool with a hot spare that currently - * in use within another pool. Since we return B_TRUE, - * libdiskmgt will continue to prevent generic consumers - * from using the device. - */ - if (ret && nvlist_lookup_uint64(config, - ZPOOL_CONFIG_IS_SPARE, &isspare) == 0 && isspare) - stateval = POOL_STATE_SPARE; - - if (zhp != NULL) - zpool_close(zhp); - } else { - stateval = POOL_STATE_POTENTIALLY_ACTIVE; - ret = B_TRUE; - } - break; - - case POOL_STATE_SPARE: - /* - * For a hot spare, it can be either definitively in use, or - * potentially active. To determine if it's in use, we iterate - * over all pools in the system and search for one with a spare - * with a matching guid. - * - * Due to the shared nature of spares, we don't actually report - * the potentially active case as in use. This means the user - * can freely create pools on the hot spares of exported pools, - * but to do otherwise makes the resulting code complicated, and - * we end up having to deal with this case anyway. - */ - cb.cb_zhp = NULL; - cb.cb_guid = vdev_guid; - cb.cb_type = ZPOOL_CONFIG_SPARES; - if (zpool_iter(hdl, find_aux, &cb) == 1) { - name = (char *)zpool_get_name(cb.cb_zhp); - ret = B_TRUE; - } else { - ret = B_FALSE; - } - break; - - case POOL_STATE_L2CACHE: - - /* - * Check if any pool is currently using this l2cache device. - */ - cb.cb_zhp = NULL; - cb.cb_guid = vdev_guid; - cb.cb_type = ZPOOL_CONFIG_L2CACHE; - if (zpool_iter(hdl, find_aux, &cb) == 1) { - name = (char *)zpool_get_name(cb.cb_zhp); - ret = B_TRUE; - } else { - ret = B_FALSE; - } - break; - - default: - ret = B_FALSE; - } - - - if (ret) { - if ((*namestr = zfs_strdup(hdl, name)) == NULL) { - if (cb.cb_zhp) - zpool_close(cb.cb_zhp); - nvlist_free(config); - return (-1); - } - *state = (pool_state_t)stateval; - } - - if (cb.cb_zhp) - zpool_close(cb.cb_zhp); - - nvlist_free(config); - *inuse = ret; - return (0); -} diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_iter.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_iter.c deleted file mode 100644 index 36138676e7db..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_iter.c +++ /dev/null @@ -1,546 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. - * Copyright (c) 2012 Pawel Jakub Dawidek. All rights reserved. - * Copyright 2014 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2019 Datto Inc. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "libzfs_impl.h" - -int -zfs_iter_clones(zfs_handle_t *zhp, zfs_iter_f func, void *data) -{ - nvlist_t *nvl = zfs_get_clones_nvl(zhp); - nvpair_t *pair; - - if (nvl == NULL) - return (0); - - for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL; - pair = nvlist_next_nvpair(nvl, pair)) { - zfs_handle_t *clone = zfs_open(zhp->zfs_hdl, nvpair_name(pair), - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); - if (clone != NULL) { - int err = func(clone, data); - if (err != 0) - return (err); - } - } - return (0); -} - -static int -zfs_do_list_ioctl(zfs_handle_t *zhp, unsigned long arg, zfs_cmd_t *zc) -{ - int rc; - uint64_t orig_cookie; - - orig_cookie = zc->zc_cookie; -top: - (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); - rc = ioctl(zhp->zfs_hdl->libzfs_fd, arg, zc); - - if (rc == -1) { - switch (errno) { - case ENOMEM: - /* expand nvlist memory and try again */ - if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, zc) != 0) { - zcmd_free_nvlists(zc); - return (-1); - } - zc->zc_cookie = orig_cookie; - goto top; - /* - * An errno value of ESRCH indicates normal completion. - * If ENOENT is returned, then the underlying dataset - * has been removed since we obtained the handle. - */ - case ESRCH: - case ENOENT: - rc = 1; - break; - default: - rc = zfs_standard_error(zhp->zfs_hdl, errno, - dgettext(TEXT_DOMAIN, - "cannot iterate filesystems")); - break; - } - } - return (rc); -} - -/* - * Iterate over all child filesystems - */ -int -zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data) -{ - zfs_cmd_t zc = { 0 }; - zfs_handle_t *nzhp; - int ret; - - if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM) - return (0); - - if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) - return (-1); - - while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT, - &zc)) == 0) { - /* - * Silently ignore errors, as the only plausible explanation is - * that the pool has since been removed. - */ - if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl, - &zc)) == NULL) { - continue; - } - - if ((ret = func(nzhp, data)) != 0) { - zcmd_free_nvlists(&zc); - return (ret); - } - } - zcmd_free_nvlists(&zc); - return ((ret < 0) ? ret : 0); -} - -/* - * Iterate over all snapshots - */ -int -zfs_iter_snapshots(zfs_handle_t *zhp, boolean_t simple, zfs_iter_f func, - void *data, uint64_t min_txg, uint64_t max_txg) -{ - zfs_cmd_t zc = { 0 }; - zfs_handle_t *nzhp; - int ret; - nvlist_t *range_nvl = NULL; - - if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT || - zhp->zfs_type == ZFS_TYPE_BOOKMARK) - return (0); - - zc.zc_simple = simple; - - if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) - return (-1); - - if (min_txg != 0) { - range_nvl = fnvlist_alloc(); - fnvlist_add_uint64(range_nvl, SNAP_ITER_MIN_TXG, min_txg); - } - if (max_txg != 0) { - if (range_nvl == NULL) - range_nvl = fnvlist_alloc(); - fnvlist_add_uint64(range_nvl, SNAP_ITER_MAX_TXG, max_txg); - } - - if (range_nvl != NULL && - zcmd_write_src_nvlist(zhp->zfs_hdl, &zc, range_nvl) != 0) { - zcmd_free_nvlists(&zc); - fnvlist_free(range_nvl); - return (-1); - } - - while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT, - &zc)) == 0) { - - if (simple) - nzhp = make_dataset_simple_handle_zc(zhp, &zc); - else - nzhp = make_dataset_handle_zc(zhp->zfs_hdl, &zc); - if (nzhp == NULL) - continue; - - if ((ret = func(nzhp, data)) != 0) { - zcmd_free_nvlists(&zc); - fnvlist_free(range_nvl); - return (ret); - } - } - zcmd_free_nvlists(&zc); - fnvlist_free(range_nvl); - return ((ret < 0) ? ret : 0); -} - -/* - * Iterate over all bookmarks - */ -int -zfs_iter_bookmarks(zfs_handle_t *zhp, zfs_iter_f func, void *data) -{ - zfs_handle_t *nzhp; - nvlist_t *props = NULL; - nvlist_t *bmarks = NULL; - int err; - - if ((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT | ZFS_TYPE_BOOKMARK)) != 0) - return (0); - - /* Setup the requested properties nvlist. */ - props = fnvlist_alloc(); - fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_GUID)); - fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_CREATETXG)); - fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_CREATION)); - - if ((err = lzc_get_bookmarks(zhp->zfs_name, props, &bmarks)) != 0) - goto out; - - for (nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL); - pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) { - char name[ZFS_MAX_DATASET_NAME_LEN]; - char *bmark_name; - nvlist_t *bmark_props; - - bmark_name = nvpair_name(pair); - bmark_props = fnvpair_value_nvlist(pair); - - (void) snprintf(name, sizeof (name), "%s#%s", zhp->zfs_name, - bmark_name); - - nzhp = make_bookmark_handle(zhp, name, bmark_props); - if (nzhp == NULL) - continue; - - if ((err = func(nzhp, data)) != 0) - goto out; - } - -out: - fnvlist_free(props); - fnvlist_free(bmarks); - - return (err); -} - -/* - * Routines for dealing with the sorted snapshot functionality - */ -typedef struct zfs_node { - zfs_handle_t *zn_handle; - avl_node_t zn_avlnode; -} zfs_node_t; - -static int -zfs_sort_snaps(zfs_handle_t *zhp, void *data) -{ - avl_tree_t *avl = data; - zfs_node_t *node; - zfs_node_t search; - - search.zn_handle = zhp; - node = avl_find(avl, &search, NULL); - if (node) { - /* - * If this snapshot was renamed while we were creating the - * AVL tree, it's possible that we already inserted it under - * its old name. Remove the old handle before adding the new - * one. - */ - zfs_close(node->zn_handle); - avl_remove(avl, node); - free(node); - } - - node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t)); - node->zn_handle = zhp; - avl_add(avl, node); - - return (0); -} - -static int -zfs_snapshot_compare(const void *larg, const void *rarg) -{ - zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; - zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; - uint64_t lcreate, rcreate; - - /* - * Sort them according to creation time. We use the hidden - * CREATETXG property to get an absolute ordering of snapshots. - */ - lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); - rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); - - return (AVL_CMP(lcreate, rcreate)); -} - -int -zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, void *data, - uint64_t min_txg, uint64_t max_txg) -{ - int ret = 0; - zfs_node_t *node; - avl_tree_t avl; - void *cookie = NULL; - - avl_create(&avl, zfs_snapshot_compare, - sizeof (zfs_node_t), offsetof(zfs_node_t, zn_avlnode)); - - ret = zfs_iter_snapshots(zhp, B_FALSE, zfs_sort_snaps, &avl, min_txg, - max_txg); - - for (node = avl_first(&avl); node != NULL; node = AVL_NEXT(&avl, node)) - ret |= callback(node->zn_handle, data); - - while ((node = avl_destroy_nodes(&avl, &cookie)) != NULL) - free(node); - - avl_destroy(&avl); - - return (ret); -} - -typedef struct { - char *ssa_first; - char *ssa_last; - boolean_t ssa_seenfirst; - boolean_t ssa_seenlast; - zfs_iter_f ssa_func; - void *ssa_arg; -} snapspec_arg_t; - -static int -snapspec_cb(zfs_handle_t *zhp, void *arg) -{ - snapspec_arg_t *ssa = arg; - const char *shortsnapname; - int err = 0; - - if (ssa->ssa_seenlast) - return (0); - - shortsnapname = strchr(zfs_get_name(zhp), '@') + 1; - if (!ssa->ssa_seenfirst && strcmp(shortsnapname, ssa->ssa_first) == 0) - ssa->ssa_seenfirst = B_TRUE; - if (strcmp(shortsnapname, ssa->ssa_last) == 0) - ssa->ssa_seenlast = B_TRUE; - - if (ssa->ssa_seenfirst) { - err = ssa->ssa_func(zhp, ssa->ssa_arg); - } else { - zfs_close(zhp); - } - - return (err); -} - -/* - * spec is a string like "A,B%C,D" - * - * , where can be: - * (single snapshot) - * % (range of snapshots, inclusive) - * % (range of snapshots, starting with earliest) - * % (range of snapshots, ending with last) - * % (all snapshots) - * [,...] (comma separated list of the above) - * - * If a snapshot can not be opened, continue trying to open the others, but - * return ENOENT at the end. - */ -int -zfs_iter_snapspec(zfs_handle_t *fs_zhp, const char *spec_orig, - zfs_iter_f func, void *arg) -{ - char *buf, *comma_separated, *cp; - int err = 0; - int ret = 0; - - buf = zfs_strdup(fs_zhp->zfs_hdl, spec_orig); - cp = buf; - - while ((comma_separated = strsep(&cp, ",")) != NULL) { - char *pct = strchr(comma_separated, '%'); - if (pct != NULL) { - snapspec_arg_t ssa = { 0 }; - ssa.ssa_func = func; - ssa.ssa_arg = arg; - - if (pct == comma_separated) - ssa.ssa_seenfirst = B_TRUE; - else - ssa.ssa_first = comma_separated; - *pct = '\0'; - ssa.ssa_last = pct + 1; - - /* - * If there is a lastname specified, make sure it - * exists. - */ - if (ssa.ssa_last[0] != '\0') { - char snapname[ZFS_MAX_DATASET_NAME_LEN]; - (void) snprintf(snapname, sizeof (snapname), - "%s@%s", zfs_get_name(fs_zhp), - ssa.ssa_last); - if (!zfs_dataset_exists(fs_zhp->zfs_hdl, - snapname, ZFS_TYPE_SNAPSHOT)) { - ret = ENOENT; - continue; - } - } - - err = zfs_iter_snapshots_sorted(fs_zhp, - snapspec_cb, &ssa, 0, 0); - if (ret == 0) - ret = err; - if (ret == 0 && (!ssa.ssa_seenfirst || - (ssa.ssa_last[0] != '\0' && !ssa.ssa_seenlast))) { - ret = ENOENT; - } - } else { - char snapname[ZFS_MAX_DATASET_NAME_LEN]; - zfs_handle_t *snap_zhp; - (void) snprintf(snapname, sizeof (snapname), "%s@%s", - zfs_get_name(fs_zhp), comma_separated); - snap_zhp = make_dataset_handle(fs_zhp->zfs_hdl, - snapname); - if (snap_zhp == NULL) { - ret = ENOENT; - continue; - } - err = func(snap_zhp, arg); - if (ret == 0) - ret = err; - } - } - - free(buf); - return (ret); -} - -/* - * Iterate over all children, snapshots and filesystems - * Process snapshots before filesystems because they are nearer the input - * handle: this is extremely important when used with zfs_iter_f functions - * looking for data, following the logic that we would like to find it as soon - * and as close as possible. - */ -int -zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data) -{ - int ret; - - if ((ret = zfs_iter_snapshots(zhp, B_FALSE, func, data, 0, 0)) != 0) - return (ret); - - return (zfs_iter_filesystems(zhp, func, data)); -} - - -typedef struct iter_stack_frame { - struct iter_stack_frame *next; - zfs_handle_t *zhp; -} iter_stack_frame_t; - -typedef struct iter_dependents_arg { - boolean_t first; - boolean_t allowrecursion; - iter_stack_frame_t *stack; - zfs_iter_f func; - void *data; -} iter_dependents_arg_t; - -static int -iter_dependents_cb(zfs_handle_t *zhp, void *arg) -{ - iter_dependents_arg_t *ida = arg; - int err = 0; - boolean_t first = ida->first; - ida->first = B_FALSE; - - if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { - err = zfs_iter_clones(zhp, iter_dependents_cb, ida); - } else if (zhp->zfs_type != ZFS_TYPE_BOOKMARK) { - iter_stack_frame_t isf; - iter_stack_frame_t *f; - - /* - * check if there is a cycle by seeing if this fs is already - * on the stack. - */ - for (f = ida->stack; f != NULL; f = f->next) { - if (f->zhp->zfs_dmustats.dds_guid == - zhp->zfs_dmustats.dds_guid) { - if (ida->allowrecursion) { - zfs_close(zhp); - return (0); - } else { - zfs_error_aux(zhp->zfs_hdl, - dgettext(TEXT_DOMAIN, - "recursive dependency at '%s'"), - zfs_get_name(zhp)); - err = zfs_error(zhp->zfs_hdl, - EZFS_RECURSIVE, - dgettext(TEXT_DOMAIN, - "cannot determine dependent " - "datasets")); - zfs_close(zhp); - return (err); - } - } - } - - isf.zhp = zhp; - isf.next = ida->stack; - ida->stack = &isf; - err = zfs_iter_filesystems(zhp, iter_dependents_cb, ida); - if (err == 0) { - err = zfs_iter_snapshots(zhp, B_FALSE, - iter_dependents_cb, ida, 0, 0); - } - ida->stack = isf.next; - } - - if (!first && err == 0) - err = ida->func(zhp, ida->data); - else - zfs_close(zhp); - - return (err); -} - -int -zfs_iter_dependents(zfs_handle_t *zhp, boolean_t allowrecursion, - zfs_iter_f func, void *data) -{ - iter_dependents_arg_t ida; - ida.allowrecursion = allowrecursion; - ida.stack = NULL; - ida.func = func; - ida.data = data; - ida.first = B_TRUE; - return (iter_dependents_cb(zfs_handle_dup(zhp), &ida)); -} diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c deleted file mode 100644 index 9d4948cc7173..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c +++ /dev/null @@ -1,1734 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2015 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, 2016 by Delphix. All rights reserved. - * Copyright 2016 Igor Kozhukhov - * Copyright 2017 Joyent, Inc. - * Copyright 2017 RackTop Systems. - * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. - */ - -/* - * Routines to manage ZFS mounts. We separate all the nasty routines that have - * to deal with the OS. The following functions are the main entry points -- - * they are used by mount and unmount and when changing a filesystem's - * mountpoint. - * - * zfs_is_mounted() - * zfs_mount() - * zfs_unmount() - * zfs_unmountall() - * - * This file also contains the functions used to manage sharing filesystems via - * NFS and iSCSI: - * - * zfs_is_shared() - * zfs_share() - * zfs_unshare() - * - * zfs_is_shared_nfs() - * zfs_is_shared_smb() - * zfs_share_proto() - * zfs_shareall(); - * zfs_unshare_nfs() - * zfs_unshare_smb() - * zfs_unshareall_nfs() - * zfs_unshareall_smb() - * zfs_unshareall() - * zfs_unshareall_bypath() - * - * The following functions are available for pool consumers, and will - * mount/unmount and share/unshare all datasets within pool: - * - * zpool_enable_datasets() - * zpool_disable_datasets() - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "libzfs_impl.h" -#include - -#include -#define MAXISALEN 257 /* based on sysinfo(2) man page */ - -static int mount_tp_nthr = 512; /* tpool threads for multi-threaded mounting */ - -static void zfs_mount_task(void *); -static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *); -zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **, - zfs_share_proto_t); - -/* - * The share protocols table must be in the same order as the zfs_share_proto_t - * enum in libzfs_impl.h - */ -typedef struct { - zfs_prop_t p_prop; - char *p_name; - int p_share_err; - int p_unshare_err; -} proto_table_t; - -proto_table_t proto_table[PROTO_END] = { - {ZFS_PROP_SHARENFS, "nfs", EZFS_SHARENFSFAILED, EZFS_UNSHARENFSFAILED}, - {ZFS_PROP_SHARESMB, "smb", EZFS_SHARESMBFAILED, EZFS_UNSHARESMBFAILED}, -}; - -zfs_share_proto_t nfs_only[] = { - PROTO_NFS, - PROTO_END -}; - -zfs_share_proto_t smb_only[] = { - PROTO_SMB, - PROTO_END -}; -zfs_share_proto_t share_all_proto[] = { - PROTO_NFS, - PROTO_SMB, - PROTO_END -}; - -/* - * Search the sharetab for the given mountpoint and protocol, returning - * a zfs_share_type_t value. - */ -static zfs_share_type_t -is_shared(libzfs_handle_t *hdl, const char *mountpoint, zfs_share_proto_t proto) -{ - char buf[MAXPATHLEN], *tab; - char *ptr; - - if (hdl->libzfs_sharetab == NULL) - return (SHARED_NOT_SHARED); - - (void) fseek(hdl->libzfs_sharetab, 0, SEEK_SET); - - while (fgets(buf, sizeof (buf), hdl->libzfs_sharetab) != NULL) { - - /* the mountpoint is the first entry on each line */ - if ((tab = strchr(buf, '\t')) == NULL) - continue; - - *tab = '\0'; - if (strcmp(buf, mountpoint) == 0) { -#ifdef illumos - /* - * the protocol field is the third field - * skip over second field - */ - ptr = ++tab; - if ((tab = strchr(ptr, '\t')) == NULL) - continue; - ptr = ++tab; - if ((tab = strchr(ptr, '\t')) == NULL) - continue; - *tab = '\0'; - if (strcmp(ptr, - proto_table[proto].p_name) == 0) { - switch (proto) { - case PROTO_NFS: - return (SHARED_NFS); - case PROTO_SMB: - return (SHARED_SMB); - default: - return (0); - } - } -#else - if (proto == PROTO_NFS) - return (SHARED_NFS); -#endif - } - } - - return (SHARED_NOT_SHARED); -} - -#ifdef illumos -static boolean_t -dir_is_empty_stat(const char *dirname) -{ - struct stat st; - - /* - * We only want to return false if the given path is a non empty - * directory, all other errors are handled elsewhere. - */ - if (stat(dirname, &st) < 0 || !S_ISDIR(st.st_mode)) { - return (B_TRUE); - } - - /* - * An empty directory will still have two entries in it, one - * entry for each of "." and "..". - */ - if (st.st_size > 2) { - return (B_FALSE); - } - - return (B_TRUE); -} - -static boolean_t -dir_is_empty_readdir(const char *dirname) -{ - DIR *dirp; - struct dirent64 *dp; - int dirfd; - - if ((dirfd = openat(AT_FDCWD, dirname, - O_RDONLY | O_NDELAY | O_LARGEFILE | O_CLOEXEC, 0)) < 0) { - return (B_TRUE); - } - - if ((dirp = fdopendir(dirfd)) == NULL) { - (void) close(dirfd); - return (B_TRUE); - } - - while ((dp = readdir64(dirp)) != NULL) { - - if (strcmp(dp->d_name, ".") == 0 || - strcmp(dp->d_name, "..") == 0) - continue; - - (void) closedir(dirp); - return (B_FALSE); - } - - (void) closedir(dirp); - return (B_TRUE); -} - -/* - * Returns true if the specified directory is empty. If we can't open the - * directory at all, return true so that the mount can fail with a more - * informative error message. - */ -static boolean_t -dir_is_empty(const char *dirname) -{ - struct statvfs64 st; - - /* - * If the statvfs call fails or the filesystem is not a ZFS - * filesystem, fall back to the slow path which uses readdir. - */ - if ((statvfs64(dirname, &st) != 0) || - (strcmp(st.f_basetype, "zfs") != 0)) { - return (dir_is_empty_readdir(dirname)); - } - - /* - * At this point, we know the provided path is on a ZFS - * filesystem, so we can use stat instead of readdir to - * determine if the directory is empty or not. We try to avoid - * using readdir because that requires opening "dirname"; this - * open file descriptor can potentially end up in a child - * process if there's a concurrent fork, thus preventing the - * zfs_mount() from otherwise succeeding (the open file - * descriptor inherited by the child process will cause the - * parent's mount to fail with EBUSY). The performance - * implications of replacing the open, read, and close with a - * single stat is nice; but is not the main motivation for the - * added complexity. - */ - return (dir_is_empty_stat(dirname)); -} -#endif - -/* - * Checks to see if the mount is active. If the filesystem is mounted, we fill - * in 'where' with the current mountpoint, and return 1. Otherwise, we return - * 0. - */ -boolean_t -is_mounted(libzfs_handle_t *zfs_hdl, const char *special, char **where) -{ - struct mnttab entry; - - if (libzfs_mnttab_find(zfs_hdl, special, &entry) != 0) - return (B_FALSE); - - if (where != NULL) - *where = zfs_strdup(zfs_hdl, entry.mnt_mountp); - - return (B_TRUE); -} - -boolean_t -zfs_is_mounted(zfs_handle_t *zhp, char **where) -{ - return (is_mounted(zhp->zfs_hdl, zfs_get_name(zhp), where)); -} - -static boolean_t -zfs_is_mountable_internal(zfs_handle_t *zhp, const char *mountpoint) -{ - - if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED) && - getzoneid() == GLOBAL_ZONEID) - return (B_FALSE); - - return (B_TRUE); -} - -/* - * Returns true if the given dataset is mountable, false otherwise. Returns the - * mountpoint in 'buf'. - */ -static boolean_t -zfs_is_mountable(zfs_handle_t *zhp, char *buf, size_t buflen, - zprop_source_t *source) -{ - char sourceloc[MAXNAMELEN]; - zprop_source_t sourcetype; - - if (!zfs_prop_valid_for_type(ZFS_PROP_MOUNTPOINT, zhp->zfs_type)) - return (B_FALSE); - - verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, buf, buflen, - &sourcetype, sourceloc, sizeof (sourceloc), B_FALSE) == 0); - - if (strcmp(buf, ZFS_MOUNTPOINT_NONE) == 0 || - strcmp(buf, ZFS_MOUNTPOINT_LEGACY) == 0) - return (B_FALSE); - - if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_OFF) - return (B_FALSE); - - if (!zfs_is_mountable_internal(zhp, buf)) - return (B_FALSE); - - if (source) - *source = sourcetype; - - return (B_TRUE); -} - -/* - * Mount the given filesystem. - */ -int -zfs_mount(zfs_handle_t *zhp, const char *options, int flags) -{ - char mountpoint[ZFS_MAXPROPLEN]; - - if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL)) - return (0); - - return (zfs_mount_at(zhp, options, flags, mountpoint)); -} - -int -zfs_mount_at(zfs_handle_t *zhp, const char *options, int flags, - const char *mountpoint) -{ - struct stat buf; - char mntopts[MNT_LINE_MAX]; - libzfs_handle_t *hdl = zhp->zfs_hdl; - - if (options == NULL) - mntopts[0] = '\0'; - else - (void) strlcpy(mntopts, options, sizeof (mntopts)); - - /* - * If the pool is imported read-only then all mounts must be read-only - */ - if (zpool_get_prop_int(zhp->zpool_hdl, ZPOOL_PROP_READONLY, NULL)) - flags |= MS_RDONLY; - - if (!zfs_is_mountable_internal(zhp, mountpoint)) - return (B_FALSE); - - /* Create the directory if it doesn't already exist */ - if (lstat(mountpoint, &buf) != 0) { - if (mkdirp(mountpoint, 0755) != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "failed to create mountpoint")); - return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, - dgettext(TEXT_DOMAIN, "cannot mount '%s'"), - mountpoint)); - } - } - -#ifdef illumos /* FreeBSD: overlay mounts are not checked. */ - /* - * Determine if the mountpoint is empty. If so, refuse to perform the - * mount. We don't perform this check if MS_OVERLAY is specified, which - * would defeat the point. We also avoid this check if 'remount' is - * specified. - */ - if ((flags & MS_OVERLAY) == 0 && - strstr(mntopts, MNTOPT_REMOUNT) == NULL && - !dir_is_empty(mountpoint)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "directory is not empty")); - return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, - dgettext(TEXT_DOMAIN, "cannot mount '%s'"), mountpoint)); - } -#endif - - /* perform the mount */ - if (zmount(zfs_get_name(zhp), mountpoint, flags, - MNTTYPE_ZFS, NULL, 0, mntopts, sizeof (mntopts)) != 0) { - /* - * Generic errors are nasty, but there are just way too many - * from mount(), and they're well-understood. We pick a few - * common ones to improve upon. - */ - if (errno == EBUSY) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "mountpoint or dataset is busy")); - } else if (errno == EPERM) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Insufficient privileges")); - } else if (errno == ENOTSUP) { - char buf[256]; - int spa_version; - - VERIFY(zfs_spa_version(zhp, &spa_version) == 0); - (void) snprintf(buf, sizeof (buf), - dgettext(TEXT_DOMAIN, "Can't mount a version %lld " - "file system on a version %d pool. Pool must be" - " upgraded to mount this file system."), - (u_longlong_t)zfs_prop_get_int(zhp, - ZFS_PROP_VERSION), spa_version); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, buf)); - } else { - zfs_error_aux(hdl, strerror(errno)); - } - return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, - dgettext(TEXT_DOMAIN, "cannot mount '%s'"), - zhp->zfs_name)); - } - - /* add the mounted entry into our cache */ - libzfs_mnttab_add(hdl, zfs_get_name(zhp), mountpoint, - mntopts); - return (0); -} - -/* - * Unmount a single filesystem. - */ -static int -unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags) -{ - if (umount2(mountpoint, flags) != 0) { - zfs_error_aux(hdl, strerror(errno)); - return (zfs_error_fmt(hdl, EZFS_UMOUNTFAILED, - dgettext(TEXT_DOMAIN, "cannot unmount '%s'"), - mountpoint)); - } - - return (0); -} - -/* - * Unmount the given filesystem. - */ -int -zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags) -{ - libzfs_handle_t *hdl = zhp->zfs_hdl; - struct mnttab entry; - char *mntpt = NULL; - - /* check to see if we need to unmount the filesystem */ - if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) && - libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0)) { - /* - * mountpoint may have come from a call to - * getmnt/getmntany if it isn't NULL. If it is NULL, - * we know it comes from libzfs_mnttab_find which can - * then get freed later. We strdup it to play it safe. - */ - if (mountpoint == NULL) - mntpt = zfs_strdup(hdl, entry.mnt_mountp); - else - mntpt = zfs_strdup(hdl, mountpoint); - - /* - * Unshare and unmount the filesystem - */ - if (zfs_unshare_proto(zhp, mntpt, share_all_proto) != 0) - return (-1); - - if (unmount_one(hdl, mntpt, flags) != 0) { - free(mntpt); - (void) zfs_shareall(zhp); - return (-1); - } - libzfs_mnttab_remove(hdl, zhp->zfs_name); - free(mntpt); - } - - return (0); -} - -/* - * Unmount this filesystem and any children inheriting the mountpoint property. - * To do this, just act like we're changing the mountpoint property, but don't - * remount the filesystems afterwards. - */ -int -zfs_unmountall(zfs_handle_t *zhp, int flags) -{ - prop_changelist_t *clp; - int ret; - - clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, 0, flags); - if (clp == NULL) - return (-1); - - ret = changelist_prefix(clp); - changelist_free(clp); - - return (ret); -} - -boolean_t -zfs_is_shared(zfs_handle_t *zhp) -{ - zfs_share_type_t rc = 0; - zfs_share_proto_t *curr_proto; - - if (ZFS_IS_VOLUME(zhp)) - return (B_FALSE); - - for (curr_proto = share_all_proto; *curr_proto != PROTO_END; - curr_proto++) - rc |= zfs_is_shared_proto(zhp, NULL, *curr_proto); - - return (rc ? B_TRUE : B_FALSE); -} - -int -zfs_share(zfs_handle_t *zhp) -{ - assert(!ZFS_IS_VOLUME(zhp)); - return (zfs_share_proto(zhp, share_all_proto)); -} - -int -zfs_unshare(zfs_handle_t *zhp) -{ - assert(!ZFS_IS_VOLUME(zhp)); - return (zfs_unshareall(zhp)); -} - -/* - * Check to see if the filesystem is currently shared. - */ -zfs_share_type_t -zfs_is_shared_proto(zfs_handle_t *zhp, char **where, zfs_share_proto_t proto) -{ - char *mountpoint; - zfs_share_type_t rc; - - if (!zfs_is_mounted(zhp, &mountpoint)) - return (SHARED_NOT_SHARED); - - if ((rc = is_shared(zhp->zfs_hdl, mountpoint, proto)) - != SHARED_NOT_SHARED) { - if (where != NULL) - *where = mountpoint; - else - free(mountpoint); - return (rc); - } else { - free(mountpoint); - return (SHARED_NOT_SHARED); - } -} - -boolean_t -zfs_is_shared_nfs(zfs_handle_t *zhp, char **where) -{ - return (zfs_is_shared_proto(zhp, where, - PROTO_NFS) != SHARED_NOT_SHARED); -} - -boolean_t -zfs_is_shared_smb(zfs_handle_t *zhp, char **where) -{ - return (zfs_is_shared_proto(zhp, where, - PROTO_SMB) != SHARED_NOT_SHARED); -} - -/* - * Make sure things will work if libshare isn't installed by using - * wrapper functions that check to see that the pointers to functions - * initialized in _zfs_init_libshare() are actually present. - */ - -#ifdef illumos -static sa_handle_t (*_sa_init)(int); -static sa_handle_t (*_sa_init_arg)(int, void *); -static void (*_sa_fini)(sa_handle_t); -static sa_share_t (*_sa_find_share)(sa_handle_t, char *); -static int (*_sa_enable_share)(sa_share_t, char *); -static int (*_sa_disable_share)(sa_share_t, char *); -static char *(*_sa_errorstr)(int); -static int (*_sa_parse_legacy_options)(sa_group_t, char *, char *); -static boolean_t (*_sa_needs_refresh)(sa_handle_t *); -static libzfs_handle_t *(*_sa_get_zfs_handle)(sa_handle_t); -static int (*_sa_zfs_process_share)(sa_handle_t, sa_group_t, sa_share_t, - char *, char *, zprop_source_t, char *, char *, char *); -static void (*_sa_update_sharetab_ts)(sa_handle_t); -#endif - -/* - * _zfs_init_libshare() - * - * Find the libshare.so.1 entry points that we use here and save the - * values to be used later. This is triggered by the runtime loader. - * Make sure the correct ISA version is loaded. - */ - -#pragma init(_zfs_init_libshare) -static void -_zfs_init_libshare(void) -{ -#ifdef illumos - void *libshare; - char path[MAXPATHLEN]; - char isa[MAXISALEN]; - -#if defined(_LP64) - if (sysinfo(SI_ARCHITECTURE_64, isa, MAXISALEN) == -1) - isa[0] = '\0'; -#else - isa[0] = '\0'; -#endif - (void) snprintf(path, MAXPATHLEN, - "/usr/lib/%s/libshare.so.1", isa); - - if ((libshare = dlopen(path, RTLD_LAZY | RTLD_GLOBAL)) != NULL) { - _sa_init = (sa_handle_t (*)(int))dlsym(libshare, "sa_init"); - _sa_init_arg = (sa_handle_t (*)(int, void *))dlsym(libshare, - "sa_init_arg"); - _sa_fini = (void (*)(sa_handle_t))dlsym(libshare, "sa_fini"); - _sa_find_share = (sa_share_t (*)(sa_handle_t, char *)) - dlsym(libshare, "sa_find_share"); - _sa_enable_share = (int (*)(sa_share_t, char *))dlsym(libshare, - "sa_enable_share"); - _sa_disable_share = (int (*)(sa_share_t, char *))dlsym(libshare, - "sa_disable_share"); - _sa_errorstr = (char *(*)(int))dlsym(libshare, "sa_errorstr"); - _sa_parse_legacy_options = (int (*)(sa_group_t, char *, char *)) - dlsym(libshare, "sa_parse_legacy_options"); - _sa_needs_refresh = (boolean_t (*)(sa_handle_t *)) - dlsym(libshare, "sa_needs_refresh"); - _sa_get_zfs_handle = (libzfs_handle_t *(*)(sa_handle_t)) - dlsym(libshare, "sa_get_zfs_handle"); - _sa_zfs_process_share = (int (*)(sa_handle_t, sa_group_t, - sa_share_t, char *, char *, zprop_source_t, char *, - char *, char *))dlsym(libshare, "sa_zfs_process_share"); - _sa_update_sharetab_ts = (void (*)(sa_handle_t)) - dlsym(libshare, "sa_update_sharetab_ts"); - if (_sa_init == NULL || _sa_init_arg == NULL || - _sa_fini == NULL || _sa_find_share == NULL || - _sa_enable_share == NULL || _sa_disable_share == NULL || - _sa_errorstr == NULL || _sa_parse_legacy_options == NULL || - _sa_needs_refresh == NULL || _sa_get_zfs_handle == NULL || - _sa_zfs_process_share == NULL || - _sa_update_sharetab_ts == NULL) { - _sa_init = NULL; - _sa_init_arg = NULL; - _sa_fini = NULL; - _sa_disable_share = NULL; - _sa_enable_share = NULL; - _sa_errorstr = NULL; - _sa_parse_legacy_options = NULL; - (void) dlclose(libshare); - _sa_needs_refresh = NULL; - _sa_get_zfs_handle = NULL; - _sa_zfs_process_share = NULL; - _sa_update_sharetab_ts = NULL; - } - } -#endif -} - -/* - * zfs_init_libshare(zhandle, service) - * - * Initialize the libshare API if it hasn't already been initialized. - * In all cases it returns 0 if it succeeded and an error if not. The - * service value is which part(s) of the API to initialize and is a - * direct map to the libshare sa_init(service) interface. - */ -static int -zfs_init_libshare_impl(libzfs_handle_t *zhandle, int service, void *arg) -{ -#ifdef illumos - /* - * libshare is either not installed or we're in a branded zone. The - * rest of the wrapper functions around the libshare calls already - * handle NULL function pointers, but we don't want the callers of - * zfs_init_libshare() to fail prematurely if libshare is not available. - */ - if (_sa_init == NULL) - return (SA_OK); - - /* - * Attempt to refresh libshare. This is necessary if there was a cache - * miss for a new ZFS dataset that was just created, or if state of the - * sharetab file has changed since libshare was last initialized. We - * want to make sure so check timestamps to see if a different process - * has updated any of the configuration. If there was some non-ZFS - * change, we need to re-initialize the internal cache. - */ - if (_sa_needs_refresh != NULL && - _sa_needs_refresh(zhandle->libzfs_sharehdl)) { - zfs_uninit_libshare(zhandle); - zhandle->libzfs_sharehdl = _sa_init_arg(service, arg); - } - - if (zhandle && zhandle->libzfs_sharehdl == NULL) - zhandle->libzfs_sharehdl = _sa_init_arg(service, arg); - - if (zhandle->libzfs_sharehdl == NULL) - return (SA_NO_MEMORY); -#endif - - return (SA_OK); -} -int -zfs_init_libshare(libzfs_handle_t *zhandle, int service) -{ - return (zfs_init_libshare_impl(zhandle, service, NULL)); -} - -int -zfs_init_libshare_arg(libzfs_handle_t *zhandle, int service, void *arg) -{ - return (zfs_init_libshare_impl(zhandle, service, arg)); -} - - -/* - * zfs_uninit_libshare(zhandle) - * - * Uninitialize the libshare API if it hasn't already been - * uninitialized. It is OK to call multiple times. - */ -void -zfs_uninit_libshare(libzfs_handle_t *zhandle) -{ - if (zhandle != NULL && zhandle->libzfs_sharehdl != NULL) { -#ifdef illumos - if (_sa_fini != NULL) - _sa_fini(zhandle->libzfs_sharehdl); -#endif - zhandle->libzfs_sharehdl = NULL; - } -} - -/* - * zfs_parse_options(options, proto) - * - * Call the legacy parse interface to get the protocol specific - * options using the NULL arg to indicate that this is a "parse" only. - */ -int -zfs_parse_options(char *options, zfs_share_proto_t proto) -{ -#ifdef illumos - if (_sa_parse_legacy_options != NULL) { - return (_sa_parse_legacy_options(NULL, options, - proto_table[proto].p_name)); - } - return (SA_CONFIG_ERR); -#else - return (SA_OK); -#endif -} - -#ifdef illumos -/* - * zfs_sa_find_share(handle, path) - * - * wrapper around sa_find_share to find a share path in the - * configuration. - */ -static sa_share_t -zfs_sa_find_share(sa_handle_t handle, char *path) -{ - if (_sa_find_share != NULL) - return (_sa_find_share(handle, path)); - return (NULL); -} - -/* - * zfs_sa_enable_share(share, proto) - * - * Wrapper for sa_enable_share which enables a share for a specified - * protocol. - */ -static int -zfs_sa_enable_share(sa_share_t share, char *proto) -{ - if (_sa_enable_share != NULL) - return (_sa_enable_share(share, proto)); - return (SA_CONFIG_ERR); -} - -/* - * zfs_sa_disable_share(share, proto) - * - * Wrapper for sa_enable_share which disables a share for a specified - * protocol. - */ -static int -zfs_sa_disable_share(sa_share_t share, char *proto) -{ - if (_sa_disable_share != NULL) - return (_sa_disable_share(share, proto)); - return (SA_CONFIG_ERR); -} -#endif /* illumos */ - -/* - * Share the given filesystem according to the options in the specified - * protocol specific properties (sharenfs, sharesmb). We rely - * on "libshare" to the dirty work for us. - */ -static int -zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto) -{ - char mountpoint[ZFS_MAXPROPLEN]; - char shareopts[ZFS_MAXPROPLEN]; - char sourcestr[ZFS_MAXPROPLEN]; - libzfs_handle_t *hdl = zhp->zfs_hdl; - zfs_share_proto_t *curr_proto; - zprop_source_t sourcetype; - int error, ret; - - if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL)) - return (0); - - for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) { - /* - * Return success if there are no share options. - */ - if (zfs_prop_get(zhp, proto_table[*curr_proto].p_prop, - shareopts, sizeof (shareopts), &sourcetype, sourcestr, - ZFS_MAXPROPLEN, B_FALSE) != 0 || - strcmp(shareopts, "off") == 0) - continue; -#ifdef illumos - ret = zfs_init_libshare_arg(hdl, SA_INIT_ONE_SHARE_FROM_HANDLE, - zhp); - if (ret != SA_OK) { - (void) zfs_error_fmt(hdl, EZFS_SHARENFSFAILED, - dgettext(TEXT_DOMAIN, "cannot share '%s': %s"), - zfs_get_name(zhp), _sa_errorstr != NULL ? - _sa_errorstr(ret) : ""); - return (-1); - } -#endif - - /* - * If the 'zoned' property is set, then zfs_is_mountable() - * will have already bailed out if we are in the global zone. - * But local zones cannot be NFS servers, so we ignore it for - * local zones as well. - */ - if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) - continue; - -#ifdef illumos - share = zfs_sa_find_share(hdl->libzfs_sharehdl, mountpoint); - if (share == NULL) { - /* - * This may be a new file system that was just - * created so isn't in the internal cache - * (second time through). Rather than - * reloading the entire configuration, we can - * assume ZFS has done the checking and it is - * safe to add this to the internal - * configuration. - */ - if (_sa_zfs_process_share(hdl->libzfs_sharehdl, - NULL, NULL, mountpoint, - proto_table[*curr_proto].p_name, sourcetype, - shareopts, sourcestr, zhp->zfs_name) != SA_OK) { - (void) zfs_error_fmt(hdl, - proto_table[*curr_proto].p_share_err, - dgettext(TEXT_DOMAIN, "cannot share '%s'"), - zfs_get_name(zhp)); - return (-1); - } - share = zfs_sa_find_share(hdl->libzfs_sharehdl, - mountpoint); - } - if (share != NULL) { - int err; - err = zfs_sa_enable_share(share, - proto_table[*curr_proto].p_name); - if (err != SA_OK) { - (void) zfs_error_fmt(hdl, - proto_table[*curr_proto].p_share_err, - dgettext(TEXT_DOMAIN, "cannot share '%s'"), - zfs_get_name(zhp)); - return (-1); - } - } else -#else - if (*curr_proto != PROTO_NFS) { - fprintf(stderr, "Unsupported share protocol: %d.\n", - *curr_proto); - continue; - } - - if (strcmp(shareopts, "on") == 0) - error = fsshare(ZFS_EXPORTS_PATH, mountpoint, ""); - else - error = fsshare(ZFS_EXPORTS_PATH, mountpoint, shareopts); - if (error != 0) -#endif - { - (void) zfs_error_fmt(hdl, - proto_table[*curr_proto].p_share_err, - dgettext(TEXT_DOMAIN, "cannot share '%s'"), - zfs_get_name(zhp)); - return (-1); - } - - } - return (0); -} - - -int -zfs_share_nfs(zfs_handle_t *zhp) -{ - return (zfs_share_proto(zhp, nfs_only)); -} - -int -zfs_share_smb(zfs_handle_t *zhp) -{ - return (zfs_share_proto(zhp, smb_only)); -} - -int -zfs_shareall(zfs_handle_t *zhp) -{ - return (zfs_share_proto(zhp, share_all_proto)); -} - -/* - * Unshare a filesystem by mountpoint. - */ -static int -unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint, - zfs_share_proto_t proto) -{ -#ifdef illumos - sa_share_t share; - int err; - char *mntpt; - - /* - * Mountpoint could get trashed if libshare calls getmntany - * which it does during API initialization, so strdup the - * value. - */ - mntpt = zfs_strdup(hdl, mountpoint); - - /* - * make sure libshare initialized, initialize everything because we - * don't know what other unsharing may happen later. Functions up the - * stack are allowed to initialize instead a subset of shares at the - * time the set is known. - */ - if ((err = zfs_init_libshare_arg(hdl, SA_INIT_ONE_SHARE_FROM_NAME, - (void *)name)) != SA_OK) { - free(mntpt); /* don't need the copy anymore */ - return (zfs_error_fmt(hdl, proto_table[proto].p_unshare_err, - dgettext(TEXT_DOMAIN, "cannot unshare '%s': %s"), - name, _sa_errorstr(err))); - } - - share = zfs_sa_find_share(hdl->libzfs_sharehdl, mntpt); - free(mntpt); /* don't need the copy anymore */ - - if (share != NULL) { - err = zfs_sa_disable_share(share, proto_table[proto].p_name); - if (err != SA_OK) { - return (zfs_error_fmt(hdl, - proto_table[proto].p_unshare_err, - dgettext(TEXT_DOMAIN, "cannot unshare '%s': %s"), - name, _sa_errorstr(err))); - } - } else { - return (zfs_error_fmt(hdl, proto_table[proto].p_unshare_err, - dgettext(TEXT_DOMAIN, "cannot unshare '%s': not found"), - name)); - } -#else - char buf[MAXPATHLEN]; - FILE *fp; - int err; - - if (proto != PROTO_NFS) { - fprintf(stderr, "No SMB support in FreeBSD yet.\n"); - return (EOPNOTSUPP); - } - - err = fsunshare(ZFS_EXPORTS_PATH, mountpoint); - if (err != 0) { - zfs_error_aux(hdl, "%s", strerror(err)); - return (zfs_error_fmt(hdl, EZFS_UNSHARENFSFAILED, - dgettext(TEXT_DOMAIN, - "cannot unshare '%s'"), name)); - } -#endif - return (0); -} - -/* - * Unshare the given filesystem. - */ -int -zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint, - zfs_share_proto_t *proto) -{ - libzfs_handle_t *hdl = zhp->zfs_hdl; - struct mnttab entry; - char *mntpt = NULL; - - /* check to see if need to unmount the filesystem */ - rewind(zhp->zfs_hdl->libzfs_mnttab); - if (mountpoint != NULL) - mountpoint = mntpt = zfs_strdup(hdl, mountpoint); - - if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) && - libzfs_mnttab_find(hdl, zfs_get_name(zhp), &entry) == 0)) { - zfs_share_proto_t *curr_proto; - - if (mountpoint == NULL) - mntpt = zfs_strdup(zhp->zfs_hdl, entry.mnt_mountp); - - for (curr_proto = proto; *curr_proto != PROTO_END; - curr_proto++) { - - if (is_shared(hdl, mntpt, *curr_proto) && - unshare_one(hdl, zhp->zfs_name, - mntpt, *curr_proto) != 0) { - if (mntpt != NULL) - free(mntpt); - return (-1); - } - } - } - if (mntpt != NULL) - free(mntpt); - - return (0); -} - -int -zfs_unshare_nfs(zfs_handle_t *zhp, const char *mountpoint) -{ - return (zfs_unshare_proto(zhp, mountpoint, nfs_only)); -} - -int -zfs_unshare_smb(zfs_handle_t *zhp, const char *mountpoint) -{ - return (zfs_unshare_proto(zhp, mountpoint, smb_only)); -} - -/* - * Same as zfs_unmountall(), but for NFS and SMB unshares. - */ -int -zfs_unshareall_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto) -{ - prop_changelist_t *clp; - int ret; - - clp = changelist_gather(zhp, ZFS_PROP_SHARENFS, 0, 0); - if (clp == NULL) - return (-1); - - ret = changelist_unshare(clp, proto); - changelist_free(clp); - - return (ret); -} - -int -zfs_unshareall_nfs(zfs_handle_t *zhp) -{ - return (zfs_unshareall_proto(zhp, nfs_only)); -} - -int -zfs_unshareall_smb(zfs_handle_t *zhp) -{ - return (zfs_unshareall_proto(zhp, smb_only)); -} - -int -zfs_unshareall(zfs_handle_t *zhp) -{ - return (zfs_unshareall_proto(zhp, share_all_proto)); -} - -int -zfs_unshareall_bypath(zfs_handle_t *zhp, const char *mountpoint) -{ - return (zfs_unshare_proto(zhp, mountpoint, share_all_proto)); -} - -/* - * Remove the mountpoint associated with the current dataset, if necessary. - * We only remove the underlying directory if: - * - * - The mountpoint is not 'none' or 'legacy' - * - The mountpoint is non-empty - * - The mountpoint is the default or inherited - * - The 'zoned' property is set, or we're in a local zone - * - * Any other directories we leave alone. - */ -void -remove_mountpoint(zfs_handle_t *zhp) -{ - char mountpoint[ZFS_MAXPROPLEN]; - zprop_source_t source; - - if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), - &source)) - return; - - if (source == ZPROP_SRC_DEFAULT || - source == ZPROP_SRC_INHERITED) { - /* - * Try to remove the directory, silently ignoring any errors. - * The filesystem may have since been removed or moved around, - * and this error isn't really useful to the administrator in - * any way. - */ - (void) rmdir(mountpoint); - } -} - -/* - * Add the given zfs handle to the cb_handles array, dynamically reallocating - * the array if it is out of space - */ -void -libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp) -{ - if (cbp->cb_alloc == cbp->cb_used) { - size_t newsz; - zfs_handle_t **newhandles; - - newsz = cbp->cb_alloc != 0 ? cbp->cb_alloc * 2 : 64; - newhandles = zfs_realloc(zhp->zfs_hdl, - cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *), - newsz * sizeof (zfs_handle_t *)); - cbp->cb_handles = newhandles; - cbp->cb_alloc = newsz; - } - cbp->cb_handles[cbp->cb_used++] = zhp; -} - -/* - * Recursive helper function used during file system enumeration - */ -static int -zfs_iter_cb(zfs_handle_t *zhp, void *data) -{ - get_all_cb_t *cbp = data; - - if (!(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM)) { - zfs_close(zhp); - return (0); - } - - if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_NOAUTO) { - zfs_close(zhp); - return (0); - } - - /* - * If this filesystem is inconsistent and has a receive resume - * token, we can not mount it. - */ - if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && - zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, - NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { - zfs_close(zhp); - return (0); - } - - libzfs_add_handle(cbp, zhp); - if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) != 0) { - zfs_close(zhp); - return (-1); - } - return (0); -} - -/* - * Sort comparator that compares two mountpoint paths. We sort these paths so - * that subdirectories immediately follow their parents. This means that we - * effectively treat the '/' character as the lowest value non-nul char. - * Since filesystems from non-global zones can have the same mountpoint - * as other filesystems, the comparator sorts global zone filesystems to - * the top of the list. This means that the global zone will traverse the - * filesystem list in the correct order and can stop when it sees the - * first zoned filesystem. In a non-global zone, only the delegated - * filesystems are seen. - * - * An example sorted list using this comparator would look like: - * - * /foo - * /foo/bar - * /foo/bar/baz - * /foo/baz - * /foo.bar - * /foo (NGZ1) - * /foo (NGZ2) - * - * The mount code depend on this ordering to deterministically iterate - * over filesystems in order to spawn parallel mount tasks. - */ -static int -mountpoint_cmp(const void *arga, const void *argb) -{ - zfs_handle_t *const *zap = arga; - zfs_handle_t *za = *zap; - zfs_handle_t *const *zbp = argb; - zfs_handle_t *zb = *zbp; - char mounta[MAXPATHLEN]; - char mountb[MAXPATHLEN]; - const char *a = mounta; - const char *b = mountb; - boolean_t gota, gotb; - uint64_t zoneda, zonedb; - - zoneda = zfs_prop_get_int(za, ZFS_PROP_ZONED); - zonedb = zfs_prop_get_int(zb, ZFS_PROP_ZONED); - if (zoneda && !zonedb) - return (1); - if (!zoneda && zonedb) - return (-1); - gota = (zfs_get_type(za) == ZFS_TYPE_FILESYSTEM); - if (gota) - verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta, - sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0); - gotb = (zfs_get_type(zb) == ZFS_TYPE_FILESYSTEM); - if (gotb) - verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb, - sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0); - - if (gota && gotb) { - while (*a != '\0' && (*a == *b)) { - a++; - b++; - } - if (*a == *b) - return (0); - if (*a == '\0') - return (-1); - if (*b == '\0') - return (1); - if (*a == '/') - return (-1); - if (*b == '/') - return (1); - return (*a < *b ? -1 : *a > *b); - } - - if (gota) - return (-1); - if (gotb) - return (1); - - /* - * If neither filesystem has a mountpoint, revert to sorting by - * datset name. - */ - return (strcmp(zfs_get_name(za), zfs_get_name(zb))); -} - -/* - * Return true if path2 is a child of path1 or path2 equals path1 or - * path1 is "/" (path2 is always a child of "/"). - */ -static boolean_t -libzfs_path_contains(const char *path1, const char *path2) -{ - return (strcmp(path1, path2) == 0 || strcmp(path1, "/") == 0 || - (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/')); -} - - -static int -non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int idx) -{ - char parent[ZFS_MAXPROPLEN]; - char child[ZFS_MAXPROPLEN]; - int i; - - verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent, - sizeof (parent), NULL, NULL, 0, B_FALSE) == 0); - - for (i = idx + 1; i < num_handles; i++) { - verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child, - sizeof (child), NULL, NULL, 0, B_FALSE) == 0); - if (!libzfs_path_contains(parent, child)) - break; - } - return (i); -} - -typedef struct mnt_param { - libzfs_handle_t *mnt_hdl; - tpool_t *mnt_tp; - zfs_handle_t **mnt_zhps; /* filesystems to mount */ - size_t mnt_num_handles; - int mnt_idx; /* Index of selected entry to mount */ - zfs_iter_f mnt_func; - void *mnt_data; -} mnt_param_t; - -/* - * Allocate and populate the parameter struct for mount function, and - * schedule mounting of the entry selected by idx. - */ -static void -zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles, - size_t num_handles, int idx, zfs_iter_f func, void *data, tpool_t *tp) -{ - mnt_param_t *mnt_param = zfs_alloc(hdl, sizeof (mnt_param_t)); - - mnt_param->mnt_hdl = hdl; - mnt_param->mnt_tp = tp; - mnt_param->mnt_zhps = handles; - mnt_param->mnt_num_handles = num_handles; - mnt_param->mnt_idx = idx; - mnt_param->mnt_func = func; - mnt_param->mnt_data = data; - - (void) tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param); -} - -/* - * This is the structure used to keep state of mounting or sharing operations - * during a call to zpool_enable_datasets(). - */ -typedef struct mount_state { - /* - * ms_mntstatus is set to -1 if any mount fails. While multiple threads - * could update this variable concurrently, no synchronization is - * needed as it's only ever set to -1. - */ - int ms_mntstatus; - int ms_mntflags; - const char *ms_mntopts; -} mount_state_t; - -static int -zfs_mount_one(zfs_handle_t *zhp, void *arg) -{ - mount_state_t *ms = arg; - int ret = 0; - - if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) != 0) - ret = ms->ms_mntstatus = -1; - return (ret); -} - -static int -zfs_share_one(zfs_handle_t *zhp, void *arg) -{ - mount_state_t *ms = arg; - int ret = 0; - - if (zfs_share(zhp) != 0) - ret = ms->ms_mntstatus = -1; - return (ret); -} - -/* - * Thread pool function to mount one file system. On completion, it finds and - * schedules its children to be mounted. This depends on the sorting done in - * zfs_foreach_mountpoint(). Note that the degenerate case (chain of entries - * each descending from the previous) will have no parallelism since we always - * have to wait for the parent to finish mounting before we can schedule - * its children. - */ -static void -zfs_mount_task(void *arg) -{ - mnt_param_t *mp = arg; - int idx = mp->mnt_idx; - zfs_handle_t **handles = mp->mnt_zhps; - size_t num_handles = mp->mnt_num_handles; - char mountpoint[ZFS_MAXPROPLEN]; - - verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint, - sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); - - if (mp->mnt_func(handles[idx], mp->mnt_data) != 0) - return; - - /* - * We dispatch tasks to mount filesystems with mountpoints underneath - * this one. We do this by dispatching the next filesystem with a - * descendant mountpoint of the one we just mounted, then skip all of - * its descendants, dispatch the next descendant mountpoint, and so on. - * The non_descendant_idx() function skips over filesystems that are - * descendants of the filesystem we just dispatched. - */ - for (int i = idx + 1; i < num_handles; - i = non_descendant_idx(handles, num_handles, i)) { - char child[ZFS_MAXPROPLEN]; - verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, - child, sizeof (child), NULL, NULL, 0, B_FALSE) == 0); - - if (!libzfs_path_contains(mountpoint, child)) - break; /* not a descendant, return */ - zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i, - mp->mnt_func, mp->mnt_data, mp->mnt_tp); - } - free(mp); -} - -/* - * Issue the func callback for each ZFS handle contained in the handles - * array. This function is used to mount all datasets, and so this function - * guarantees that filesystems for parent mountpoints are called before their - * children. As such, before issuing any callbacks, we first sort the array - * of handles by mountpoint. - * - * Callbacks are issued in one of two ways: - * - * 1. Sequentially: If the parallel argument is B_FALSE or the ZFS_SERIAL_MOUNT - * environment variable is set, then we issue callbacks sequentially. - * - * 2. In parallel: If the parallel argument is B_TRUE and the ZFS_SERIAL_MOUNT - * environment variable is not set, then we use a tpool to dispatch threads - * to mount filesystems in parallel. This function dispatches tasks to mount - * the filesystems at the top-level mountpoints, and these tasks in turn - * are responsible for recursively mounting filesystems in their children - * mountpoints. - */ -void -zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, - size_t num_handles, zfs_iter_f func, void *data, boolean_t parallel) -{ - zoneid_t zoneid = getzoneid(); - - /* - * The ZFS_SERIAL_MOUNT environment variable is an undocumented - * variable that can be used as a convenience to do a/b comparison - * of serial vs. parallel mounting. - */ - boolean_t serial_mount = !parallel || - (getenv("ZFS_SERIAL_MOUNT") != NULL); - - /* - * Sort the datasets by mountpoint. See mountpoint_cmp for details - * of how these are sorted. - */ - qsort(handles, num_handles, sizeof (zfs_handle_t *), mountpoint_cmp); - - if (serial_mount) { - for (int i = 0; i < num_handles; i++) { - func(handles[i], data); - } - return; - } - - /* - * Issue the callback function for each dataset using a parallel - * algorithm that uses a thread pool to manage threads. - */ - tpool_t *tp = tpool_create(1, mount_tp_nthr, 0, NULL); - - /* - * There may be multiple "top level" mountpoints outside of the pool's - * root mountpoint, e.g.: /foo /bar. Dispatch a mount task for each of - * these. - */ - for (int i = 0; i < num_handles; - i = non_descendant_idx(handles, num_handles, i)) { - /* - * Since the mountpoints have been sorted so that the zoned - * filesystems are at the end, a zoned filesystem seen from - * the global zone means that we're done. - */ - if (zoneid == GLOBAL_ZONEID && - zfs_prop_get_int(handles[i], ZFS_PROP_ZONED)) - break; - zfs_dispatch_mount(hdl, handles, num_handles, i, func, data, - tp); - } - - tpool_wait(tp); /* wait for all scheduled mounts to complete */ - tpool_destroy(tp); -} - -/* - * Mount and share all datasets within the given pool. This assumes that no - * datasets within the pool are currently mounted. - */ -#pragma weak zpool_mount_datasets = zpool_enable_datasets -int -zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) -{ - get_all_cb_t cb = { 0 }; - mount_state_t ms = { 0 }; - zfs_handle_t *zfsp; - int ret = 0; - - if ((zfsp = zfs_open(zhp->zpool_hdl, zhp->zpool_name, - ZFS_TYPE_DATASET)) == NULL) - goto out; - - /* - * Gather all non-snapshot datasets within the pool. Start by adding - * the root filesystem for this pool to the list, and then iterate - * over all child filesystems. - */ - libzfs_add_handle(&cb, zfsp); - if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) != 0) - goto out; - - /* - * Mount all filesystems - */ - ms.ms_mntopts = mntopts; - ms.ms_mntflags = flags; - zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, - zfs_mount_one, &ms, B_TRUE); - if (ms.ms_mntstatus != 0) - ret = ms.ms_mntstatus; - - /* - * Share all filesystems that need to be shared. This needs to be - * a separate pass because libshare is not mt-safe, and so we need - * to share serially. - */ - ms.ms_mntstatus = 0; - zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, - zfs_share_one, &ms, B_FALSE); - if (ms.ms_mntstatus != 0) - ret = ms.ms_mntstatus; - -out: - for (int i = 0; i < cb.cb_used; i++) - zfs_close(cb.cb_handles[i]); - free(cb.cb_handles); - - return (ret); -} - -static int -mountpoint_compare(const void *a, const void *b) -{ - const char *mounta = *((char **)a); - const char *mountb = *((char **)b); - - return (strcmp(mountb, mounta)); -} - -/* alias for 2002/240 */ -#pragma weak zpool_unmount_datasets = zpool_disable_datasets -/* - * Unshare and unmount all datasets within the given pool. We don't want to - * rely on traversing the DSL to discover the filesystems within the pool, - * because this may be expensive (if not all of them are mounted), and can fail - * arbitrarily (on I/O error, for example). Instead, we walk /etc/mnttab and - * gather all the filesystems that are currently mounted. - */ -int -zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) -{ - int used, alloc; - struct mnttab entry; - size_t namelen; - char **mountpoints = NULL; - zfs_handle_t **datasets = NULL; - libzfs_handle_t *hdl = zhp->zpool_hdl; - int i; - int ret = -1; - int flags = (force ? MS_FORCE : 0); -#ifdef illumos - sa_init_selective_arg_t sharearg; -#endif - - namelen = strlen(zhp->zpool_name); - - rewind(hdl->libzfs_mnttab); - used = alloc = 0; - while (getmntent(hdl->libzfs_mnttab, &entry) == 0) { - /* - * Ignore non-ZFS entries. - */ - if (entry.mnt_fstype == NULL || - strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) - continue; - - /* - * Ignore filesystems not within this pool. - */ - if (entry.mnt_mountp == NULL || - strncmp(entry.mnt_special, zhp->zpool_name, namelen) != 0 || - (entry.mnt_special[namelen] != '/' && - entry.mnt_special[namelen] != '\0')) - continue; - - /* - * At this point we've found a filesystem within our pool. Add - * it to our growing list. - */ - if (used == alloc) { - if (alloc == 0) { - if ((mountpoints = zfs_alloc(hdl, - 8 * sizeof (void *))) == NULL) - goto out; - - if ((datasets = zfs_alloc(hdl, - 8 * sizeof (void *))) == NULL) - goto out; - - alloc = 8; - } else { - void *ptr; - - if ((ptr = zfs_realloc(hdl, mountpoints, - alloc * sizeof (void *), - alloc * 2 * sizeof (void *))) == NULL) - goto out; - mountpoints = ptr; - - if ((ptr = zfs_realloc(hdl, datasets, - alloc * sizeof (void *), - alloc * 2 * sizeof (void *))) == NULL) - goto out; - datasets = ptr; - - alloc *= 2; - } - } - - if ((mountpoints[used] = zfs_strdup(hdl, - entry.mnt_mountp)) == NULL) - goto out; - - /* - * This is allowed to fail, in case there is some I/O error. It - * is only used to determine if we need to remove the underlying - * mountpoint, so failure is not fatal. - */ - datasets[used] = make_dataset_handle(hdl, entry.mnt_special); - - used++; - } - - /* - * At this point, we have the entire list of filesystems, so sort it by - * mountpoint. - */ -#ifdef illumos - sharearg.zhandle_arr = datasets; - sharearg.zhandle_len = used; - ret = zfs_init_libshare_arg(hdl, SA_INIT_SHARE_API_SELECTIVE, - &sharearg); - if (ret != 0) - goto out; -#endif - qsort(mountpoints, used, sizeof (char *), mountpoint_compare); - - /* - * Walk through and first unshare everything. - */ - for (i = 0; i < used; i++) { - zfs_share_proto_t *curr_proto; - for (curr_proto = share_all_proto; *curr_proto != PROTO_END; - curr_proto++) { - if (is_shared(hdl, mountpoints[i], *curr_proto) && - unshare_one(hdl, mountpoints[i], - mountpoints[i], *curr_proto) != 0) - goto out; - } - } - - /* - * Now unmount everything, removing the underlying directories as - * appropriate. - */ - for (i = 0; i < used; i++) { - if (unmount_one(hdl, mountpoints[i], flags) != 0) - goto out; - } - - for (i = 0; i < used; i++) { - if (datasets[i]) - remove_mountpoint(datasets[i]); - } - - ret = 0; -out: - for (i = 0; i < used; i++) { - if (datasets[i]) - zfs_close(datasets[i]); - free(mountpoints[i]); - } - free(datasets); - free(mountpoints); - - return (ret); -} diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c deleted file mode 100644 index 434f77e27da9..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c +++ /dev/null @@ -1,4669 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. - * Copyright 2016 Nexenta Systems, Inc. - * Copyright 2016 Igor Kozhukhov - * Copyright (c) 2017 Datto Inc. - * Copyright (c) 2017, Intel Corporation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zfs_namecheck.h" -#include "zfs_prop.h" -#include "libzfs_impl.h" -#include "zfs_comutil.h" -#include "zfeature_common.h" - -static int read_efi_label(nvlist_t *, diskaddr_t *, boolean_t *); -static boolean_t zpool_vdev_is_interior(const char *name); - -#define BACKUP_SLICE "s2" - -typedef struct prop_flags { - int create:1; /* Validate property on creation */ - int import:1; /* Validate property on import */ -} prop_flags_t; - -/* - * ==================================================================== - * zpool property functions - * ==================================================================== - */ - -static int -zpool_get_all_props(zpool_handle_t *zhp) -{ - zfs_cmd_t zc = { 0 }; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - - if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) - return (-1); - - while (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { - if (errno == ENOMEM) { - if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { - zcmd_free_nvlists(&zc); - return (-1); - } - } else { - zcmd_free_nvlists(&zc); - return (-1); - } - } - - if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zpool_props) != 0) { - zcmd_free_nvlists(&zc); - return (-1); - } - - zcmd_free_nvlists(&zc); - - return (0); -} - -static int -zpool_props_refresh(zpool_handle_t *zhp) -{ - nvlist_t *old_props; - - old_props = zhp->zpool_props; - - if (zpool_get_all_props(zhp) != 0) - return (-1); - - nvlist_free(old_props); - return (0); -} - -static char * -zpool_get_prop_string(zpool_handle_t *zhp, zpool_prop_t prop, - zprop_source_t *src) -{ - nvlist_t *nv, *nvl; - uint64_t ival; - char *value; - zprop_source_t source; - - nvl = zhp->zpool_props; - if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { - verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0); - source = ival; - verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); - } else { - source = ZPROP_SRC_DEFAULT; - if ((value = (char *)zpool_prop_default_string(prop)) == NULL) - value = "-"; - } - - if (src) - *src = source; - - return (value); -} - -uint64_t -zpool_get_prop_int(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src) -{ - nvlist_t *nv, *nvl; - uint64_t value; - zprop_source_t source; - - if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) { - /* - * zpool_get_all_props() has most likely failed because - * the pool is faulted, but if all we need is the top level - * vdev's guid then get it from the zhp config nvlist. - */ - if ((prop == ZPOOL_PROP_GUID) && - (nvlist_lookup_nvlist(zhp->zpool_config, - ZPOOL_CONFIG_VDEV_TREE, &nv) == 0) && - (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value) - == 0)) { - return (value); - } - return (zpool_prop_default_numeric(prop)); - } - - nvl = zhp->zpool_props; - if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { - verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &value) == 0); - source = value; - verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0); - } else { - source = ZPROP_SRC_DEFAULT; - value = zpool_prop_default_numeric(prop); - } - - if (src) - *src = source; - - return (value); -} - -/* - * Map VDEV STATE to printed strings. - */ -const char * -zpool_state_to_name(vdev_state_t state, vdev_aux_t aux) -{ - switch (state) { - case VDEV_STATE_CLOSED: - case VDEV_STATE_OFFLINE: - return (gettext("OFFLINE")); - case VDEV_STATE_REMOVED: - return (gettext("REMOVED")); - case VDEV_STATE_CANT_OPEN: - if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) - return (gettext("FAULTED")); - else if (aux == VDEV_AUX_SPLIT_POOL) - return (gettext("SPLIT")); - else - return (gettext("UNAVAIL")); - case VDEV_STATE_FAULTED: - return (gettext("FAULTED")); - case VDEV_STATE_DEGRADED: - return (gettext("DEGRADED")); - case VDEV_STATE_HEALTHY: - return (gettext("ONLINE")); - - default: - break; - } - - return (gettext("UNKNOWN")); -} - -/* - * Map POOL STATE to printed strings. - */ -const char * -zpool_pool_state_to_name(pool_state_t state) -{ - switch (state) { - case POOL_STATE_ACTIVE: - return (gettext("ACTIVE")); - case POOL_STATE_EXPORTED: - return (gettext("EXPORTED")); - case POOL_STATE_DESTROYED: - return (gettext("DESTROYED")); - case POOL_STATE_SPARE: - return (gettext("SPARE")); - case POOL_STATE_L2CACHE: - return (gettext("L2CACHE")); - case POOL_STATE_UNINITIALIZED: - return (gettext("UNINITIALIZED")); - case POOL_STATE_UNAVAIL: - return (gettext("UNAVAIL")); - case POOL_STATE_POTENTIALLY_ACTIVE: - return (gettext("POTENTIALLY_ACTIVE")); - } - - return (gettext("UNKNOWN")); -} - -/* - * Get a zpool property value for 'prop' and return the value in - * a pre-allocated buffer. - */ -int -zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, - zprop_source_t *srctype, boolean_t literal) -{ - uint64_t intval; - const char *strval; - zprop_source_t src = ZPROP_SRC_NONE; - nvlist_t *nvroot; - vdev_stat_t *vs; - uint_t vsc; - - if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { - switch (prop) { - case ZPOOL_PROP_NAME: - (void) strlcpy(buf, zpool_get_name(zhp), len); - break; - - case ZPOOL_PROP_HEALTH: - (void) strlcpy(buf, - zpool_pool_state_to_name(POOL_STATE_UNAVAIL), len); - break; - - case ZPOOL_PROP_GUID: - intval = zpool_get_prop_int(zhp, prop, &src); - (void) snprintf(buf, len, "%llu", intval); - break; - - case ZPOOL_PROP_ALTROOT: - case ZPOOL_PROP_CACHEFILE: - case ZPOOL_PROP_COMMENT: - if (zhp->zpool_props != NULL || - zpool_get_all_props(zhp) == 0) { - (void) strlcpy(buf, - zpool_get_prop_string(zhp, prop, &src), - len); - break; - } - /* FALLTHROUGH */ - default: - (void) strlcpy(buf, "-", len); - break; - } - - if (srctype != NULL) - *srctype = src; - return (0); - } - - if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) && - prop != ZPOOL_PROP_NAME) - return (-1); - - switch (zpool_prop_get_type(prop)) { - case PROP_TYPE_STRING: - (void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src), - len); - break; - - case PROP_TYPE_NUMBER: - intval = zpool_get_prop_int(zhp, prop, &src); - - switch (prop) { - case ZPOOL_PROP_SIZE: - case ZPOOL_PROP_ALLOCATED: - case ZPOOL_PROP_FREE: - case ZPOOL_PROP_FREEING: - case ZPOOL_PROP_LEAKED: - if (literal) { - (void) snprintf(buf, len, "%llu", - (u_longlong_t)intval); - } else { - (void) zfs_nicenum(intval, buf, len); - } - break; - case ZPOOL_PROP_BOOTSIZE: - case ZPOOL_PROP_EXPANDSZ: - case ZPOOL_PROP_CHECKPOINT: - if (intval == 0) { - (void) strlcpy(buf, "-", len); - } else if (literal) { - (void) snprintf(buf, len, "%llu", - (u_longlong_t)intval); - } else { - (void) zfs_nicenum(intval, buf, len); - } - break; - case ZPOOL_PROP_CAPACITY: - if (literal) { - (void) snprintf(buf, len, "%llu", - (u_longlong_t)intval); - } else { - (void) snprintf(buf, len, "%llu%%", - (u_longlong_t)intval); - } - break; - case ZPOOL_PROP_FRAGMENTATION: - if (intval == UINT64_MAX) { - (void) strlcpy(buf, "-", len); - } else { - (void) snprintf(buf, len, "%llu%%", - (u_longlong_t)intval); - } - break; - case ZPOOL_PROP_DEDUPRATIO: - (void) snprintf(buf, len, "%llu.%02llux", - (u_longlong_t)(intval / 100), - (u_longlong_t)(intval % 100)); - break; - case ZPOOL_PROP_HEALTH: - verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), - ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - verify(nvlist_lookup_uint64_array(nvroot, - ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) - == 0); - - (void) strlcpy(buf, zpool_state_to_name(intval, - vs->vs_aux), len); - break; - case ZPOOL_PROP_VERSION: - if (intval >= SPA_VERSION_FEATURES) { - (void) snprintf(buf, len, "-"); - break; - } - /* FALLTHROUGH */ - default: - (void) snprintf(buf, len, "%llu", intval); - } - break; - - case PROP_TYPE_INDEX: - intval = zpool_get_prop_int(zhp, prop, &src); - if (zpool_prop_index_to_string(prop, intval, &strval) - != 0) - return (-1); - (void) strlcpy(buf, strval, len); - break; - - default: - abort(); - } - - if (srctype) - *srctype = src; - - return (0); -} - -/* - * Check if the bootfs name has the same pool name as it is set to. - * Assuming bootfs is a valid dataset name. - */ -static boolean_t -bootfs_name_valid(const char *pool, const char *bootfs) -{ - int len = strlen(pool); - - if (!zfs_name_valid(bootfs, ZFS_TYPE_FILESYSTEM|ZFS_TYPE_SNAPSHOT)) - return (B_FALSE); - - if (strncmp(pool, bootfs, len) == 0 && - (bootfs[len] == '/' || bootfs[len] == '\0')) - return (B_TRUE); - - return (B_FALSE); -} - -boolean_t -zpool_is_bootable(zpool_handle_t *zhp) -{ - char bootfs[ZFS_MAX_DATASET_NAME_LEN]; - - return (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs, - sizeof (bootfs), NULL, B_FALSE) == 0 && strncmp(bootfs, "-", - sizeof (bootfs)) != 0); -} - - -/* - * Given an nvlist of zpool properties to be set, validate that they are - * correct, and parse any numeric properties (index, boolean, etc) if they are - * specified as strings. - */ -static nvlist_t * -zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, - nvlist_t *props, uint64_t version, prop_flags_t flags, char *errbuf) -{ - nvpair_t *elem; - nvlist_t *retprops; - zpool_prop_t prop; - char *strval; - uint64_t intval; - char *slash, *check; - struct stat64 statbuf; - zpool_handle_t *zhp; - - if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) { - (void) no_memory(hdl); - return (NULL); - } - - elem = NULL; - while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { - const char *propname = nvpair_name(elem); - - prop = zpool_name_to_prop(propname); - if (prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname)) { - int err; - char *fname = strchr(propname, '@') + 1; - - err = zfeature_lookup_name(fname, NULL); - if (err != 0) { - ASSERT3U(err, ==, ENOENT); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid feature '%s'"), fname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - - if (nvpair_type(elem) != DATA_TYPE_STRING) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' must be a string"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - - (void) nvpair_value_string(elem, &strval); - if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s' can only be set to " - "'enabled'"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - - if (nvlist_add_uint64(retprops, propname, 0) != 0) { - (void) no_memory(hdl); - goto error; - } - continue; - } - - /* - * Make sure this property is valid and applies to this type. - */ - if (prop == ZPOOL_PROP_INVAL) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid property '%s'"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - - if (zpool_prop_readonly(prop)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " - "is readonly"), propname); - (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); - goto error; - } - - if (zprop_parse_value(hdl, elem, prop, ZFS_TYPE_POOL, retprops, - &strval, &intval, errbuf) != 0) - goto error; - - /* - * Perform additional checking for specific properties. - */ - switch (prop) { - case ZPOOL_PROP_VERSION: - if (intval < version || - !SPA_VERSION_IS_SUPPORTED(intval)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s' number %d is invalid."), - propname, intval); - (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); - goto error; - } - break; - - case ZPOOL_PROP_BOOTSIZE: - if (!flags.create) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s' can only be set during pool " - "creation"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - break; - - case ZPOOL_PROP_BOOTFS: - if (flags.create || flags.import) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s' cannot be set at creation " - "or import time"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - - if (version < SPA_VERSION_BOOTFS) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgraded to support " - "'%s' property"), propname); - (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); - goto error; - } - - /* - * bootfs property value has to be a dataset name and - * the dataset has to be in the same pool as it sets to. - */ - if (strval[0] != '\0' && !bootfs_name_valid(poolname, - strval)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " - "is an invalid name"), strval); - (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); - goto error; - } - - if ((zhp = zpool_open_canfail(hdl, poolname)) == NULL) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "could not open pool '%s'"), poolname); - (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf); - goto error; - } - zpool_close(zhp); - break; - - case ZPOOL_PROP_ALTROOT: - if (!flags.create && !flags.import) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s' can only be set during pool " - "creation or import"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - - if (strval[0] != '/') { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "bad alternate root '%s'"), strval); - (void) zfs_error(hdl, EZFS_BADPATH, errbuf); - goto error; - } - break; - - case ZPOOL_PROP_CACHEFILE: - if (strval[0] == '\0') - break; - - if (strcmp(strval, "none") == 0) - break; - - if (strval[0] != '/') { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s' must be empty, an " - "absolute path, or 'none'"), propname); - (void) zfs_error(hdl, EZFS_BADPATH, errbuf); - goto error; - } - - slash = strrchr(strval, '/'); - - if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || - strcmp(slash, "/..") == 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' is not a valid file"), strval); - (void) zfs_error(hdl, EZFS_BADPATH, errbuf); - goto error; - } - - *slash = '\0'; - - if (strval[0] != '\0' && - (stat64(strval, &statbuf) != 0 || - !S_ISDIR(statbuf.st_mode))) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' is not a valid directory"), - strval); - (void) zfs_error(hdl, EZFS_BADPATH, errbuf); - goto error; - } - - *slash = '/'; - break; - - case ZPOOL_PROP_COMMENT: - for (check = strval; *check != '\0'; check++) { - if (!isprint(*check)) { - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, - "comment may only have printable " - "characters")); - (void) zfs_error(hdl, EZFS_BADPROP, - errbuf); - goto error; - } - } - if (strlen(strval) > ZPROP_MAX_COMMENT) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "comment must not exceed %d characters"), - ZPROP_MAX_COMMENT); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - break; - - case ZPOOL_PROP_READONLY: - if (!flags.import) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s' can only be set at " - "import time"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - break; - - case ZPOOL_PROP_TNAME: - if (!flags.create) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s' can only be set at " - "creation time"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - break; - - case ZPOOL_PROP_MULTIHOST: - if (get_system_hostid() == 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "requires a non-zero system hostid")); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - break; - - default: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s'(%d) not defined"), propname, prop); - break; - } - } - - return (retprops); -error: - nvlist_free(retprops); - return (NULL); -} - -/* - * Set zpool property : propname=propval. - */ -int -zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval) -{ - zfs_cmd_t zc = { 0 }; - int ret = -1; - char errbuf[1024]; - nvlist_t *nvl = NULL; - nvlist_t *realprops; - uint64_t version; - prop_flags_t flags = { 0 }; - - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), - zhp->zpool_name); - - if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) - return (no_memory(zhp->zpool_hdl)); - - if (nvlist_add_string(nvl, propname, propval) != 0) { - nvlist_free(nvl); - return (no_memory(zhp->zpool_hdl)); - } - - version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); - if ((realprops = zpool_valid_proplist(zhp->zpool_hdl, - zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) { - nvlist_free(nvl); - return (-1); - } - - nvlist_free(nvl); - nvl = realprops; - - /* - * Execute the corresponding ioctl() to set this property. - */ - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - - if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, nvl) != 0) { - nvlist_free(nvl); - return (-1); - } - - ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SET_PROPS, &zc); - - zcmd_free_nvlists(&zc); - nvlist_free(nvl); - - if (ret) - (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); - else - (void) zpool_props_refresh(zhp); - - return (ret); -} - -int -zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) -{ - libzfs_handle_t *hdl = zhp->zpool_hdl; - zprop_list_t *entry; - char buf[ZFS_MAXPROPLEN]; - nvlist_t *features = NULL; - zprop_list_t **last; - boolean_t firstexpand = (NULL == *plp); - - if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0) - return (-1); - - last = plp; - while (*last != NULL) - last = &(*last)->pl_next; - - if ((*plp)->pl_all) - features = zpool_get_features(zhp); - - if ((*plp)->pl_all && firstexpand) { - for (int i = 0; i < SPA_FEATURES; i++) { - zprop_list_t *entry = zfs_alloc(hdl, - sizeof (zprop_list_t)); - entry->pl_prop = ZPROP_INVAL; - entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s", - spa_feature_table[i].fi_uname); - entry->pl_width = strlen(entry->pl_user_prop); - entry->pl_all = B_TRUE; - - *last = entry; - last = &entry->pl_next; - } - } - - /* add any unsupported features */ - for (nvpair_t *nvp = nvlist_next_nvpair(features, NULL); - nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) { - char *propname; - boolean_t found; - zprop_list_t *entry; - - if (zfeature_is_supported(nvpair_name(nvp))) - continue; - - propname = zfs_asprintf(hdl, "unsupported@%s", - nvpair_name(nvp)); - - /* - * Before adding the property to the list make sure that no - * other pool already added the same property. - */ - found = B_FALSE; - entry = *plp; - while (entry != NULL) { - if (entry->pl_user_prop != NULL && - strcmp(propname, entry->pl_user_prop) == 0) { - found = B_TRUE; - break; - } - entry = entry->pl_next; - } - if (found) { - free(propname); - continue; - } - - entry = zfs_alloc(hdl, sizeof (zprop_list_t)); - entry->pl_prop = ZPROP_INVAL; - entry->pl_user_prop = propname; - entry->pl_width = strlen(entry->pl_user_prop); - entry->pl_all = B_TRUE; - - *last = entry; - last = &entry->pl_next; - } - - for (entry = *plp; entry != NULL; entry = entry->pl_next) { - - if (entry->pl_fixed) - continue; - - if (entry->pl_prop != ZPROP_INVAL && - zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf), - NULL, B_FALSE) == 0) { - if (strlen(buf) > entry->pl_width) - entry->pl_width = strlen(buf); - } - } - - return (0); -} - -/* - * Get the state for the given feature on the given ZFS pool. - */ -int -zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf, - size_t len) -{ - uint64_t refcount; - boolean_t found = B_FALSE; - nvlist_t *features = zpool_get_features(zhp); - boolean_t supported; - const char *feature = strchr(propname, '@') + 1; - - supported = zpool_prop_feature(propname); - ASSERT(supported || zpool_prop_unsupported(propname)); - - /* - * Convert from feature name to feature guid. This conversion is - * unecessary for unsupported@... properties because they already - * use guids. - */ - if (supported) { - int ret; - spa_feature_t fid; - - ret = zfeature_lookup_name(feature, &fid); - if (ret != 0) { - (void) strlcpy(buf, "-", len); - return (ENOTSUP); - } - feature = spa_feature_table[fid].fi_guid; - } - - if (nvlist_lookup_uint64(features, feature, &refcount) == 0) - found = B_TRUE; - - if (supported) { - if (!found) { - (void) strlcpy(buf, ZFS_FEATURE_DISABLED, len); - } else { - if (refcount == 0) - (void) strlcpy(buf, ZFS_FEATURE_ENABLED, len); - else - (void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len); - } - } else { - if (found) { - if (refcount == 0) { - (void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE); - } else { - (void) strcpy(buf, ZFS_UNSUPPORTED_READONLY); - } - } else { - (void) strlcpy(buf, "-", len); - return (ENOTSUP); - } - } - - return (0); -} - -/* - * Don't start the slice at the default block of 34; many storage - * devices will use a stripe width of 128k, so start there instead. - */ -#define NEW_START_BLOCK 256 - -/* - * Validate the given pool name, optionally putting an extended error message in - * 'buf'. - */ -boolean_t -zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) -{ - namecheck_err_t why; - char what; - int ret; - - ret = pool_namecheck(pool, &why, &what); - - /* - * The rules for reserved pool names were extended at a later point. - * But we need to support users with existing pools that may now be - * invalid. So we only check for this expanded set of names during a - * create (or import), and only in userland. - */ - if (ret == 0 && !isopen && - (strncmp(pool, "mirror", 6) == 0 || - strncmp(pool, "raidz", 5) == 0 || - strncmp(pool, "spare", 5) == 0 || - strcmp(pool, "log") == 0)) { - if (hdl != NULL) - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, "name is reserved")); - return (B_FALSE); - } - - - if (ret != 0) { - if (hdl != NULL) { - switch (why) { - case NAME_ERR_TOOLONG: - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, "name is too long")); - break; - - case NAME_ERR_INVALCHAR: - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, "invalid character " - "'%c' in pool name"), what); - break; - - case NAME_ERR_NOLETTER: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "name must begin with a letter")); - break; - - case NAME_ERR_RESERVED: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "name is reserved")); - break; - - case NAME_ERR_DISKLIKE: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool name is reserved")); - break; - - case NAME_ERR_LEADING_SLASH: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "leading slash in name")); - break; - - case NAME_ERR_EMPTY_COMPONENT: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "empty component in name")); - break; - - case NAME_ERR_TRAILING_SLASH: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "trailing slash in name")); - break; - - case NAME_ERR_MULTIPLE_DELIMITERS: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "multiple '@' and/or '#' delimiters in " - "name")); - break; - - default: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "(%d) not defined"), why); - break; - } - } - return (B_FALSE); - } - - return (B_TRUE); -} - -/* - * Open a handle to the given pool, even if the pool is currently in the FAULTED - * state. - */ -zpool_handle_t * -zpool_open_canfail(libzfs_handle_t *hdl, const char *pool) -{ - zpool_handle_t *zhp; - boolean_t missing; - - /* - * Make sure the pool name is valid. - */ - if (!zpool_name_valid(hdl, B_TRUE, pool)) { - (void) zfs_error_fmt(hdl, EZFS_INVALIDNAME, - dgettext(TEXT_DOMAIN, "cannot open '%s'"), - pool); - return (NULL); - } - - if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL) - return (NULL); - - zhp->zpool_hdl = hdl; - (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); - - if (zpool_refresh_stats(zhp, &missing) != 0) { - zpool_close(zhp); - return (NULL); - } - - if (missing) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool")); - (void) zfs_error_fmt(hdl, EZFS_NOENT, - dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool); - zpool_close(zhp); - return (NULL); - } - - return (zhp); -} - -/* - * Like the above, but silent on error. Used when iterating over pools (because - * the configuration cache may be out of date). - */ -int -zpool_open_silent(libzfs_handle_t *hdl, const char *pool, zpool_handle_t **ret) -{ - zpool_handle_t *zhp; - boolean_t missing; - - if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL) - return (-1); - - zhp->zpool_hdl = hdl; - (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); - - if (zpool_refresh_stats(zhp, &missing) != 0) { - zpool_close(zhp); - return (-1); - } - - if (missing) { - zpool_close(zhp); - *ret = NULL; - return (0); - } - - *ret = zhp; - return (0); -} - -/* - * Similar to zpool_open_canfail(), but refuses to open pools in the faulted - * state. - */ -zpool_handle_t * -zpool_open(libzfs_handle_t *hdl, const char *pool) -{ - zpool_handle_t *zhp; - - if ((zhp = zpool_open_canfail(hdl, pool)) == NULL) - return (NULL); - - if (zhp->zpool_state == POOL_STATE_UNAVAIL) { - (void) zfs_error_fmt(hdl, EZFS_POOLUNAVAIL, - dgettext(TEXT_DOMAIN, "cannot open '%s'"), zhp->zpool_name); - zpool_close(zhp); - return (NULL); - } - - return (zhp); -} - -/* - * Close the handle. Simply frees the memory associated with the handle. - */ -void -zpool_close(zpool_handle_t *zhp) -{ - nvlist_free(zhp->zpool_config); - nvlist_free(zhp->zpool_old_config); - nvlist_free(zhp->zpool_props); - free(zhp); -} - -/* - * Return the name of the pool. - */ -const char * -zpool_get_name(zpool_handle_t *zhp) -{ - return (zhp->zpool_name); -} - - -/* - * Return the state of the pool (ACTIVE or UNAVAILABLE) - */ -int -zpool_get_state(zpool_handle_t *zhp) -{ - return (zhp->zpool_state); -} - -/* - * Check if vdev list contains a special vdev - */ -static boolean_t -zpool_has_special_vdev(nvlist_t *nvroot) -{ - nvlist_t **child; - uint_t children; - - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, - &children) == 0) { - for (uint_t c = 0; c < children; c++) { - char *bias; - - if (nvlist_lookup_string(child[c], - ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0 && - strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) { - return (B_TRUE); - } - } - } - return (B_FALSE); -} - -/* - * Create the named pool, using the provided vdev list. It is assumed - * that the consumer has already validated the contents of the nvlist, so we - * don't have to worry about error semantics. - */ -int -zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, - nvlist_t *props, nvlist_t *fsprops) -{ - zfs_cmd_t zc = { 0 }; - nvlist_t *zc_fsprops = NULL; - nvlist_t *zc_props = NULL; - char msg[1024]; - int ret = -1; - - (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, - "cannot create '%s'"), pool); - - if (!zpool_name_valid(hdl, B_FALSE, pool)) - return (zfs_error(hdl, EZFS_INVALIDNAME, msg)); - - if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) - return (-1); - - if (props) { - prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE }; - - if ((zc_props = zpool_valid_proplist(hdl, pool, props, - SPA_VERSION_1, flags, msg)) == NULL) { - goto create_failed; - } - } - - if (fsprops) { - uint64_t zoned; - char *zonestr; - - zoned = ((nvlist_lookup_string(fsprops, - zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) && - strcmp(zonestr, "on") == 0); - - if ((zc_fsprops = zfs_valid_proplist(hdl, ZFS_TYPE_FILESYSTEM, - fsprops, zoned, NULL, NULL, msg)) == NULL) { - goto create_failed; - } - - if (nvlist_exists(zc_fsprops, - zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)) && - !zpool_has_special_vdev(nvroot)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "%s property requires a special vdev"), - zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)); - (void) zfs_error(hdl, EZFS_BADPROP, msg); - goto create_failed; - } - - if (!zc_props && - (nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) { - goto create_failed; - } - if (nvlist_add_nvlist(zc_props, - ZPOOL_ROOTFS_PROPS, zc_fsprops) != 0) { - goto create_failed; - } - } - - if (zc_props && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0) - goto create_failed; - - (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); - - if ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_CREATE, &zc)) != 0) { - - zcmd_free_nvlists(&zc); - nvlist_free(zc_props); - nvlist_free(zc_fsprops); - - switch (errno) { - case EBUSY: - /* - * This can happen if the user has specified the same - * device multiple times. We can't reliably detect this - * until we try to add it and see we already have a - * label. - */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "one or more vdevs refer to the same device")); - return (zfs_error(hdl, EZFS_BADDEV, msg)); - - case ERANGE: - /* - * This happens if the record size is smaller or larger - * than the allowed size range, or not a power of 2. - * - * NOTE: although zfs_valid_proplist is called earlier, - * this case may have slipped through since the - * pool does not exist yet and it is therefore - * impossible to read properties e.g. max blocksize - * from the pool. - */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "record size invalid")); - return (zfs_error(hdl, EZFS_BADPROP, msg)); - - case EOVERFLOW: - /* - * This occurs when one of the devices is below - * SPA_MINDEVSIZE. Unfortunately, we can't detect which - * device was the problem device since there's no - * reliable way to determine device size from userland. - */ - { - char buf[64]; - - zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf)); - - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "one or more devices is less than the " - "minimum size (%s)"), buf); - } - return (zfs_error(hdl, EZFS_BADDEV, msg)); - - case ENOSPC: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "one or more devices is out of space")); - return (zfs_error(hdl, EZFS_BADDEV, msg)); - - case ENOTBLK: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "cache device must be a disk or disk slice")); - return (zfs_error(hdl, EZFS_BADDEV, msg)); - - default: - return (zpool_standard_error(hdl, errno, msg)); - } - } - -create_failed: - zcmd_free_nvlists(&zc); - nvlist_free(zc_props); - nvlist_free(zc_fsprops); - return (ret); -} - -/* - * Destroy the given pool. It is up to the caller to ensure that there are no - * datasets left in the pool. - */ -int -zpool_destroy(zpool_handle_t *zhp, const char *log_str) -{ - zfs_cmd_t zc = { 0 }; - zfs_handle_t *zfp = NULL; - libzfs_handle_t *hdl = zhp->zpool_hdl; - char msg[1024]; - - if (zhp->zpool_state == POOL_STATE_ACTIVE && - (zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL) - return (-1); - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - zc.zc_history = (uint64_t)(uintptr_t)log_str; - - if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) { - (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, - "cannot destroy '%s'"), zhp->zpool_name); - - if (errno == EROFS) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "one or more devices is read only")); - (void) zfs_error(hdl, EZFS_BADDEV, msg); - } else { - (void) zpool_standard_error(hdl, errno, msg); - } - - if (zfp) - zfs_close(zfp); - return (-1); - } - - if (zfp) { - remove_mountpoint(zfp); - zfs_close(zfp); - } - - return (0); -} - -/* - * Create a checkpoint in the given pool. - */ -int -zpool_checkpoint(zpool_handle_t *zhp) -{ - libzfs_handle_t *hdl = zhp->zpool_hdl; - char msg[1024]; - int error; - - error = lzc_pool_checkpoint(zhp->zpool_name); - if (error != 0) { - (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, - "cannot checkpoint '%s'"), zhp->zpool_name); - (void) zpool_standard_error(hdl, error, msg); - return (-1); - } - - return (0); -} - -/* - * Discard the checkpoint from the given pool. - */ -int -zpool_discard_checkpoint(zpool_handle_t *zhp) -{ - libzfs_handle_t *hdl = zhp->zpool_hdl; - char msg[1024]; - int error; - - error = lzc_pool_checkpoint_discard(zhp->zpool_name); - if (error != 0) { - (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, - "cannot discard checkpoint in '%s'"), zhp->zpool_name); - (void) zpool_standard_error(hdl, error, msg); - return (-1); - } - - return (0); -} - -/* - * Add the given vdevs to the pool. The caller must have already performed the - * necessary verification to ensure that the vdev specification is well-formed. - */ -int -zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) -{ - zfs_cmd_t zc = { 0 }; - int ret; - libzfs_handle_t *hdl = zhp->zpool_hdl; - char msg[1024]; - nvlist_t **spares, **l2cache; - uint_t nspares, nl2cache; - - (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, - "cannot add to '%s'"), zhp->zpool_name); - - if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < - SPA_VERSION_SPARES && - nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " - "upgraded to add hot spares")); - return (zfs_error(hdl, EZFS_BADVERSION, msg)); - } - - if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < - SPA_VERSION_L2CACHE && - nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, - &l2cache, &nl2cache) == 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " - "upgraded to add cache devices")); - return (zfs_error(hdl, EZFS_BADVERSION, msg)); - } - - if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) - return (-1); - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - - if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) { - switch (errno) { - case EBUSY: - /* - * This can happen if the user has specified the same - * device multiple times. We can't reliably detect this - * until we try to add it and see we already have a - * label. - */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "one or more vdevs refer to the same device")); - (void) zfs_error(hdl, EZFS_BADDEV, msg); - break; - - case EINVAL: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid config; a pool with removing/removed " - "vdevs does not support adding raidz vdevs")); - (void) zfs_error(hdl, EZFS_BADDEV, msg); - break; - - case EOVERFLOW: - /* - * This occurrs when one of the devices is below - * SPA_MINDEVSIZE. Unfortunately, we can't detect which - * device was the problem device since there's no - * reliable way to determine device size from userland. - */ - { - char buf[64]; - - zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf)); - - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "device is less than the minimum " - "size (%s)"), buf); - } - (void) zfs_error(hdl, EZFS_BADDEV, msg); - break; - - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgraded to add these vdevs")); - (void) zfs_error(hdl, EZFS_BADVERSION, msg); - break; - - case EDOM: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "root pool can not have multiple vdevs" - " or separate logs")); - (void) zfs_error(hdl, EZFS_POOL_NOTSUP, msg); - break; - - case ENOTBLK: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "cache device must be a disk or disk slice")); - (void) zfs_error(hdl, EZFS_BADDEV, msg); - break; - - default: - (void) zpool_standard_error(hdl, errno, msg); - } - - ret = -1; - } else { - ret = 0; - } - - zcmd_free_nvlists(&zc); - - return (ret); -} - -/* - * Exports the pool from the system. The caller must ensure that there are no - * mounted datasets in the pool. - */ -static int -zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce, - const char *log_str) -{ - zfs_cmd_t zc = { 0 }; - char msg[1024]; - - (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, - "cannot export '%s'"), zhp->zpool_name); - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - zc.zc_cookie = force; - zc.zc_guid = hardforce; - zc.zc_history = (uint64_t)(uintptr_t)log_str; - - if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) { - switch (errno) { - case EXDEV: - zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, - "use '-f' to override the following errors:\n" - "'%s' has an active shared spare which could be" - " used by other pools once '%s' is exported."), - zhp->zpool_name, zhp->zpool_name); - return (zfs_error(zhp->zpool_hdl, EZFS_ACTIVE_SPARE, - msg)); - default: - return (zpool_standard_error_fmt(zhp->zpool_hdl, errno, - msg)); - } - } - - return (0); -} - -int -zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str) -{ - return (zpool_export_common(zhp, force, B_FALSE, log_str)); -} - -int -zpool_export_force(zpool_handle_t *zhp, const char *log_str) -{ - return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str)); -} - -static void -zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, - nvlist_t *config) -{ - nvlist_t *nv = NULL; - uint64_t rewindto; - int64_t loss = -1; - struct tm t; - char timestr[128]; - - if (!hdl->libzfs_printerr || config == NULL) - return; - - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || - nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) { - return; - } - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) - return; - (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); - - if (localtime_r((time_t *)&rewindto, &t) != NULL && - strftime(timestr, 128, 0, &t) != 0) { - if (dryrun) { - (void) printf(dgettext(TEXT_DOMAIN, - "Would be able to return %s " - "to its state as of %s.\n"), - name, timestr); - } else { - (void) printf(dgettext(TEXT_DOMAIN, - "Pool %s returned to its state as of %s.\n"), - name, timestr); - } - if (loss > 120) { - (void) printf(dgettext(TEXT_DOMAIN, - "%s approximately %lld "), - dryrun ? "Would discard" : "Discarded", - (loss + 30) / 60); - (void) printf(dgettext(TEXT_DOMAIN, - "minutes of transactions.\n")); - } else if (loss > 0) { - (void) printf(dgettext(TEXT_DOMAIN, - "%s approximately %lld "), - dryrun ? "Would discard" : "Discarded", loss); - (void) printf(dgettext(TEXT_DOMAIN, - "seconds of transactions.\n")); - } - } -} - -void -zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, - nvlist_t *config) -{ - nvlist_t *nv = NULL; - int64_t loss = -1; - uint64_t edata = UINT64_MAX; - uint64_t rewindto; - struct tm t; - char timestr[128]; - - if (!hdl->libzfs_printerr) - return; - - if (reason >= 0) - (void) printf(dgettext(TEXT_DOMAIN, "action: ")); - else - (void) printf(dgettext(TEXT_DOMAIN, "\t")); - - /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */ - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || - nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 || - nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) - goto no_info; - - (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS, - &edata); - - (void) printf(dgettext(TEXT_DOMAIN, - "Recovery is possible, but will result in some data loss.\n")); - - if (localtime_r((time_t *)&rewindto, &t) != NULL && - strftime(timestr, 128, 0, &t) != 0) { - (void) printf(dgettext(TEXT_DOMAIN, - "\tReturning the pool to its state as of %s\n" - "\tshould correct the problem. "), - timestr); - } else { - (void) printf(dgettext(TEXT_DOMAIN, - "\tReverting the pool to an earlier state " - "should correct the problem.\n\t")); - } - - if (loss > 120) { - (void) printf(dgettext(TEXT_DOMAIN, - "Approximately %lld minutes of data\n" - "\tmust be discarded, irreversibly. "), (loss + 30) / 60); - } else if (loss > 0) { - (void) printf(dgettext(TEXT_DOMAIN, - "Approximately %lld seconds of data\n" - "\tmust be discarded, irreversibly. "), loss); - } - if (edata != 0 && edata != UINT64_MAX) { - if (edata == 1) { - (void) printf(dgettext(TEXT_DOMAIN, - "After rewind, at least\n" - "\tone persistent user-data error will remain. ")); - } else { - (void) printf(dgettext(TEXT_DOMAIN, - "After rewind, several\n" - "\tpersistent user-data errors will remain. ")); - } - } - (void) printf(dgettext(TEXT_DOMAIN, - "Recovery can be attempted\n\tby executing 'zpool %s -F %s'. "), - reason >= 0 ? "clear" : "import", name); - - (void) printf(dgettext(TEXT_DOMAIN, - "A scrub of the pool\n" - "\tis strongly recommended after recovery.\n")); - return; - -no_info: - (void) printf(dgettext(TEXT_DOMAIN, - "Destroy and re-create the pool from\n\ta backup source.\n")); -} - -/* - * zpool_import() is a contracted interface. Should be kept the same - * if possible. - * - * Applications should use zpool_import_props() to import a pool with - * new properties value to be set. - */ -int -zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, - char *altroot) -{ - nvlist_t *props = NULL; - int ret; - - if (altroot != NULL) { - if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) { - return (zfs_error_fmt(hdl, EZFS_NOMEM, - dgettext(TEXT_DOMAIN, "cannot import '%s'"), - newname)); - } - - if (nvlist_add_string(props, - zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 || - nvlist_add_string(props, - zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) { - nvlist_free(props); - return (zfs_error_fmt(hdl, EZFS_NOMEM, - dgettext(TEXT_DOMAIN, "cannot import '%s'"), - newname)); - } - } - - ret = zpool_import_props(hdl, config, newname, props, - ZFS_IMPORT_NORMAL); - nvlist_free(props); - return (ret); -} - -static void -print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv, - int indent) -{ - nvlist_t **child; - uint_t c, children; - char *vname; - uint64_t is_log = 0; - - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, - &is_log); - - if (name != NULL) - (void) printf("\t%*s%s%s\n", indent, "", name, - is_log ? " [log]" : ""); - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) - return; - - for (c = 0; c < children; c++) { - vname = zpool_vdev_name(hdl, NULL, child[c], VDEV_NAME_TYPE_ID); - print_vdev_tree(hdl, vname, child[c], indent + 2); - free(vname); - } -} - -void -zpool_print_unsup_feat(nvlist_t *config) -{ - nvlist_t *nvinfo, *unsup_feat; - - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == - 0); - verify(nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT, - &unsup_feat) == 0); - - for (nvpair_t *nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL; - nvp = nvlist_next_nvpair(unsup_feat, nvp)) { - char *desc; - - verify(nvpair_type(nvp) == DATA_TYPE_STRING); - verify(nvpair_value_string(nvp, &desc) == 0); - - if (strlen(desc) > 0) - (void) printf("\t%s (%s)\n", nvpair_name(nvp), desc); - else - (void) printf("\t%s\n", nvpair_name(nvp)); - } -} - -/* - * Import the given pool using the known configuration and a list of - * properties to be set. The configuration should have come from - * zpool_find_import(). The 'newname' parameters control whether the pool - * is imported with a different name. - */ -int -zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, - nvlist_t *props, int flags) -{ - zfs_cmd_t zc = { 0 }; - zpool_load_policy_t policy; - nvlist_t *nv = NULL; - nvlist_t *nvinfo = NULL; - nvlist_t *missing = NULL; - char *thename; - char *origname; - int ret; - int error = 0; - char errbuf[1024]; - - verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, - &origname) == 0); - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot import pool '%s'"), origname); - - if (newname != NULL) { - if (!zpool_name_valid(hdl, B_FALSE, newname)) - return (zfs_error_fmt(hdl, EZFS_INVALIDNAME, - dgettext(TEXT_DOMAIN, "cannot import '%s'"), - newname)); - thename = (char *)newname; - } else { - thename = origname; - } - - if (props != NULL) { - uint64_t version; - prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; - - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, - &version) == 0); - - if ((props = zpool_valid_proplist(hdl, origname, - props, version, flags, errbuf)) == NULL) - return (-1); - if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { - nvlist_free(props); - return (-1); - } - nvlist_free(props); - } - - (void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name)); - - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, - &zc.zc_guid) == 0); - - if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) { - zcmd_free_nvlists(&zc); - return (-1); - } - if (zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2) != 0) { - zcmd_free_nvlists(&zc); - return (-1); - } - - zc.zc_cookie = flags; - while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 && - errno == ENOMEM) { - if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { - zcmd_free_nvlists(&zc); - return (-1); - } - } - if (ret != 0) - error = errno; - - (void) zcmd_read_dst_nvlist(hdl, &zc, &nv); - - zcmd_free_nvlists(&zc); - - zpool_get_load_policy(config, &policy); - - if (error) { - char desc[1024]; - char aux[256]; - - /* - * Dry-run failed, but we print out what success - * looks like if we found a best txg - */ - if (policy.zlp_rewind & ZPOOL_TRY_REWIND) { - zpool_rewind_exclaim(hdl, newname ? origname : thename, - B_TRUE, nv); - nvlist_free(nv); - return (-1); - } - - if (newname == NULL) - (void) snprintf(desc, sizeof (desc), - dgettext(TEXT_DOMAIN, "cannot import '%s'"), - thename); - else - (void) snprintf(desc, sizeof (desc), - dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"), - origname, thename); - - switch (error) { - case ENOTSUP: - if (nv != NULL && nvlist_lookup_nvlist(nv, - ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && - nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) { - (void) printf(dgettext(TEXT_DOMAIN, "This " - "pool uses the following feature(s) not " - "supported by this system:\n")); - zpool_print_unsup_feat(nv); - if (nvlist_exists(nvinfo, - ZPOOL_CONFIG_CAN_RDONLY)) { - (void) printf(dgettext(TEXT_DOMAIN, - "All unsupported features are only " - "required for writing to the pool." - "\nThe pool can be imported using " - "'-o readonly=on'.\n")); - } - } - /* - * Unsupported version. - */ - (void) zfs_error(hdl, EZFS_BADVERSION, desc); - break; - - case EREMOTEIO: - if (nv != NULL && nvlist_lookup_nvlist(nv, - ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0) { - char *hostname = ""; - uint64_t hostid = 0; - mmp_state_t mmp_state; - - mmp_state = fnvlist_lookup_uint64(nvinfo, - ZPOOL_CONFIG_MMP_STATE); - - if (nvlist_exists(nvinfo, - ZPOOL_CONFIG_MMP_HOSTNAME)) - hostname = fnvlist_lookup_string(nvinfo, - ZPOOL_CONFIG_MMP_HOSTNAME); - - if (nvlist_exists(nvinfo, - ZPOOL_CONFIG_MMP_HOSTID)) - hostid = fnvlist_lookup_uint64(nvinfo, - ZPOOL_CONFIG_MMP_HOSTID); - - if (mmp_state == MMP_STATE_ACTIVE) { - (void) snprintf(aux, sizeof (aux), - dgettext(TEXT_DOMAIN, "pool is imp" - "orted on host '%s' (hostid=%lx).\n" - "Export the pool on the other " - "system, then run 'zpool import'."), - hostname, (unsigned long) hostid); - } else if (mmp_state == MMP_STATE_NO_HOSTID) { - (void) snprintf(aux, sizeof (aux), - dgettext(TEXT_DOMAIN, "pool has " - "the multihost property on and " - "the\nsystem's hostid is not " - "set.\n")); - } - - (void) zfs_error_aux(hdl, aux); - } - (void) zfs_error(hdl, EZFS_ACTIVE_POOL, desc); - break; - - case EINVAL: - (void) zfs_error(hdl, EZFS_INVALCONFIG, desc); - break; - - case EROFS: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "one or more devices is read only")); - (void) zfs_error(hdl, EZFS_BADDEV, desc); - break; - - case ENXIO: - if (nv && nvlist_lookup_nvlist(nv, - ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && - nvlist_lookup_nvlist(nvinfo, - ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) { - (void) printf(dgettext(TEXT_DOMAIN, - "The devices below are missing or " - "corrupted, use '-m' to import the pool " - "anyway:\n")); - print_vdev_tree(hdl, NULL, missing, 2); - (void) printf("\n"); - } - (void) zpool_standard_error(hdl, error, desc); - break; - - case EEXIST: - (void) zpool_standard_error(hdl, error, desc); - break; - case ENAMETOOLONG: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "new name of at least one dataset is longer than " - "the maximum allowable length")); - (void) zfs_error(hdl, EZFS_NAMETOOLONG, desc); - break; - default: - (void) zpool_standard_error(hdl, error, desc); - zpool_explain_recover(hdl, - newname ? origname : thename, -error, nv); - break; - } - - nvlist_free(nv); - ret = -1; - } else { - zpool_handle_t *zhp; - - /* - * This should never fail, but play it safe anyway. - */ - if (zpool_open_silent(hdl, thename, &zhp) != 0) - ret = -1; - else if (zhp != NULL) - zpool_close(zhp); - if (policy.zlp_rewind & - (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { - zpool_rewind_exclaim(hdl, newname ? origname : thename, - ((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0), nv); - } - nvlist_free(nv); - return (0); - } - - return (ret); -} - -/* - * Scan the pool. - */ -int -zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) -{ - zfs_cmd_t zc = { 0 }; - char msg[1024]; - int err; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - zc.zc_cookie = func; - zc.zc_flags = cmd; - - if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0) - return (0); - - err = errno; - - /* ECANCELED on a scrub means we resumed a paused scrub */ - if (err == ECANCELED && func == POOL_SCAN_SCRUB && - cmd == POOL_SCRUB_NORMAL) - return (0); - - if (err == ENOENT && func != POOL_SCAN_NONE && cmd == POOL_SCRUB_NORMAL) - return (0); - - if (func == POOL_SCAN_SCRUB) { - if (cmd == POOL_SCRUB_PAUSE) { - (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, - "cannot pause scrubbing %s"), zc.zc_name); - } else { - assert(cmd == POOL_SCRUB_NORMAL); - (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, - "cannot scrub %s"), zc.zc_name); - } - } else if (func == POOL_SCAN_NONE) { - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"), - zc.zc_name); - } else { - assert(!"unexpected result"); - } - - if (err == EBUSY) { - nvlist_t *nvroot; - pool_scan_stat_t *ps = NULL; - uint_t psc; - - verify(nvlist_lookup_nvlist(zhp->zpool_config, - ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - (void) nvlist_lookup_uint64_array(nvroot, - ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); - if (ps && ps->pss_func == POOL_SCAN_SCRUB) { - if (cmd == POOL_SCRUB_PAUSE) - return (zfs_error(hdl, EZFS_SCRUB_PAUSED, msg)); - else - return (zfs_error(hdl, EZFS_SCRUBBING, msg)); - } else { - return (zfs_error(hdl, EZFS_RESILVERING, msg)); - } - } else if (err == ENOENT) { - return (zfs_error(hdl, EZFS_NO_SCRUB, msg)); - } else { - return (zpool_standard_error(hdl, err, msg)); - } -} - -static int -xlate_init_err(int err) -{ - switch (err) { - case ENODEV: - return (EZFS_NODEVICE); - case EINVAL: - case EROFS: - return (EZFS_BADDEV); - case EBUSY: - return (EZFS_INITIALIZING); - case ESRCH: - return (EZFS_NO_INITIALIZE); - } - return (err); -} - -/* - * Begin, suspend, or cancel the initialization (initializing of all free - * blocks) for the given vdevs in the given pool. - */ -int -zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, - nvlist_t *vds) -{ - char msg[1024]; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - nvlist_t *errlist; - - /* translate vdev names to guids */ - nvlist_t *vdev_guids = fnvlist_alloc(); - nvlist_t *guids_to_paths = fnvlist_alloc(); - boolean_t spare, cache; - nvlist_t *tgt; - nvpair_t *elem; - - for (elem = nvlist_next_nvpair(vds, NULL); elem != NULL; - elem = nvlist_next_nvpair(vds, elem)) { - char *vd_path = nvpair_name(elem); - tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache, NULL); - - if ((tgt == NULL) || cache || spare) { - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot initialize '%s'"), - vd_path); - int err = (tgt == NULL) ? EZFS_NODEVICE : - (spare ? EZFS_ISSPARE : EZFS_ISL2CACHE); - fnvlist_free(vdev_guids); - fnvlist_free(guids_to_paths); - return (zfs_error(hdl, err, msg)); - } - - uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); - fnvlist_add_uint64(vdev_guids, vd_path, guid); - - (void) snprintf(msg, sizeof (msg), "%llu", guid); - fnvlist_add_string(guids_to_paths, msg, vd_path); - } - - int err = lzc_initialize(zhp->zpool_name, cmd_type, vdev_guids, - &errlist); - fnvlist_free(vdev_guids); - - if (err == 0) { - fnvlist_free(guids_to_paths); - return (0); - } - - nvlist_t *vd_errlist = NULL; - if (errlist != NULL) { - vd_errlist = fnvlist_lookup_nvlist(errlist, - ZPOOL_INITIALIZE_VDEVS); - } - - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "operation failed")); - - for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL; - elem = nvlist_next_nvpair(vd_errlist, elem)) { - int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem)); - char *path = fnvlist_lookup_string(guids_to_paths, - nvpair_name(elem)); - (void) zfs_error_fmt(hdl, vd_error, "cannot initialize '%s'", - path); - } - - fnvlist_free(guids_to_paths); - if (vd_errlist != NULL) - return (-1); - - return (zpool_standard_error(hdl, err, msg)); -} - -#ifdef illumos -/* - * This provides a very minimal check whether a given string is likely a - * c#t#d# style string. Users of this are expected to do their own - * verification of the s# part. - */ -#define CTD_CHECK(str) (str && str[0] == 'c' && isdigit(str[1])) - -/* - * More elaborate version for ones which may start with "/dev/dsk/" - * and the like. - */ -static int -ctd_check_path(char *str) -{ - /* - * If it starts with a slash, check the last component. - */ - if (str && str[0] == '/') { - char *tmp = strrchr(str, '/'); - - /* - * If it ends in "/old", check the second-to-last - * component of the string instead. - */ - if (tmp != str && strcmp(tmp, "/old") == 0) { - for (tmp--; *tmp != '/'; tmp--) - ; - } - str = tmp + 1; - } - return (CTD_CHECK(str)); -} -#endif - -/* - * Find a vdev that matches the search criteria specified. We use the - * the nvpair name to determine how we should look for the device. - * 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL - * spare; but FALSE if its an INUSE spare. - */ -static nvlist_t * -vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare, - boolean_t *l2cache, boolean_t *log) -{ - uint_t c, children; - nvlist_t **child; - nvlist_t *ret; - uint64_t is_log; - char *srchkey; - nvpair_t *pair = nvlist_next_nvpair(search, NULL); - - /* Nothing to look for */ - if (search == NULL || pair == NULL) - return (NULL); - - /* Obtain the key we will use to search */ - srchkey = nvpair_name(pair); - - switch (nvpair_type(pair)) { - case DATA_TYPE_UINT64: - if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) { - uint64_t srchval, theguid; - - verify(nvpair_value_uint64(pair, &srchval) == 0); - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, - &theguid) == 0); - if (theguid == srchval) - return (nv); - } - break; - - case DATA_TYPE_STRING: { - char *srchval, *val; - - verify(nvpair_value_string(pair, &srchval) == 0); - if (nvlist_lookup_string(nv, srchkey, &val) != 0) - break; - - /* - * Search for the requested value. Special cases: - * - * - ZPOOL_CONFIG_PATH for whole disk entries. To support - * UEFI boot, these end in "s0" or "s0/old" or "s1" or - * "s1/old". The "s0" or "s1" part is hidden from the user, - * but included in the string, so this matches around it. - * - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE). - * - * Otherwise, all other searches are simple string compares. - */ -#ifdef illumos - if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 && - ctd_check_path(val)) { - uint64_t wholedisk = 0; - - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, - &wholedisk); - if (wholedisk) { - int slen = strlen(srchval); - int vlen = strlen(val); - - if (slen != vlen - 2) - break; - - /* - * make_leaf_vdev() should only set - * wholedisk for ZPOOL_CONFIG_PATHs which - * will include "/dev/dsk/", giving plenty of - * room for the indices used next. - */ - ASSERT(vlen >= 6); - - /* - * strings identical except trailing "s0" - */ - if ((strcmp(&val[vlen - 2], "s0") == 0 || - strcmp(&val[vlen - 2], "s1") == 0) && - strncmp(srchval, val, slen) == 0) - return (nv); - - /* - * strings identical except trailing "s0/old" - */ - if ((strcmp(&val[vlen - 6], "s0/old") == 0 || - strcmp(&val[vlen - 6], "s1/old") == 0) && - strcmp(&srchval[slen - 4], "/old") == 0 && - strncmp(srchval, val, slen - 4) == 0) - return (nv); - - break; - } - } else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) { -#else - if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) { -#endif - char *type, *idx, *end, *p; - uint64_t id, vdev_id; - - /* - * Determine our vdev type, keeping in mind - * that the srchval is composed of a type and - * vdev id pair (i.e. mirror-4). - */ - if ((type = strdup(srchval)) == NULL) - return (NULL); - - if ((p = strrchr(type, '-')) == NULL) { - free(type); - break; - } - idx = p + 1; - *p = '\0'; - - /* - * If the types don't match then keep looking. - */ - if (strncmp(val, type, strlen(val)) != 0) { - free(type); - break; - } - - verify(zpool_vdev_is_interior(type)); - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, - &id) == 0); - - errno = 0; - vdev_id = strtoull(idx, &end, 10); - - free(type); - if (errno != 0) - return (NULL); - - /* - * Now verify that we have the correct vdev id. - */ - if (vdev_id == id) - return (nv); - } - - /* - * Common case - */ - if (strcmp(srchval, val) == 0) - return (nv); - break; - } - - default: - break; - } - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) - return (NULL); - - for (c = 0; c < children; c++) { - if ((ret = vdev_to_nvlist_iter(child[c], search, - avail_spare, l2cache, NULL)) != NULL) { - /* - * The 'is_log' value is only set for the toplevel - * vdev, not the leaf vdevs. So we always lookup the - * log device from the root of the vdev tree (where - * 'log' is non-NULL). - */ - if (log != NULL && - nvlist_lookup_uint64(child[c], - ZPOOL_CONFIG_IS_LOG, &is_log) == 0 && - is_log) { - *log = B_TRUE; - } - return (ret); - } - } - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, - &child, &children) == 0) { - for (c = 0; c < children; c++) { - if ((ret = vdev_to_nvlist_iter(child[c], search, - avail_spare, l2cache, NULL)) != NULL) { - *avail_spare = B_TRUE; - return (ret); - } - } - } - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, - &child, &children) == 0) { - for (c = 0; c < children; c++) { - if ((ret = vdev_to_nvlist_iter(child[c], search, - avail_spare, l2cache, NULL)) != NULL) { - *l2cache = B_TRUE; - return (ret); - } - } - } - - return (NULL); -} - -/* - * Given a physical path (minus the "/devices" prefix), find the - * associated vdev. - */ -nvlist_t * -zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath, - boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) -{ - nvlist_t *search, *nvroot, *ret; - - verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); - verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0); - - verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - - *avail_spare = B_FALSE; - *l2cache = B_FALSE; - if (log != NULL) - *log = B_FALSE; - ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); - nvlist_free(search); - - return (ret); -} - -/* - * Determine if we have an "interior" top-level vdev (i.e mirror/raidz). - */ -static boolean_t -zpool_vdev_is_interior(const char *name) -{ - if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || - strncmp(name, VDEV_TYPE_SPARE, strlen(VDEV_TYPE_SPARE)) == 0 || - strncmp(name, - VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 || - strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) - return (B_TRUE); - return (B_FALSE); -} - -nvlist_t * -zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, - boolean_t *l2cache, boolean_t *log) -{ - char buf[MAXPATHLEN]; - char *end; - nvlist_t *nvroot, *search, *ret; - uint64_t guid; - - verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - guid = strtoull(path, &end, 10); - if (guid != 0 && *end == '\0') { - verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0); - } else if (zpool_vdev_is_interior(path)) { - verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0); - } else if (path[0] != '/') { - (void) snprintf(buf, sizeof (buf), "%s%s", _PATH_DEV, path); - verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0); - } else { - verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0); - } - - verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - - *avail_spare = B_FALSE; - *l2cache = B_FALSE; - if (log != NULL) - *log = B_FALSE; - ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); - nvlist_free(search); - - return (ret); -} - -static int -vdev_is_online(nvlist_t *nv) -{ - uint64_t ival; - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || - nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || - nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) - return (0); - - return (1); -} - -/* - * Helper function for zpool_get_physpaths(). - */ -static int -vdev_get_one_physpath(nvlist_t *config, char *physpath, size_t physpath_size, - size_t *bytes_written) -{ - size_t bytes_left, pos, rsz; - char *tmppath; - const char *format; - - if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH, - &tmppath) != 0) - return (EZFS_NODEVICE); - - pos = *bytes_written; - bytes_left = physpath_size - pos; - format = (pos == 0) ? "%s" : " %s"; - - rsz = snprintf(physpath + pos, bytes_left, format, tmppath); - *bytes_written += rsz; - - if (rsz >= bytes_left) { - /* if physpath was not copied properly, clear it */ - if (bytes_left != 0) { - physpath[pos] = 0; - } - return (EZFS_NOSPC); - } - return (0); -} - -static int -vdev_get_physpaths(nvlist_t *nv, char *physpath, size_t phypath_size, - size_t *rsz, boolean_t is_spare) -{ - char *type; - int ret; - - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) - return (EZFS_INVALCONFIG); - - if (strcmp(type, VDEV_TYPE_DISK) == 0) { - /* - * An active spare device has ZPOOL_CONFIG_IS_SPARE set. - * For a spare vdev, we only want to boot from the active - * spare device. - */ - if (is_spare) { - uint64_t spare = 0; - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, - &spare); - if (!spare) - return (EZFS_INVALCONFIG); - } - - if (vdev_is_online(nv)) { - if ((ret = vdev_get_one_physpath(nv, physpath, - phypath_size, rsz)) != 0) - return (ret); - } - } else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 || - strcmp(type, VDEV_TYPE_RAIDZ) == 0 || - strcmp(type, VDEV_TYPE_REPLACING) == 0 || - (is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) { - nvlist_t **child; - uint_t count; - int i, ret; - - if (nvlist_lookup_nvlist_array(nv, - ZPOOL_CONFIG_CHILDREN, &child, &count) != 0) - return (EZFS_INVALCONFIG); - - for (i = 0; i < count; i++) { - ret = vdev_get_physpaths(child[i], physpath, - phypath_size, rsz, is_spare); - if (ret == EZFS_NOSPC) - return (ret); - } - } - - return (EZFS_POOL_INVALARG); -} - -/* - * Get phys_path for a root pool config. - * Return 0 on success; non-zero on failure. - */ -static int -zpool_get_config_physpath(nvlist_t *config, char *physpath, size_t phypath_size) -{ - size_t rsz; - nvlist_t *vdev_root; - nvlist_t **child; - uint_t count; - char *type; - - rsz = 0; - - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &vdev_root) != 0) - return (EZFS_INVALCONFIG); - - if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 || - nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN, - &child, &count) != 0) - return (EZFS_INVALCONFIG); - - /* - * root pool can only have a single top-level vdev. - */ - if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1) - return (EZFS_POOL_INVALARG); - - (void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz, - B_FALSE); - - /* No online devices */ - if (rsz == 0) - return (EZFS_NODEVICE); - - return (0); -} - -/* - * Get phys_path for a root pool - * Return 0 on success; non-zero on failure. - */ -int -zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size) -{ - return (zpool_get_config_physpath(zhp->zpool_config, physpath, - phypath_size)); -} - -/* - * If the device has being dynamically expanded then we need to relabel - * the disk to use the new unallocated space. - */ -static int -zpool_relabel_disk(libzfs_handle_t *hdl, const char *name) -{ -#ifdef illumos - char path[MAXPATHLEN]; - char errbuf[1024]; - int fd, error; - int (*_efi_use_whole_disk)(int); - - if ((_efi_use_whole_disk = (int (*)(int))dlsym(RTLD_DEFAULT, - "efi_use_whole_disk")) == NULL) - return (-1); - - (void) snprintf(path, sizeof (path), "%s/%s", ZFS_RDISK_ROOT, name); - - if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " - "relabel '%s': unable to open device"), name); - return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); - } - - /* - * It's possible that we might encounter an error if the device - * does not have any unallocated space left. If so, we simply - * ignore that error and continue on. - */ - error = _efi_use_whole_disk(fd); - (void) close(fd); - if (error && error != VT_ENOSPC) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " - "relabel '%s': unable to read disk capacity"), name); - return (zfs_error(hdl, EZFS_NOCAP, errbuf)); - } -#endif /* illumos */ - return (0); -} - -/* - * Bring the specified vdev online. The 'flags' parameter is a set of the - * ZFS_ONLINE_* flags. - */ -int -zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, - vdev_state_t *newstate) -{ - zfs_cmd_t zc = { 0 }; - char msg[1024]; - char *pathname; - nvlist_t *tgt; - boolean_t avail_spare, l2cache, islog; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - if (flags & ZFS_ONLINE_EXPAND) { - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot expand %s"), path); - } else { - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot online %s"), path); - } - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, - &islog)) == NULL) - return (zfs_error(hdl, EZFS_NODEVICE, msg)); - - verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); - - if (avail_spare) - return (zfs_error(hdl, EZFS_ISSPARE, msg)); - - if ((flags & ZFS_ONLINE_EXPAND || - zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) && - nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &pathname) == 0) { - uint64_t wholedisk = 0; - - (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, - &wholedisk); - - /* - * XXX - L2ARC 1.0 devices can't support expansion. - */ - if (l2cache) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "cannot expand cache devices")); - return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg)); - } - - if (wholedisk) { - pathname += strlen(ZFS_DISK_ROOT) + 1; - (void) zpool_relabel_disk(hdl, pathname); - } - } - - zc.zc_cookie = VDEV_STATE_ONLINE; - zc.zc_obj = flags; - - if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) { - if (errno == EINVAL) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split " - "from this pool into a new one. Use '%s' " - "instead"), "zpool detach"); - return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg)); - } - return (zpool_standard_error(hdl, errno, msg)); - } - - *newstate = zc.zc_cookie; - return (0); -} - -/* - * Take the specified vdev offline - */ -int -zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) -{ - zfs_cmd_t zc = { 0 }; - char msg[1024]; - nvlist_t *tgt; - boolean_t avail_spare, l2cache; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot offline %s"), path); - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, - NULL)) == NULL) - return (zfs_error(hdl, EZFS_NODEVICE, msg)); - - verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); - - if (avail_spare) - return (zfs_error(hdl, EZFS_ISSPARE, msg)); - - zc.zc_cookie = VDEV_STATE_OFFLINE; - zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0; - - if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) - return (0); - - switch (errno) { - case EBUSY: - - /* - * There are no other replicas of this device. - */ - return (zfs_error(hdl, EZFS_NOREPLICAS, msg)); - - case EEXIST: - /* - * The log device has unplayed logs - */ - return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg)); - - default: - return (zpool_standard_error(hdl, errno, msg)); - } -} - -/* - * Mark the given vdev faulted. - */ -int -zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) -{ - zfs_cmd_t zc = { 0 }; - char msg[1024]; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot fault %llu"), guid); - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - zc.zc_guid = guid; - zc.zc_cookie = VDEV_STATE_FAULTED; - zc.zc_obj = aux; - - if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) - return (0); - - switch (errno) { - case EBUSY: - - /* - * There are no other replicas of this device. - */ - return (zfs_error(hdl, EZFS_NOREPLICAS, msg)); - - default: - return (zpool_standard_error(hdl, errno, msg)); - } - -} - -/* - * Mark the given vdev degraded. - */ -int -zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) -{ - zfs_cmd_t zc = { 0 }; - char msg[1024]; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot degrade %llu"), guid); - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - zc.zc_guid = guid; - zc.zc_cookie = VDEV_STATE_DEGRADED; - zc.zc_obj = aux; - - if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) - return (0); - - return (zpool_standard_error(hdl, errno, msg)); -} - -/* - * Returns TRUE if the given nvlist is a vdev that was originally swapped in as - * a hot spare. - */ -static boolean_t -is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which) -{ - nvlist_t **child; - uint_t c, children; - char *type; - - if (nvlist_lookup_nvlist_array(search, ZPOOL_CONFIG_CHILDREN, &child, - &children) == 0) { - verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE, - &type) == 0); - - if (strcmp(type, VDEV_TYPE_SPARE) == 0 && - children == 2 && child[which] == tgt) - return (B_TRUE); - - for (c = 0; c < children; c++) - if (is_replacing_spare(child[c], tgt, which)) - return (B_TRUE); - } - - return (B_FALSE); -} - -/* - * Attach new_disk (fully described by nvroot) to old_disk. - * If 'replacing' is specified, the new disk will replace the old one. - */ -int -zpool_vdev_attach(zpool_handle_t *zhp, - const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing) -{ - zfs_cmd_t zc = { 0 }; - char msg[1024]; - int ret; - nvlist_t *tgt; - boolean_t avail_spare, l2cache, islog; - uint64_t val; - char *newname; - nvlist_t **child; - uint_t children; - nvlist_t *config_root; - libzfs_handle_t *hdl = zhp->zpool_hdl; - boolean_t rootpool = zpool_is_bootable(zhp); - - if (replacing) - (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, - "cannot replace %s with %s"), old_disk, new_disk); - else - (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, - "cannot attach %s to %s"), new_disk, old_disk); - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache, - &islog)) == NULL) - return (zfs_error(hdl, EZFS_NODEVICE, msg)); - - if (avail_spare) - return (zfs_error(hdl, EZFS_ISSPARE, msg)); - - if (l2cache) - return (zfs_error(hdl, EZFS_ISL2CACHE, msg)); - - verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); - zc.zc_cookie = replacing; - - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0 || children != 1) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "new device must be a single disk")); - return (zfs_error(hdl, EZFS_INVALCONFIG, msg)); - } - - verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), - ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0); - - if ((newname = zpool_vdev_name(NULL, NULL, child[0], 0)) == NULL) - return (-1); - - /* - * If the target is a hot spare that has been swapped in, we can only - * replace it with another hot spare. - */ - if (replacing && - nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 && - (zpool_find_vdev(zhp, newname, &avail_spare, &l2cache, - NULL) == NULL || !avail_spare) && - is_replacing_spare(config_root, tgt, 1)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "can only be replaced by another hot spare")); - free(newname); - return (zfs_error(hdl, EZFS_BADTARGET, msg)); - } - - free(newname); - - if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) - return (-1); - - ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc); - - zcmd_free_nvlists(&zc); - - if (ret == 0) { - if (rootpool) { - /* - * XXX need a better way to prevent user from - * booting up a half-baked vdev. - */ - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make " - "sure to wait until resilver is done " - "before rebooting.\n")); - (void) fprintf(stderr, "\n"); - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "If " - "you boot from pool '%s', you may need to update\n" - "boot code on newly attached disk '%s'.\n\n" - "Assuming you use GPT partitioning and 'da0' is " - "your new boot disk\n" - "you may use the following command:\n\n" - "\tgpart bootcode -b /boot/pmbr -p " - "/boot/gptzfsboot -i 1 da0\n\n"), - zhp->zpool_name, new_disk); - } - return (0); - } - - switch (errno) { - case ENOTSUP: - /* - * Can't attach to or replace this type of vdev. - */ - if (replacing) { - uint64_t version = zpool_get_prop_int(zhp, - ZPOOL_PROP_VERSION, NULL); - - if (islog) - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "cannot replace a log with a spare")); - else if (version >= SPA_VERSION_MULTI_REPLACE) - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "already in replacing/spare config; wait " - "for completion or use 'zpool detach'")); - else - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "cannot replace a replacing device")); - } else { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "can only attach to mirrors and top-level " - "disks")); - } - (void) zfs_error(hdl, EZFS_BADTARGET, msg); - break; - - case EINVAL: - /* - * The new device must be a single disk. - */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "new device must be a single disk")); - (void) zfs_error(hdl, EZFS_INVALCONFIG, msg); - break; - - case EBUSY: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, " - "or device removal is in progress"), - new_disk); - (void) zfs_error(hdl, EZFS_BADDEV, msg); - break; - - case EOVERFLOW: - /* - * The new device is too small. - */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "device is too small")); - (void) zfs_error(hdl, EZFS_BADDEV, msg); - break; - - case EDOM: - /* - * The new device has a different alignment requirement. - */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "devices have different sector alignment")); - (void) zfs_error(hdl, EZFS_BADDEV, msg); - break; - - case ENAMETOOLONG: - /* - * The resulting top-level vdev spec won't fit in the label. - */ - (void) zfs_error(hdl, EZFS_DEVOVERFLOW, msg); - break; - - default: - (void) zpool_standard_error(hdl, errno, msg); - } - - return (-1); -} - -/* - * Detach the specified device. - */ -int -zpool_vdev_detach(zpool_handle_t *zhp, const char *path) -{ - zfs_cmd_t zc = { 0 }; - char msg[1024]; - nvlist_t *tgt; - boolean_t avail_spare, l2cache; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot detach %s"), path); - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, - NULL)) == NULL) - return (zfs_error(hdl, EZFS_NODEVICE, msg)); - - if (avail_spare) - return (zfs_error(hdl, EZFS_ISSPARE, msg)); - - if (l2cache) - return (zfs_error(hdl, EZFS_ISL2CACHE, msg)); - - verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); - - if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0) - return (0); - - switch (errno) { - - case ENOTSUP: - /* - * Can't detach from this type of vdev. - */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only " - "applicable to mirror and replacing vdevs")); - (void) zfs_error(hdl, EZFS_BADTARGET, msg); - break; - - case EBUSY: - /* - * There are no other replicas of this device. - */ - (void) zfs_error(hdl, EZFS_NOREPLICAS, msg); - break; - - default: - (void) zpool_standard_error(hdl, errno, msg); - } - - return (-1); -} - -/* - * Find a mirror vdev in the source nvlist. - * - * The mchild array contains a list of disks in one of the top-level mirrors - * of the source pool. The schild array contains a list of disks that the - * user specified on the command line. We loop over the mchild array to - * see if any entry in the schild array matches. - * - * If a disk in the mchild array is found in the schild array, we return - * the index of that entry. Otherwise we return -1. - */ -static int -find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren, - nvlist_t **schild, uint_t schildren) -{ - uint_t mc; - - for (mc = 0; mc < mchildren; mc++) { - uint_t sc; - char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp, - mchild[mc], 0); - - for (sc = 0; sc < schildren; sc++) { - char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp, - schild[sc], 0); - boolean_t result = (strcmp(mpath, spath) == 0); - - free(spath); - if (result) { - free(mpath); - return (mc); - } - } - - free(mpath); - } - - return (-1); -} - -/* - * Split a mirror pool. If newroot points to null, then a new nvlist - * is generated and it is the responsibility of the caller to free it. - */ -int -zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, - nvlist_t *props, splitflags_t flags) -{ - zfs_cmd_t zc = { 0 }; - char msg[1024]; - nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL; - nvlist_t **varray = NULL, *zc_props = NULL; - uint_t c, children, newchildren, lastlog = 0, vcount, found = 0; - libzfs_handle_t *hdl = zhp->zpool_hdl; - uint64_t vers; - boolean_t freelist = B_FALSE, memory_err = B_TRUE; - int retval = 0; - - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name); - - if (!zpool_name_valid(hdl, B_FALSE, newname)) - return (zfs_error(hdl, EZFS_INVALIDNAME, msg)); - - if ((config = zpool_get_config(zhp, NULL)) == NULL) { - (void) fprintf(stderr, gettext("Internal error: unable to " - "retrieve pool configuration\n")); - return (-1); - } - - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) - == 0); - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0); - - if (props) { - prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; - if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name, - props, vers, flags, msg)) == NULL) - return (-1); - } - - if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, - &children) != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Source pool is missing vdev tree")); - nvlist_free(zc_props); - return (-1); - } - - varray = zfs_alloc(hdl, children * sizeof (nvlist_t *)); - vcount = 0; - - if (*newroot == NULL || - nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, - &newchild, &newchildren) != 0) - newchildren = 0; - - for (c = 0; c < children; c++) { - uint64_t is_log = B_FALSE, is_hole = B_FALSE; - char *type; - nvlist_t **mchild, *vdev; - uint_t mchildren; - int entry; - - /* - * Unlike cache & spares, slogs are stored in the - * ZPOOL_CONFIG_CHILDREN array. We filter them out here. - */ - (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - &is_log); - (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, - &is_hole); - if (is_log || is_hole) { - /* - * Create a hole vdev and put it in the config. - */ - if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0) - goto out; - if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_HOLE) != 0) - goto out; - if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE, - 1) != 0) - goto out; - if (lastlog == 0) - lastlog = vcount; - varray[vcount++] = vdev; - continue; - } - lastlog = 0; - verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type) - == 0); - if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Source pool must be composed only of mirrors\n")); - retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); - goto out; - } - - verify(nvlist_lookup_nvlist_array(child[c], - ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); - - /* find or add an entry for this top-level vdev */ - if (newchildren > 0 && - (entry = find_vdev_entry(zhp, mchild, mchildren, - newchild, newchildren)) >= 0) { - /* We found a disk that the user specified. */ - vdev = mchild[entry]; - ++found; - } else { - /* User didn't specify a disk for this vdev. */ - vdev = mchild[mchildren - 1]; - } - - if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) - goto out; - } - - /* did we find every disk the user specified? */ - if (found != newchildren) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must " - "include at most one disk from each mirror")); - retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); - goto out; - } - - /* Prepare the nvlist for populating. */ - if (*newroot == NULL) { - if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0) - goto out; - freelist = B_TRUE; - if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_ROOT) != 0) - goto out; - } else { - verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0); - } - - /* Add all the children we found */ - if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray, - lastlog == 0 ? vcount : lastlog) != 0) - goto out; - - /* - * If we're just doing a dry run, exit now with success. - */ - if (flags.dryrun) { - memory_err = B_FALSE; - freelist = B_FALSE; - goto out; - } - - /* now build up the config list & call the ioctl */ - if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0) - goto out; - - if (nvlist_add_nvlist(newconfig, - ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 || - nvlist_add_string(newconfig, - ZPOOL_CONFIG_POOL_NAME, newname) != 0 || - nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0) - goto out; - - /* - * The new pool is automatically part of the namespace unless we - * explicitly export it. - */ - if (!flags.import) - zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT; - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string)); - if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0) - goto out; - if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0) - goto out; - - if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) { - retval = zpool_standard_error(hdl, errno, msg); - goto out; - } - - freelist = B_FALSE; - memory_err = B_FALSE; - -out: - if (varray != NULL) { - int v; - - for (v = 0; v < vcount; v++) - nvlist_free(varray[v]); - free(varray); - } - zcmd_free_nvlists(&zc); - nvlist_free(zc_props); - nvlist_free(newconfig); - if (freelist) { - nvlist_free(*newroot); - *newroot = NULL; - } - - if (retval != 0) - return (retval); - - if (memory_err) - return (no_memory(hdl)); - - return (0); -} - -/* - * Remove the given device. - */ -int -zpool_vdev_remove(zpool_handle_t *zhp, const char *path) -{ - zfs_cmd_t zc = { 0 }; - char msg[1024]; - nvlist_t *tgt; - boolean_t avail_spare, l2cache, islog; - libzfs_handle_t *hdl = zhp->zpool_hdl; - uint64_t version; - - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot remove %s"), path); - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, - &islog)) == NULL) - return (zfs_error(hdl, EZFS_NODEVICE, msg)); - - version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); - if (islog && version < SPA_VERSION_HOLES) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgraded to support log removal")); - return (zfs_error(hdl, EZFS_BADVERSION, msg)); - } - - zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); - - if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) - return (0); - - switch (errno) { - - case EINVAL: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid config; all top-level vdevs must " - "have the same sector size and not be raidz.")); - (void) zfs_error(hdl, EZFS_INVALCONFIG, msg); - break; - - case EBUSY: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Pool busy; removal may already be in progress")); - (void) zfs_error(hdl, EZFS_BUSY, msg); - break; - - default: - (void) zpool_standard_error(hdl, errno, msg); - } - return (-1); -} - -int -zpool_vdev_remove_cancel(zpool_handle_t *zhp) -{ - zfs_cmd_t zc = { 0 }; - char msg[1024]; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot cancel removal")); - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - zc.zc_cookie = 1; - - if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) - return (0); - - return (zpool_standard_error(hdl, errno, msg)); -} - -int -zpool_vdev_indirect_size(zpool_handle_t *zhp, const char *path, - uint64_t *sizep) -{ - char msg[1024]; - nvlist_t *tgt; - boolean_t avail_spare, l2cache, islog; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot determine indirect size of %s"), - path); - - if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, - &islog)) == NULL) - return (zfs_error(hdl, EZFS_NODEVICE, msg)); - - if (avail_spare || l2cache || islog) { - *sizep = 0; - return (0); - } - - if (nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_INDIRECT_SIZE, sizep) != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "indirect size not available")); - return (zfs_error(hdl, EINVAL, msg)); - } - return (0); -} - -/* - * Clear the errors for the pool, or the particular device if specified. - */ -int -zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl) -{ - zfs_cmd_t zc = { 0 }; - char msg[1024]; - nvlist_t *tgt; - zpool_load_policy_t policy; - boolean_t avail_spare, l2cache; - libzfs_handle_t *hdl = zhp->zpool_hdl; - nvlist_t *nvi = NULL; - int error; - - if (path) - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), - path); - else - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), - zhp->zpool_name); - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - if (path) { - if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, - &l2cache, NULL)) == NULL) - return (zfs_error(hdl, EZFS_NODEVICE, msg)); - - /* - * Don't allow error clearing for hot spares. Do allow - * error clearing for l2cache devices. - */ - if (avail_spare) - return (zfs_error(hdl, EZFS_ISSPARE, msg)); - - verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, - &zc.zc_guid) == 0); - } - - zpool_get_load_policy(rewindnvl, &policy); - zc.zc_cookie = policy.zlp_rewind; - - if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2) != 0) - return (-1); - - if (zcmd_write_src_nvlist(hdl, &zc, rewindnvl) != 0) - return (-1); - - while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 && - errno == ENOMEM) { - if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { - zcmd_free_nvlists(&zc); - return (-1); - } - } - - if (!error || ((policy.zlp_rewind & ZPOOL_TRY_REWIND) && - errno != EPERM && errno != EACCES)) { - if (policy.zlp_rewind & - (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { - (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); - zpool_rewind_exclaim(hdl, zc.zc_name, - ((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0), - nvi); - nvlist_free(nvi); - } - zcmd_free_nvlists(&zc); - return (0); - } - - zcmd_free_nvlists(&zc); - return (zpool_standard_error(hdl, errno, msg)); -} - -/* - * Similar to zpool_clear(), but takes a GUID (used by fmd). - */ -int -zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) -{ - zfs_cmd_t zc = { 0 }; - char msg[1024]; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"), - guid); - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - zc.zc_guid = guid; - zc.zc_cookie = ZPOOL_NO_REWIND; - - if (ioctl(hdl->libzfs_fd, ZFS_IOC_CLEAR, &zc) == 0) - return (0); - - return (zpool_standard_error(hdl, errno, msg)); -} - -/* - * Change the GUID for a pool. - */ -int -zpool_reguid(zpool_handle_t *zhp) -{ - char msg[1024]; - libzfs_handle_t *hdl = zhp->zpool_hdl; - zfs_cmd_t zc = { 0 }; - - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name); - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0) - return (0); - - return (zpool_standard_error(hdl, errno, msg)); -} - -/* - * Reopen the pool. - */ -int -zpool_reopen(zpool_handle_t *zhp) -{ - zfs_cmd_t zc = { 0 }; - char msg[1024]; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot reopen '%s'"), - zhp->zpool_name); - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - if (zfs_ioctl(hdl, ZFS_IOC_POOL_REOPEN, &zc) == 0) - return (0); - return (zpool_standard_error(hdl, errno, msg)); -} - -/* call into libzfs_core to execute the sync IOCTL per pool */ -int -zpool_sync_one(zpool_handle_t *zhp, void *data) -{ - int ret; - libzfs_handle_t *hdl = zpool_get_handle(zhp); - const char *pool_name = zpool_get_name(zhp); - boolean_t *force = data; - nvlist_t *innvl = fnvlist_alloc(); - - fnvlist_add_boolean_value(innvl, "force", *force); - if ((ret = lzc_sync(pool_name, innvl, NULL)) != 0) { - nvlist_free(innvl); - return (zpool_standard_error_fmt(hdl, ret, - dgettext(TEXT_DOMAIN, "sync '%s' failed"), pool_name)); - } - nvlist_free(innvl); - - return (0); -} - -/* - * Convert from a devid string to a path. - */ -static char * -devid_to_path(char *devid_str) -{ - ddi_devid_t devid; - char *minor; - char *path; - devid_nmlist_t *list = NULL; - int ret; - - if (devid_str_decode(devid_str, &devid, &minor) != 0) - return (NULL); - - ret = devid_deviceid_to_nmlist("/dev", devid, minor, &list); - - devid_str_free(minor); - devid_free(devid); - - if (ret != 0) - return (NULL); - - /* - * In a case the strdup() fails, we will just return NULL below. - */ - path = strdup(list[0].devname); - - devid_free_nmlist(list); - - return (path); -} - -/* - * Convert from a path to a devid string. - */ -static char * -path_to_devid(const char *path) -{ -#ifdef have_devid - int fd; - ddi_devid_t devid; - char *minor, *ret; - - if ((fd = open(path, O_RDONLY)) < 0) - return (NULL); - - minor = NULL; - ret = NULL; - if (devid_get(fd, &devid) == 0) { - if (devid_get_minor_name(fd, &minor) == 0) - ret = devid_str_encode(devid, minor); - if (minor != NULL) - devid_str_free(minor); - devid_free(devid); - } - (void) close(fd); - - return (ret); -#else - return (NULL); -#endif -} - -/* - * Issue the necessary ioctl() to update the stored path value for the vdev. We - * ignore any failure here, since a common case is for an unprivileged user to - * type 'zpool status', and we'll display the correct information anyway. - */ -static void -set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path) -{ - zfs_cmd_t zc = { 0 }; - - (void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - (void) strncpy(zc.zc_value, path, sizeof (zc.zc_value)); - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, - &zc.zc_guid) == 0); - - (void) ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SETPATH, &zc); -} - -/* - * Given a vdev, return the name to display in iostat. If the vdev has a path, - * we use that, stripping off any leading "/dev/dsk/"; if not, we use the type. - * We also check if this is a whole disk, in which case we strip off the - * trailing 's0' slice name. - * - * This routine is also responsible for identifying when disks have been - * reconfigured in a new location. The kernel will have opened the device by - * devid, but the path will still refer to the old location. To catch this, we - * first do a path -> devid translation (which is fast for the common case). If - * the devid matches, we're done. If not, we do a reverse devid -> path - * translation and issue the appropriate ioctl() to update the path of the vdev. - * If 'zhp' is NULL, then this is an exported pool, and we don't need to do any - * of these checks. - */ -char * -zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, - int name_flags) -{ - char *path, *devid, *env; - uint64_t value; - char buf[64]; - vdev_stat_t *vs; - uint_t vsc; - int have_stats; - int have_path; - - env = getenv("ZPOOL_VDEV_NAME_PATH"); - if (env && (strtoul(env, NULL, 0) > 0 || - !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) - name_flags |= VDEV_NAME_PATH; - - env = getenv("ZPOOL_VDEV_NAME_GUID"); - if (env && (strtoul(env, NULL, 0) > 0 || - !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) - name_flags |= VDEV_NAME_GUID; - - env = getenv("ZPOOL_VDEV_NAME_FOLLOW_LINKS"); - if (env && (strtoul(env, NULL, 0) > 0 || - !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) - name_flags |= VDEV_NAME_FOLLOW_LINKS; - - have_stats = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &vsc) == 0; - have_path = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0; - - /* - * If the device is not currently present, assume it will not - * come back at the same device path. Display the device by GUID. - */ - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0 || - (name_flags & VDEV_NAME_GUID) != 0 || - have_path && have_stats && vs->vs_state <= VDEV_STATE_CANT_OPEN) { - nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value); - (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)value); - path = buf; - } else if (have_path) { - - /* - * If the device is dead (faulted, offline, etc) then don't - * bother opening it. Otherwise we may be forcing the user to - * open a misbehaving device, which can have undesirable - * effects. - */ - if ((have_stats == 0 || - vs->vs_state >= VDEV_STATE_DEGRADED) && - zhp != NULL && - nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &devid) == 0) { - /* - * Determine if the current path is correct. - */ - char *newdevid = path_to_devid(path); - - if (newdevid == NULL || - strcmp(devid, newdevid) != 0) { - char *newpath; - - if ((newpath = devid_to_path(devid)) != NULL) { - /* - * Update the path appropriately. - */ - set_path(zhp, nv, newpath); - if (nvlist_add_string(nv, - ZPOOL_CONFIG_PATH, newpath) == 0) - verify(nvlist_lookup_string(nv, - ZPOOL_CONFIG_PATH, - &path) == 0); - free(newpath); - } - } - - if (newdevid) - devid_str_free(newdevid); - } - -#ifdef illumos - if (name_flags & VDEV_NAME_FOLLOW_LINKS) { - char *rp = realpath(path, NULL); - if (rp) { - strlcpy(buf, rp, sizeof (buf)); - path = buf; - free(rp); - } - } - - if (strncmp(path, ZFS_DISK_ROOTD, strlen(ZFS_DISK_ROOTD)) == 0) - path += strlen(ZFS_DISK_ROOTD); - - /* - * Remove the partition from the path it this is a whole disk. - */ - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) - == 0 && value && !(name_flags & VDEV_NAME_PATH)) { - int pathlen = strlen(path); - char *tmp = zfs_strdup(hdl, path); - - /* - * If it starts with c#, and ends with "s0" or "s1", - * chop the slice off, or if it ends with "s0/old" or - * "s1/old", remove the slice from the middle. - */ - if (CTD_CHECK(tmp)) { - if (strcmp(&tmp[pathlen - 2], "s0") == 0 || - strcmp(&tmp[pathlen - 2], "s1") == 0) { - tmp[pathlen - 2] = '\0'; - } else if (pathlen > 6 && - (strcmp(&tmp[pathlen - 6], "s0/old") == 0 || - strcmp(&tmp[pathlen - 6], "s1/old") == 0)) { - (void) strcpy(&tmp[pathlen - 6], - "/old"); - } - } - return (tmp); - } -#else /* !illumos */ - if (strncmp(path, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) - path += sizeof(_PATH_DEV) - 1; -#endif /* illumos */ - } else { - verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0); - - /* - * If it's a raidz device, we need to stick in the parity level. - */ - if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, - &value) == 0); - (void) snprintf(buf, sizeof (buf), "%s%llu", path, - (u_longlong_t)value); - path = buf; - } - - /* - * We identify each top-level vdev by using a - * naming convention. - */ - if (name_flags & VDEV_NAME_TYPE_ID) { - uint64_t id; - - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, - &id) == 0); - (void) snprintf(buf, sizeof (buf), "%s-%llu", path, - (u_longlong_t)id); - path = buf; - } - } - - return (zfs_strdup(hdl, path)); -} - -static int -zbookmark_mem_compare(const void *a, const void *b) -{ - return (memcmp(a, b, sizeof (zbookmark_phys_t))); -} - -/* - * Retrieve the persistent error log, uniquify the members, and return to the - * caller. - */ -int -zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) -{ - zfs_cmd_t zc = { 0 }; - uint64_t count; - zbookmark_phys_t *zb = NULL; - int i; - - /* - * Retrieve the raw error list from the kernel. If the number of errors - * has increased, allocate more space and continue until we get the - * entire list. - */ - verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT, - &count) == 0); - if (count == 0) - return (0); - if ((zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl, - count * sizeof (zbookmark_phys_t))) == (uintptr_t)NULL) - return (-1); - zc.zc_nvlist_dst_size = count; - (void) strcpy(zc.zc_name, zhp->zpool_name); - for (;;) { - if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_ERROR_LOG, - &zc) != 0) { - free((void *)(uintptr_t)zc.zc_nvlist_dst); - if (errno == ENOMEM) { - void *dst; - - count = zc.zc_nvlist_dst_size; - dst = zfs_alloc(zhp->zpool_hdl, count * - sizeof (zbookmark_phys_t)); - if (dst == NULL) - return (-1); - zc.zc_nvlist_dst = (uintptr_t)dst; - } else { - return (-1); - } - } else { - break; - } - } - - /* - * Sort the resulting bookmarks. This is a little confusing due to the - * implementation of ZFS_IOC_ERROR_LOG. The bookmarks are copied last - * to first, and 'zc_nvlist_dst_size' indicates the number of boomarks - * _not_ copied as part of the process. So we point the start of our - * array appropriate and decrement the total number of elements. - */ - zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) + - zc.zc_nvlist_dst_size; - count -= zc.zc_nvlist_dst_size; - - qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare); - - verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); - - /* - * Fill in the nverrlistp with nvlist's of dataset and object numbers. - */ - for (i = 0; i < count; i++) { - nvlist_t *nv; - - /* ignoring zb_blkid and zb_level for now */ - if (i > 0 && zb[i-1].zb_objset == zb[i].zb_objset && - zb[i-1].zb_object == zb[i].zb_object) - continue; - - if (nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) != 0) - goto nomem; - if (nvlist_add_uint64(nv, ZPOOL_ERR_DATASET, - zb[i].zb_objset) != 0) { - nvlist_free(nv); - goto nomem; - } - if (nvlist_add_uint64(nv, ZPOOL_ERR_OBJECT, - zb[i].zb_object) != 0) { - nvlist_free(nv); - goto nomem; - } - if (nvlist_add_nvlist(*nverrlistp, "ejk", nv) != 0) { - nvlist_free(nv); - goto nomem; - } - nvlist_free(nv); - } - - free((void *)(uintptr_t)zc.zc_nvlist_dst); - return (0); - -nomem: - free((void *)(uintptr_t)zc.zc_nvlist_dst); - return (no_memory(zhp->zpool_hdl)); -} - -/* - * Upgrade a ZFS pool to the latest on-disk version. - */ -int -zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version) -{ - zfs_cmd_t zc = { 0 }; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - (void) strcpy(zc.zc_name, zhp->zpool_name); - zc.zc_cookie = new_version; - - if (zfs_ioctl(hdl, ZFS_IOC_POOL_UPGRADE, &zc) != 0) - return (zpool_standard_error_fmt(hdl, errno, - dgettext(TEXT_DOMAIN, "cannot upgrade '%s'"), - zhp->zpool_name)); - return (0); -} - -void -zfs_save_arguments(int argc, char **argv, char *string, int len) -{ - (void) strlcpy(string, basename(argv[0]), len); - for (int i = 1; i < argc; i++) { - (void) strlcat(string, " ", len); - (void) strlcat(string, argv[i], len); - } -} - -int -zpool_log_history(libzfs_handle_t *hdl, const char *message) -{ - zfs_cmd_t zc = { 0 }; - nvlist_t *args; - int err; - - args = fnvlist_alloc(); - fnvlist_add_string(args, "message", message); - err = zcmd_write_src_nvlist(hdl, &zc, args); - if (err == 0) - err = ioctl(hdl->libzfs_fd, ZFS_IOC_LOG_HISTORY, &zc); - nvlist_free(args); - zcmd_free_nvlists(&zc); - return (err); -} - -/* - * Perform ioctl to get some command history of a pool. - * - * 'buf' is the buffer to fill up to 'len' bytes. 'off' is the - * logical offset of the history buffer to start reading from. - * - * Upon return, 'off' is the next logical offset to read from and - * 'len' is the actual amount of bytes read into 'buf'. - */ -static int -get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len) -{ - zfs_cmd_t zc = { 0 }; - libzfs_handle_t *hdl = zhp->zpool_hdl; - - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - - zc.zc_history = (uint64_t)(uintptr_t)buf; - zc.zc_history_len = *len; - zc.zc_history_offset = *off; - - if (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) { - switch (errno) { - case EPERM: - return (zfs_error_fmt(hdl, EZFS_PERM, - dgettext(TEXT_DOMAIN, - "cannot show history for pool '%s'"), - zhp->zpool_name)); - case ENOENT: - return (zfs_error_fmt(hdl, EZFS_NOHISTORY, - dgettext(TEXT_DOMAIN, "cannot get history for pool " - "'%s'"), zhp->zpool_name)); - case ENOTSUP: - return (zfs_error_fmt(hdl, EZFS_BADVERSION, - dgettext(TEXT_DOMAIN, "cannot get history for pool " - "'%s', pool must be upgraded"), zhp->zpool_name)); - default: - return (zpool_standard_error_fmt(hdl, errno, - dgettext(TEXT_DOMAIN, - "cannot get history for '%s'"), zhp->zpool_name)); - } - } - - *len = zc.zc_history_len; - *off = zc.zc_history_offset; - - return (0); -} - -/* - * Process the buffer of nvlists, unpacking and storing each nvlist record - * into 'records'. 'leftover' is set to the number of bytes that weren't - * processed as there wasn't a complete record. - */ -int -zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover, - nvlist_t ***records, uint_t *numrecords) -{ - uint64_t reclen; - nvlist_t *nv; - int i; - - while (bytes_read > sizeof (reclen)) { - - /* get length of packed record (stored as little endian) */ - for (i = 0, reclen = 0; i < sizeof (reclen); i++) - reclen += (uint64_t)(((uchar_t *)buf)[i]) << (8*i); - - if (bytes_read < sizeof (reclen) + reclen) - break; - - /* unpack record */ - if (nvlist_unpack(buf + sizeof (reclen), reclen, &nv, 0) != 0) - return (ENOMEM); - bytes_read -= sizeof (reclen) + reclen; - buf += sizeof (reclen) + reclen; - - /* add record to nvlist array */ - (*numrecords)++; - if (ISP2(*numrecords + 1)) { - *records = realloc(*records, - *numrecords * 2 * sizeof (nvlist_t *)); - } - (*records)[*numrecords - 1] = nv; - } - - *leftover = bytes_read; - return (0); -} - -/* from spa_history.c: spa_history_create_obj() */ -#define HIS_BUF_LEN_DEF (128 << 10) -#define HIS_BUF_LEN_MAX (1 << 30) - -/* - * Retrieve the command history of a pool. - */ -int -zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp, uint64_t *off, - boolean_t *eof) -{ - char *buf; - uint64_t buflen = HIS_BUF_LEN_DEF; - nvlist_t **records = NULL; - uint_t numrecords = 0; - int err, i; - uint64_t start = *off; - - buf = malloc(buflen); - if (buf == NULL) - return (ENOMEM); - /* process about 1MB at a time */ - while (*off - start < 1024 * 1024) { - uint64_t bytes_read = buflen; - uint64_t leftover; - - if ((err = get_history(zhp, buf, off, &bytes_read)) != 0) - break; - - /* if nothing else was read in, we're at EOF, just return */ - if (bytes_read == 0) { - *eof = B_TRUE; - break; - } - - if ((err = zpool_history_unpack(buf, bytes_read, - &leftover, &records, &numrecords)) != 0) - break; - *off -= leftover; - if (leftover == bytes_read) { - /* - * no progress made, because buffer is not big enough - * to hold this record; resize and retry. - */ - buflen *= 2; - free(buf); - buf = NULL; - if ((buflen >= HIS_BUF_LEN_MAX) || - ((buf = malloc(buflen)) == NULL)) { - err = ENOMEM; - break; - } - } - } - - free(buf); - - if (!err) { - verify(nvlist_alloc(nvhisp, NV_UNIQUE_NAME, 0) == 0); - verify(nvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD, - records, numrecords) == 0); - } - for (i = 0; i < numrecords; i++) - nvlist_free(records[i]); - free(records); - - return (err); -} - -void -zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, - char *pathname, size_t len) -{ - zfs_cmd_t zc = { 0 }; - boolean_t mounted = B_FALSE; - char *mntpnt = NULL; - char dsname[ZFS_MAX_DATASET_NAME_LEN]; - - if (dsobj == 0) { - /* special case for the MOS */ - (void) snprintf(pathname, len, ":<0x%llx>", obj); - return; - } - - /* get the dataset's name */ - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - zc.zc_obj = dsobj; - if (ioctl(zhp->zpool_hdl->libzfs_fd, - ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) { - /* just write out a path of two object numbers */ - (void) snprintf(pathname, len, "<0x%llx>:<0x%llx>", - dsobj, obj); - return; - } - (void) strlcpy(dsname, zc.zc_value, sizeof (dsname)); - - /* find out if the dataset is mounted */ - mounted = is_mounted(zhp->zpool_hdl, dsname, &mntpnt); - - /* get the corrupted object's path */ - (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); - zc.zc_obj = obj; - if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_PATH, - &zc) == 0) { - if (mounted) { - (void) snprintf(pathname, len, "%s%s", mntpnt, - zc.zc_value); - } else { - (void) snprintf(pathname, len, "%s:%s", - dsname, zc.zc_value); - } - } else { - (void) snprintf(pathname, len, "%s:<0x%llx>", dsname, obj); - } - free(mntpnt); -} - -int -zpool_set_bootenv(zpool_handle_t *zhp, const char *envmap) -{ - int error = lzc_set_bootenv(zhp->zpool_name, envmap); - if (error != 0) { - (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, - dgettext(TEXT_DOMAIN, - "error setting bootenv in pool '%s'"), zhp->zpool_name); - } - - return (error); -} - -int -zpool_get_bootenv(zpool_handle_t *zhp, char *outbuf, size_t size, off_t offset) -{ - nvlist_t *nvl; - int error = lzc_get_bootenv(zhp->zpool_name, &nvl);; - if (error != 0) { - (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, - dgettext(TEXT_DOMAIN, - "error getting bootenv in pool '%s'"), zhp->zpool_name); - return (-1); - } - char *envmap = fnvlist_lookup_string(nvl, "envmap"); - if (offset >= strlen(envmap)) { - fnvlist_free(nvl); - return (0); - } - - strlcpy(outbuf, envmap + offset, size); - int bytes = MIN(strlen(envmap + offset), size); - fnvlist_free(nvl); - return (bytes); -} - -#ifdef illumos -/* - * Read the EFI label from the config, if a label does not exist then - * pass back the error to the caller. If the caller has passed a non-NULL - * diskaddr argument then we set it to the starting address of the EFI - * partition. If the caller has passed a non-NULL boolean argument, then - * we set it to indicate if the disk does have efi system partition. - */ -static int -read_efi_label(nvlist_t *config, diskaddr_t *sb, boolean_t *system) -{ - char *path; - int fd; - char diskname[MAXPATHLEN]; - boolean_t boot = B_FALSE; - int err = -1; - int slice; - - if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0) - return (err); - - (void) snprintf(diskname, sizeof (diskname), "%s%s", ZFS_RDISK_ROOT, - strrchr(path, '/')); - if ((fd = open(diskname, O_RDONLY|O_NDELAY)) >= 0) { - struct dk_gpt *vtoc; - - if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) { - for (slice = 0; slice < vtoc->efi_nparts; slice++) { - if (vtoc->efi_parts[slice].p_tag == V_SYSTEM) - boot = B_TRUE; - if (vtoc->efi_parts[slice].p_tag == V_USR) - break; - } - if (sb != NULL && vtoc->efi_parts[slice].p_tag == V_USR) - *sb = vtoc->efi_parts[slice].p_start; - if (system != NULL) - *system = boot; - efi_free(vtoc); - } - (void) close(fd); - } - return (err); -} - -/* - * determine where a partition starts on a disk in the current - * configuration - */ -static diskaddr_t -find_start_block(nvlist_t *config) -{ - nvlist_t **child; - uint_t c, children; - diskaddr_t sb = MAXOFFSET_T; - uint64_t wholedisk; - - if (nvlist_lookup_nvlist_array(config, - ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { - if (nvlist_lookup_uint64(config, - ZPOOL_CONFIG_WHOLE_DISK, - &wholedisk) != 0 || !wholedisk) { - return (MAXOFFSET_T); - } - if (read_efi_label(config, &sb, NULL) < 0) - sb = MAXOFFSET_T; - return (sb); - } - - for (c = 0; c < children; c++) { - sb = find_start_block(child[c]); - if (sb != MAXOFFSET_T) { - return (sb); - } - } - return (MAXOFFSET_T); -} -#endif /* illumos */ - -/* - * Label an individual disk. The name provided is the short name, - * stripped of any leading /dev path. - */ -int -zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name, - zpool_boot_label_t boot_type, uint64_t boot_size, int *slice) -{ -#ifdef illumos - char path[MAXPATHLEN]; - struct dk_gpt *vtoc; - int fd; - size_t resv = EFI_MIN_RESV_SIZE; - uint64_t slice_size; - diskaddr_t start_block; - char errbuf[1024]; - - /* prepare an error message just in case */ - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot label '%s'"), name); - - if (zhp) { - nvlist_t *nvroot; - - verify(nvlist_lookup_nvlist(zhp->zpool_config, - ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - - if (zhp->zpool_start_block == 0) - start_block = find_start_block(nvroot); - else - start_block = zhp->zpool_start_block; - zhp->zpool_start_block = start_block; - } else { - /* new pool */ - start_block = NEW_START_BLOCK; - } - - (void) snprintf(path, sizeof (path), "%s/%s%s", ZFS_RDISK_ROOT, name, - BACKUP_SLICE); - - if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { - /* - * This shouldn't happen. We've long since verified that this - * is a valid device. - */ - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, "unable to open device")); - return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); - } - - if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) { - /* - * The only way this can fail is if we run out of memory, or we - * were unable to read the disk's capacity - */ - if (errno == ENOMEM) - (void) no_memory(hdl); - - (void) close(fd); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "unable to read disk capacity"), name); - - return (zfs_error(hdl, EZFS_NOCAP, errbuf)); - } - - /* - * Why we use V_USR: V_BACKUP confuses users, and is considered - * disposable by some EFI utilities (since EFI doesn't have a backup - * slice). V_UNASSIGNED is supposed to be used only for zero size - * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, - * etc. were all pretty specific. V_USR is as close to reality as we - * can get, in the absence of V_OTHER. - */ - /* first fix the partition start block */ - if (start_block == MAXOFFSET_T) - start_block = NEW_START_BLOCK; - - /* - * EFI System partition is using slice 0. - * ZFS is on slice 1 and slice 8 is reserved. - * We assume the GPT partition table without system - * partition has zfs p_start == NEW_START_BLOCK. - * If start_block != NEW_START_BLOCK, it means we have - * system partition. Correct solution would be to query/cache vtoc - * from existing vdev member. - */ - if (boot_type == ZPOOL_CREATE_BOOT_LABEL) { - if (boot_size % vtoc->efi_lbasize != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "boot partition size must be a multiple of %d"), - vtoc->efi_lbasize); - (void) close(fd); - efi_free(vtoc); - return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); - } - /* - * System partition size checks. - * Note the 1MB is quite arbitrary value, since we - * are creating dedicated pool, it should be enough - * to hold fat + efi bootloader. May need to be - * adjusted if the bootloader size will grow. - */ - if (boot_size < 1024 * 1024) { - char buf[64]; - zfs_nicenum(boot_size, buf, sizeof (buf)); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Specified size %s for EFI System partition is too " - "small, the minimum size is 1MB."), buf); - (void) close(fd); - efi_free(vtoc); - return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); - } - /* 33MB is tested with mkfs -F pcfs */ - if (hdl->libzfs_printerr && - ((vtoc->efi_lbasize == 512 && - boot_size < 33 * 1024 * 1024) || - (vtoc->efi_lbasize == 4096 && - boot_size < 256 * 1024 * 1024))) { - char buf[64]; - zfs_nicenum(boot_size, buf, sizeof (buf)); - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, - "Warning: EFI System partition size %s is " - "not allowing to create FAT32 file\nsystem, which " - "may result in unbootable system.\n"), buf); - } - /* Adjust zfs partition start by size of system partition. */ - start_block += boot_size / vtoc->efi_lbasize; - } - - if (start_block == NEW_START_BLOCK) { - /* - * Use default layout. - * ZFS is on slice 0 and slice 8 is reserved. - */ - slice_size = vtoc->efi_last_u_lba + 1; - slice_size -= EFI_MIN_RESV_SIZE; - slice_size -= start_block; - if (slice != NULL) - *slice = 0; - - vtoc->efi_parts[0].p_start = start_block; - vtoc->efi_parts[0].p_size = slice_size; - - vtoc->efi_parts[0].p_tag = V_USR; - (void) strcpy(vtoc->efi_parts[0].p_name, "zfs"); - - vtoc->efi_parts[8].p_start = slice_size + start_block; - vtoc->efi_parts[8].p_size = resv; - vtoc->efi_parts[8].p_tag = V_RESERVED; - } else { - slice_size = start_block - NEW_START_BLOCK; - vtoc->efi_parts[0].p_start = NEW_START_BLOCK; - vtoc->efi_parts[0].p_size = slice_size; - vtoc->efi_parts[0].p_tag = V_SYSTEM; - (void) strcpy(vtoc->efi_parts[0].p_name, "loader"); - if (slice != NULL) - *slice = 1; - /* prepare slice 1 */ - slice_size = vtoc->efi_last_u_lba + 1 - slice_size; - slice_size -= resv; - slice_size -= NEW_START_BLOCK; - vtoc->efi_parts[1].p_start = start_block; - vtoc->efi_parts[1].p_size = slice_size; - vtoc->efi_parts[1].p_tag = V_USR; - (void) strcpy(vtoc->efi_parts[1].p_name, "zfs"); - - vtoc->efi_parts[8].p_start = slice_size + start_block; - vtoc->efi_parts[8].p_size = resv; - vtoc->efi_parts[8].p_tag = V_RESERVED; - } - - if (efi_write(fd, vtoc) != 0) { - /* - * Some block drivers (like pcata) may not support EFI - * GPT labels. Print out a helpful error message dir- - * ecting the user to manually label the disk and give - * a specific slice. - */ - (void) close(fd); - efi_free(vtoc); - - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "try using fdisk(1M) and then provide a specific slice")); - return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); - } - - (void) close(fd); - efi_free(vtoc); -#endif /* illumos */ - return (0); -} - -static boolean_t -supported_dump_vdev_type(libzfs_handle_t *hdl, nvlist_t *config, char *errbuf) -{ - char *type; - nvlist_t **child; - uint_t children, c; - - verify(nvlist_lookup_string(config, ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_FILE) == 0 || - strcmp(type, VDEV_TYPE_HOLE) == 0 || - strcmp(type, VDEV_TYPE_MISSING) == 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "vdev type '%s' is not supported"), type); - (void) zfs_error(hdl, EZFS_VDEVNOTSUP, errbuf); - return (B_FALSE); - } - if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (c = 0; c < children; c++) { - if (!supported_dump_vdev_type(hdl, child[c], errbuf)) - return (B_FALSE); - } - } - return (B_TRUE); -} - -/* - * Check if this zvol is allowable for use as a dump device; zero if - * it is, > 0 if it isn't, < 0 if it isn't a zvol. - * - * Allowable storage configurations include mirrors, all raidz variants, and - * pools with log, cache, and spare devices. Pools which are backed by files or - * have missing/hole vdevs are not suitable. - */ -int -zvol_check_dump_config(char *arg) -{ - zpool_handle_t *zhp = NULL; - nvlist_t *config, *nvroot; - char *p, *volname; - nvlist_t **top; - uint_t toplevels; - libzfs_handle_t *hdl; - char errbuf[1024]; - char poolname[ZFS_MAX_DATASET_NAME_LEN]; - int pathlen = strlen(ZVOL_FULL_DEV_DIR); - int ret = 1; - - if (strncmp(arg, ZVOL_FULL_DEV_DIR, pathlen)) { - return (-1); - } - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "dump is not supported on device '%s'"), arg); - - if ((hdl = libzfs_init()) == NULL) - return (1); - libzfs_print_on_error(hdl, B_TRUE); - - volname = arg + pathlen; - - /* check the configuration of the pool */ - if ((p = strchr(volname, '/')) == NULL) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "malformed dataset name")); - (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); - return (1); - } else if (p - volname >= ZFS_MAX_DATASET_NAME_LEN) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "dataset name is too long")); - (void) zfs_error(hdl, EZFS_NAMETOOLONG, errbuf); - return (1); - } else { - (void) strncpy(poolname, volname, p - volname); - poolname[p - volname] = '\0'; - } - - if ((zhp = zpool_open(hdl, poolname)) == NULL) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "could not open pool '%s'"), poolname); - (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf); - goto out; - } - config = zpool_get_config(zhp, NULL); - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "could not obtain vdev configuration for '%s'"), poolname); - (void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf); - goto out; - } - - verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &top, &toplevels) == 0); - - if (!supported_dump_vdev_type(hdl, top[0], errbuf)) { - goto out; - } - ret = 0; - -out: - if (zhp) - zpool_close(zhp); - libzfs_fini(hdl); - return (ret); -} - -int -zpool_nextboot(libzfs_handle_t *hdl, uint64_t pool_guid, uint64_t dev_guid, - const char *command) -{ - zfs_cmd_t zc = { 0 }; - nvlist_t *args; - char *packed; - size_t size; - int error; - - args = fnvlist_alloc(); - fnvlist_add_uint64(args, ZPOOL_CONFIG_POOL_GUID, pool_guid); - fnvlist_add_uint64(args, ZPOOL_CONFIG_GUID, dev_guid); - fnvlist_add_string(args, "command", command); - error = zcmd_write_src_nvlist(hdl, &zc, args); - if (error == 0) - error = ioctl(hdl->libzfs_fd, ZFS_IOC_NEXTBOOT, &zc); - zcmd_free_nvlists(&zc); - nvlist_free(args); - return (error); -} diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c deleted file mode 100644 index 2e2e1020ad8a..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c +++ /dev/null @@ -1,3924 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 2012 Pawel Jakub Dawidek. All rights reserved. - * Copyright (c) 2013 Steven Hartland. All rights reserved. - * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2016 Igor Kozhukhov - * Copyright (c) 2018, loli10K . All rights reserved. - * Copyright (c) 2019 Datto Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "zfs_namecheck.h" -#include "zfs_prop.h" -#include "zfs_fletcher.h" -#include "libzfs_impl.h" -#include -#include -#include -#include - -#ifdef __FreeBSD__ -extern int zfs_ioctl_version; -#endif - -/* in libzfs_dataset.c */ -extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); -/* We need to use something for ENODATA. */ -#define ENODATA EIDRM - -static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, - recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int, - uint64_t *, const char *); -static int guid_to_name(libzfs_handle_t *, const char *, - uint64_t, boolean_t, char *); - -static const zio_cksum_t zero_cksum = { 0 }; - -typedef struct dedup_arg { - int inputfd; - int outputfd; - libzfs_handle_t *dedup_hdl; -} dedup_arg_t; - -typedef struct progress_arg { - zfs_handle_t *pa_zhp; - int pa_fd; - boolean_t pa_parsable; - boolean_t pa_astitle; - uint64_t pa_size; -} progress_arg_t; - -typedef struct dataref { - uint64_t ref_guid; - uint64_t ref_object; - uint64_t ref_offset; -} dataref_t; - -typedef struct dedup_entry { - struct dedup_entry *dde_next; - zio_cksum_t dde_chksum; - uint64_t dde_prop; - dataref_t dde_ref; -} dedup_entry_t; - -#define MAX_DDT_PHYSMEM_PERCENT 20 -#define SMALLEST_POSSIBLE_MAX_DDT_MB 128 - -typedef struct dedup_table { - dedup_entry_t **dedup_hash_array; - umem_cache_t *ddecache; - uint64_t max_ddt_size; /* max dedup table size in bytes */ - uint64_t cur_ddt_size; /* current dedup table size in bytes */ - uint64_t ddt_count; - int numhashbits; - boolean_t ddt_full; -} dedup_table_t; - -static int -high_order_bit(uint64_t n) -{ - int count; - - for (count = 0; n != 0; count++) - n >>= 1; - return (count); -} - -static size_t -ssread(void *buf, size_t len, FILE *stream) -{ - size_t outlen; - - if ((outlen = fread(buf, len, 1, stream)) == 0) - return (0); - - return (outlen); -} - -static void -ddt_hash_append(libzfs_handle_t *hdl, dedup_table_t *ddt, dedup_entry_t **ddepp, - zio_cksum_t *cs, uint64_t prop, dataref_t *dr) -{ - dedup_entry_t *dde; - - if (ddt->cur_ddt_size >= ddt->max_ddt_size) { - if (ddt->ddt_full == B_FALSE) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Dedup table full. Deduplication will continue " - "with existing table entries")); - ddt->ddt_full = B_TRUE; - } - return; - } - - if ((dde = umem_cache_alloc(ddt->ddecache, UMEM_DEFAULT)) - != NULL) { - assert(*ddepp == NULL); - dde->dde_next = NULL; - dde->dde_chksum = *cs; - dde->dde_prop = prop; - dde->dde_ref = *dr; - *ddepp = dde; - ddt->cur_ddt_size += sizeof (dedup_entry_t); - ddt->ddt_count++; - } -} - -/* - * Using the specified dedup table, do a lookup for an entry with - * the checksum cs. If found, return the block's reference info - * in *dr. Otherwise, insert a new entry in the dedup table, using - * the reference information specified by *dr. - * - * return value: true - entry was found - * false - entry was not found - */ -static boolean_t -ddt_update(libzfs_handle_t *hdl, dedup_table_t *ddt, zio_cksum_t *cs, - uint64_t prop, dataref_t *dr) -{ - uint32_t hashcode; - dedup_entry_t **ddepp; - - hashcode = BF64_GET(cs->zc_word[0], 0, ddt->numhashbits); - - for (ddepp = &(ddt->dedup_hash_array[hashcode]); *ddepp != NULL; - ddepp = &((*ddepp)->dde_next)) { - if (ZIO_CHECKSUM_EQUAL(((*ddepp)->dde_chksum), *cs) && - (*ddepp)->dde_prop == prop) { - *dr = (*ddepp)->dde_ref; - return (B_TRUE); - } - } - ddt_hash_append(hdl, ddt, ddepp, cs, prop, dr); - return (B_FALSE); -} - -static int -dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, - zio_cksum_t *zc, int outfd) -{ - ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), - ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - (void) fletcher_4_incremental_native(drr, - offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); - if (drr->drr_type != DRR_BEGIN) { - ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. - drr_checksum.drr_checksum)); - drr->drr_u.drr_checksum.drr_checksum = *zc; - } - (void) fletcher_4_incremental_native( - &drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc); - if (write(outfd, drr, sizeof (*drr)) == -1) - return (errno); - if (payload_len != 0) { - (void) fletcher_4_incremental_native(payload, payload_len, zc); - if (write(outfd, payload, payload_len) == -1) - return (errno); - } - return (0); -} - -/* - * This function is started in a separate thread when the dedup option - * has been requested. The main send thread determines the list of - * snapshots to be included in the send stream and makes the ioctl calls - * for each one. But instead of having the ioctl send the output to the - * the output fd specified by the caller of zfs_send()), the - * ioctl is told to direct the output to a pipe, which is read by the - * alternate thread running THIS function. This function does the - * dedup'ing by: - * 1. building a dedup table (the DDT) - * 2. doing checksums on each data block and inserting a record in the DDT - * 3. looking for matching checksums, and - * 4. sending a DRR_WRITE_BYREF record instead of a write record whenever - * a duplicate block is found. - * The output of this function then goes to the output fd requested - * by the caller of zfs_send(). - */ -static void * -cksummer(void *arg) -{ - dedup_arg_t *dda = arg; - char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE); - dmu_replay_record_t thedrr; - dmu_replay_record_t *drr = &thedrr; - FILE *ofp; - int outfd; - dedup_table_t ddt; - zio_cksum_t stream_cksum; - uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); - uint64_t numbuckets; - - ddt.max_ddt_size = - MAX((physmem * MAX_DDT_PHYSMEM_PERCENT) / 100, - SMALLEST_POSSIBLE_MAX_DDT_MB << 20); - - numbuckets = ddt.max_ddt_size / (sizeof (dedup_entry_t)); - - /* - * numbuckets must be a power of 2. Increase number to - * a power of 2 if necessary. - */ - if (!ISP2(numbuckets)) - numbuckets = 1 << high_order_bit(numbuckets); - - ddt.dedup_hash_array = calloc(numbuckets, sizeof (dedup_entry_t *)); - ddt.ddecache = umem_cache_create("dde", sizeof (dedup_entry_t), 0, - NULL, NULL, NULL, NULL, NULL, 0); - ddt.cur_ddt_size = numbuckets * sizeof (dedup_entry_t *); - ddt.numhashbits = high_order_bit(numbuckets) - 1; - ddt.ddt_full = B_FALSE; - - outfd = dda->outputfd; - ofp = fdopen(dda->inputfd, "r"); - while (ssread(drr, sizeof (*drr), ofp) != 0) { - - /* - * kernel filled in checksum, we are going to write same - * record, but need to regenerate checksum. - */ - if (drr->drr_type != DRR_BEGIN) { - bzero(&drr->drr_u.drr_checksum.drr_checksum, - sizeof (drr->drr_u.drr_checksum.drr_checksum)); - } - - switch (drr->drr_type) { - case DRR_BEGIN: - { - struct drr_begin *drrb = &drr->drr_u.drr_begin; - int fflags; - int sz = 0; - ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); - - ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); - - /* set the DEDUP feature flag for this stream */ - fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); - fflags |= (DMU_BACKUP_FEATURE_DEDUP | - DMU_BACKUP_FEATURE_DEDUPPROPS); - DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); - - if (drr->drr_payloadlen != 0) { - sz = drr->drr_payloadlen; - - if (sz > SPA_MAXBLOCKSIZE) { - buf = zfs_realloc(dda->dedup_hdl, buf, - SPA_MAXBLOCKSIZE, sz); - } - (void) ssread(buf, sz, ofp); - if (ferror(stdin)) - perror("fread"); - } - if (dump_record(drr, buf, sz, &stream_cksum, - outfd) != 0) - goto out; - break; - } - - case DRR_END: - { - struct drr_end *drre = &drr->drr_u.drr_end; - /* use the recalculated checksum */ - drre->drr_checksum = stream_cksum; - if (dump_record(drr, NULL, 0, &stream_cksum, - outfd) != 0) - goto out; - break; - } - - case DRR_OBJECT: - { - struct drr_object *drro = &drr->drr_u.drr_object; - if (drro->drr_bonuslen > 0) { - (void) ssread(buf, - P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), - ofp); - } - if (dump_record(drr, buf, - P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), - &stream_cksum, outfd) != 0) - goto out; - break; - } - - case DRR_SPILL: - { - struct drr_spill *drrs = &drr->drr_u.drr_spill; - (void) ssread(buf, drrs->drr_length, ofp); - if (dump_record(drr, buf, drrs->drr_length, - &stream_cksum, outfd) != 0) - goto out; - break; - } - - case DRR_FREEOBJECTS: - { - if (dump_record(drr, NULL, 0, &stream_cksum, - outfd) != 0) - goto out; - break; - } - - case DRR_WRITE: - { - struct drr_write *drrw = &drr->drr_u.drr_write; - dataref_t dataref; - uint64_t payload_size; - - payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); - (void) ssread(buf, payload_size, ofp); - - /* - * Use the existing checksum if it's dedup-capable, - * else calculate a SHA256 checksum for it. - */ - - if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum, - zero_cksum) || - !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) { - SHA256_CTX ctx; - zio_cksum_t tmpsha256; - - SHA256Init(&ctx); - SHA256Update(&ctx, buf, payload_size); - SHA256Final(&tmpsha256, &ctx); - drrw->drr_key.ddk_cksum.zc_word[0] = - BE_64(tmpsha256.zc_word[0]); - drrw->drr_key.ddk_cksum.zc_word[1] = - BE_64(tmpsha256.zc_word[1]); - drrw->drr_key.ddk_cksum.zc_word[2] = - BE_64(tmpsha256.zc_word[2]); - drrw->drr_key.ddk_cksum.zc_word[3] = - BE_64(tmpsha256.zc_word[3]); - drrw->drr_checksumtype = ZIO_CHECKSUM_SHA256; - drrw->drr_checksumflags = DRR_CHECKSUM_DEDUP; - } - - dataref.ref_guid = drrw->drr_toguid; - dataref.ref_object = drrw->drr_object; - dataref.ref_offset = drrw->drr_offset; - - if (ddt_update(dda->dedup_hdl, &ddt, - &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop, - &dataref)) { - dmu_replay_record_t wbr_drr = {0}; - struct drr_write_byref *wbr_drrr = - &wbr_drr.drr_u.drr_write_byref; - - /* block already present in stream */ - wbr_drr.drr_type = DRR_WRITE_BYREF; - - wbr_drrr->drr_object = drrw->drr_object; - wbr_drrr->drr_offset = drrw->drr_offset; - wbr_drrr->drr_length = drrw->drr_logical_size; - wbr_drrr->drr_toguid = drrw->drr_toguid; - wbr_drrr->drr_refguid = dataref.ref_guid; - wbr_drrr->drr_refobject = - dataref.ref_object; - wbr_drrr->drr_refoffset = - dataref.ref_offset; - - wbr_drrr->drr_checksumtype = - drrw->drr_checksumtype; - wbr_drrr->drr_checksumflags = - drrw->drr_checksumtype; - wbr_drrr->drr_key.ddk_cksum = - drrw->drr_key.ddk_cksum; - wbr_drrr->drr_key.ddk_prop = - drrw->drr_key.ddk_prop; - - if (dump_record(&wbr_drr, NULL, 0, - &stream_cksum, outfd) != 0) - goto out; - } else { - /* block not previously seen */ - if (dump_record(drr, buf, payload_size, - &stream_cksum, outfd) != 0) - goto out; - } - break; - } - - case DRR_WRITE_EMBEDDED: - { - struct drr_write_embedded *drrwe = - &drr->drr_u.drr_write_embedded; - (void) ssread(buf, - P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp); - if (dump_record(drr, buf, - P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), - &stream_cksum, outfd) != 0) - goto out; - break; - } - - case DRR_FREE: - { - if (dump_record(drr, NULL, 0, &stream_cksum, - outfd) != 0) - goto out; - break; - } - - default: - (void) fprintf(stderr, "INVALID record type 0x%x\n", - drr->drr_type); - /* should never happen, so assert */ - assert(B_FALSE); - } - } -out: - umem_cache_destroy(ddt.ddecache); - free(ddt.dedup_hash_array); - free(buf); - (void) fclose(ofp); - - return (NULL); -} - -/* - * Routines for dealing with the AVL tree of fs-nvlists - */ -typedef struct fsavl_node { - avl_node_t fn_node; - nvlist_t *fn_nvfs; - char *fn_snapname; - uint64_t fn_guid; -} fsavl_node_t; - -static int -fsavl_compare(const void *arg1, const void *arg2) -{ - const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1; - const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2; - - return (AVL_CMP(fn1->fn_guid, fn2->fn_guid)); -} - -/* - * Given the GUID of a snapshot, find its containing filesystem and - * (optionally) name. - */ -static nvlist_t * -fsavl_find(avl_tree_t *avl, uint64_t snapguid, char **snapname) -{ - fsavl_node_t fn_find; - fsavl_node_t *fn; - - fn_find.fn_guid = snapguid; - - fn = avl_find(avl, &fn_find, NULL); - if (fn) { - if (snapname) - *snapname = fn->fn_snapname; - return (fn->fn_nvfs); - } - return (NULL); -} - -static void -fsavl_destroy(avl_tree_t *avl) -{ - fsavl_node_t *fn; - void *cookie; - - if (avl == NULL) - return; - - cookie = NULL; - while ((fn = avl_destroy_nodes(avl, &cookie)) != NULL) - free(fn); - avl_destroy(avl); - free(avl); -} - -/* - * Given an nvlist, produce an avl tree of snapshots, ordered by guid - */ -static avl_tree_t * -fsavl_create(nvlist_t *fss) -{ - avl_tree_t *fsavl; - nvpair_t *fselem = NULL; - - if ((fsavl = malloc(sizeof (avl_tree_t))) == NULL) - return (NULL); - - avl_create(fsavl, fsavl_compare, sizeof (fsavl_node_t), - offsetof(fsavl_node_t, fn_node)); - - while ((fselem = nvlist_next_nvpair(fss, fselem)) != NULL) { - nvlist_t *nvfs, *snaps; - nvpair_t *snapelem = NULL; - - VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs)); - VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps)); - - while ((snapelem = - nvlist_next_nvpair(snaps, snapelem)) != NULL) { - fsavl_node_t *fn; - uint64_t guid; - - VERIFY(0 == nvpair_value_uint64(snapelem, &guid)); - if ((fn = malloc(sizeof (fsavl_node_t))) == NULL) { - fsavl_destroy(fsavl); - return (NULL); - } - fn->fn_nvfs = nvfs; - fn->fn_snapname = nvpair_name(snapelem); - fn->fn_guid = guid; - - /* - * Note: if there are multiple snaps with the - * same GUID, we ignore all but one. - */ - if (avl_find(fsavl, fn, NULL) == NULL) - avl_add(fsavl, fn); - else - free(fn); - } - } - - return (fsavl); -} - -/* - * Routines for dealing with the giant nvlist of fs-nvlists, etc. - */ -typedef struct send_data { - /* - * assigned inside every recursive call, - * restored from *_save on return: - * - * guid of fromsnap snapshot in parent dataset - * txg of fromsnap snapshot in current dataset - * txg of tosnap snapshot in current dataset - */ - - uint64_t parent_fromsnap_guid; - uint64_t fromsnap_txg; - uint64_t tosnap_txg; - - /* the nvlists get accumulated during depth-first traversal */ - nvlist_t *parent_snaps; - nvlist_t *fss; - nvlist_t *snapprops; - - /* send-receive configuration, does not change during traversal */ - const char *fsname; - const char *fromsnap; - const char *tosnap; - boolean_t recursive; - boolean_t verbose; - boolean_t replicate; - - /* - * The header nvlist is of the following format: - * { - * "tosnap" -> string - * "fromsnap" -> string (if incremental) - * "fss" -> { - * id -> { - * - * "name" -> string (full name; for debugging) - * "parentfromsnap" -> number (guid of fromsnap in parent) - * - * "props" -> { name -> value (only if set here) } - * "snaps" -> { name (lastname) -> number (guid) } - * "snapprops" -> { name (lastname) -> { name -> value } } - * - * "origin" -> number (guid) (if clone) - * "sent" -> boolean (not on-disk) - * } - * } - * } - * - */ -} send_data_t; - -static void send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv); - -static int -send_iterate_snap(zfs_handle_t *zhp, void *arg) -{ - send_data_t *sd = arg; - uint64_t guid = zhp->zfs_dmustats.dds_guid; - uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; - char *snapname; - nvlist_t *nv; - - snapname = strrchr(zhp->zfs_name, '@')+1; - - if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { - if (sd->verbose) { - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, - "skipping snapshot %s because it was created " - "after the destination snapshot (%s)\n"), - zhp->zfs_name, sd->tosnap); - } - zfs_close(zhp); - return (0); - } - - VERIFY(0 == nvlist_add_uint64(sd->parent_snaps, snapname, guid)); - /* - * NB: if there is no fromsnap here (it's a newly created fs in - * an incremental replication), we will substitute the tosnap. - */ - if ((sd->fromsnap && strcmp(snapname, sd->fromsnap) == 0) || - (sd->parent_fromsnap_guid == 0 && sd->tosnap && - strcmp(snapname, sd->tosnap) == 0)) { - sd->parent_fromsnap_guid = guid; - } - - VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0)); - send_iterate_prop(zhp, nv); - VERIFY(0 == nvlist_add_nvlist(sd->snapprops, snapname, nv)); - nvlist_free(nv); - - zfs_close(zhp); - return (0); -} - -static void -send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv) -{ - nvpair_t *elem = NULL; - - while ((elem = nvlist_next_nvpair(zhp->zfs_props, elem)) != NULL) { - char *propname = nvpair_name(elem); - zfs_prop_t prop = zfs_name_to_prop(propname); - nvlist_t *propnv; - - if (!zfs_prop_user(propname)) { - /* - * Realistically, this should never happen. However, - * we want the ability to add DSL properties without - * needing to make incompatible version changes. We - * need to ignore unknown properties to allow older - * software to still send datasets containing these - * properties, with the unknown properties elided. - */ - if (prop == ZPROP_INVAL) - continue; - - if (zfs_prop_readonly(prop)) - continue; - } - - verify(nvpair_value_nvlist(elem, &propnv) == 0); - if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || - prop == ZFS_PROP_REFQUOTA || - prop == ZFS_PROP_REFRESERVATION) { - char *source; - uint64_t value; - verify(nvlist_lookup_uint64(propnv, - ZPROP_VALUE, &value) == 0); - if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) - continue; - /* - * May have no source before SPA_VERSION_RECVD_PROPS, - * but is still modifiable. - */ - if (nvlist_lookup_string(propnv, - ZPROP_SOURCE, &source) == 0) { - if ((strcmp(source, zhp->zfs_name) != 0) && - (strcmp(source, - ZPROP_SOURCE_VAL_RECVD) != 0)) - continue; - } - } else { - char *source; - if (nvlist_lookup_string(propnv, - ZPROP_SOURCE, &source) != 0) - continue; - if ((strcmp(source, zhp->zfs_name) != 0) && - (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0)) - continue; - } - - if (zfs_prop_user(propname) || - zfs_prop_get_type(prop) == PROP_TYPE_STRING) { - char *value; - verify(nvlist_lookup_string(propnv, - ZPROP_VALUE, &value) == 0); - VERIFY(0 == nvlist_add_string(nv, propname, value)); - } else { - uint64_t value; - verify(nvlist_lookup_uint64(propnv, - ZPROP_VALUE, &value) == 0); - VERIFY(0 == nvlist_add_uint64(nv, propname, value)); - } - } -} - -/* - * returns snapshot creation txg - * and returns 0 if the snapshot does not exist - */ -static uint64_t -get_snap_txg(libzfs_handle_t *hdl, const char *fs, const char *snap) -{ - char name[ZFS_MAX_DATASET_NAME_LEN]; - uint64_t txg = 0; - - if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0') - return (txg); - - (void) snprintf(name, sizeof (name), "%s@%s", fs, snap); - if (zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT)) { - zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT); - if (zhp != NULL) { - txg = zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG); - zfs_close(zhp); - } - } - - return (txg); -} - -/* - * recursively generate nvlists describing datasets. See comment - * for the data structure send_data_t above for description of contents - * of the nvlist. - */ -static int -send_iterate_fs(zfs_handle_t *zhp, void *arg) -{ - send_data_t *sd = arg; - nvlist_t *nvfs, *nv; - int rv = 0; - uint64_t min_txg = 0, max_txg = 0; - uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid; - uint64_t fromsnap_txg_save = sd->fromsnap_txg; - uint64_t tosnap_txg_save = sd->tosnap_txg; - uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; - uint64_t guid = zhp->zfs_dmustats.dds_guid; - uint64_t fromsnap_txg, tosnap_txg; - char guidstring[64]; - - fromsnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->fromsnap); - if (fromsnap_txg != 0) - sd->fromsnap_txg = fromsnap_txg; - - tosnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->tosnap); - if (tosnap_txg != 0) - sd->tosnap_txg = tosnap_txg; - - /* - * on the send side, if the current dataset does not have tosnap, - * perform two additional checks: - * - * - skip sending the current dataset if it was created later than - * the parent tosnap - * - return error if the current dataset was created earlier than - * the parent tosnap - */ - if (sd->tosnap != NULL && tosnap_txg == 0) { - if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { - if (sd->verbose) { - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, - "skipping dataset %s: snapshot %s does " - "not exist\n"), zhp->zfs_name, sd->tosnap); - } - } else { - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, - "cannot send %s@%s%s: snapshot %s@%s does not " - "exist\n"), sd->fsname, sd->tosnap, sd->recursive ? - dgettext(TEXT_DOMAIN, " recursively") : "", - zhp->zfs_name, sd->tosnap); - rv = -1; - } - goto out; - } - - nvfs = fnvlist_alloc(); - fnvlist_add_string(nvfs, "name", zhp->zfs_name); - fnvlist_add_uint64(nvfs, "parentfromsnap", - sd->parent_fromsnap_guid); - - if (zhp->zfs_dmustats.dds_origin[0]) { - zfs_handle_t *origin = zfs_open(zhp->zfs_hdl, - zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT); - if (origin == NULL) { - rv = -1; - goto out; - } - VERIFY(0 == nvlist_add_uint64(nvfs, "origin", - origin->zfs_dmustats.dds_guid)); - } - - /* iterate over props */ - VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0)); - send_iterate_prop(zhp, nv); - VERIFY(0 == nvlist_add_nvlist(nvfs, "props", nv)); - nvlist_free(nv); - - /* iterate over snaps, and set sd->parent_fromsnap_guid */ - if (!sd->replicate && fromsnap_txg != 0) - min_txg = fromsnap_txg; - if (!sd->replicate && tosnap_txg != 0) - max_txg = tosnap_txg; - sd->parent_fromsnap_guid = 0; - VERIFY(0 == nvlist_alloc(&sd->parent_snaps, NV_UNIQUE_NAME, 0)); - VERIFY(0 == nvlist_alloc(&sd->snapprops, NV_UNIQUE_NAME, 0)); - (void) zfs_iter_snapshots_sorted(zhp, send_iterate_snap, sd, - min_txg, max_txg); - VERIFY(0 == nvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps)); - VERIFY(0 == nvlist_add_nvlist(nvfs, "snapprops", sd->snapprops)); - fnvlist_free(sd->parent_snaps); - fnvlist_free(sd->snapprops); - - /* add this fs to nvlist */ - (void) snprintf(guidstring, sizeof (guidstring), - "0x%llx", (longlong_t)guid); - VERIFY(0 == nvlist_add_nvlist(sd->fss, guidstring, nvfs)); - nvlist_free(nvfs); - - /* iterate over children */ - if (sd->recursive) - rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd); - -out: - sd->parent_fromsnap_guid = parent_fromsnap_guid_save; - sd->fromsnap_txg = fromsnap_txg_save; - sd->tosnap_txg = tosnap_txg_save; - - zfs_close(zhp); - return (rv); -} - -static int -gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, - const char *tosnap, boolean_t recursive, boolean_t verbose, - boolean_t replicate, nvlist_t **nvlp, avl_tree_t **avlp) -{ - zfs_handle_t *zhp; - int error; - uint64_t min_txg = 0, max_txg = 0; - send_data_t sd = { 0 }; - - zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); - if (zhp == NULL) - return (EZFS_BADTYPE); - - VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0)); - sd.fsname = fsname; - sd.fromsnap = fromsnap; - sd.tosnap = tosnap; - sd.recursive = recursive; - sd.verbose = verbose; - sd.replicate = replicate; - - if ((error = send_iterate_fs(zhp, &sd)) != 0) { - nvlist_free(sd.fss); - if (avlp != NULL) - *avlp = NULL; - *nvlp = NULL; - return (error); - } - - if (avlp != NULL && (*avlp = fsavl_create(sd.fss)) == NULL) { - nvlist_free(sd.fss); - *nvlp = NULL; - return (EZFS_NOMEM); - } - - *nvlp = sd.fss; - return (0); -} - -/* - * Routines specific to "zfs send" - */ -typedef struct send_dump_data { - /* these are all just the short snapname (the part after the @) */ - const char *fromsnap; - const char *tosnap; - char prevsnap[ZFS_MAX_DATASET_NAME_LEN]; - uint64_t prevsnap_obj; - boolean_t seenfrom, seento, replicate, doall, fromorigin; - boolean_t verbose, dryrun, parsable, progress, embed_data, std_out; - boolean_t progressastitle; - boolean_t large_block, compress; - int outfd; - boolean_t err; - nvlist_t *fss; - nvlist_t *snapholds; - avl_tree_t *fsavl; - snapfilter_cb_t *filter_cb; - void *filter_cb_arg; - nvlist_t *debugnv; - char holdtag[ZFS_MAX_DATASET_NAME_LEN]; - int cleanup_fd; - uint64_t size; -} send_dump_data_t; - -static int -zfs_send_space(zfs_handle_t *zhp, const char *snapname, const char *from, - enum lzc_send_flags flags, uint64_t *spacep) -{ - libzfs_handle_t *hdl = zhp->zfs_hdl; - int error; - - assert(snapname != NULL); - error = lzc_send_space(snapname, from, flags, spacep); - - if (error != 0) { - char errbuf[1024]; - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "warning: cannot estimate space for '%s'"), snapname); - - switch (error) { - case EXDEV: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "not an earlier snapshot from the same fs")); - return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); - - case ENOENT: - if (zfs_dataset_exists(hdl, snapname, - ZFS_TYPE_SNAPSHOT)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "incremental source (%s) does not exist"), - snapname); - } - return (zfs_error(hdl, EZFS_NOENT, errbuf)); - - case EDQUOT: - case EFBIG: - case EIO: - case ENOLINK: - case ENOSPC: - case ENXIO: - case EPIPE: - case ERANGE: - case EFAULT: - case EROFS: - case EINVAL: - zfs_error_aux(hdl, strerror(error)); - return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); - - default: - return (zfs_standard_error(hdl, error, errbuf)); - } - } - - return (0); -} - -/* - * Dumps a backup of the given snapshot (incremental from fromsnap if it's not - * NULL) to the file descriptor specified by outfd. - */ -static int -dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, - boolean_t fromorigin, int outfd, enum lzc_send_flags flags, - nvlist_t *debugnv) -{ - zfs_cmd_t zc = { 0 }; - libzfs_handle_t *hdl = zhp->zfs_hdl; - nvlist_t *thisdbg; - - assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); - assert(fromsnap_obj == 0 || !fromorigin); - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - zc.zc_cookie = outfd; - zc.zc_obj = fromorigin; - zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); - zc.zc_fromobj = fromsnap_obj; - zc.zc_flags = flags; - - VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0)); - if (fromsnap && fromsnap[0] != '\0') { - VERIFY(0 == nvlist_add_string(thisdbg, - "fromsnap", fromsnap)); - } - - if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { - char errbuf[1024]; - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "warning: cannot send '%s'"), zhp->zfs_name); - - VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno)); - if (debugnv) { - VERIFY(0 == nvlist_add_nvlist(debugnv, - zhp->zfs_name, thisdbg)); - } - nvlist_free(thisdbg); - - switch (errno) { - case EXDEV: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "not an earlier snapshot from the same fs")); - return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); - - case ENOENT: - if (zfs_dataset_exists(hdl, zc.zc_name, - ZFS_TYPE_SNAPSHOT)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "incremental source (@%s) does not exist"), - zc.zc_value); - } - return (zfs_error(hdl, EZFS_NOENT, errbuf)); - - case EDQUOT: - case EFBIG: - case EIO: - case ENOLINK: - case ENOSPC: -#ifdef illumos - case ENOSTR: -#endif - case ENXIO: - case EPIPE: - case ERANGE: - case EFAULT: - case EROFS: - zfs_error_aux(hdl, strerror(errno)); - return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); - - default: - return (zfs_standard_error(hdl, errno, errbuf)); - } - } - - if (debugnv) - VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg)); - nvlist_free(thisdbg); - - return (0); -} - -static void -gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd) -{ - assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); - - /* - * zfs_send() only sets snapholds for sends that need them, - * e.g. replication and doall. - */ - if (sdd->snapholds == NULL) - return; - - fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag); -} - -static void * -send_progress_thread(void *arg) -{ - progress_arg_t *pa = arg; - zfs_cmd_t zc = { 0 }; - zfs_handle_t *zhp = pa->pa_zhp; - libzfs_handle_t *hdl = zhp->zfs_hdl; - unsigned long long bytes, total; - char buf[16]; - time_t t; - struct tm *tm; - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - - if (!pa->pa_parsable && !pa->pa_astitle) - (void) fprintf(stderr, "TIME SENT SNAPSHOT\n"); - - /* - * Print the progress from ZFS_IOC_SEND_PROGRESS every second. - */ - for (;;) { - (void) sleep(1); - - zc.zc_cookie = pa->pa_fd; - if (zfs_ioctl(hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0) - return ((void *)-1); - - (void) time(&t); - tm = localtime(&t); - bytes = zc.zc_cookie; - - if (pa->pa_astitle) { - int pct; - if (pa->pa_size > bytes) - pct = 100 * bytes / pa->pa_size; - else - pct = 100; - - setproctitle("sending %s (%d%%: %llu/%llu)", - zhp->zfs_name, pct, bytes, pa->pa_size); - } else if (pa->pa_parsable) { - (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", - tm->tm_hour, tm->tm_min, tm->tm_sec, - bytes, zhp->zfs_name); - } else { - zfs_nicenum(bytes, buf, sizeof (buf)); - (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", - tm->tm_hour, tm->tm_min, tm->tm_sec, - buf, zhp->zfs_name); - } - } -} - -static void -send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap, - uint64_t size, boolean_t parsable) -{ - if (parsable) { - if (fromsnap != NULL) { - (void) fprintf(fout, "incremental\t%s\t%s", - fromsnap, tosnap); - } else { - (void) fprintf(fout, "full\t%s", - tosnap); - } - } else { - if (fromsnap != NULL) { - if (strchr(fromsnap, '@') == NULL && - strchr(fromsnap, '#') == NULL) { - (void) fprintf(fout, dgettext(TEXT_DOMAIN, - "send from @%s to %s"), - fromsnap, tosnap); - } else { - (void) fprintf(fout, dgettext(TEXT_DOMAIN, - "send from %s to %s"), - fromsnap, tosnap); - } - } else { - (void) fprintf(fout, dgettext(TEXT_DOMAIN, - "full send of %s"), - tosnap); - } - } - - if (parsable) { - (void) fprintf(fout, "\t%llu", - (longlong_t)size); - } else if (size != 0) { - char buf[16]; - zfs_nicenum(size, buf, sizeof (buf)); - (void) fprintf(fout, dgettext(TEXT_DOMAIN, - " estimated size is %s"), buf); - } - (void) fprintf(fout, "\n"); -} - -static int -dump_snapshot(zfs_handle_t *zhp, void *arg) -{ - send_dump_data_t *sdd = arg; - progress_arg_t pa = { 0 }; - pthread_t tid; - char *thissnap; - enum lzc_send_flags flags = 0; - int err; - boolean_t isfromsnap, istosnap, fromorigin; - boolean_t exclude = B_FALSE; - FILE *fout = sdd->std_out ? stdout : stderr; - - err = 0; - thissnap = strchr(zhp->zfs_name, '@') + 1; - isfromsnap = (sdd->fromsnap != NULL && - strcmp(sdd->fromsnap, thissnap) == 0); - - if (!sdd->seenfrom && isfromsnap) { - gather_holds(zhp, sdd); - sdd->seenfrom = B_TRUE; - (void) strcpy(sdd->prevsnap, thissnap); - sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); - zfs_close(zhp); - return (0); - } - - if (sdd->seento || !sdd->seenfrom) { - zfs_close(zhp); - return (0); - } - - istosnap = (strcmp(sdd->tosnap, thissnap) == 0); - if (istosnap) - sdd->seento = B_TRUE; - - if (sdd->large_block) - flags |= LZC_SEND_FLAG_LARGE_BLOCK; - if (sdd->embed_data) - flags |= LZC_SEND_FLAG_EMBED_DATA; - if (sdd->compress) - flags |= LZC_SEND_FLAG_COMPRESS; - - if (!sdd->doall && !isfromsnap && !istosnap) { - if (sdd->replicate) { - char *snapname; - nvlist_t *snapprops; - /* - * Filter out all intermediate snapshots except origin - * snapshots needed to replicate clones. - */ - nvlist_t *nvfs = fsavl_find(sdd->fsavl, - zhp->zfs_dmustats.dds_guid, &snapname); - - VERIFY(0 == nvlist_lookup_nvlist(nvfs, - "snapprops", &snapprops)); - VERIFY(0 == nvlist_lookup_nvlist(snapprops, - thissnap, &snapprops)); - exclude = !nvlist_exists(snapprops, "is_clone_origin"); - } else { - exclude = B_TRUE; - } - } - - /* - * If a filter function exists, call it to determine whether - * this snapshot will be sent. - */ - if (exclude || (sdd->filter_cb != NULL && - sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) { - /* - * This snapshot is filtered out. Don't send it, and don't - * set prevsnap_obj, so it will be as if this snapshot didn't - * exist, and the next accepted snapshot will be sent as - * an incremental from the last accepted one, or as the - * first (and full) snapshot in the case of a replication, - * non-incremental send. - */ - zfs_close(zhp); - return (0); - } - - gather_holds(zhp, sdd); - fromorigin = sdd->prevsnap[0] == '\0' && - (sdd->fromorigin || sdd->replicate); - - if (sdd->verbose || sdd->progress) { - uint64_t size = 0; - char fromds[ZFS_MAX_DATASET_NAME_LEN]; - - if (sdd->prevsnap[0] != '\0') { - (void) strlcpy(fromds, zhp->zfs_name, sizeof (fromds)); - *(strchr(fromds, '@') + 1) = '\0'; - (void) strlcat(fromds, sdd->prevsnap, sizeof (fromds)); - } - if (zfs_send_space(zhp, zhp->zfs_name, - sdd->prevsnap[0] ? fromds : NULL, flags, &size) != 0) { - size = 0; /* cannot estimate send space */ - } else { - send_print_verbose(fout, zhp->zfs_name, - sdd->prevsnap[0] ? sdd->prevsnap : NULL, - size, sdd->parsable); - } - sdd->size += size; - } - - if (!sdd->dryrun) { - /* - * If progress reporting is requested, spawn a new thread to - * poll ZFS_IOC_SEND_PROGRESS at a regular interval. - */ - if (sdd->progress) { - pa.pa_zhp = zhp; - pa.pa_fd = sdd->outfd; - pa.pa_parsable = sdd->parsable; - pa.pa_size = sdd->size; - pa.pa_astitle = sdd->progressastitle; - - if ((err = pthread_create(&tid, NULL, - send_progress_thread, &pa)) != 0) { - zfs_close(zhp); - return (err); - } - } - - err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, - fromorigin, sdd->outfd, flags, sdd->debugnv); - - if (sdd->progress) { - (void) pthread_cancel(tid); - (void) pthread_join(tid, NULL); - } - } - - (void) strcpy(sdd->prevsnap, thissnap); - sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); - zfs_close(zhp); - return (err); -} - -static int -dump_filesystem(zfs_handle_t *zhp, void *arg) -{ - int rv = 0; - uint64_t min_txg = 0, max_txg = 0; - send_dump_data_t *sdd = arg; - boolean_t missingfrom = B_FALSE; - zfs_cmd_t zc = { 0 }; - - (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", - zhp->zfs_name, sdd->tosnap); - if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) { - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, - "WARNING: could not send %s@%s: does not exist\n"), - zhp->zfs_name, sdd->tosnap); - sdd->err = B_TRUE; - return (0); - } - - if (sdd->replicate && sdd->fromsnap) { - /* - * If this fs does not have fromsnap, and we're doing - * recursive, we need to send a full stream from the - * beginning (or an incremental from the origin if this - * is a clone). If we're doing non-recursive, then let - * them get the error. - */ - (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", - zhp->zfs_name, sdd->fromsnap); - if (ioctl(zhp->zfs_hdl->libzfs_fd, - ZFS_IOC_OBJSET_STATS, &zc) != 0) { - missingfrom = B_TRUE; - } - } - - sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0; - sdd->prevsnap_obj = 0; - if (sdd->fromsnap == NULL || missingfrom) - sdd->seenfrom = B_TRUE; - - if (!sdd->replicate && sdd->fromsnap != NULL) - min_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, - sdd->fromsnap); - if (!sdd->replicate && sdd->tosnap != NULL) - max_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, - sdd->tosnap); - - rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg, - min_txg, max_txg); - if (!sdd->seenfrom) { - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, - "WARNING: could not send %s@%s:\n" - "incremental source (%s@%s) does not exist\n"), - zhp->zfs_name, sdd->tosnap, - zhp->zfs_name, sdd->fromsnap); - sdd->err = B_TRUE; - } else if (!sdd->seento) { - if (sdd->fromsnap) { - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, - "WARNING: could not send %s@%s:\n" - "incremental source (%s@%s) " - "is not earlier than it\n"), - zhp->zfs_name, sdd->tosnap, - zhp->zfs_name, sdd->fromsnap); - } else { - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, - "WARNING: " - "could not send %s@%s: does not exist\n"), - zhp->zfs_name, sdd->tosnap); - } - sdd->err = B_TRUE; - } - - return (rv); -} - -static int -dump_filesystems(zfs_handle_t *rzhp, void *arg) -{ - send_dump_data_t *sdd = arg; - nvpair_t *fspair; - boolean_t needagain, progress; - - if (!sdd->replicate) - return (dump_filesystem(rzhp, sdd)); - - /* Mark the clone origin snapshots. */ - for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; - fspair = nvlist_next_nvpair(sdd->fss, fspair)) { - nvlist_t *nvfs; - uint64_t origin_guid = 0; - - VERIFY(0 == nvpair_value_nvlist(fspair, &nvfs)); - (void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid); - if (origin_guid != 0) { - char *snapname; - nvlist_t *origin_nv = fsavl_find(sdd->fsavl, - origin_guid, &snapname); - if (origin_nv != NULL) { - nvlist_t *snapprops; - VERIFY(0 == nvlist_lookup_nvlist(origin_nv, - "snapprops", &snapprops)); - VERIFY(0 == nvlist_lookup_nvlist(snapprops, - snapname, &snapprops)); - VERIFY(0 == nvlist_add_boolean( - snapprops, "is_clone_origin")); - } - } - } -again: - needagain = progress = B_FALSE; - for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; - fspair = nvlist_next_nvpair(sdd->fss, fspair)) { - nvlist_t *fslist, *parent_nv; - char *fsname; - zfs_handle_t *zhp; - int err; - uint64_t origin_guid = 0; - uint64_t parent_guid = 0; - - VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0); - if (nvlist_lookup_boolean(fslist, "sent") == 0) - continue; - - VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0); - (void) nvlist_lookup_uint64(fslist, "origin", &origin_guid); - (void) nvlist_lookup_uint64(fslist, "parentfromsnap", - &parent_guid); - - if (parent_guid != 0) { - parent_nv = fsavl_find(sdd->fsavl, parent_guid, NULL); - if (!nvlist_exists(parent_nv, "sent")) { - /* parent has not been sent; skip this one */ - needagain = B_TRUE; - continue; - } - } - - if (origin_guid != 0) { - nvlist_t *origin_nv = fsavl_find(sdd->fsavl, - origin_guid, NULL); - if (origin_nv != NULL && - !nvlist_exists(origin_nv, "sent")) { - /* - * origin has not been sent yet; - * skip this clone. - */ - needagain = B_TRUE; - continue; - } - } - - zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET); - if (zhp == NULL) - return (-1); - err = dump_filesystem(zhp, sdd); - VERIFY(nvlist_add_boolean(fslist, "sent") == 0); - progress = B_TRUE; - zfs_close(zhp); - if (err) - return (err); - } - if (needagain) { - assert(progress); - goto again; - } - - /* clean out the sent flags in case we reuse this fss */ - for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; - fspair = nvlist_next_nvpair(sdd->fss, fspair)) { - nvlist_t *fslist; - - VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0); - (void) nvlist_remove_all(fslist, "sent"); - } - - return (0); -} - -nvlist_t * -zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token) -{ - unsigned int version; - int nread; - unsigned long long checksum, packed_len; - - /* - * Decode token header, which is: - * -- - * Note that the only supported token version is 1. - */ - nread = sscanf(token, "%u-%llx-%llx-", - &version, &checksum, &packed_len); - if (nread != 3) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "resume token is corrupt (invalid format)")); - return (NULL); - } - - if (version != ZFS_SEND_RESUME_TOKEN_VERSION) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "resume token is corrupt (invalid version %u)"), - version); - return (NULL); - } - - /* convert hexadecimal representation to binary */ - token = strrchr(token, '-') + 1; - int len = strlen(token) / 2; - unsigned char *compressed = zfs_alloc(hdl, len); - for (int i = 0; i < len; i++) { - nread = sscanf(token + i * 2, "%2hhx", compressed + i); - if (nread != 1) { - free(compressed); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "resume token is corrupt " - "(payload is not hex-encoded)")); - return (NULL); - } - } - - /* verify checksum */ - zio_cksum_t cksum; - fletcher_4_native(compressed, len, NULL, &cksum); - if (cksum.zc_word[0] != checksum) { - free(compressed); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "resume token is corrupt (incorrect checksum)")); - return (NULL); - } - - /* uncompress */ - void *packed = zfs_alloc(hdl, packed_len); - uLongf packed_len_long = packed_len; - if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK || - packed_len_long != packed_len) { - free(packed); - free(compressed); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "resume token is corrupt (decompression failed)")); - return (NULL); - } - - /* unpack nvlist */ - nvlist_t *nv; - int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP); - free(packed); - free(compressed); - if (error != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "resume token is corrupt (nvlist_unpack failed)")); - return (NULL); - } - return (nv); -} - -int -zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, - const char *resume_token) -{ - char errbuf[1024]; - char *toname; - char *fromname = NULL; - uint64_t resumeobj, resumeoff, toguid, fromguid, bytes; - zfs_handle_t *zhp; - int error = 0; - char name[ZFS_MAX_DATASET_NAME_LEN]; - enum lzc_send_flags lzc_flags = 0; - uint64_t size = 0; - FILE *fout = (flags->verbose && flags->dryrun) ? stdout : stderr; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot resume send")); - - nvlist_t *resume_nvl = - zfs_send_resume_token_to_nvlist(hdl, resume_token); - if (resume_nvl == NULL) { - /* - * zfs_error_aux has already been set by - * zfs_send_resume_token_to_nvlist - */ - return (zfs_error(hdl, EZFS_FAULT, errbuf)); - } - if (flags->verbose) { - (void) fprintf(fout, dgettext(TEXT_DOMAIN, - "resume token contents:\n")); - nvlist_print(fout, resume_nvl); - } - - if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 || - nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 || - nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 || - nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || - nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "resume token is corrupt")); - return (zfs_error(hdl, EZFS_FAULT, errbuf)); - } - fromguid = 0; - (void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid); - - if (flags->largeblock || nvlist_exists(resume_nvl, "largeblockok")) - lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; - if (flags->embed_data || nvlist_exists(resume_nvl, "embedok")) - lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; - if (flags->compress || nvlist_exists(resume_nvl, "compressok")) - lzc_flags |= LZC_SEND_FLAG_COMPRESS; - - if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) { - if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' is no longer the same snapshot used in " - "the initial send"), toname); - } else { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' used in the initial send no longer exists"), - toname); - } - return (zfs_error(hdl, EZFS_BADPATH, errbuf)); - } - zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); - if (zhp == NULL) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "unable to access '%s'"), name); - return (zfs_error(hdl, EZFS_BADPATH, errbuf)); - } - - if (fromguid != 0) { - if (guid_to_name(hdl, toname, fromguid, B_TRUE, name) != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "incremental source %#llx no longer exists"), - (longlong_t)fromguid); - return (zfs_error(hdl, EZFS_BADPATH, errbuf)); - } - fromname = name; - } - - if (flags->progress || flags->verbose) { - error = lzc_send_space(zhp->zfs_name, fromname, - lzc_flags, &size); - if (error == 0) - size = MAX(0, (int64_t)(size - bytes)); - } - if (flags->verbose) { - send_print_verbose(fout, zhp->zfs_name, fromname, - size, flags->parsable); - } - - if (!flags->dryrun) { - progress_arg_t pa = { 0 }; - pthread_t tid; - /* - * If progress reporting is requested, spawn a new thread to - * poll ZFS_IOC_SEND_PROGRESS at a regular interval. - */ - if (flags->progress) { - pa.pa_zhp = zhp; - pa.pa_fd = outfd; - pa.pa_parsable = flags->parsable; - pa.pa_size = size; - pa.pa_astitle = flags->progressastitle; - - error = pthread_create(&tid, NULL, - send_progress_thread, &pa); - if (error != 0) { - zfs_close(zhp); - return (error); - } - } - - error = lzc_send_resume(zhp->zfs_name, fromname, outfd, - lzc_flags, resumeobj, resumeoff); - - if (flags->progress) { - (void) pthread_cancel(tid); - (void) pthread_join(tid, NULL); - } - - char errbuf[1024]; - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "warning: cannot send '%s'"), zhp->zfs_name); - - zfs_close(zhp); - - switch (error) { - case 0: - return (0); - case EXDEV: - case ENOENT: - case EDQUOT: - case EFBIG: - case EIO: - case ENOLINK: - case ENOSPC: -#ifdef illumos - case ENOSTR: -#endif - case ENXIO: - case EPIPE: - case ERANGE: - case EFAULT: - case EROFS: - zfs_error_aux(hdl, strerror(errno)); - return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); - - default: - return (zfs_standard_error(hdl, errno, errbuf)); - } - } - - - zfs_close(zhp); - - return (error); -} - -/* - * Generate a send stream for the dataset identified by the argument zhp. - * - * The content of the send stream is the snapshot identified by - * 'tosnap'. Incremental streams are requested in two ways: - * - from the snapshot identified by "fromsnap" (if non-null) or - * - from the origin of the dataset identified by zhp, which must - * be a clone. In this case, "fromsnap" is null and "fromorigin" - * is TRUE. - * - * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and - * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM) - * if "replicate" is set. If "doall" is set, dump all the intermediate - * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall" - * case too. If "props" is set, send properties. - */ -int -zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - sendflags_t *flags, int outfd, snapfilter_cb_t filter_func, - void *cb_arg, nvlist_t **debugnvp) -{ - char errbuf[1024]; - send_dump_data_t sdd = { 0 }; - int err = 0; - nvlist_t *fss = NULL; - avl_tree_t *fsavl = NULL; - static uint64_t holdseq; - int spa_version; - pthread_t tid = 0; - int pipefd[2]; - dedup_arg_t dda = { 0 }; - int featureflags = 0; - FILE *fout; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot send '%s'"), zhp->zfs_name); - - if (fromsnap && fromsnap[0] == '\0') { - zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, - "zero-length incremental source")); - return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); - } - - if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) { - uint64_t version; - version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); - if (version >= ZPL_VERSION_SA) { - featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; - } - } - - if (flags->dedup && !flags->dryrun) { - featureflags |= (DMU_BACKUP_FEATURE_DEDUP | - DMU_BACKUP_FEATURE_DEDUPPROPS); - if ((err = pipe(pipefd)) != 0) { - zfs_error_aux(zhp->zfs_hdl, strerror(errno)); - return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, - errbuf)); - } - dda.outputfd = outfd; - dda.inputfd = pipefd[1]; - dda.dedup_hdl = zhp->zfs_hdl; - if ((err = pthread_create(&tid, NULL, cksummer, &dda)) != 0) { - (void) close(pipefd[0]); - (void) close(pipefd[1]); - zfs_error_aux(zhp->zfs_hdl, strerror(errno)); - return (zfs_error(zhp->zfs_hdl, - EZFS_THREADCREATEFAILED, errbuf)); - } - } - - if (flags->replicate || flags->doall || flags->props) { - dmu_replay_record_t drr = { 0 }; - char *packbuf = NULL; - size_t buflen = 0; - zio_cksum_t zc = { 0 }; - - if (flags->replicate || flags->props) { - nvlist_t *hdrnv; - - VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0)); - if (fromsnap) { - VERIFY(0 == nvlist_add_string(hdrnv, - "fromsnap", fromsnap)); - } - VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap)); - if (!flags->replicate) { - VERIFY(0 == nvlist_add_boolean(hdrnv, - "not_recursive")); - } - - err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name, - fromsnap, tosnap, flags->replicate, flags->verbose, - flags->replicate, &fss, &fsavl); - if (err) - goto err_out; - VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss)); - err = nvlist_pack(hdrnv, &packbuf, &buflen, - NV_ENCODE_XDR, 0); - if (debugnvp) - *debugnvp = hdrnv; - else - nvlist_free(hdrnv); - if (err) - goto stderr_out; - } - - if (!flags->dryrun) { - /* write first begin record */ - drr.drr_type = DRR_BEGIN; - drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; - DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin. - drr_versioninfo, DMU_COMPOUNDSTREAM); - DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin. - drr_versioninfo, featureflags); - (void) snprintf(drr.drr_u.drr_begin.drr_toname, - sizeof (drr.drr_u.drr_begin.drr_toname), - "%s@%s", zhp->zfs_name, tosnap); - drr.drr_payloadlen = buflen; - - err = dump_record(&drr, packbuf, buflen, &zc, outfd); - free(packbuf); - if (err != 0) - goto stderr_out; - - /* write end record */ - bzero(&drr, sizeof (drr)); - drr.drr_type = DRR_END; - drr.drr_u.drr_end.drr_checksum = zc; - err = write(outfd, &drr, sizeof (drr)); - if (err == -1) { - err = errno; - goto stderr_out; - } - - err = 0; - } - } - - /* dump each stream */ - sdd.fromsnap = fromsnap; - sdd.tosnap = tosnap; - if (tid != 0) - sdd.outfd = pipefd[0]; - else - sdd.outfd = outfd; - sdd.replicate = flags->replicate; - sdd.doall = flags->doall; - sdd.fromorigin = flags->fromorigin; - sdd.fss = fss; - sdd.fsavl = fsavl; - sdd.verbose = flags->verbose; - sdd.parsable = flags->parsable; - sdd.progress = flags->progress; - sdd.progressastitle = flags->progressastitle; - sdd.dryrun = flags->dryrun; - sdd.large_block = flags->largeblock; - sdd.embed_data = flags->embed_data; - sdd.compress = flags->compress; - sdd.filter_cb = filter_func; - sdd.filter_cb_arg = cb_arg; - if (debugnvp) - sdd.debugnv = *debugnvp; - if (sdd.verbose && sdd.dryrun) - sdd.std_out = B_TRUE; - fout = sdd.std_out ? stdout : stderr; - - /* - * Some flags require that we place user holds on the datasets that are - * being sent so they don't get destroyed during the send. We can skip - * this step if the pool is imported read-only since the datasets cannot - * be destroyed. - */ - if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp), - ZPOOL_PROP_READONLY, NULL) && - zfs_spa_version(zhp, &spa_version) == 0 && - spa_version >= SPA_VERSION_USERREFS && - (flags->doall || flags->replicate)) { - ++holdseq; - (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag), - ".send-%d-%llu", getpid(), (u_longlong_t)holdseq); - sdd.cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL); - if (sdd.cleanup_fd < 0) { - err = errno; - goto stderr_out; - } - sdd.snapholds = fnvlist_alloc(); - } else { - sdd.cleanup_fd = -1; - sdd.snapholds = NULL; - } - if (flags->progress || flags->verbose || sdd.snapholds != NULL) { - /* - * Do a verbose no-op dry run to get all the verbose output - * or to gather snapshot hold's before generating any data, - * then do a non-verbose real run to generate the streams. - */ - sdd.dryrun = B_TRUE; - err = dump_filesystems(zhp, &sdd); - - if (err != 0) - goto stderr_out; - - if (flags->verbose) { - if (flags->parsable) { - (void) fprintf(fout, "size\t%llu\n", - (longlong_t)sdd.size); - } else { - char buf[16]; - zfs_nicenum(sdd.size, buf, sizeof (buf)); - (void) fprintf(fout, dgettext(TEXT_DOMAIN, - "total estimated size is %s\n"), buf); - } - } - - /* Ensure no snaps found is treated as an error. */ - if (!sdd.seento) { - err = ENOENT; - goto err_out; - } - - /* Skip the second run if dryrun was requested. */ - if (flags->dryrun) - goto err_out; - - if (sdd.snapholds != NULL) { - err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds); - if (err != 0) - goto stderr_out; - - fnvlist_free(sdd.snapholds); - sdd.snapholds = NULL; - } - - sdd.dryrun = B_FALSE; - sdd.verbose = B_FALSE; - } - - err = dump_filesystems(zhp, &sdd); - fsavl_destroy(fsavl); - nvlist_free(fss); - - /* Ensure no snaps found is treated as an error. */ - if (err == 0 && !sdd.seento) - err = ENOENT; - - if (tid != 0) { - if (err != 0) - (void) pthread_cancel(tid); - (void) close(pipefd[0]); - (void) pthread_join(tid, NULL); - } - - if (sdd.cleanup_fd != -1) { - VERIFY(0 == close(sdd.cleanup_fd)); - sdd.cleanup_fd = -1; - } - - if (!flags->dryrun && (flags->replicate || flags->doall || - flags->props)) { - /* - * write final end record. NB: want to do this even if - * there was some error, because it might not be totally - * failed. - */ - dmu_replay_record_t drr = { 0 }; - drr.drr_type = DRR_END; - if (write(outfd, &drr, sizeof (drr)) == -1) { - return (zfs_standard_error(zhp->zfs_hdl, - errno, errbuf)); - } - } - - return (err || sdd.err); - -stderr_out: - err = zfs_standard_error(zhp->zfs_hdl, err, errbuf); -err_out: - fsavl_destroy(fsavl); - nvlist_free(fss); - fnvlist_free(sdd.snapholds); - - if (sdd.cleanup_fd != -1) - VERIFY(0 == close(sdd.cleanup_fd)); - if (tid != 0) { - (void) pthread_cancel(tid); - (void) close(pipefd[0]); - (void) pthread_join(tid, NULL); - } - return (err); -} - -int -zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t flags) -{ - int err = 0; - libzfs_handle_t *hdl = zhp->zfs_hdl; - enum lzc_send_flags lzc_flags = 0; - FILE *fout = (flags.verbose && flags.dryrun) ? stdout : stderr; - char errbuf[1024]; - - if (flags.largeblock) - lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; - if (flags.embed_data) - lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; - if (flags.compress) - lzc_flags |= LZC_SEND_FLAG_COMPRESS; - - if (flags.verbose) { - uint64_t size = 0; - err = lzc_send_space(zhp->zfs_name, from, lzc_flags, &size); - if (err == 0) { - send_print_verbose(fout, zhp->zfs_name, from, size, - flags.parsable); - if (flags.parsable) { - (void) fprintf(fout, "size\t%llu\n", - (longlong_t)size); - } else { - char buf[16]; - zfs_nicenum(size, buf, sizeof (buf)); - (void) fprintf(fout, dgettext(TEXT_DOMAIN, - "total estimated size is %s\n"), buf); - } - } else { - (void) fprintf(stderr, "Cannot estimate send size: " - "%s\n", strerror(errno)); - } - } - - if (flags.dryrun) - return (err); - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "warning: cannot send '%s'"), zhp->zfs_name); - - err = lzc_send(zhp->zfs_name, from, fd, lzc_flags); - if (err != 0) { - switch (errno) { - case EXDEV: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "not an earlier snapshot from the same fs")); - return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); - - case ENOENT: - case ESRCH: - if (lzc_exists(zhp->zfs_name)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "incremental source (%s) does not exist"), - from); - } - return (zfs_error(hdl, EZFS_NOENT, errbuf)); - - case EBUSY: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "target is busy; if a filesystem, " - "it must not be mounted")); - return (zfs_error(hdl, EZFS_BUSY, errbuf)); - - case EDQUOT: - case EFBIG: - case EIO: - case ENOLINK: - case ENOSPC: -#ifdef illumos - case ENOSTR: -#endif - case ENXIO: - case EPIPE: - case ERANGE: - case EFAULT: - case EROFS: - zfs_error_aux(hdl, strerror(errno)); - return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); - - default: - return (zfs_standard_error(hdl, errno, errbuf)); - } - } - return (err != 0); -} - -/* - * Routines specific to "zfs recv" - */ - -static int -recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen, - boolean_t byteswap, zio_cksum_t *zc) -{ - char *cp = buf; - int rv; - int len = ilen; - - assert(ilen <= SPA_MAXBLOCKSIZE); - - do { - rv = read(fd, cp, len); - cp += rv; - len -= rv; - } while (rv > 0); - - if (rv < 0 || len != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "failed to read from stream")); - return (zfs_error(hdl, EZFS_BADSTREAM, dgettext(TEXT_DOMAIN, - "cannot receive"))); - } - - if (zc) { - if (byteswap) - (void) fletcher_4_incremental_byteswap(buf, ilen, zc); - else - (void) fletcher_4_incremental_native(buf, ilen, zc); - } - return (0); -} - -static int -recv_read_nvlist(libzfs_handle_t *hdl, int fd, int len, nvlist_t **nvp, - boolean_t byteswap, zio_cksum_t *zc) -{ - char *buf; - int err; - - buf = zfs_alloc(hdl, len); - if (buf == NULL) - return (ENOMEM); - - err = recv_read(hdl, fd, buf, len, byteswap, zc); - if (err != 0) { - free(buf); - return (err); - } - - err = nvlist_unpack(buf, len, nvp, 0); - free(buf); - if (err != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " - "stream (malformed nvlist)")); - return (EINVAL); - } - return (0); -} - -static int -recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, - int baselen, char *newname, recvflags_t *flags) -{ - static int seq; - int err; - prop_changelist_t *clp; - zfs_handle_t *zhp; - - zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); - if (zhp == NULL) - return (-1); - clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, - flags->force ? MS_FORCE : 0); - zfs_close(zhp); - if (clp == NULL) - return (-1); - err = changelist_prefix(clp); - if (err) - return (err); - - if (tryname) { - (void) strcpy(newname, tryname); - if (flags->verbose) { - (void) printf("attempting rename %s to %s\n", - name, newname); - } - err = lzc_rename(name, newname); - if (err == 0) - changelist_rename(clp, name, tryname); - } else { - err = ENOENT; - } - - if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) { - seq++; - - (void) snprintf(newname, ZFS_MAX_DATASET_NAME_LEN, - "%.*srecv-%u-%u", baselen, name, getpid(), seq); - if (flags->verbose) { - (void) printf("failed - trying rename %s to %s\n", - name, newname); - } - err = lzc_rename(name, newname); - if (err == 0) - changelist_rename(clp, name, newname); - if (err && flags->verbose) { - (void) printf("failed (%u) - " - "will try again on next pass\n", errno); - } - err = EAGAIN; - } else if (flags->verbose) { - if (err == 0) - (void) printf("success\n"); - else - (void) printf("failed (%u)\n", errno); - } - - (void) changelist_postfix(clp); - changelist_free(clp); - - return (err); -} - -static int -recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, - char *newname, recvflags_t *flags) -{ - int err = 0; - prop_changelist_t *clp; - zfs_handle_t *zhp; - boolean_t defer = B_FALSE; - int spa_version; - - zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); - if (zhp == NULL) - return (-1); - clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, - flags->force ? MS_FORCE : 0); - if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT && - zfs_spa_version(zhp, &spa_version) == 0 && - spa_version >= SPA_VERSION_USERREFS) - defer = B_TRUE; - zfs_close(zhp); - if (clp == NULL) - return (-1); - err = changelist_prefix(clp); - if (err) - return (err); - - if (flags->verbose) - (void) printf("attempting destroy %s\n", name); - if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { - nvlist_t *nv = fnvlist_alloc(); - fnvlist_add_boolean(nv, name); - err = lzc_destroy_snaps(nv, defer, NULL); - fnvlist_free(nv); - } else { - err = lzc_destroy(name); - } - if (err == 0) { - if (flags->verbose) - (void) printf("success\n"); - changelist_remove(clp, name); - } - - (void) changelist_postfix(clp); - changelist_free(clp); - - /* - * Deferred destroy might destroy the snapshot or only mark it to be - * destroyed later, and it returns success in either case. - */ - if (err != 0 || (defer && zfs_dataset_exists(hdl, name, - ZFS_TYPE_SNAPSHOT))) { - err = recv_rename(hdl, name, NULL, baselen, newname, flags); - } - - return (err); -} - -typedef struct guid_to_name_data { - uint64_t guid; - boolean_t bookmark_ok; - char *name; - char *skip; -} guid_to_name_data_t; - -static int -guid_to_name_cb(zfs_handle_t *zhp, void *arg) -{ - guid_to_name_data_t *gtnd = arg; - const char *slash; - int err; - - if (gtnd->skip != NULL && - (slash = strrchr(zhp->zfs_name, '/')) != NULL && - strcmp(slash + 1, gtnd->skip) == 0) { - zfs_close(zhp); - return (0); - } - - if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid) { - (void) strcpy(gtnd->name, zhp->zfs_name); - zfs_close(zhp); - return (EEXIST); - } - - err = zfs_iter_children(zhp, guid_to_name_cb, gtnd); - if (err != EEXIST && gtnd->bookmark_ok) - err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd); - zfs_close(zhp); - return (err); -} - -/* - * Attempt to find the local dataset associated with this guid. In the case of - * multiple matches, we attempt to find the "best" match by searching - * progressively larger portions of the hierarchy. This allows one to send a - * tree of datasets individually and guarantee that we will find the source - * guid within that hierarchy, even if there are multiple matches elsewhere. - */ -static int -guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, - boolean_t bookmark_ok, char *name) -{ - char pname[ZFS_MAX_DATASET_NAME_LEN]; - guid_to_name_data_t gtnd; - - gtnd.guid = guid; - gtnd.bookmark_ok = bookmark_ok; - gtnd.name = name; - gtnd.skip = NULL; - - /* - * Search progressively larger portions of the hierarchy, starting - * with the filesystem specified by 'parent'. This will - * select the "most local" version of the origin snapshot in the case - * that there are multiple matching snapshots in the system. - */ - (void) strlcpy(pname, parent, sizeof (pname)); - char *cp = strrchr(pname, '@'); - if (cp == NULL) - cp = strchr(pname, '\0'); - for (; cp != NULL; cp = strrchr(pname, '/')) { - /* Chop off the last component and open the parent */ - *cp = '\0'; - zfs_handle_t *zhp = make_dataset_handle(hdl, pname); - - if (zhp == NULL) - continue; - int err = guid_to_name_cb(zfs_handle_dup(zhp), >nd); - if (err != EEXIST) - err = zfs_iter_children(zhp, guid_to_name_cb, >nd); - if (err != EEXIST && bookmark_ok) - err = zfs_iter_bookmarks(zhp, guid_to_name_cb, >nd); - zfs_close(zhp); - if (err == EEXIST) - return (0); - - /* - * Remember the last portion of the dataset so we skip it next - * time through (as we've already searched that portion of the - * hierarchy). - */ - gtnd.skip = strrchr(pname, '/') + 1; - } - - return (ENOENT); -} - -/* - * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if - * guid1 is after guid2. - */ -static int -created_before(libzfs_handle_t *hdl, avl_tree_t *avl, - uint64_t guid1, uint64_t guid2) -{ - nvlist_t *nvfs; - char *fsname, *snapname; - char buf[ZFS_MAX_DATASET_NAME_LEN]; - int rv; - zfs_handle_t *guid1hdl, *guid2hdl; - uint64_t create1, create2; - - if (guid2 == 0) - return (0); - if (guid1 == 0) - return (1); - - nvfs = fsavl_find(avl, guid1, &snapname); - VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); - (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); - guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); - if (guid1hdl == NULL) - return (-1); - - nvfs = fsavl_find(avl, guid2, &snapname); - VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); - (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); - guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); - if (guid2hdl == NULL) { - zfs_close(guid1hdl); - return (-1); - } - - create1 = zfs_prop_get_int(guid1hdl, ZFS_PROP_CREATETXG); - create2 = zfs_prop_get_int(guid2hdl, ZFS_PROP_CREATETXG); - - if (create1 < create2) - rv = -1; - else if (create1 > create2) - rv = +1; - else - rv = 0; - - zfs_close(guid1hdl); - zfs_close(guid2hdl); - - return (rv); -} - -static int -recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, - recvflags_t *flags, nvlist_t *stream_nv, avl_tree_t *stream_avl, - nvlist_t *renamed) -{ - nvlist_t *local_nv, *deleted = NULL; - avl_tree_t *local_avl; - nvpair_t *fselem, *nextfselem; - char *fromsnap; - char newname[ZFS_MAX_DATASET_NAME_LEN]; - char guidname[32]; - int error; - boolean_t needagain, progress, recursive; - char *s1, *s2; - - VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap)); - - recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == - ENOENT); - - if (flags->dryrun) - return (0); - -again: - needagain = progress = B_FALSE; - - VERIFY(0 == nvlist_alloc(&deleted, NV_UNIQUE_NAME, 0)); - - if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL, - recursive, B_FALSE, B_FALSE, &local_nv, &local_avl)) != 0) - return (error); - - /* - * Process deletes and renames - */ - for (fselem = nvlist_next_nvpair(local_nv, NULL); - fselem; fselem = nextfselem) { - nvlist_t *nvfs, *snaps; - nvlist_t *stream_nvfs = NULL; - nvpair_t *snapelem, *nextsnapelem; - uint64_t fromguid = 0; - uint64_t originguid = 0; - uint64_t stream_originguid = 0; - uint64_t parent_fromsnap_guid, stream_parent_fromsnap_guid; - char *fsname, *stream_fsname; - - nextfselem = nvlist_next_nvpair(local_nv, fselem); - - VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs)); - VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps)); - VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); - VERIFY(0 == nvlist_lookup_uint64(nvfs, "parentfromsnap", - &parent_fromsnap_guid)); - (void) nvlist_lookup_uint64(nvfs, "origin", &originguid); - - /* - * First find the stream's fs, so we can check for - * a different origin (due to "zfs promote") - */ - for (snapelem = nvlist_next_nvpair(snaps, NULL); - snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) { - uint64_t thisguid; - - VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid)); - stream_nvfs = fsavl_find(stream_avl, thisguid, NULL); - - if (stream_nvfs != NULL) - break; - } - - /* check for promote */ - (void) nvlist_lookup_uint64(stream_nvfs, "origin", - &stream_originguid); - if (stream_nvfs && originguid != stream_originguid) { - switch (created_before(hdl, local_avl, - stream_originguid, originguid)) { - case 1: { - /* promote it! */ - zfs_cmd_t zc = { 0 }; - nvlist_t *origin_nvfs; - char *origin_fsname; - - if (flags->verbose) - (void) printf("promoting %s\n", fsname); - - origin_nvfs = fsavl_find(local_avl, originguid, - NULL); - VERIFY(0 == nvlist_lookup_string(origin_nvfs, - "name", &origin_fsname)); - (void) strlcpy(zc.zc_value, origin_fsname, - sizeof (zc.zc_value)); - (void) strlcpy(zc.zc_name, fsname, - sizeof (zc.zc_name)); - error = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc); - if (error == 0) - progress = B_TRUE; - break; - } - default: - break; - case -1: - fsavl_destroy(local_avl); - nvlist_free(local_nv); - return (-1); - } - /* - * We had/have the wrong origin, therefore our - * list of snapshots is wrong. Need to handle - * them on the next pass. - */ - needagain = B_TRUE; - continue; - } - - for (snapelem = nvlist_next_nvpair(snaps, NULL); - snapelem; snapelem = nextsnapelem) { - uint64_t thisguid; - char *stream_snapname; - nvlist_t *found, *props; - - nextsnapelem = nvlist_next_nvpair(snaps, snapelem); - - VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid)); - found = fsavl_find(stream_avl, thisguid, - &stream_snapname); - - /* check for delete */ - if (found == NULL) { - char name[ZFS_MAX_DATASET_NAME_LEN]; - - if (!flags->force) - continue; - - (void) snprintf(name, sizeof (name), "%s@%s", - fsname, nvpair_name(snapelem)); - - error = recv_destroy(hdl, name, - strlen(fsname)+1, newname, flags); - if (error) - needagain = B_TRUE; - else - progress = B_TRUE; - sprintf(guidname, "%" PRIu64, thisguid); - nvlist_add_boolean(deleted, guidname); - continue; - } - - stream_nvfs = found; - - if (0 == nvlist_lookup_nvlist(stream_nvfs, "snapprops", - &props) && 0 == nvlist_lookup_nvlist(props, - stream_snapname, &props)) { - zfs_cmd_t zc = { 0 }; - - zc.zc_cookie = B_TRUE; /* received */ - (void) snprintf(zc.zc_name, sizeof (zc.zc_name), - "%s@%s", fsname, nvpair_name(snapelem)); - if (zcmd_write_src_nvlist(hdl, &zc, - props) == 0) { - (void) zfs_ioctl(hdl, - ZFS_IOC_SET_PROP, &zc); - zcmd_free_nvlists(&zc); - } - } - - /* check for different snapname */ - if (strcmp(nvpair_name(snapelem), - stream_snapname) != 0) { - char name[ZFS_MAX_DATASET_NAME_LEN]; - char tryname[ZFS_MAX_DATASET_NAME_LEN]; - - (void) snprintf(name, sizeof (name), "%s@%s", - fsname, nvpair_name(snapelem)); - (void) snprintf(tryname, sizeof (name), "%s@%s", - fsname, stream_snapname); - - error = recv_rename(hdl, name, tryname, - strlen(fsname)+1, newname, flags); - if (error) - needagain = B_TRUE; - else - progress = B_TRUE; - } - - if (strcmp(stream_snapname, fromsnap) == 0) - fromguid = thisguid; - } - - /* check for delete */ - if (stream_nvfs == NULL) { - if (!flags->force) - continue; - - error = recv_destroy(hdl, fsname, strlen(tofs)+1, - newname, flags); - if (error) - needagain = B_TRUE; - else - progress = B_TRUE; - sprintf(guidname, "%" PRIu64, parent_fromsnap_guid); - nvlist_add_boolean(deleted, guidname); - continue; - } - - if (fromguid == 0) { - if (flags->verbose) { - (void) printf("local fs %s does not have " - "fromsnap (%s in stream); must have " - "been deleted locally; ignoring\n", - fsname, fromsnap); - } - continue; - } - - VERIFY(0 == nvlist_lookup_string(stream_nvfs, - "name", &stream_fsname)); - VERIFY(0 == nvlist_lookup_uint64(stream_nvfs, - "parentfromsnap", &stream_parent_fromsnap_guid)); - - s1 = strrchr(fsname, '/'); - s2 = strrchr(stream_fsname, '/'); - - /* - * Check if we're going to rename based on parent guid change - * and the current parent guid was also deleted. If it was then - * rename will fail and is likely unneeded, so avoid this and - * force an early retry to determine the new - * parent_fromsnap_guid. - */ - if (stream_parent_fromsnap_guid != 0 && - parent_fromsnap_guid != 0 && - stream_parent_fromsnap_guid != parent_fromsnap_guid) { - sprintf(guidname, "%" PRIu64, parent_fromsnap_guid); - if (nvlist_exists(deleted, guidname)) { - progress = B_TRUE; - needagain = B_TRUE; - goto doagain; - } - } - - /* - * Check for rename. If the exact receive path is specified, it - * does not count as a rename, but we still need to check the - * datasets beneath it. - */ - if ((stream_parent_fromsnap_guid != 0 && - parent_fromsnap_guid != 0 && - stream_parent_fromsnap_guid != parent_fromsnap_guid) || - ((flags->isprefix || strcmp(tofs, fsname) != 0) && - (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) { - nvlist_t *parent; - char tryname[ZFS_MAX_DATASET_NAME_LEN]; - - parent = fsavl_find(local_avl, - stream_parent_fromsnap_guid, NULL); - /* - * NB: parent might not be found if we used the - * tosnap for stream_parent_fromsnap_guid, - * because the parent is a newly-created fs; - * we'll be able to rename it after we recv the - * new fs. - */ - if (parent != NULL) { - char *pname; - - VERIFY(0 == nvlist_lookup_string(parent, "name", - &pname)); - (void) snprintf(tryname, sizeof (tryname), - "%s%s", pname, strrchr(stream_fsname, '/')); - } else { - tryname[0] = '\0'; - if (flags->verbose) { - (void) printf("local fs %s new parent " - "not found\n", fsname); - } - } - - newname[0] = '\0'; - - error = recv_rename(hdl, fsname, tryname, - strlen(tofs)+1, newname, flags); - - if (renamed != NULL && newname[0] != '\0') { - VERIFY(0 == nvlist_add_boolean(renamed, - newname)); - } - - if (error) - needagain = B_TRUE; - else - progress = B_TRUE; - } - } - -doagain: - fsavl_destroy(local_avl); - nvlist_free(local_nv); - nvlist_free(deleted); - - if (needagain && progress) { - /* do another pass to fix up temporary names */ - if (flags->verbose) - (void) printf("another pass:\n"); - goto again; - } - - return (needagain); -} - -static int -zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, - recvflags_t *flags, dmu_replay_record_t *drr, zio_cksum_t *zc, - char **top_zfs, int cleanup_fd, uint64_t *action_handlep) -{ - nvlist_t *stream_nv = NULL; - avl_tree_t *stream_avl = NULL; - char *fromsnap = NULL; - char *sendsnap = NULL; - char *cp; - char tofs[ZFS_MAX_DATASET_NAME_LEN]; - char sendfs[ZFS_MAX_DATASET_NAME_LEN]; - char errbuf[1024]; - dmu_replay_record_t drre; - int error; - boolean_t anyerr = B_FALSE; - boolean_t softerr = B_FALSE; - boolean_t recursive; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot receive")); - - assert(drr->drr_type == DRR_BEGIN); - assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC); - assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) == - DMU_COMPOUNDSTREAM); - - /* - * Read in the nvlist from the stream. - */ - if (drr->drr_payloadlen != 0) { - error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen, - &stream_nv, flags->byteswap, zc); - if (error) { - error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); - goto out; - } - } - - recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == - ENOENT); - - if (recursive && strchr(destname, '@')) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "cannot specify snapshot name for multi-snapshot stream")); - error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); - goto out; - } - - /* - * Read in the end record and verify checksum. - */ - if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre), - flags->byteswap, NULL))) - goto out; - if (flags->byteswap) { - drre.drr_type = BSWAP_32(drre.drr_type); - drre.drr_u.drr_end.drr_checksum.zc_word[0] = - BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]); - drre.drr_u.drr_end.drr_checksum.zc_word[1] = - BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[1]); - drre.drr_u.drr_end.drr_checksum.zc_word[2] = - BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[2]); - drre.drr_u.drr_end.drr_checksum.zc_word[3] = - BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[3]); - } - if (drre.drr_type != DRR_END) { - error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); - goto out; - } - if (!ZIO_CHECKSUM_EQUAL(drre.drr_u.drr_end.drr_checksum, *zc)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "incorrect header checksum")); - error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); - goto out; - } - - (void) nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap); - - if (drr->drr_payloadlen != 0) { - nvlist_t *stream_fss; - - VERIFY(0 == nvlist_lookup_nvlist(stream_nv, "fss", - &stream_fss)); - if ((stream_avl = fsavl_create(stream_fss)) == NULL) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "couldn't allocate avl tree")); - error = zfs_error(hdl, EZFS_NOMEM, errbuf); - goto out; - } - - if (fromsnap != NULL && recursive) { - nvlist_t *renamed = NULL; - nvpair_t *pair = NULL; - - (void) strlcpy(tofs, destname, sizeof (tofs)); - if (flags->isprefix) { - struct drr_begin *drrb = &drr->drr_u.drr_begin; - int i; - - if (flags->istail) { - cp = strrchr(drrb->drr_toname, '/'); - if (cp == NULL) { - (void) strlcat(tofs, "/", - sizeof (tofs)); - i = 0; - } else { - i = (cp - drrb->drr_toname); - } - } else { - i = strcspn(drrb->drr_toname, "/@"); - } - /* zfs_receive_one() will create_parents() */ - (void) strlcat(tofs, &drrb->drr_toname[i], - sizeof (tofs)); - *strchr(tofs, '@') = '\0'; - } - - if (!flags->dryrun && !flags->nomount) { - VERIFY(0 == nvlist_alloc(&renamed, - NV_UNIQUE_NAME, 0)); - } - - softerr = recv_incremental_replication(hdl, tofs, flags, - stream_nv, stream_avl, renamed); - - /* Unmount renamed filesystems before receiving. */ - while ((pair = nvlist_next_nvpair(renamed, - pair)) != NULL) { - zfs_handle_t *zhp; - prop_changelist_t *clp = NULL; - - zhp = zfs_open(hdl, nvpair_name(pair), - ZFS_TYPE_FILESYSTEM); - if (zhp != NULL) { - clp = changelist_gather(zhp, - ZFS_PROP_MOUNTPOINT, 0, - flags->forceunmount ? MS_FORCE : 0); - zfs_close(zhp); - if (clp != NULL) { - softerr |= - changelist_prefix(clp); - changelist_free(clp); - } - } - } - - nvlist_free(renamed); - } - } - - /* - * Get the fs specified by the first path in the stream (the top level - * specified by 'zfs send') and pass it to each invocation of - * zfs_receive_one(). - */ - (void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname, - sizeof (sendfs)); - if ((cp = strchr(sendfs, '@')) != NULL) { - *cp = '\0'; - /* - * Find the "sendsnap", the final snapshot in a replication - * stream. zfs_receive_one() handles certain errors - * differently, depending on if the contained stream is the - * last one or not. - */ - sendsnap = (cp + 1); - } - - /* Finally, receive each contained stream */ - do { - /* - * we should figure out if it has a recoverable - * error, in which case do a recv_skip() and drive on. - * Note, if we fail due to already having this guid, - * zfs_receive_one() will take care of it (ie, - * recv_skip() and return 0). - */ - error = zfs_receive_impl(hdl, destname, NULL, flags, fd, - sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd, - action_handlep, sendsnap); - if (error == ENODATA) { - error = 0; - break; - } - anyerr |= error; - } while (error == 0); - - if (drr->drr_payloadlen != 0 && recursive && fromsnap != NULL) { - /* - * Now that we have the fs's they sent us, try the - * renames again. - */ - softerr = recv_incremental_replication(hdl, tofs, flags, - stream_nv, stream_avl, NULL); - } - -out: - fsavl_destroy(stream_avl); - nvlist_free(stream_nv); - if (softerr) - error = -2; - if (anyerr) - error = -1; - return (error); -} - -static void -trunc_prop_errs(int truncated) -{ - ASSERT(truncated != 0); - - if (truncated == 1) - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, - "1 more property could not be set\n")); - else - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, - "%d more properties could not be set\n"), truncated); -} - -static int -recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) -{ - dmu_replay_record_t *drr; - void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); - char errbuf[1024]; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot receive:")); - - /* XXX would be great to use lseek if possible... */ - drr = buf; - - while (recv_read(hdl, fd, drr, sizeof (dmu_replay_record_t), - byteswap, NULL) == 0) { - if (byteswap) - drr->drr_type = BSWAP_32(drr->drr_type); - - switch (drr->drr_type) { - case DRR_BEGIN: - if (drr->drr_payloadlen != 0) { - (void) recv_read(hdl, fd, buf, - drr->drr_payloadlen, B_FALSE, NULL); - } - break; - - case DRR_END: - free(buf); - return (0); - - case DRR_OBJECT: - if (byteswap) { - drr->drr_u.drr_object.drr_bonuslen = - BSWAP_32(drr->drr_u.drr_object. - drr_bonuslen); - } - (void) recv_read(hdl, fd, buf, - P2ROUNDUP(drr->drr_u.drr_object.drr_bonuslen, 8), - B_FALSE, NULL); - break; - - case DRR_WRITE: - if (byteswap) { - drr->drr_u.drr_write.drr_logical_size = - BSWAP_64( - drr->drr_u.drr_write.drr_logical_size); - drr->drr_u.drr_write.drr_compressed_size = - BSWAP_64( - drr->drr_u.drr_write.drr_compressed_size); - } - uint64_t payload_size = - DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write); - (void) recv_read(hdl, fd, buf, - payload_size, B_FALSE, NULL); - break; - case DRR_SPILL: - if (byteswap) { - drr->drr_u.drr_spill.drr_length = - BSWAP_64(drr->drr_u.drr_spill.drr_length); - } - (void) recv_read(hdl, fd, buf, - drr->drr_u.drr_spill.drr_length, B_FALSE, NULL); - break; - case DRR_WRITE_EMBEDDED: - if (byteswap) { - drr->drr_u.drr_write_embedded.drr_psize = - BSWAP_32(drr->drr_u.drr_write_embedded. - drr_psize); - } - (void) recv_read(hdl, fd, buf, - P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize, - 8), B_FALSE, NULL); - break; - case DRR_WRITE_BYREF: - case DRR_FREEOBJECTS: - case DRR_FREE: - break; - - default: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid record type")); - return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); - } - } - - free(buf); - return (-1); -} - -static void -recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap, - boolean_t resumable) -{ - char target_fs[ZFS_MAX_DATASET_NAME_LEN]; - - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "checksum mismatch or incomplete stream")); - - if (!resumable) - return; - (void) strlcpy(target_fs, target_snap, sizeof (target_fs)); - *strchr(target_fs, '@') = '\0'; - zfs_handle_t *zhp = zfs_open(hdl, target_fs, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); - if (zhp == NULL) - return; - - char token_buf[ZFS_MAXPROPLEN]; - int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, - token_buf, sizeof (token_buf), - NULL, NULL, 0, B_TRUE); - if (error == 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "checksum mismatch or incomplete stream.\n" - "Partially received snapshot is saved.\n" - "A resuming stream can be generated on the sending " - "system by running:\n" - " zfs send -t %s"), - token_buf); - } - zfs_close(zhp); -} - -/* - * Restores a backup of tosnap from the file descriptor specified by infd. - */ -static int -zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, - const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr, - dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv, - avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, - uint64_t *action_handlep, const char *finalsnap) -{ - zfs_cmd_t zc = { 0 }; - time_t begin_time; - int ioctl_err, ioctl_errno, err; - char *cp; - struct drr_begin *drrb = &drr->drr_u.drr_begin; - char errbuf[1024]; - char prop_errbuf[1024]; - const char *chopprefix; - boolean_t newfs = B_FALSE; - boolean_t stream_wantsnewfs; - uint64_t parent_snapguid = 0; - prop_changelist_t *clp = NULL; - nvlist_t *snapprops_nvlist = NULL; - zprop_errflags_t prop_errflags; - boolean_t recursive; - char *snapname = NULL; - - begin_time = time(NULL); - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot receive")); - - recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == - ENOENT); - - if (stream_avl != NULL) { - nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid, - &snapname); - nvlist_t *props; - int ret; - - (void) nvlist_lookup_uint64(fs, "parentfromsnap", - &parent_snapguid); - err = nvlist_lookup_nvlist(fs, "props", &props); - if (err) - VERIFY(0 == nvlist_alloc(&props, NV_UNIQUE_NAME, 0)); - - if (flags->canmountoff) { - VERIFY(0 == nvlist_add_uint64(props, - zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0)); - } - ret = zcmd_write_src_nvlist(hdl, &zc, props); - if (err) - nvlist_free(props); - - if (0 == nvlist_lookup_nvlist(fs, "snapprops", &props)) { - VERIFY(0 == nvlist_lookup_nvlist(props, - snapname, &snapprops_nvlist)); - } - - if (ret != 0) - return (-1); - } - - cp = NULL; - - /* - * Determine how much of the snapshot name stored in the stream - * we are going to tack on to the name they specified on the - * command line, and how much we are going to chop off. - * - * If they specified a snapshot, chop the entire name stored in - * the stream. - */ - if (flags->istail) { - /* - * A filesystem was specified with -e. We want to tack on only - * the tail of the sent snapshot path. - */ - if (strchr(tosnap, '@')) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " - "argument - snapshot not allowed with -e")); - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - } - - chopprefix = strrchr(sendfs, '/'); - - if (chopprefix == NULL) { - /* - * The tail is the poolname, so we need to - * prepend a path separator. - */ - int len = strlen(drrb->drr_toname); - cp = malloc(len + 2); - cp[0] = '/'; - (void) strcpy(&cp[1], drrb->drr_toname); - chopprefix = cp; - } else { - chopprefix = drrb->drr_toname + (chopprefix - sendfs); - } - } else if (flags->isprefix) { - /* - * A filesystem was specified with -d. We want to tack on - * everything but the first element of the sent snapshot path - * (all but the pool name). - */ - if (strchr(tosnap, '@')) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " - "argument - snapshot not allowed with -d")); - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - } - - chopprefix = strchr(drrb->drr_toname, '/'); - if (chopprefix == NULL) - chopprefix = strchr(drrb->drr_toname, '@'); - } else if (strchr(tosnap, '@') == NULL) { - /* - * If a filesystem was specified without -d or -e, we want to - * tack on everything after the fs specified by 'zfs send'. - */ - chopprefix = drrb->drr_toname + strlen(sendfs); - } else { - /* A snapshot was specified as an exact path (no -d or -e). */ - if (recursive) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "cannot specify snapshot name for multi-snapshot " - "stream")); - return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); - } - chopprefix = drrb->drr_toname + strlen(drrb->drr_toname); - } - - ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname); - ASSERT(chopprefix > drrb->drr_toname); - ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname)); - ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' || - chopprefix[0] == '\0'); - - /* - * Determine name of destination snapshot, store in zc_value. - */ - (void) strcpy(zc.zc_value, tosnap); - (void) strncat(zc.zc_value, chopprefix, sizeof (zc.zc_value)); -#ifdef __FreeBSD__ - if (zfs_ioctl_version == ZFS_IOCVER_UNDEF) - zfs_ioctl_version = get_zfs_ioctl_version(); - /* - * For forward compatibility hide tosnap in zc_value - */ - if (zfs_ioctl_version < ZFS_IOCVER_LZC) - (void) strcpy(zc.zc_value + strlen(zc.zc_value) + 1, tosnap); -#endif - free(cp); - if (!zfs_name_valid(zc.zc_value, ZFS_TYPE_SNAPSHOT)) { - zcmd_free_nvlists(&zc); - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - } - - /* - * Determine the name of the origin snapshot, store in zc_string. - */ - if (originsnap) { - (void) strncpy(zc.zc_string, originsnap, sizeof (zc.zc_string)); - if (flags->verbose) - (void) printf("using provided clone origin %s\n", - zc.zc_string); - } else if (drrb->drr_flags & DRR_FLAG_CLONE) { - if (guid_to_name(hdl, zc.zc_value, - drrb->drr_fromguid, B_FALSE, zc.zc_string) != 0) { - zcmd_free_nvlists(&zc); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "local origin for clone %s does not exist"), - zc.zc_value); - return (zfs_error(hdl, EZFS_NOENT, errbuf)); - } - if (flags->verbose) - (void) printf("found clone origin %s\n", zc.zc_string); - } - - boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & - DMU_BACKUP_FEATURE_RESUMING; - stream_wantsnewfs = (drrb->drr_fromguid == 0 || - (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming; - - if (stream_wantsnewfs) { - /* - * if the parent fs does not exist, look for it based on - * the parent snap GUID - */ - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot receive new filesystem stream")); - - (void) strcpy(zc.zc_name, zc.zc_value); - cp = strrchr(zc.zc_name, '/'); - if (cp) - *cp = '\0'; - if (cp && - !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { - char suffix[ZFS_MAX_DATASET_NAME_LEN]; - (void) strcpy(suffix, strrchr(zc.zc_value, '/')); - if (guid_to_name(hdl, zc.zc_name, parent_snapguid, - B_FALSE, zc.zc_value) == 0) { - *strchr(zc.zc_value, '@') = '\0'; - (void) strcat(zc.zc_value, suffix); - } - } - } else { - /* - * If the fs does not exist, look for it based on the - * fromsnap GUID. - */ - if (resuming) { - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, - "cannot receive resume stream")); - } else { - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, - "cannot receive incremental stream")); - } - - (void) strcpy(zc.zc_name, zc.zc_value); - *strchr(zc.zc_name, '@') = '\0'; - - /* - * If the exact receive path was specified and this is the - * topmost path in the stream, then if the fs does not exist we - * should look no further. - */ - if ((flags->isprefix || (*(chopprefix = drrb->drr_toname + - strlen(sendfs)) != '\0' && *chopprefix != '@')) && - !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { - char snap[ZFS_MAX_DATASET_NAME_LEN]; - (void) strcpy(snap, strchr(zc.zc_value, '@')); - if (guid_to_name(hdl, zc.zc_name, drrb->drr_fromguid, - B_FALSE, zc.zc_value) == 0) { - *strchr(zc.zc_value, '@') = '\0'; - (void) strcat(zc.zc_value, snap); - } - } - } - - (void) strcpy(zc.zc_name, zc.zc_value); - *strchr(zc.zc_name, '@') = '\0'; - - if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { - zfs_handle_t *zhp; - - /* - * Destination fs exists. It must be one of these cases: - * - an incremental send stream - * - the stream specifies a new fs (full stream or clone) - * and they want us to blow away the existing fs (and - * have therefore specified -F and removed any snapshots) - * - we are resuming a failed receive. - */ - if (stream_wantsnewfs) { - boolean_t is_volume = drrb->drr_type == DMU_OST_ZVOL; - if (!flags->force) { - zcmd_free_nvlists(&zc); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "destination '%s' exists\n" - "must specify -F to overwrite it"), - zc.zc_name); - return (zfs_error(hdl, EZFS_EXISTS, errbuf)); - } - if (ioctl(hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT, - &zc) == 0) { - zcmd_free_nvlists(&zc); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "destination has snapshots (eg. %s)\n" - "must destroy them to overwrite it"), - zc.zc_name); - return (zfs_error(hdl, EZFS_EXISTS, errbuf)); - } - if (is_volume && strrchr(zc.zc_name, '/') == NULL) { - zcmd_free_nvlists(&zc); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "destination '%s' is the root dataset\n" - "cannot overwrite with a ZVOL"), - zc.zc_name); - return (zfs_error(hdl, EZFS_EXISTS, errbuf)); - } - if (is_volume && - ioctl(hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, - &zc) == 0) { - zcmd_free_nvlists(&zc); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "destination has children (eg. %s)\n" - "cannot overwrite with a ZVOL"), - zc.zc_name); - return (zfs_error(hdl, EZFS_WRONG_PARENT, - errbuf)); - } - } - - if ((zhp = zfs_open(hdl, zc.zc_name, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { - zcmd_free_nvlists(&zc); - return (-1); - } - - if (stream_wantsnewfs && - zhp->zfs_dmustats.dds_origin[0]) { - zcmd_free_nvlists(&zc); - zfs_close(zhp); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "destination '%s' is a clone\n" - "must destroy it to overwrite it"), - zc.zc_name); - return (zfs_error(hdl, EZFS_EXISTS, errbuf)); - } - - if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM && - (stream_wantsnewfs || resuming)) { - /* We can't do online recv in this case */ - clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, - flags->forceunmount ? MS_FORCE : 0); - if (clp == NULL) { - zfs_close(zhp); - zcmd_free_nvlists(&zc); - return (-1); - } - if (changelist_prefix(clp) != 0) { - changelist_free(clp); - zfs_close(zhp); - zcmd_free_nvlists(&zc); - return (-1); - } - } - - /* - * If we are resuming a newfs, set newfs here so that we will - * mount it if the recv succeeds this time. We can tell - * that it was a newfs on the first recv because the fs - * itself will be inconsistent (if the fs existed when we - * did the first recv, we would have received it into - * .../%recv). - */ - if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT)) - newfs = B_TRUE; - - zfs_close(zhp); - } else { - zfs_handle_t *zhp; - - /* - * Destination filesystem does not exist. Therefore we better - * be creating a new filesystem (either from a full backup, or - * a clone). It would therefore be invalid if the user - * specified only the pool name (i.e. if the destination name - * contained no slash character). - */ - if (!stream_wantsnewfs || - (cp = strrchr(zc.zc_name, '/')) == NULL) { - zcmd_free_nvlists(&zc); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "destination '%s' does not exist"), zc.zc_name); - return (zfs_error(hdl, EZFS_NOENT, errbuf)); - } - - /* - * Trim off the final dataset component so we perform the - * recvbackup ioctl to the filesystems's parent. - */ - *cp = '\0'; - - if (flags->isprefix && !flags->istail && !flags->dryrun && - create_parents(hdl, zc.zc_value, strlen(tosnap)) != 0) { - zcmd_free_nvlists(&zc); - return (zfs_error(hdl, EZFS_BADRESTORE, errbuf)); - } - - /* validate parent */ - zhp = zfs_open(hdl, zc.zc_name, ZFS_TYPE_DATASET); - if (zhp == NULL) { - zcmd_free_nvlists(&zc); - return (zfs_error(hdl, EZFS_BADRESTORE, errbuf)); - } - if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) { - zcmd_free_nvlists(&zc); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "parent '%s' is not a filesystem"), zc.zc_name); - zfs_close(zhp); - return (zfs_error(hdl, EZFS_WRONG_PARENT, errbuf)); - } - zfs_close(zhp); - - newfs = B_TRUE; - } - - zc.zc_begin_record = *drr_noswap; - zc.zc_cookie = infd; - zc.zc_guid = flags->force; - zc.zc_resumable = flags->resumable; - if (flags->verbose) { - (void) printf("%s %s stream of %s into %s\n", - flags->dryrun ? "would receive" : "receiving", - drrb->drr_fromguid ? "incremental" : "full", - drrb->drr_toname, zc.zc_value); - (void) fflush(stdout); - } - - if (flags->dryrun) { - zcmd_free_nvlists(&zc); - return (recv_skip(hdl, infd, flags->byteswap)); - } - - zc.zc_nvlist_dst = (uint64_t)(uintptr_t)prop_errbuf; - zc.zc_nvlist_dst_size = sizeof (prop_errbuf); - zc.zc_cleanup_fd = cleanup_fd; - zc.zc_action_handle = *action_handlep; - - err = ioctl_err = zfs_ioctl(hdl, ZFS_IOC_RECV, &zc); - ioctl_errno = errno; - prop_errflags = (zprop_errflags_t)zc.zc_obj; - - if (err == 0) { - nvlist_t *prop_errors; - VERIFY(0 == nvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst, - zc.zc_nvlist_dst_size, &prop_errors, 0)); - - nvpair_t *prop_err = NULL; - - while ((prop_err = nvlist_next_nvpair(prop_errors, - prop_err)) != NULL) { - char tbuf[1024]; - zfs_prop_t prop; - int intval; - - prop = zfs_name_to_prop(nvpair_name(prop_err)); - (void) nvpair_value_int32(prop_err, &intval); - if (strcmp(nvpair_name(prop_err), - ZPROP_N_MORE_ERRORS) == 0) { - trunc_prop_errs(intval); - break; - } else if (snapname == NULL || finalsnap == NULL || - strcmp(finalsnap, snapname) == 0 || - strcmp(nvpair_name(prop_err), - zfs_prop_to_name(ZFS_PROP_REFQUOTA)) != 0) { - /* - * Skip the special case of, for example, - * "refquota", errors on intermediate - * snapshots leading up to a final one. - * That's why we have all of the checks above. - * - * See zfs_ioctl.c's extract_delay_props() for - * a list of props which can fail on - * intermediate snapshots, but shouldn't - * affect the overall receive. - */ - (void) snprintf(tbuf, sizeof (tbuf), - dgettext(TEXT_DOMAIN, - "cannot receive %s property on %s"), - nvpair_name(prop_err), zc.zc_name); - zfs_setprop_error(hdl, prop, intval, tbuf); - } - } - nvlist_free(prop_errors); - } - - zc.zc_nvlist_dst = 0; - zc.zc_nvlist_dst_size = 0; - zcmd_free_nvlists(&zc); - - if (err == 0 && snapprops_nvlist) { - zfs_cmd_t zc2 = { 0 }; - - (void) strcpy(zc2.zc_name, zc.zc_value); - zc2.zc_cookie = B_TRUE; /* received */ - if (zcmd_write_src_nvlist(hdl, &zc2, snapprops_nvlist) == 0) { - (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc2); - zcmd_free_nvlists(&zc2); - } - } - - if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) { - /* - * It may be that this snapshot already exists, - * in which case we want to consume & ignore it - * rather than failing. - */ - avl_tree_t *local_avl; - nvlist_t *local_nv, *fs; - cp = strchr(zc.zc_value, '@'); - - /* - * XXX Do this faster by just iterating over snaps in - * this fs. Also if zc_value does not exist, we will - * get a strange "does not exist" error message. - */ - *cp = '\0'; - if (gather_nvlist(hdl, zc.zc_value, NULL, NULL, B_FALSE, - B_FALSE, B_FALSE, &local_nv, &local_avl) == 0) { - *cp = '@'; - fs = fsavl_find(local_avl, drrb->drr_toguid, NULL); - fsavl_destroy(local_avl); - nvlist_free(local_nv); - - if (fs != NULL) { - if (flags->verbose) { - (void) printf("snap %s already exists; " - "ignoring\n", zc.zc_value); - } - err = ioctl_err = recv_skip(hdl, infd, - flags->byteswap); - } - } - *cp = '@'; - } - - if (ioctl_err != 0) { - switch (ioctl_errno) { - case ENODEV: - cp = strchr(zc.zc_value, '@'); - *cp = '\0'; - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "most recent snapshot of %s does not\n" - "match incremental source"), zc.zc_value); - (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); - *cp = '@'; - break; - case ETXTBSY: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "destination %s has been modified\n" - "since most recent snapshot"), zc.zc_name); - (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); - break; - case EEXIST: - cp = strchr(zc.zc_value, '@'); - if (newfs) { - /* it's the containing fs that exists */ - *cp = '\0'; - } - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "destination already exists")); - (void) zfs_error_fmt(hdl, EZFS_EXISTS, - dgettext(TEXT_DOMAIN, "cannot restore to %s"), - zc.zc_value); - *cp = '@'; - break; - case EINVAL: - (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); - break; - case ECKSUM: - recv_ecksum_set_aux(hdl, zc.zc_value, flags->resumable); - (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); - break; - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgraded to receive this stream.")); - (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); - break; - case EDQUOT: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "destination %s space quota exceeded"), zc.zc_name); - (void) zfs_error(hdl, EZFS_NOSPC, errbuf); - break; - default: - (void) zfs_standard_error(hdl, ioctl_errno, errbuf); - } - } - - /* - * Mount the target filesystem (if created). Also mount any - * children of the target filesystem if we did a replication - * receive (indicated by stream_avl being non-NULL). - */ - cp = strchr(zc.zc_value, '@'); - if (cp && (ioctl_err == 0 || !newfs)) { - zfs_handle_t *h; - - *cp = '\0'; - h = zfs_open(hdl, zc.zc_value, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); - if (h != NULL) { - if (h->zfs_type == ZFS_TYPE_VOLUME) { - *cp = '@'; - } else if (newfs || stream_avl) { - /* - * Track the first/top of hierarchy fs, - * for mounting and sharing later. - */ - if (top_zfs && *top_zfs == NULL) - *top_zfs = zfs_strdup(hdl, zc.zc_value); - } - zfs_close(h); - } - *cp = '@'; - } - - if (clp) { - if (!flags->nomount) - err |= changelist_postfix(clp); - changelist_free(clp); - } - - if (prop_errflags & ZPROP_ERR_NOCLEAR) { - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " - "failed to clear unreceived properties on %s"), - zc.zc_name); - (void) fprintf(stderr, "\n"); - } - if (prop_errflags & ZPROP_ERR_NORESTORE) { - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " - "failed to restore original properties on %s"), - zc.zc_name); - (void) fprintf(stderr, "\n"); - } - - if (err || ioctl_err) - return (-1); - - *action_handlep = zc.zc_action_handle; - - if (flags->verbose) { - char buf1[64]; - char buf2[64]; - uint64_t bytes = zc.zc_cookie; - time_t delta = time(NULL) - begin_time; - if (delta == 0) - delta = 1; - zfs_nicenum(bytes, buf1, sizeof (buf1)); - zfs_nicenum(bytes/delta, buf2, sizeof (buf1)); - - (void) printf("received %sB stream in %lu seconds (%sB/sec)\n", - buf1, delta, buf2); - } - - return (0); -} - -static int -zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, - const char *originsnap, recvflags_t *flags, int infd, const char *sendfs, - nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, - uint64_t *action_handlep, const char *finalsnap) -{ - int err; - dmu_replay_record_t drr, drr_noswap; - struct drr_begin *drrb = &drr.drr_u.drr_begin; - char errbuf[1024]; - zio_cksum_t zcksum = { 0 }; - uint64_t featureflags; - int hdrtype; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot receive")); - - if (flags->isprefix && - !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs " - "(%s) does not exist"), tosnap); - return (zfs_error(hdl, EZFS_NOENT, errbuf)); - } - if (originsnap && - !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs " - "(%s) does not exist"), originsnap); - return (zfs_error(hdl, EZFS_NOENT, errbuf)); - } - - /* read in the BEGIN record */ - if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, - &zcksum))) - return (err); - - if (drr.drr_type == DRR_END || drr.drr_type == BSWAP_32(DRR_END)) { - /* It's the double end record at the end of a package */ - return (ENODATA); - } - - /* the kernel needs the non-byteswapped begin record */ - drr_noswap = drr; - - flags->byteswap = B_FALSE; - if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { - /* - * We computed the checksum in the wrong byteorder in - * recv_read() above; do it again correctly. - */ - bzero(&zcksum, sizeof (zio_cksum_t)); - (void) fletcher_4_incremental_byteswap(&drr, - sizeof (drr), &zcksum); - flags->byteswap = B_TRUE; - - drr.drr_type = BSWAP_32(drr.drr_type); - drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen); - drrb->drr_magic = BSWAP_64(drrb->drr_magic); - drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); - drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); - drrb->drr_type = BSWAP_32(drrb->drr_type); - drrb->drr_flags = BSWAP_32(drrb->drr_flags); - drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); - drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); - } - - if (drrb->drr_magic != DMU_BACKUP_MAGIC || drr.drr_type != DRR_BEGIN) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " - "stream (bad magic number)")); - return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); - } - - featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); - hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo); - - if (!DMU_STREAM_SUPPORTED(featureflags) || - (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "stream has unsupported feature, feature flags = %lx"), - featureflags); - return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); - } - - if (strchr(drrb->drr_toname, '@') == NULL) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " - "stream (bad snapshot name)")); - return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); - } - - if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) { - char nonpackage_sendfs[ZFS_MAX_DATASET_NAME_LEN]; - if (sendfs == NULL) { - /* - * We were not called from zfs_receive_package(). Get - * the fs specified by 'zfs send'. - */ - char *cp; - (void) strlcpy(nonpackage_sendfs, - drr.drr_u.drr_begin.drr_toname, - sizeof (nonpackage_sendfs)); - if ((cp = strchr(nonpackage_sendfs, '@')) != NULL) - *cp = '\0'; - sendfs = nonpackage_sendfs; - VERIFY(finalsnap == NULL); - } - return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags, - &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs, - cleanup_fd, action_handlep, finalsnap)); - } else { - assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == - DMU_COMPOUNDSTREAM); - return (zfs_receive_package(hdl, infd, tosnap, flags, &drr, - &zcksum, top_zfs, cleanup_fd, action_handlep)); - } -} - -/* - * Restores a backup of tosnap from the file descriptor specified by infd. - * Return 0 on total success, -2 if some things couldn't be - * destroyed/renamed/promoted, -1 if some things couldn't be received. - * (-1 will override -2, if -1 and the resumable flag was specified the - * transfer can be resumed if the sending side supports it). - */ -int -zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, - recvflags_t *flags, int infd, avl_tree_t *stream_avl) -{ - char *top_zfs = NULL; - int err; - int cleanup_fd; - uint64_t action_handle = 0; - char *originsnap = NULL; - - if (props) { - err = nvlist_lookup_string(props, "origin", &originsnap); - if (err && err != ENOENT) - return (err); - } - - cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL); - VERIFY(cleanup_fd >= 0); - - err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL, - stream_avl, &top_zfs, cleanup_fd, &action_handle, NULL); - - VERIFY(0 == close(cleanup_fd)); - - if (err == 0 && !flags->nomount && top_zfs) { - zfs_handle_t *zhp; - prop_changelist_t *clp; - - zhp = zfs_open(hdl, top_zfs, ZFS_TYPE_FILESYSTEM); - if (zhp != NULL) { - clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, - CL_GATHER_MOUNT_ALWAYS, - flags->forceunmount ? MS_FORCE : 0); - zfs_close(zhp); - if (clp != NULL) { - /* mount and share received datasets */ - err = changelist_postfix(clp); - changelist_free(clp); - } - } - if (zhp == NULL || clp == NULL || err) - err = -1; - } - if (top_zfs) - free(top_zfs); - - return (err); -} diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c deleted file mode 100644 index d32662022cf5..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c +++ /dev/null @@ -1,511 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. - * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ - -/* - * This file contains the functions which analyze the status of a pool. This - * include both the status of an active pool, as well as the status exported - * pools. Returns one of the ZPOOL_STATUS_* defines describing the status of - * the pool. This status is independent (to a certain degree) from the state of - * the pool. A pool's state describes only whether or not it is capable of - * providing the necessary fault tolerance for data. The status describes the - * overall status of devices. A pool that is online can still have a device - * that is experiencing errors. - * - * Only a subset of the possible faults can be detected using 'zpool status', - * and not all possible errors correspond to a FMA message ID. The explanation - * is left up to the caller, depending on whether it is a live pool or an - * import. - */ - -#include -#include -#include -#include "libzfs_impl.h" -#include "zfeature_common.h" - -/* - * Message ID table. This must be kept in sync with the ZPOOL_STATUS_* defines - * in libzfs.h. Note that there are some status results which go past the end - * of this table, and hence have no associated message ID. - */ -static char *zfs_msgid_table[] = { - "ZFS-8000-14", /* ZPOOL_STATUS_CORRUPT_CACHE */ - "ZFS-8000-2Q", /* ZPOOL_STATUS_MISSING_DEV_R */ - "ZFS-8000-3C", /* ZPOOL_STATUS_MISSING_DEV_NR */ - "ZFS-8000-4J", /* ZPOOL_STATUS_CORRUPT_LABEL_R */ - "ZFS-8000-5E", /* ZPOOL_STATUS_CORRUPT_LABEL_NR */ - "ZFS-8000-6X", /* ZPOOL_STATUS_BAD_GUID_SUM */ - "ZFS-8000-72", /* ZPOOL_STATUS_CORRUPT_POOL */ - "ZFS-8000-8A", /* ZPOOL_STATUS_CORRUPT_DATA */ - "ZFS-8000-9P", /* ZPOOL_STATUS_FAILING_DEV */ - "ZFS-8000-A5", /* ZPOOL_STATUS_VERSION_NEWER */ - "ZFS-8000-EY", /* ZPOOL_STATUS_HOSTID_MISMATCH */ - "ZFS-8000-EY", /* ZPOOL_STATUS_HOSTID_ACTIVE */ - "ZFS-8000-EY", /* ZPOOL_STATUS_HOSTID_REQUIRED */ - "ZFS-8000-HC", /* ZPOOL_STATUS_IO_FAILURE_WAIT */ - "ZFS-8000-JQ", /* ZPOOL_STATUS_IO_FAILURE_CONTINUE */ - "ZFS-8000-MM", /* ZPOOL_STATUS_IO_FAILURE_MMP */ - "ZFS-8000-K4", /* ZPOOL_STATUS_BAD_LOG */ - /* - * The following results have no message ID. - * ZPOOL_STATUS_UNSUP_FEAT_READ - * ZPOOL_STATUS_UNSUP_FEAT_WRITE - * ZPOOL_STATUS_FAULTED_DEV_R - * ZPOOL_STATUS_FAULTED_DEV_NR - * ZPOOL_STATUS_VERSION_OLDER - * ZPOOL_STATUS_FEAT_DISABLED - * ZPOOL_STATUS_RESILVERING - * ZPOOL_STATUS_OFFLINE_DEV - * ZPOOL_STATUS_REMOVED_DEV - * ZPOOL_STATUS_OK - */ -}; - -#define NMSGID (sizeof (zfs_msgid_table) / sizeof (zfs_msgid_table[0])) - -/* ARGSUSED */ -static int -vdev_missing(vdev_stat_t *vs, uint_t vsc) -{ - return (vs->vs_state == VDEV_STATE_CANT_OPEN && - vs->vs_aux == VDEV_AUX_OPEN_FAILED); -} - -/* ARGSUSED */ -static int -vdev_faulted(vdev_stat_t *vs, uint_t vsc) -{ - return (vs->vs_state == VDEV_STATE_FAULTED); -} - -/* ARGSUSED */ -static int -vdev_errors(vdev_stat_t *vs, uint_t vsc) -{ - return (vs->vs_state == VDEV_STATE_DEGRADED || - vs->vs_read_errors != 0 || vs->vs_write_errors != 0 || - vs->vs_checksum_errors != 0); -} - -/* ARGSUSED */ -static int -vdev_broken(vdev_stat_t *vs, uint_t vsc) -{ - return (vs->vs_state == VDEV_STATE_CANT_OPEN); -} - -/* ARGSUSED */ -static int -vdev_offlined(vdev_stat_t *vs, uint_t vsc) -{ - return (vs->vs_state == VDEV_STATE_OFFLINE); -} - -/* ARGSUSED */ -static int -vdev_removed(vdev_stat_t *vs, uint_t vsc) -{ - return (vs->vs_state == VDEV_STATE_REMOVED); -} - -static int -vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc) -{ - return (VDEV_STAT_VALID(vs_physical_ashift, vsc) && - vs->vs_configured_ashift < vs->vs_physical_ashift); -} - -/* - * Detect if any leaf devices that have seen errors or could not be opened. - */ -static boolean_t -find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t), - boolean_t ignore_replacing) -{ - nvlist_t **child; - vdev_stat_t *vs; - uint_t c, vsc, children; - - /* - * Ignore problems within a 'replacing' vdev, since we're presumably in - * the process of repairing any such errors, and don't want to call them - * out again. We'll pick up the fact that a resilver is happening - * later. - */ - if (ignore_replacing == B_TRUE) { - char *type; - - verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, - &type) == 0); - if (strcmp(type, VDEV_TYPE_REPLACING) == 0) - return (B_FALSE); - } - - if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child, - &children) == 0) { - for (c = 0; c < children; c++) - if (find_vdev_problem(child[c], func, ignore_replacing)) - return (B_TRUE); - } else { - verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &vsc) == 0); - - if (func(vs, vsc) != 0) - return (B_TRUE); - } - - /* - * Check any L2 cache devs - */ - if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_L2CACHE, &child, - &children) == 0) { - for (c = 0; c < children; c++) - if (find_vdev_problem(child[c], func, ignore_replacing)) - return (B_TRUE); - } - - return (B_FALSE); -} - -/* - * Active pool health status. - * - * To determine the status for a pool, we make several passes over the config, - * picking the most egregious error we find. In order of importance, we do the - * following: - * - * - Check for a complete and valid configuration - * - Look for any faulted or missing devices in a non-replicated config - * - Check for any data errors - * - Check for any faulted or missing devices in a replicated config - * - Look for any devices showing errors - * - Check for any resilvering devices - * - * There can obviously be multiple errors within a single pool, so this routine - * only picks the most damaging of all the current errors to report. - */ -static zpool_status_t -check_status(nvlist_t *config, boolean_t isimport) -{ - nvlist_t *nvroot; - vdev_stat_t *vs; - pool_scan_stat_t *ps = NULL; - uint_t vsc, psc; - uint64_t nerr; - uint64_t version; - uint64_t stateval; - uint64_t suspended; - uint64_t hostid = 0; - unsigned long system_hostid = get_system_hostid(); - - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, - &version) == 0); - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &vsc) == 0); - verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, - &stateval) == 0); - - /* - * Currently resilvering a vdev - */ - (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, - (uint64_t **)&ps, &psc); - if (ps != NULL && ps->pss_func == POOL_SCAN_RESILVER && - ps->pss_state == DSS_SCANNING) - return (ZPOOL_STATUS_RESILVERING); - - /* - * The multihost property is set and the pool may be active. - */ - if (vs->vs_state == VDEV_STATE_CANT_OPEN && - vs->vs_aux == VDEV_AUX_ACTIVE) { - mmp_state_t mmp_state; - nvlist_t *nvinfo; - - nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); - mmp_state = fnvlist_lookup_uint64(nvinfo, - ZPOOL_CONFIG_MMP_STATE); - - if (mmp_state == MMP_STATE_ACTIVE) - return (ZPOOL_STATUS_HOSTID_ACTIVE); - else if (mmp_state == MMP_STATE_NO_HOSTID) - return (ZPOOL_STATUS_HOSTID_REQUIRED); - else - return (ZPOOL_STATUS_HOSTID_MISMATCH); - } - - /* - * Pool last accessed by another system. - */ - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); - if (hostid != 0 && (unsigned long)hostid != system_hostid && - stateval == POOL_STATE_ACTIVE) - return (ZPOOL_STATUS_HOSTID_MISMATCH); - - /* - * Newer on-disk version. - */ - if (vs->vs_state == VDEV_STATE_CANT_OPEN && - vs->vs_aux == VDEV_AUX_VERSION_NEWER) - return (ZPOOL_STATUS_VERSION_NEWER); - - /* - * Unsupported feature(s). - */ - if (vs->vs_state == VDEV_STATE_CANT_OPEN && - vs->vs_aux == VDEV_AUX_UNSUP_FEAT) { - nvlist_t *nvinfo; - - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, - &nvinfo) == 0); - if (nvlist_exists(nvinfo, ZPOOL_CONFIG_CAN_RDONLY)) - return (ZPOOL_STATUS_UNSUP_FEAT_WRITE); - return (ZPOOL_STATUS_UNSUP_FEAT_READ); - } - - /* - * Check that the config is complete. - */ - if (vs->vs_state == VDEV_STATE_CANT_OPEN && - vs->vs_aux == VDEV_AUX_BAD_GUID_SUM) - return (ZPOOL_STATUS_BAD_GUID_SUM); - - /* - * Check whether the pool has suspended. - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_SUSPENDED, - &suspended) == 0) { - uint64_t reason; - - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_SUSPENDED_REASON, - &reason) == 0 && reason == ZIO_SUSPEND_MMP) - return (ZPOOL_STATUS_IO_FAILURE_MMP); - - if (suspended == ZIO_FAILURE_MODE_CONTINUE) - return (ZPOOL_STATUS_IO_FAILURE_CONTINUE); - return (ZPOOL_STATUS_IO_FAILURE_WAIT); - } - - /* - * Could not read a log. - */ - if (vs->vs_state == VDEV_STATE_CANT_OPEN && - vs->vs_aux == VDEV_AUX_BAD_LOG) { - return (ZPOOL_STATUS_BAD_LOG); - } - - /* - * Bad devices in non-replicated config. - */ - if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) - return (ZPOOL_STATUS_FAULTED_DEV_NR); - - if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_missing, B_TRUE)) - return (ZPOOL_STATUS_MISSING_DEV_NR); - - if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_broken, B_TRUE)) - return (ZPOOL_STATUS_CORRUPT_LABEL_NR); - - /* - * Corrupted pool metadata - */ - if (vs->vs_state == VDEV_STATE_CANT_OPEN && - vs->vs_aux == VDEV_AUX_CORRUPT_DATA) - return (ZPOOL_STATUS_CORRUPT_POOL); - - /* - * Persistent data errors. - */ - if (!isimport) { - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, - &nerr) == 0 && nerr != 0) - return (ZPOOL_STATUS_CORRUPT_DATA); - } - - /* - * Missing devices in a replicated config. - */ - if (find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) - return (ZPOOL_STATUS_FAULTED_DEV_R); - if (find_vdev_problem(nvroot, vdev_missing, B_TRUE)) - return (ZPOOL_STATUS_MISSING_DEV_R); - if (find_vdev_problem(nvroot, vdev_broken, B_TRUE)) - return (ZPOOL_STATUS_CORRUPT_LABEL_R); - - /* - * Devices with errors - */ - if (!isimport && find_vdev_problem(nvroot, vdev_errors, B_TRUE)) - return (ZPOOL_STATUS_FAILING_DEV); - - /* - * Offlined devices - */ - if (find_vdev_problem(nvroot, vdev_offlined, B_TRUE)) - return (ZPOOL_STATUS_OFFLINE_DEV); - - /* - * Removed device - */ - if (find_vdev_problem(nvroot, vdev_removed, B_TRUE)) - return (ZPOOL_STATUS_REMOVED_DEV); - - /* - * Suboptimal, but usable, ashift configuration. - */ - if (find_vdev_problem(nvroot, vdev_non_native_ashift, B_FALSE)) - return (ZPOOL_STATUS_NON_NATIVE_ASHIFT); - - /* - * Outdated, but usable, version - */ - if (SPA_VERSION_IS_SUPPORTED(version) && version != SPA_VERSION) - return (ZPOOL_STATUS_VERSION_OLDER); - - /* - * Usable pool with disabled features - */ - if (version >= SPA_VERSION_FEATURES) { - int i; - nvlist_t *feat; - - if (isimport) { - feat = fnvlist_lookup_nvlist(config, - ZPOOL_CONFIG_LOAD_INFO); - if (nvlist_exists(feat, ZPOOL_CONFIG_ENABLED_FEAT)) - feat = fnvlist_lookup_nvlist(feat, - ZPOOL_CONFIG_ENABLED_FEAT); - } else { - feat = fnvlist_lookup_nvlist(config, - ZPOOL_CONFIG_FEATURE_STATS); - } - - for (i = 0; i < SPA_FEATURES; i++) { - zfeature_info_t *fi = &spa_feature_table[i]; - if (!nvlist_exists(feat, fi->fi_guid)) - return (ZPOOL_STATUS_FEAT_DISABLED); - } - } - - return (ZPOOL_STATUS_OK); -} - -zpool_status_t -zpool_get_status(zpool_handle_t *zhp, char **msgid) -{ - zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE); - - if (ret >= NMSGID) - *msgid = NULL; - else - *msgid = zfs_msgid_table[ret]; - - return (ret); -} - -zpool_status_t -zpool_import_status(nvlist_t *config, char **msgid) -{ - zpool_status_t ret = check_status(config, B_TRUE); - - if (ret >= NMSGID) - *msgid = NULL; - else - *msgid = zfs_msgid_table[ret]; - - return (ret); -} - -static void -dump_ddt_stat(const ddt_stat_t *dds, int h) -{ - char refcnt[6]; - char blocks[6], lsize[6], psize[6], dsize[6]; - char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6]; - - if (dds == NULL || dds->dds_blocks == 0) - return; - - if (h == -1) - (void) strcpy(refcnt, "Total"); - else - zfs_nicenum(1ULL << h, refcnt, sizeof (refcnt)); - - zfs_nicenum(dds->dds_blocks, blocks, sizeof (blocks)); - zfs_nicenum(dds->dds_lsize, lsize, sizeof (lsize)); - zfs_nicenum(dds->dds_psize, psize, sizeof (psize)); - zfs_nicenum(dds->dds_dsize, dsize, sizeof (dsize)); - zfs_nicenum(dds->dds_ref_blocks, ref_blocks, sizeof (ref_blocks)); - zfs_nicenum(dds->dds_ref_lsize, ref_lsize, sizeof (ref_lsize)); - zfs_nicenum(dds->dds_ref_psize, ref_psize, sizeof (ref_psize)); - zfs_nicenum(dds->dds_ref_dsize, ref_dsize, sizeof (ref_dsize)); - - (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", - refcnt, - blocks, lsize, psize, dsize, - ref_blocks, ref_lsize, ref_psize, ref_dsize); -} - -/* - * Print the DDT histogram and the column totals. - */ -void -zpool_dump_ddt(const ddt_stat_t *dds_total, const ddt_histogram_t *ddh) -{ - int h; - - (void) printf("\n"); - - (void) printf("bucket " - " allocated " - " referenced \n"); - (void) printf("______ " - "______________________________ " - "______________________________\n"); - - (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", - "refcnt", - "blocks", "LSIZE", "PSIZE", "DSIZE", - "blocks", "LSIZE", "PSIZE", "DSIZE"); - - (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", - "------", - "------", "-----", "-----", "-----", - "------", "-----", "-----", "-----"); - - for (h = 0; h < 64; h++) - dump_ddt_stat(&ddh->ddh_stat[h], h); - - dump_ddt_stat(dds_total, -1); - - (void) printf("\n"); -} diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c deleted file mode 100644 index 4439bcbbee57..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c +++ /dev/null @@ -1,1661 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2019 Joyent, Inc. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright 2016 Igor Kozhukhov - * Copyright (c) 2017 Datto Inc. - */ - -/* - * Internal utility routines for the ZFS library. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "libzfs_impl.h" -#include "zfs_prop.h" -#include "zfs_comutil.h" -#include "zfeature_common.h" - - -int -libzfs_errno(libzfs_handle_t *hdl) -{ - return (hdl->libzfs_error); -} - -const char * -libzfs_error_action(libzfs_handle_t *hdl) -{ - return (hdl->libzfs_action); -} - -const char * -libzfs_error_description(libzfs_handle_t *hdl) -{ - if (hdl->libzfs_desc[0] != '\0') - return (hdl->libzfs_desc); - - switch (hdl->libzfs_error) { - case EZFS_NOMEM: - return (dgettext(TEXT_DOMAIN, "out of memory")); - case EZFS_BADPROP: - return (dgettext(TEXT_DOMAIN, "invalid property value")); - case EZFS_PROPREADONLY: - return (dgettext(TEXT_DOMAIN, "read-only property")); - case EZFS_PROPTYPE: - return (dgettext(TEXT_DOMAIN, "property doesn't apply to " - "datasets of this type")); - case EZFS_PROPNONINHERIT: - return (dgettext(TEXT_DOMAIN, "property cannot be inherited")); - case EZFS_PROPSPACE: - return (dgettext(TEXT_DOMAIN, "invalid quota or reservation")); - case EZFS_BADTYPE: - return (dgettext(TEXT_DOMAIN, "operation not applicable to " - "datasets of this type")); - case EZFS_BUSY: - return (dgettext(TEXT_DOMAIN, "pool or dataset is busy")); - case EZFS_EXISTS: - return (dgettext(TEXT_DOMAIN, "pool or dataset exists")); - case EZFS_NOENT: - return (dgettext(TEXT_DOMAIN, "no such pool or dataset")); - case EZFS_BADSTREAM: - return (dgettext(TEXT_DOMAIN, "invalid backup stream")); - case EZFS_DSREADONLY: - return (dgettext(TEXT_DOMAIN, "dataset is read-only")); - case EZFS_VOLTOOBIG: - return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for " - "this system")); - case EZFS_INVALIDNAME: - return (dgettext(TEXT_DOMAIN, "invalid name")); - case EZFS_BADRESTORE: - return (dgettext(TEXT_DOMAIN, "unable to restore to " - "destination")); - case EZFS_BADBACKUP: - return (dgettext(TEXT_DOMAIN, "backup failed")); - case EZFS_BADTARGET: - return (dgettext(TEXT_DOMAIN, "invalid target vdev")); - case EZFS_NODEVICE: - return (dgettext(TEXT_DOMAIN, "no such device in pool")); - case EZFS_BADDEV: - return (dgettext(TEXT_DOMAIN, "invalid device")); - case EZFS_NOREPLICAS: - return (dgettext(TEXT_DOMAIN, "no valid replicas")); - case EZFS_RESILVERING: - return (dgettext(TEXT_DOMAIN, "currently resilvering")); - case EZFS_BADVERSION: - return (dgettext(TEXT_DOMAIN, "unsupported version or " - "feature")); - case EZFS_POOLUNAVAIL: - return (dgettext(TEXT_DOMAIN, "pool is unavailable")); - case EZFS_DEVOVERFLOW: - return (dgettext(TEXT_DOMAIN, "too many devices in one vdev")); - case EZFS_BADPATH: - return (dgettext(TEXT_DOMAIN, "must be an absolute path")); - case EZFS_CROSSTARGET: - return (dgettext(TEXT_DOMAIN, "operation crosses datasets or " - "pools")); - case EZFS_ZONED: - return (dgettext(TEXT_DOMAIN, "dataset in use by local zone")); - case EZFS_MOUNTFAILED: - return (dgettext(TEXT_DOMAIN, "mount failed")); - case EZFS_UMOUNTFAILED: - return (dgettext(TEXT_DOMAIN, "umount failed")); - case EZFS_UNSHARENFSFAILED: - return (dgettext(TEXT_DOMAIN, "unshare(1M) failed")); - case EZFS_SHARENFSFAILED: - return (dgettext(TEXT_DOMAIN, "share(1M) failed")); - case EZFS_UNSHARESMBFAILED: - return (dgettext(TEXT_DOMAIN, "smb remove share failed")); - case EZFS_SHARESMBFAILED: - return (dgettext(TEXT_DOMAIN, "smb add share failed")); - case EZFS_PERM: - return (dgettext(TEXT_DOMAIN, "permission denied")); - case EZFS_NOSPC: - return (dgettext(TEXT_DOMAIN, "out of space")); - case EZFS_FAULT: - return (dgettext(TEXT_DOMAIN, "bad address")); - case EZFS_IO: - return (dgettext(TEXT_DOMAIN, "I/O error")); - case EZFS_INTR: - return (dgettext(TEXT_DOMAIN, "signal received")); - case EZFS_ISSPARE: - return (dgettext(TEXT_DOMAIN, "device is reserved as a hot " - "spare")); - case EZFS_INVALCONFIG: - return (dgettext(TEXT_DOMAIN, "invalid vdev configuration")); - case EZFS_RECURSIVE: - return (dgettext(TEXT_DOMAIN, "recursive dataset dependency")); - case EZFS_NOHISTORY: - return (dgettext(TEXT_DOMAIN, "no history available")); - case EZFS_POOLPROPS: - return (dgettext(TEXT_DOMAIN, "failed to retrieve " - "pool properties")); - case EZFS_POOL_NOTSUP: - return (dgettext(TEXT_DOMAIN, "operation not supported " - "on this type of pool")); - case EZFS_POOL_INVALARG: - return (dgettext(TEXT_DOMAIN, "invalid argument for " - "this pool operation")); - case EZFS_NAMETOOLONG: - return (dgettext(TEXT_DOMAIN, "dataset name is too long")); - case EZFS_OPENFAILED: - return (dgettext(TEXT_DOMAIN, "open failed")); - case EZFS_NOCAP: - return (dgettext(TEXT_DOMAIN, - "disk capacity information could not be retrieved")); - case EZFS_LABELFAILED: - return (dgettext(TEXT_DOMAIN, "write of label failed")); - case EZFS_BADWHO: - return (dgettext(TEXT_DOMAIN, "invalid user/group")); - case EZFS_BADPERM: - return (dgettext(TEXT_DOMAIN, "invalid permission")); - case EZFS_BADPERMSET: - return (dgettext(TEXT_DOMAIN, "invalid permission set name")); - case EZFS_NODELEGATION: - return (dgettext(TEXT_DOMAIN, "delegated administration is " - "disabled on pool")); - case EZFS_BADCACHE: - return (dgettext(TEXT_DOMAIN, "invalid or missing cache file")); - case EZFS_ISL2CACHE: - return (dgettext(TEXT_DOMAIN, "device is in use as a cache")); - case EZFS_VDEVNOTSUP: - return (dgettext(TEXT_DOMAIN, "vdev specification is not " - "supported")); - case EZFS_NOTSUP: - return (dgettext(TEXT_DOMAIN, "operation not supported " - "on this dataset")); - case EZFS_IOC_NOTSUPPORTED: - return (dgettext(TEXT_DOMAIN, "operation not supported by " - "zfs kernel module")); - case EZFS_ACTIVE_SPARE: - return (dgettext(TEXT_DOMAIN, "pool has active shared spare " - "device")); - case EZFS_UNPLAYED_LOGS: - return (dgettext(TEXT_DOMAIN, "log device has unplayed intent " - "logs")); - case EZFS_REFTAG_RELE: - return (dgettext(TEXT_DOMAIN, "no such tag on this dataset")); - case EZFS_REFTAG_HOLD: - return (dgettext(TEXT_DOMAIN, "tag already exists on this " - "dataset")); - case EZFS_TAGTOOLONG: - return (dgettext(TEXT_DOMAIN, "tag too long")); - case EZFS_PIPEFAILED: - return (dgettext(TEXT_DOMAIN, "pipe create failed")); - case EZFS_THREADCREATEFAILED: - return (dgettext(TEXT_DOMAIN, "thread create failed")); - case EZFS_POSTSPLIT_ONLINE: - return (dgettext(TEXT_DOMAIN, "disk was split from this pool " - "into a new one")); - case EZFS_SCRUB_PAUSED: - return (dgettext(TEXT_DOMAIN, "scrub is paused; " - "use 'zpool scrub' to resume")); - case EZFS_SCRUBBING: - return (dgettext(TEXT_DOMAIN, "currently scrubbing; " - "use 'zpool scrub -s' to cancel current scrub")); - case EZFS_NO_SCRUB: - return (dgettext(TEXT_DOMAIN, "there is no active scrub")); - case EZFS_DIFF: - return (dgettext(TEXT_DOMAIN, "unable to generate diffs")); - case EZFS_DIFFDATA: - return (dgettext(TEXT_DOMAIN, "invalid diff data")); - case EZFS_POOLREADONLY: - return (dgettext(TEXT_DOMAIN, "pool is read-only")); - case EZFS_NO_PENDING: - return (dgettext(TEXT_DOMAIN, "operation is not " - "in progress")); - case EZFS_CHECKPOINT_EXISTS: - return (dgettext(TEXT_DOMAIN, "checkpoint exists")); - case EZFS_DISCARDING_CHECKPOINT: - return (dgettext(TEXT_DOMAIN, "currently discarding " - "checkpoint")); - case EZFS_NO_CHECKPOINT: - return (dgettext(TEXT_DOMAIN, "checkpoint does not exist")); - case EZFS_DEVRM_IN_PROGRESS: - return (dgettext(TEXT_DOMAIN, "device removal in progress")); - case EZFS_VDEV_TOO_BIG: - return (dgettext(TEXT_DOMAIN, "device exceeds supported size")); - case EZFS_ACTIVE_POOL: - return (dgettext(TEXT_DOMAIN, "pool is imported on a " - "different host")); - case EZFS_TOOMANY: - return (dgettext(TEXT_DOMAIN, "argument list too long")); - case EZFS_INITIALIZING: - return (dgettext(TEXT_DOMAIN, "currently initializing")); - case EZFS_NO_INITIALIZE: - return (dgettext(TEXT_DOMAIN, "there is no active " - "initialization")); - case EZFS_WRONG_PARENT: - return (dgettext(TEXT_DOMAIN, "invalid parent dataset")); - case EZFS_UNKNOWN: - return (dgettext(TEXT_DOMAIN, "unknown error")); - default: - assert(hdl->libzfs_error == 0); - return (dgettext(TEXT_DOMAIN, "no error")); - } -} - -/*PRINTFLIKE2*/ -void -zfs_error_aux(libzfs_handle_t *hdl, const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - - (void) vsnprintf(hdl->libzfs_desc, sizeof (hdl->libzfs_desc), - fmt, ap); - hdl->libzfs_desc_active = 1; - - va_end(ap); -} - -static void -zfs_verror(libzfs_handle_t *hdl, int error, const char *fmt, va_list ap) -{ - (void) vsnprintf(hdl->libzfs_action, sizeof (hdl->libzfs_action), - fmt, ap); - hdl->libzfs_error = error; - - if (hdl->libzfs_desc_active) - hdl->libzfs_desc_active = 0; - else - hdl->libzfs_desc[0] = '\0'; - - if (hdl->libzfs_printerr) { - if (error == EZFS_UNKNOWN) { - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "internal " - "error: %s\n"), libzfs_error_description(hdl)); - abort(); - } - - (void) fprintf(stderr, "%s: %s\n", hdl->libzfs_action, - libzfs_error_description(hdl)); - if (error == EZFS_NOMEM) - exit(1); - } -} - -int -zfs_error(libzfs_handle_t *hdl, int error, const char *msg) -{ - return (zfs_error_fmt(hdl, error, "%s", msg)); -} - -/*PRINTFLIKE3*/ -int -zfs_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - - zfs_verror(hdl, error, fmt, ap); - - va_end(ap); - - return (-1); -} - -static int -zfs_common_error(libzfs_handle_t *hdl, int error, const char *fmt, - va_list ap) -{ - switch (error) { - case EPERM: - case EACCES: - zfs_verror(hdl, EZFS_PERM, fmt, ap); - return (-1); - - case ECANCELED: - zfs_verror(hdl, EZFS_NODELEGATION, fmt, ap); - return (-1); - - case EIO: - zfs_verror(hdl, EZFS_IO, fmt, ap); - return (-1); - - case EFAULT: - zfs_verror(hdl, EZFS_FAULT, fmt, ap); - return (-1); - - case EINTR: - zfs_verror(hdl, EZFS_INTR, fmt, ap); - return (-1); - } - - return (0); -} - -int -zfs_standard_error(libzfs_handle_t *hdl, int error, const char *msg) -{ - return (zfs_standard_error_fmt(hdl, error, "%s", msg)); -} - -/*PRINTFLIKE3*/ -int -zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - - if (zfs_common_error(hdl, error, fmt, ap) != 0) { - va_end(ap); - return (-1); - } - - switch (error) { - case ENXIO: - case ENODEV: - case EPIPE: - zfs_verror(hdl, EZFS_IO, fmt, ap); - break; - - case ENOENT: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "dataset does not exist")); - zfs_verror(hdl, EZFS_NOENT, fmt, ap); - break; - - case ENOSPC: - case EDQUOT: - zfs_verror(hdl, EZFS_NOSPC, fmt, ap); - va_end(ap); - return (-1); - - case EEXIST: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "dataset already exists")); - zfs_verror(hdl, EZFS_EXISTS, fmt, ap); - break; - - case EBUSY: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "dataset is busy")); - zfs_verror(hdl, EZFS_BUSY, fmt, ap); - break; - case EROFS: - zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); - break; - case ENAMETOOLONG: - zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap); - break; - case ENOTSUP: - zfs_verror(hdl, EZFS_BADVERSION, fmt, ap); - break; - case EAGAIN: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool I/O is currently suspended")); - zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); - break; - case EREMOTEIO: - zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap); - break; - case ZFS_ERR_IOC_CMD_UNAVAIL: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " - "module does not support this operation. A reboot may " - "be required to enable this operation.")); - zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); - break; - case ZFS_ERR_IOC_ARG_UNAVAIL: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " - "module does not support an option for this operation. " - "A reboot may be required to enable this option.")); - zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); - break; - case ZFS_ERR_IOC_ARG_REQUIRED: - case ZFS_ERR_IOC_ARG_BADTYPE: - zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); - break; - default: - zfs_error_aux(hdl, strerror(error)); - zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); - break; - } - - va_end(ap); - return (-1); -} - -int -zpool_standard_error(libzfs_handle_t *hdl, int error, const char *msg) -{ - return (zpool_standard_error_fmt(hdl, error, "%s", msg)); -} - -/*PRINTFLIKE3*/ -int -zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - - if (zfs_common_error(hdl, error, fmt, ap) != 0) { - va_end(ap); - return (-1); - } - - switch (error) { - case ENODEV: - zfs_verror(hdl, EZFS_NODEVICE, fmt, ap); - break; - - case ENOENT: - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, "no such pool or dataset")); - zfs_verror(hdl, EZFS_NOENT, fmt, ap); - break; - - case EEXIST: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool already exists")); - zfs_verror(hdl, EZFS_EXISTS, fmt, ap); - break; - - case EBUSY: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool is busy")); - zfs_verror(hdl, EZFS_BUSY, fmt, ap); - break; - - case ENXIO: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "one or more devices is currently unavailable")); - zfs_verror(hdl, EZFS_BADDEV, fmt, ap); - break; - - case ENAMETOOLONG: - zfs_verror(hdl, EZFS_DEVOVERFLOW, fmt, ap); - break; - - case ENOTSUP: - zfs_verror(hdl, EZFS_POOL_NOTSUP, fmt, ap); - break; - - case EINVAL: - zfs_verror(hdl, EZFS_POOL_INVALARG, fmt, ap); - break; - - case ENOSPC: - case EDQUOT: - zfs_verror(hdl, EZFS_NOSPC, fmt, ap); - va_end(ap); - return (-1); - - case EAGAIN: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool I/O is currently suspended")); - zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); - break; - - case EROFS: - zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); - break; - /* There is no pending operation to cancel */ - case ESRCH: - zfs_verror(hdl, EZFS_NO_PENDING, fmt, ap); - break; - case EREMOTEIO: - zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap); - break; - case ZFS_ERR_CHECKPOINT_EXISTS: - zfs_verror(hdl, EZFS_CHECKPOINT_EXISTS, fmt, ap); - break; - case ZFS_ERR_DISCARDING_CHECKPOINT: - zfs_verror(hdl, EZFS_DISCARDING_CHECKPOINT, fmt, ap); - break; - case ZFS_ERR_NO_CHECKPOINT: - zfs_verror(hdl, EZFS_NO_CHECKPOINT, fmt, ap); - break; - case ZFS_ERR_DEVRM_IN_PROGRESS: - zfs_verror(hdl, EZFS_DEVRM_IN_PROGRESS, fmt, ap); - break; - case ZFS_ERR_VDEV_TOO_BIG: - zfs_verror(hdl, EZFS_VDEV_TOO_BIG, fmt, ap); - break; - case ZFS_ERR_WRONG_PARENT: - zfs_verror(hdl, EZFS_WRONG_PARENT, fmt, ap); - break; - case ZFS_ERR_IOC_CMD_UNAVAIL: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " - "module does not support this operation. A reboot may " - "be required to enable this operation.")); - zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); - break; - case ZFS_ERR_IOC_ARG_UNAVAIL: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " - "module does not support an option for this operation. " - "A reboot may be required to enable this option.")); - zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); - break; - case ZFS_ERR_IOC_ARG_REQUIRED: - case ZFS_ERR_IOC_ARG_BADTYPE: - zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); - break; - default: - zfs_error_aux(hdl, strerror(error)); - zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); - } - - va_end(ap); - return (-1); -} - -/* - * Display an out of memory error message and abort the current program. - */ -int -no_memory(libzfs_handle_t *hdl) -{ - return (zfs_error(hdl, EZFS_NOMEM, "internal error")); -} - -/* - * A safe form of malloc() which will die if the allocation fails. - */ -void * -zfs_alloc(libzfs_handle_t *hdl, size_t size) -{ - void *data; - - if ((data = calloc(1, size)) == NULL) - (void) no_memory(hdl); - - return (data); -} - -/* - * A safe form of asprintf() which will die if the allocation fails. - */ -/*PRINTFLIKE2*/ -char * -zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...) -{ - va_list ap; - char *ret; - int err; - - va_start(ap, fmt); - - err = vasprintf(&ret, fmt, ap); - - va_end(ap); - - if (err < 0) - (void) no_memory(hdl); - - return (ret); -} - -/* - * A safe form of realloc(), which also zeroes newly allocated space. - */ -void * -zfs_realloc(libzfs_handle_t *hdl, void *ptr, size_t oldsize, size_t newsize) -{ - void *ret; - - if ((ret = realloc(ptr, newsize)) == NULL) { - (void) no_memory(hdl); - return (NULL); - } - - bzero((char *)ret + oldsize, (newsize - oldsize)); - return (ret); -} - -/* - * A safe form of strdup() which will die if the allocation fails. - */ -char * -zfs_strdup(libzfs_handle_t *hdl, const char *str) -{ - char *ret; - - if ((ret = strdup(str)) == NULL) - (void) no_memory(hdl); - - return (ret); -} - -/* - * Convert a number to an appropriately human-readable output. - */ -void -zfs_nicenum(uint64_t num, char *buf, size_t buflen) -{ - nicenum(num, buf, buflen); -} - -void -libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr) -{ - hdl->libzfs_printerr = printerr; -} - -static int -libzfs_load(void) -{ - int error; - - if (modfind("zfs") < 0) { - /* Not present in kernel, try loading it. */ - if (kldload("zfs") < 0 || modfind("zfs") < 0) { - if (errno != EEXIST) - return (-1); - } - } - return (0); -} - -libzfs_handle_t * -libzfs_init(void) -{ - libzfs_handle_t *hdl; - - if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) { - return (NULL); - } - - if (libzfs_load() < 0) { - free(hdl); - return (NULL); - } - - if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR)) < 0) { - free(hdl); - return (NULL); - } - - if ((hdl->libzfs_mnttab = fopen(MNTTAB, "r")) == NULL) { - (void) close(hdl->libzfs_fd); - free(hdl); - return (NULL); - } - - hdl->libzfs_sharetab = fopen(ZFS_EXPORTS_PATH, "r"); - - if (libzfs_core_init() != 0) { - (void) close(hdl->libzfs_fd); - (void) fclose(hdl->libzfs_mnttab); - (void) fclose(hdl->libzfs_sharetab); - free(hdl); - return (NULL); - } - - zfs_prop_init(); - zpool_prop_init(); - zpool_feature_init(); - libzfs_mnttab_init(hdl); - - if (getenv("ZFS_PROP_DEBUG") != NULL) { - hdl->libzfs_prop_debug = B_TRUE; - } - - return (hdl); -} - -void -libzfs_fini(libzfs_handle_t *hdl) -{ - (void) close(hdl->libzfs_fd); - if (hdl->libzfs_mnttab) - (void) fclose(hdl->libzfs_mnttab); - if (hdl->libzfs_sharetab) - (void) fclose(hdl->libzfs_sharetab); - zfs_uninit_libshare(hdl); - zpool_free_handles(hdl); -#ifdef illumos - libzfs_fru_clear(hdl, B_TRUE); -#endif - namespace_clear(hdl); - libzfs_mnttab_fini(hdl); - libzfs_core_fini(); - free(hdl); -} - -libzfs_handle_t * -zpool_get_handle(zpool_handle_t *zhp) -{ - return (zhp->zpool_hdl); -} - -libzfs_handle_t * -zfs_get_handle(zfs_handle_t *zhp) -{ - return (zhp->zfs_hdl); -} - -zpool_handle_t * -zfs_get_pool_handle(const zfs_handle_t *zhp) -{ - return (zhp->zpool_hdl); -} - -/* - * Given a name, determine whether or not it's a valid path - * (starts with '/' or "./"). If so, walk the mnttab trying - * to match the device number. If not, treat the path as an - * fs/vol/snap/bkmark name. - */ -zfs_handle_t * -zfs_path_to_zhandle(libzfs_handle_t *hdl, char *path, zfs_type_t argtype) -{ - struct stat64 statbuf; - struct extmnttab entry; - int ret; - - if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) { - /* - * It's not a valid path, assume it's a name of type 'argtype'. - */ - return (zfs_open(hdl, path, argtype)); - } - - if (stat64(path, &statbuf) != 0) { - (void) fprintf(stderr, "%s: %s\n", path, strerror(errno)); - return (NULL); - } - -#ifdef illumos - rewind(hdl->libzfs_mnttab); - while ((ret = getextmntent(hdl->libzfs_mnttab, &entry, 0)) == 0) { - if (makedevice(entry.mnt_major, entry.mnt_minor) == - statbuf.st_dev) { - break; - } - } -#else - { - struct statfs sfs; - - ret = statfs(path, &sfs); - if (ret == 0) - statfs2mnttab(&sfs, &entry); - else { - (void) fprintf(stderr, "%s: %s\n", path, - strerror(errno)); - } - } -#endif /* illumos */ - if (ret != 0) { - return (NULL); - } - - if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { - (void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"), - path); - return (NULL); - } - - return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM)); -} - -/* - * Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from - * an ioctl(). - */ -int -zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len) -{ - if (len == 0) - len = 16 * 1024; - zc->zc_nvlist_dst_size = len; - zc->zc_nvlist_dst = - (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size); - if (zc->zc_nvlist_dst == 0) - return (-1); - - return (0); -} - -/* - * Called when an ioctl() which returns an nvlist fails with ENOMEM. This will - * expand the nvlist to the size specified in 'zc_nvlist_dst_size', which was - * filled in by the kernel to indicate the actual required size. - */ -int -zcmd_expand_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc) -{ - free((void *)(uintptr_t)zc->zc_nvlist_dst); - zc->zc_nvlist_dst = - (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size); - if (zc->zc_nvlist_dst == 0) - return (-1); - - return (0); -} - -/* - * Called to free the src and dst nvlists stored in the command structure. - */ -void -zcmd_free_nvlists(zfs_cmd_t *zc) -{ - free((void *)(uintptr_t)zc->zc_nvlist_conf); - free((void *)(uintptr_t)zc->zc_nvlist_src); - free((void *)(uintptr_t)zc->zc_nvlist_dst); - zc->zc_nvlist_conf = NULL; - zc->zc_nvlist_src = NULL; - zc->zc_nvlist_dst = NULL; -} - -static int -zcmd_write_nvlist_com(libzfs_handle_t *hdl, uint64_t *outnv, uint64_t *outlen, - nvlist_t *nvl) -{ - char *packed; - size_t len; - - verify(nvlist_size(nvl, &len, NV_ENCODE_NATIVE) == 0); - - if ((packed = zfs_alloc(hdl, len)) == NULL) - return (-1); - - verify(nvlist_pack(nvl, &packed, &len, NV_ENCODE_NATIVE, 0) == 0); - - *outnv = (uint64_t)(uintptr_t)packed; - *outlen = len; - - return (0); -} - -int -zcmd_write_conf_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl) -{ - return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_conf, - &zc->zc_nvlist_conf_size, nvl)); -} - -int -zcmd_write_src_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl) -{ - return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_src, - &zc->zc_nvlist_src_size, nvl)); -} - -/* - * Unpacks an nvlist from the ZFS ioctl command structure. - */ -int -zcmd_read_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t **nvlp) -{ - if (nvlist_unpack((void *)(uintptr_t)zc->zc_nvlist_dst, - zc->zc_nvlist_dst_size, nvlp, 0) != 0) - return (no_memory(hdl)); - - return (0); -} - -int -zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) -{ - return (ioctl(hdl->libzfs_fd, request, zc)); -} - -/* - * ================================================================ - * API shared by zfs and zpool property management - * ================================================================ - */ - -static void -zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type) -{ - zprop_list_t *pl = cbp->cb_proplist; - int i; - char *title; - size_t len; - - cbp->cb_first = B_FALSE; - if (cbp->cb_scripted) - return; - - /* - * Start with the length of the column headers. - */ - cbp->cb_colwidths[GET_COL_NAME] = strlen(dgettext(TEXT_DOMAIN, "NAME")); - cbp->cb_colwidths[GET_COL_PROPERTY] = strlen(dgettext(TEXT_DOMAIN, - "PROPERTY")); - cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN, - "VALUE")); - cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN, - "RECEIVED")); - cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN, - "SOURCE")); - - /* first property is always NAME */ - assert(cbp->cb_proplist->pl_prop == - ((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ZFS_PROP_NAME)); - - /* - * Go through and calculate the widths for each column. For the - * 'source' column, we kludge it up by taking the worst-case scenario of - * inheriting from the longest name. This is acceptable because in the - * majority of cases 'SOURCE' is the last column displayed, and we don't - * use the width anyway. Note that the 'VALUE' column can be oversized, - * if the name of the property is much longer than any values we find. - */ - for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { - /* - * 'PROPERTY' column - */ - if (pl->pl_prop != ZPROP_INVAL) { - const char *propname = (type == ZFS_TYPE_POOL) ? - zpool_prop_to_name(pl->pl_prop) : - zfs_prop_to_name(pl->pl_prop); - - len = strlen(propname); - if (len > cbp->cb_colwidths[GET_COL_PROPERTY]) - cbp->cb_colwidths[GET_COL_PROPERTY] = len; - } else { - len = strlen(pl->pl_user_prop); - if (len > cbp->cb_colwidths[GET_COL_PROPERTY]) - cbp->cb_colwidths[GET_COL_PROPERTY] = len; - } - - /* - * 'VALUE' column. The first property is always the 'name' - * property that was tacked on either by /sbin/zfs's - * zfs_do_get() or when calling zprop_expand_list(), so we - * ignore its width. If the user specified the name property - * to display, then it will be later in the list in any case. - */ - if (pl != cbp->cb_proplist && - pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE]) - cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width; - - /* 'RECEIVED' column. */ - if (pl != cbp->cb_proplist && - pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD]) - cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width; - - /* - * 'NAME' and 'SOURCE' columns - */ - if (pl->pl_prop == (type == ZFS_TYPE_POOL ? ZPOOL_PROP_NAME : - ZFS_PROP_NAME) && - pl->pl_width > cbp->cb_colwidths[GET_COL_NAME]) { - cbp->cb_colwidths[GET_COL_NAME] = pl->pl_width; - cbp->cb_colwidths[GET_COL_SOURCE] = pl->pl_width + - strlen(dgettext(TEXT_DOMAIN, "inherited from")); - } - } - - /* - * Now go through and print the headers. - */ - for (i = 0; i < ZFS_GET_NCOLS; i++) { - switch (cbp->cb_columns[i]) { - case GET_COL_NAME: - title = dgettext(TEXT_DOMAIN, "NAME"); - break; - case GET_COL_PROPERTY: - title = dgettext(TEXT_DOMAIN, "PROPERTY"); - break; - case GET_COL_VALUE: - title = dgettext(TEXT_DOMAIN, "VALUE"); - break; - case GET_COL_RECVD: - title = dgettext(TEXT_DOMAIN, "RECEIVED"); - break; - case GET_COL_SOURCE: - title = dgettext(TEXT_DOMAIN, "SOURCE"); - break; - default: - title = NULL; - } - - if (title != NULL) { - if (i == (ZFS_GET_NCOLS - 1) || - cbp->cb_columns[i + 1] == GET_COL_NONE) - (void) printf("%s", title); - else - (void) printf("%-*s ", - cbp->cb_colwidths[cbp->cb_columns[i]], - title); - } - } - (void) printf("\n"); -} - -/* - * Display a single line of output, according to the settings in the callback - * structure. - */ -void -zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp, - const char *propname, const char *value, zprop_source_t sourcetype, - const char *source, const char *recvd_value) -{ - int i; - const char *str = NULL; - char buf[128]; - - /* - * Ignore those source types that the user has chosen to ignore. - */ - if ((sourcetype & cbp->cb_sources) == 0) - return; - - if (cbp->cb_first) - zprop_print_headers(cbp, cbp->cb_type); - - for (i = 0; i < ZFS_GET_NCOLS; i++) { - switch (cbp->cb_columns[i]) { - case GET_COL_NAME: - str = name; - break; - - case GET_COL_PROPERTY: - str = propname; - break; - - case GET_COL_VALUE: - str = value; - break; - - case GET_COL_SOURCE: - switch (sourcetype) { - case ZPROP_SRC_NONE: - str = "-"; - break; - - case ZPROP_SRC_DEFAULT: - str = "default"; - break; - - case ZPROP_SRC_LOCAL: - str = "local"; - break; - - case ZPROP_SRC_TEMPORARY: - str = "temporary"; - break; - - case ZPROP_SRC_INHERITED: - (void) snprintf(buf, sizeof (buf), - "inherited from %s", source); - str = buf; - break; - case ZPROP_SRC_RECEIVED: - str = "received"; - break; - - default: - str = NULL; - assert(!"unhandled zprop_source_t"); - } - break; - - case GET_COL_RECVD: - str = (recvd_value == NULL ? "-" : recvd_value); - break; - - default: - continue; - } - - if (cbp->cb_columns[i + 1] == GET_COL_NONE) - (void) printf("%s", str); - else if (cbp->cb_scripted) - (void) printf("%s\t", str); - else - (void) printf("%-*s ", - cbp->cb_colwidths[cbp->cb_columns[i]], - str); - } - - (void) printf("\n"); -} - -/* - * Given a numeric suffix, convert the value into a number of bits that the - * resulting value must be shifted. - */ -static int -str2shift(libzfs_handle_t *hdl, const char *buf) -{ - const char *ends = "BKMGTPEZ"; - int i; - - if (buf[0] == '\0') - return (0); - for (i = 0; i < strlen(ends); i++) { - if (toupper(buf[0]) == ends[i]) - break; - } - if (i == strlen(ends)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid numeric suffix '%s'"), buf); - return (-1); - } - - /* - * We want to allow trailing 'b' characters for 'GB' or 'Mb'. But don't - * allow 'BB' - that's just weird. - */ - if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0' && - toupper(buf[0]) != 'B')) - return (10*i); - - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid numeric suffix '%s'"), buf); - return (-1); -} - -/* - * Convert a string of the form '100G' into a real number. Used when setting - * properties or creating a volume. 'buf' is used to place an extended error - * message for the caller to use. - */ -int -zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num) -{ - char *end; - int shift; - - *num = 0; - - /* Check to see if this looks like a number. */ - if ((value[0] < '0' || value[0] > '9') && value[0] != '.') { - if (hdl) - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "bad numeric value '%s'"), value); - return (-1); - } - - /* Rely on strtoull() to process the numeric portion. */ - errno = 0; - *num = strtoull(value, &end, 10); - - /* - * Check for ERANGE, which indicates that the value is too large to fit - * in a 64-bit value. - */ - if (errno == ERANGE) { - if (hdl) - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "numeric value is too large")); - return (-1); - } - - /* - * If we have a decimal value, then do the computation with floating - * point arithmetic. Otherwise, use standard arithmetic. - */ - if (*end == '.') { - double fval = strtod(value, &end); - - if ((shift = str2shift(hdl, end)) == -1) - return (-1); - - fval *= pow(2, shift); - - if (fval > UINT64_MAX) { - if (hdl) - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "numeric value is too large")); - return (-1); - } - - *num = (uint64_t)fval; - } else { - if ((shift = str2shift(hdl, end)) == -1) - return (-1); - - /* Check for overflow */ - if (shift >= 64 || (*num << shift) >> shift != *num) { - if (hdl) - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "numeric value is too large")); - return (-1); - } - - *num <<= shift; - } - - return (0); -} - -/* - * Given a propname=value nvpair to set, parse any numeric properties - * (index, boolean, etc) if they are specified as strings and add the - * resulting nvpair to the returned nvlist. - * - * At the DSL layer, all properties are either 64-bit numbers or strings. - * We want the user to be able to ignore this fact and specify properties - * as native values (numbers, for example) or as strings (to simplify - * command line utilities). This also handles converting index types - * (compression, checksum, etc) from strings to their on-disk index. - */ -int -zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop, - zfs_type_t type, nvlist_t *ret, char **svalp, uint64_t *ivalp, - const char *errbuf) -{ - data_type_t datatype = nvpair_type(elem); - zprop_type_t proptype; - const char *propname; - char *value; - boolean_t isnone = B_FALSE; - boolean_t isauto = B_FALSE; - - if (type == ZFS_TYPE_POOL) { - proptype = zpool_prop_get_type(prop); - propname = zpool_prop_to_name(prop); - } else { - proptype = zfs_prop_get_type(prop); - propname = zfs_prop_to_name(prop); - } - - /* - * Convert any properties to the internal DSL value types. - */ - *svalp = NULL; - *ivalp = 0; - - switch (proptype) { - case PROP_TYPE_STRING: - if (datatype != DATA_TYPE_STRING) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' must be a string"), nvpair_name(elem)); - goto error; - } - (void) nvpair_value_string(elem, svalp); - if (strlen(*svalp) >= ZFS_MAXPROPLEN) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' is too long"), nvpair_name(elem)); - goto error; - } - break; - - case PROP_TYPE_NUMBER: - if (datatype == DATA_TYPE_STRING) { - (void) nvpair_value_string(elem, &value); - if (strcmp(value, "none") == 0) { - isnone = B_TRUE; - } else if (strcmp(value, "auto") == 0) { - isauto = B_TRUE; - } else if (zfs_nicestrtonum(hdl, value, ivalp) != 0) { - goto error; - } - } else if (datatype == DATA_TYPE_UINT64) { - (void) nvpair_value_uint64(elem, ivalp); - } else { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' must be a number"), nvpair_name(elem)); - goto error; - } - - /* - * Quota special: force 'none' and don't allow 0. - */ - if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 && !isnone && - (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_REFQUOTA)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "use 'none' to disable quota/refquota")); - goto error; - } - - /* - * Special handling for "*_limit=none". In this case it's not - * 0 but UINT64_MAX. - */ - if ((type & ZFS_TYPE_DATASET) && isnone && - (prop == ZFS_PROP_FILESYSTEM_LIMIT || - prop == ZFS_PROP_SNAPSHOT_LIMIT)) { - *ivalp = UINT64_MAX; - } - - /* - * Special handling for setting 'refreservation' to 'auto'. Use - * UINT64_MAX to tell the caller to use zfs_fix_auto_resv(). - * 'auto' is only allowed on volumes. - */ - if (isauto) { - switch (prop) { - case ZFS_PROP_REFRESERVATION: - if ((type & ZFS_TYPE_VOLUME) == 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s=auto' only allowed on " - "volumes"), nvpair_name(elem)); - goto error; - } - *ivalp = UINT64_MAX; - break; - default: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'auto' is invalid value for '%s'"), - nvpair_name(elem)); - goto error; - } - } - - break; - - case PROP_TYPE_INDEX: - if (datatype != DATA_TYPE_STRING) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' must be a string"), nvpair_name(elem)); - goto error; - } - - (void) nvpair_value_string(elem, &value); - - if (zprop_string_to_index(prop, value, ivalp, type) != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' must be one of '%s'"), propname, - zprop_values(prop, type)); - goto error; - } - break; - - default: - abort(); - } - - /* - * Add the result to our return set of properties. - */ - if (*svalp != NULL) { - if (nvlist_add_string(ret, propname, *svalp) != 0) { - (void) no_memory(hdl); - return (-1); - } - } else { - if (nvlist_add_uint64(ret, propname, *ivalp) != 0) { - (void) no_memory(hdl); - return (-1); - } - } - - return (0); -error: - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - return (-1); -} - -static int -addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp, - zfs_type_t type) -{ - int prop; - zprop_list_t *entry; - - prop = zprop_name_to_prop(propname, type); - - if (prop != ZPROP_INVAL && !zprop_valid_for_type(prop, type)) - prop = ZPROP_INVAL; - - /* - * When no property table entry can be found, return failure if - * this is a pool property or if this isn't a user-defined - * dataset property, - */ - if (prop == ZPROP_INVAL && ((type == ZFS_TYPE_POOL && - !zpool_prop_feature(propname) && - !zpool_prop_unsupported(propname)) || - (type == ZFS_TYPE_DATASET && !zfs_prop_user(propname) && - !zfs_prop_userquota(propname) && !zfs_prop_written(propname)))) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid property '%s'"), propname); - return (zfs_error(hdl, EZFS_BADPROP, - dgettext(TEXT_DOMAIN, "bad property list"))); - } - - if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL) - return (-1); - - entry->pl_prop = prop; - if (prop == ZPROP_INVAL) { - if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) == - NULL) { - free(entry); - return (-1); - } - entry->pl_width = strlen(propname); - } else { - entry->pl_width = zprop_width(prop, &entry->pl_fixed, - type); - } - - *listp = entry; - - return (0); -} - -/* - * Given a comma-separated list of properties, construct a property list - * containing both user-defined and native properties. This function will - * return a NULL list if 'all' is specified, which can later be expanded - * by zprop_expand_list(). - */ -int -zprop_get_list(libzfs_handle_t *hdl, char *props, zprop_list_t **listp, - zfs_type_t type) -{ - *listp = NULL; - - /* - * If 'all' is specified, return a NULL list. - */ - if (strcmp(props, "all") == 0) - return (0); - - /* - * If no props were specified, return an error. - */ - if (props[0] == '\0') { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "no properties specified")); - return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, - "bad property list"))); - } - - /* - * It would be nice to use getsubopt() here, but the inclusion of column - * aliases makes this more effort than it's worth. - */ - while (*props != '\0') { - size_t len; - char *p; - char c; - - if ((p = strchr(props, ',')) == NULL) { - len = strlen(props); - p = props + len; - } else { - len = p - props; - } - - /* - * Check for empty options. - */ - if (len == 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "empty property name")); - return (zfs_error(hdl, EZFS_BADPROP, - dgettext(TEXT_DOMAIN, "bad property list"))); - } - - /* - * Check all regular property names. - */ - c = props[len]; - props[len] = '\0'; - - if (strcmp(props, "space") == 0) { - static char *spaceprops[] = { - "name", "avail", "used", "usedbysnapshots", - "usedbydataset", "usedbyrefreservation", - "usedbychildren", NULL - }; - int i; - - for (i = 0; spaceprops[i]; i++) { - if (addlist(hdl, spaceprops[i], listp, type)) - return (-1); - listp = &(*listp)->pl_next; - } - } else { - if (addlist(hdl, props, listp, type)) - return (-1); - listp = &(*listp)->pl_next; - } - - props = p; - if (c == ',') - props++; - } - - return (0); -} - -void -zprop_free_list(zprop_list_t *pl) -{ - zprop_list_t *next; - - while (pl != NULL) { - next = pl->pl_next; - free(pl->pl_user_prop); - free(pl); - pl = next; - } -} - -typedef struct expand_data { - zprop_list_t **last; - libzfs_handle_t *hdl; - zfs_type_t type; -} expand_data_t; - -int -zprop_expand_list_cb(int prop, void *cb) -{ - zprop_list_t *entry; - expand_data_t *edp = cb; - - if ((entry = zfs_alloc(edp->hdl, sizeof (zprop_list_t))) == NULL) - return (ZPROP_INVAL); - - entry->pl_prop = prop; - entry->pl_width = zprop_width(prop, &entry->pl_fixed, edp->type); - entry->pl_all = B_TRUE; - - *(edp->last) = entry; - edp->last = &entry->pl_next; - - return (ZPROP_CONT); -} - -int -zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, zfs_type_t type) -{ - zprop_list_t *entry; - zprop_list_t **last; - expand_data_t exp; - - if (*plp == NULL) { - /* - * If this is the very first time we've been called for an 'all' - * specification, expand the list to include all native - * properties. - */ - last = plp; - - exp.last = last; - exp.hdl = hdl; - exp.type = type; - - if (zprop_iter_common(zprop_expand_list_cb, &exp, B_FALSE, - B_FALSE, type) == ZPROP_INVAL) - return (-1); - - /* - * Add 'name' to the beginning of the list, which is handled - * specially. - */ - if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL) - return (-1); - - entry->pl_prop = (type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : - ZFS_PROP_NAME; - entry->pl_width = zprop_width(entry->pl_prop, - &entry->pl_fixed, type); - entry->pl_all = B_TRUE; - entry->pl_next = *plp; - *plp = entry; - } - return (0); -} - -int -zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered, - zfs_type_t type) -{ - return (zprop_iter_common(func, cb, show_all, ordered, type)); -} - -ulong_t -get_system_hostid(void) -{ - char *env; - - /* - * Allow the hostid to be subverted for testing. - */ - env = getenv("ZFS_HOSTID"); - if (env) { - ulong_t hostid = strtoull(env, NULL, 16); - return (hostid & 0xFFFFFFFF); - } - - return (gethostid()); -} diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c deleted file mode 100644 index 2a6b5cc5927c..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c +++ /dev/null @@ -1,1234 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2012, 2020 by Delphix. All rights reserved. - * Copyright (c) 2013 Steven Hartland. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2017 RackTop Systems. - * Copyright (c) 2017 Datto Inc. - */ - -/* - * LibZFS_Core (lzc) is intended to replace most functionality in libzfs. - * It has the following characteristics: - * - * - Thread Safe. libzfs_core is accessible concurrently from multiple - * threads. This is accomplished primarily by avoiding global data - * (e.g. caching). Since it's thread-safe, there is no reason for a - * process to have multiple libzfs "instances". Therefore, we store - * our few pieces of data (e.g. the file descriptor) in global - * variables. The fd is reference-counted so that the libzfs_core - * library can be "initialized" multiple times (e.g. by different - * consumers within the same process). - * - * - Committed Interface. The libzfs_core interface will be committed, - * therefore consumers can compile against it and be confident that - * their code will continue to work on future releases of this code. - * Currently, the interface is Evolving (not Committed), but we intend - * to commit to it once it is more complete and we determine that it - * meets the needs of all consumers. - * - * - Programatic Error Handling. libzfs_core communicates errors with - * defined error numbers, and doesn't print anything to stdout/stderr. - * - * - Thin Layer. libzfs_core is a thin layer, marshaling arguments - * to/from the kernel ioctls. There is generally a 1:1 correspondence - * between libzfs_core functions and ioctls to /dev/zfs. - * - * - Clear Atomicity. Because libzfs_core functions are generally 1:1 - * with kernel ioctls, and kernel ioctls are general atomic, each - * libzfs_core function is atomic. For example, creating multiple - * snapshots with a single call to lzc_snapshot() is atomic -- it - * can't fail with only some of the requested snapshots created, even - * in the event of power loss or system crash. - * - * - Continued libzfs Support. Some higher-level operations (e.g. - * support for "zfs send -R") are too complicated to fit the scope of - * libzfs_core. This functionality will continue to live in libzfs. - * Where appropriate, libzfs will use the underlying atomic operations - * of libzfs_core. For example, libzfs may implement "zfs send -R | - * zfs receive" by using individual "send one snapshot", rename, - * destroy, and "receive one snapshot" operations in libzfs_core. - * /sbin/zfs and /zbin/zpool will link with both libzfs and - * libzfs_core. Other consumers should aim to use only libzfs_core, - * since that will be the supported, stable interface going forwards. - */ - -#define _IN_LIBZFS_CORE_ - -#include -#include -#include -#include -#include -#ifdef ZFS_DEBUG -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include "libzfs_core_compat.h" -#include "libzfs_compat.h" - -#ifdef __FreeBSD__ -extern int zfs_ioctl_version; -#endif - -static int g_fd = -1; -static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER; -static int g_refcount; - -#ifdef ZFS_DEBUG -static zfs_ioc_t fail_ioc_cmd; -static zfs_errno_t fail_ioc_err; - -static void -libzfs_core_debug_ioc(void) -{ - /* - * To test running newer user space binaries with kernel's - * that don't yet support an ioctl or a new ioctl arg we - * provide an override to intentionally fail an ioctl. - * - * USAGE: - * The override variable, ZFS_IOC_TEST, is of the form "cmd:err" - * - * For example, to fail a ZFS_IOC_POOL_CHECKPOINT with a - * ZFS_ERR_IOC_CMD_UNAVAIL, the string would be "0x5a4d:1029" - * - * $ sudo sh -c "ZFS_IOC_TEST=0x5a4d:1029 zpool checkpoint tank" - * cannot checkpoint 'tank': the loaded zfs module does not support - * this operation. A reboot may be required to enable this operation. - */ - if (fail_ioc_cmd == 0) { - char *ioc_test = getenv("ZFS_IOC_TEST"); - unsigned int ioc_num = 0, ioc_err = 0; - - if (ioc_test != NULL && - sscanf(ioc_test, "%i:%i", &ioc_num, &ioc_err) == 2 && - ioc_num < ZFS_IOC_LAST) { - fail_ioc_cmd = ioc_num; - fail_ioc_err = ioc_err; - } - } -} -#endif - -int -libzfs_core_init(void) -{ - (void) pthread_mutex_lock(&g_lock); - if (g_refcount == 0) { - g_fd = open("/dev/zfs", O_RDWR); - if (g_fd < 0) { - (void) pthread_mutex_unlock(&g_lock); - return (errno); - } - } - g_refcount++; - -#ifdef ZFS_DEBUG - libzfs_core_debug_ioc(); -#endif - (void) pthread_mutex_unlock(&g_lock); - - return (0); -} - -void -libzfs_core_fini(void) -{ - (void) pthread_mutex_lock(&g_lock); - ASSERT3S(g_refcount, >, 0); - - if (g_refcount > 0) - g_refcount--; - - if (g_refcount == 0 && g_fd != -1) { - (void) close(g_fd); - g_fd = -1; - } - (void) pthread_mutex_unlock(&g_lock); -} - -static int -lzc_ioctl(zfs_ioc_t ioc, const char *name, - nvlist_t *source, nvlist_t **resultp) -{ - zfs_cmd_t zc = { 0 }; - int error = 0; - char *packed = NULL; -#ifdef __FreeBSD__ - nvlist_t *oldsource; -#endif - size_t size = 0; - - ASSERT3S(g_refcount, >, 0); - VERIFY3S(g_fd, !=, -1); - -#ifdef ZFS_DEBUG - if (ioc == fail_ioc_cmd) - return (fail_ioc_err); -#endif - - if (name != NULL) - (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); - -#ifdef __FreeBSD__ - if (zfs_ioctl_version == ZFS_IOCVER_UNDEF) - zfs_ioctl_version = get_zfs_ioctl_version(); - - if (zfs_ioctl_version < ZFS_IOCVER_LZC) { - oldsource = source; - error = lzc_compat_pre(&zc, &ioc, &source); - if (error) - return (error); - } -#endif - - if (source != NULL) { - packed = fnvlist_pack(source, &size); - zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; - zc.zc_nvlist_src_size = size; - } - - if (resultp != NULL) { - *resultp = NULL; - if (ioc == ZFS_IOC_CHANNEL_PROGRAM) { - zc.zc_nvlist_dst_size = fnvlist_lookup_uint64(source, - ZCP_ARG_MEMLIMIT); - } else { - zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024); - } - zc.zc_nvlist_dst = (uint64_t)(uintptr_t) - malloc(zc.zc_nvlist_dst_size); -#ifdef illumos - if (zc.zc_nvlist_dst == NULL) { -#else - if (zc.zc_nvlist_dst == 0) { -#endif - error = ENOMEM; - goto out; - } - } - - while (ioctl(g_fd, ioc, &zc) != 0) { - /* - * If ioctl exited with ENOMEM, we retry the ioctl after - * increasing the size of the destination nvlist. - * - * Channel programs that exit with ENOMEM ran over the - * lua memory sandbox; they should not be retried. - */ - if (errno == ENOMEM && resultp != NULL && - ioc != ZFS_IOC_CHANNEL_PROGRAM) { - free((void *)(uintptr_t)zc.zc_nvlist_dst); - zc.zc_nvlist_dst_size *= 2; - zc.zc_nvlist_dst = (uint64_t)(uintptr_t) - malloc(zc.zc_nvlist_dst_size); -#ifdef illumos - if (zc.zc_nvlist_dst == NULL) { -#else - if (zc.zc_nvlist_dst == 0) { -#endif - error = ENOMEM; - goto out; - } - } else { - error = errno; - break; - } - } - -#ifdef __FreeBSD__ - if (zfs_ioctl_version < ZFS_IOCVER_LZC) - lzc_compat_post(&zc, ioc); -#endif - if (zc.zc_nvlist_dst_filled) { - *resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst, - zc.zc_nvlist_dst_size); - } -#ifdef __FreeBSD__ - if (zfs_ioctl_version < ZFS_IOCVER_LZC) - lzc_compat_outnvl(&zc, ioc, resultp); -#endif -out: -#ifdef __FreeBSD__ - if (zfs_ioctl_version < ZFS_IOCVER_LZC) { - if (source != oldsource) - nvlist_free(source); - source = oldsource; - } -#endif - fnvlist_pack_free(packed, size); - free((void *)(uintptr_t)zc.zc_nvlist_dst); - return (error); -} - -int -lzc_create(const char *fsname, enum lzc_dataset_type type, nvlist_t *props) -{ - int error; - nvlist_t *args = fnvlist_alloc(); - fnvlist_add_int32(args, "type", (dmu_objset_type_t)type); - if (props != NULL) - fnvlist_add_nvlist(args, "props", props); - error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL); - nvlist_free(args); - return (error); -} - -int -lzc_clone(const char *fsname, const char *origin, - nvlist_t *props) -{ - int error; - nvlist_t *args = fnvlist_alloc(); - fnvlist_add_string(args, "origin", origin); - if (props != NULL) - fnvlist_add_nvlist(args, "props", props); - error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL); - nvlist_free(args); - return (error); -} - -int -lzc_promote(const char *fsname, char *snapnamebuf, int snapnamelen) -{ - /* - * The promote ioctl is still legacy, so we need to construct our - * own zfs_cmd_t rather than using lzc_ioctl(). - */ - zfs_cmd_t zc = { 0 }; - - ASSERT3S(g_refcount, >, 0); - VERIFY3S(g_fd, !=, -1); - - (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name)); - if (ioctl(g_fd, ZFS_IOC_PROMOTE, &zc) != 0) { - int error = errno; - if (error == EEXIST && snapnamebuf != NULL) - (void) strlcpy(snapnamebuf, zc.zc_string, snapnamelen); - return (error); - } - return (0); -} - -int -lzc_remap(const char *fsname) -{ - int error; - nvlist_t *args = fnvlist_alloc(); - error = lzc_ioctl(ZFS_IOC_REMAP, fsname, args, NULL); - nvlist_free(args); - return (error); -} - -int -lzc_rename(const char *source, const char *target) -{ - zfs_cmd_t zc = { 0 }; - int error; - - ASSERT3S(g_refcount, >, 0); - VERIFY3S(g_fd, !=, -1); - - (void) strlcpy(zc.zc_name, source, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); - error = ioctl(g_fd, ZFS_IOC_RENAME, &zc); - if (error != 0) - error = errno; - return (error); -} - -int -lzc_destroy(const char *fsname) -{ - int error; - - nvlist_t *args = fnvlist_alloc(); - error = lzc_ioctl(ZFS_IOC_DESTROY, fsname, args, NULL); - nvlist_free(args); - return (error); -} - -/* - * Creates snapshots. - * - * The keys in the snaps nvlist are the snapshots to be created. - * They must all be in the same pool. - * - * The props nvlist is properties to set. Currently only user properties - * are supported. { user:prop_name -> string value } - * - * The returned results nvlist will have an entry for each snapshot that failed. - * The value will be the (int32) error code. - * - * The return value will be 0 if all snapshots were created, otherwise it will - * be the errno of a (unspecified) snapshot that failed. - */ -int -lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist) -{ - nvpair_t *elem; - nvlist_t *args; - int error; - char pool[ZFS_MAX_DATASET_NAME_LEN]; - - *errlist = NULL; - - /* determine the pool name */ - elem = nvlist_next_nvpair(snaps, NULL); - if (elem == NULL) - return (0); - (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); - pool[strcspn(pool, "/@")] = '\0'; - - args = fnvlist_alloc(); - fnvlist_add_nvlist(args, "snaps", snaps); - if (props != NULL) - fnvlist_add_nvlist(args, "props", props); - - error = lzc_ioctl(ZFS_IOC_SNAPSHOT, pool, args, errlist); - nvlist_free(args); - - return (error); -} - -/* - * Destroys snapshots. - * - * The keys in the snaps nvlist are the snapshots to be destroyed. - * They must all be in the same pool. - * - * Snapshots that do not exist will be silently ignored. - * - * If 'defer' is not set, and a snapshot has user holds or clones, the - * destroy operation will fail and none of the snapshots will be - * destroyed. - * - * If 'defer' is set, and a snapshot has user holds or clones, it will be - * marked for deferred destruction, and will be destroyed when the last hold - * or clone is removed/destroyed. - * - * The return value will be 0 if all snapshots were destroyed (or marked for - * later destruction if 'defer' is set) or didn't exist to begin with. - * - * Otherwise the return value will be the errno of a (unspecified) snapshot - * that failed, no snapshots will be destroyed, and the errlist will have an - * entry for each snapshot that failed. The value in the errlist will be - * the (int32) error code. - */ -int -lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist) -{ - nvpair_t *elem; - nvlist_t *args; - int error; - char pool[ZFS_MAX_DATASET_NAME_LEN]; - - /* determine the pool name */ - elem = nvlist_next_nvpair(snaps, NULL); - if (elem == NULL) - return (0); - (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); - pool[strcspn(pool, "/@")] = '\0'; - - args = fnvlist_alloc(); - fnvlist_add_nvlist(args, "snaps", snaps); - if (defer) - fnvlist_add_boolean(args, "defer"); - - error = lzc_ioctl(ZFS_IOC_DESTROY_SNAPS, pool, args, errlist); - nvlist_free(args); - - return (error); -} - -int -lzc_snaprange_space(const char *firstsnap, const char *lastsnap, - uint64_t *usedp) -{ - nvlist_t *args; - nvlist_t *result; - int err; - char fs[ZFS_MAX_DATASET_NAME_LEN]; - char *atp; - - /* determine the fs name */ - (void) strlcpy(fs, firstsnap, sizeof (fs)); - atp = strchr(fs, '@'); - if (atp == NULL) - return (EINVAL); - *atp = '\0'; - - args = fnvlist_alloc(); - fnvlist_add_string(args, "firstsnap", firstsnap); - - err = lzc_ioctl(ZFS_IOC_SPACE_SNAPS, lastsnap, args, &result); - nvlist_free(args); - if (err == 0) - *usedp = fnvlist_lookup_uint64(result, "used"); - fnvlist_free(result); - - return (err); -} - -boolean_t -lzc_exists(const char *dataset) -{ - /* - * The objset_stats ioctl is still legacy, so we need to construct our - * own zfs_cmd_t rather than using lzc_ioctl(). - */ - zfs_cmd_t zc = { 0 }; - - ASSERT3S(g_refcount, >, 0); - VERIFY3S(g_fd, !=, -1); - - (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); - return (ioctl(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0); -} - -/* - * outnvl is unused. - * It was added to preserve the function signature in case it is - * needed in the future. - */ -/*ARGSUSED*/ -int -lzc_sync(const char *pool_name, nvlist_t *innvl, nvlist_t **outnvl) -{ - return (lzc_ioctl(ZFS_IOC_POOL_SYNC, pool_name, innvl, NULL)); -} - -/* - * Create "user holds" on snapshots. If there is a hold on a snapshot, - * the snapshot can not be destroyed. (However, it can be marked for deletion - * by lzc_destroy_snaps(defer=B_TRUE).) - * - * The keys in the nvlist are snapshot names. - * The snapshots must all be in the same pool. - * The value is the name of the hold (string type). - * - * If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL). - * In this case, when the cleanup_fd is closed (including on process - * termination), the holds will be released. If the system is shut down - * uncleanly, the holds will be released when the pool is next opened - * or imported. - * - * Holds for snapshots which don't exist will be skipped and have an entry - * added to errlist, but will not cause an overall failure. - * - * The return value will be 0 if all holds, for snapshots that existed, - * were succesfully created. - * - * Otherwise the return value will be the errno of a (unspecified) hold that - * failed and no holds will be created. - * - * In all cases the errlist will have an entry for each hold that failed - * (name = snapshot), with its value being the error code (int32). - */ -int -lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist) -{ - char pool[ZFS_MAX_DATASET_NAME_LEN]; - nvlist_t *args; - nvpair_t *elem; - int error; - - /* determine the pool name */ - elem = nvlist_next_nvpair(holds, NULL); - if (elem == NULL) - return (0); - (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); - pool[strcspn(pool, "/@")] = '\0'; - - args = fnvlist_alloc(); - fnvlist_add_nvlist(args, "holds", holds); - if (cleanup_fd != -1) - fnvlist_add_int32(args, "cleanup_fd", cleanup_fd); - - error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist); - nvlist_free(args); - return (error); -} - -/* - * Release "user holds" on snapshots. If the snapshot has been marked for - * deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have - * any clones, and all the user holds are removed, then the snapshot will be - * destroyed. - * - * The keys in the nvlist are snapshot names. - * The snapshots must all be in the same pool. - * The value is a nvlist whose keys are the holds to remove. - * - * Holds which failed to release because they didn't exist will have an entry - * added to errlist, but will not cause an overall failure. - * - * The return value will be 0 if the nvl holds was empty or all holds that - * existed, were successfully removed. - * - * Otherwise the return value will be the errno of a (unspecified) hold that - * failed to release and no holds will be released. - * - * In all cases the errlist will have an entry for each hold that failed to - * to release. - */ -int -lzc_release(nvlist_t *holds, nvlist_t **errlist) -{ - char pool[ZFS_MAX_DATASET_NAME_LEN]; - nvpair_t *elem; - - /* determine the pool name */ - elem = nvlist_next_nvpair(holds, NULL); - if (elem == NULL) - return (0); - (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); - pool[strcspn(pool, "/@")] = '\0'; - - return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist)); -} - -/* - * Retrieve list of user holds on the specified snapshot. - * - * On success, *holdsp will be set to a nvlist which the caller must free. - * The keys are the names of the holds, and the value is the creation time - * of the hold (uint64) in seconds since the epoch. - */ -int -lzc_get_holds(const char *snapname, nvlist_t **holdsp) -{ - return (lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, NULL, holdsp)); -} - -/* - * Generate a zfs send stream for the specified snapshot and write it to - * the specified file descriptor. - * - * "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap") - * - * If "from" is NULL, a full (non-incremental) stream will be sent. - * If "from" is non-NULL, it must be the full name of a snapshot or - * bookmark to send an incremental from (e.g. "pool/fs@earlier_snap" or - * "pool/fs#earlier_bmark"). If non-NULL, the specified snapshot or - * bookmark must represent an earlier point in the history of "snapname"). - * It can be an earlier snapshot in the same filesystem or zvol as "snapname", - * or it can be the origin of "snapname"'s filesystem, or an earlier - * snapshot in the origin, etc. - * - * "fd" is the file descriptor to write the send stream to. - * - * If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted - * to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT - * records with drr_blksz > 128K. - * - * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted - * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA, - * which the receiving system must support (as indicated by support - * for the "embedded_data" feature). - */ -int -lzc_send(const char *snapname, const char *from, int fd, - enum lzc_send_flags flags) -{ - return (lzc_send_resume(snapname, from, fd, flags, 0, 0)); -} - -int -lzc_send_resume(const char *snapname, const char *from, int fd, - enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff) -{ - nvlist_t *args; - int err; - - args = fnvlist_alloc(); - fnvlist_add_int32(args, "fd", fd); - if (from != NULL) - fnvlist_add_string(args, "fromsnap", from); - if (flags & LZC_SEND_FLAG_LARGE_BLOCK) - fnvlist_add_boolean(args, "largeblockok"); - if (flags & LZC_SEND_FLAG_EMBED_DATA) - fnvlist_add_boolean(args, "embedok"); - if (flags & LZC_SEND_FLAG_COMPRESS) - fnvlist_add_boolean(args, "compressok"); - if (resumeobj != 0 || resumeoff != 0) { - fnvlist_add_uint64(args, "resume_object", resumeobj); - fnvlist_add_uint64(args, "resume_offset", resumeoff); - } - err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL); - nvlist_free(args); - return (err); -} - -/* - * "from" can be NULL, a snapshot, or a bookmark. - * - * If from is NULL, a full (non-incremental) stream will be estimated. This - * is calculated very efficiently. - * - * If from is a snapshot, lzc_send_space uses the deadlists attached to - * each snapshot to efficiently estimate the stream size. - * - * If from is a bookmark, the indirect blocks in the destination snapshot - * are traversed, looking for blocks with a birth time since the creation TXG of - * the snapshot this bookmark was created from. This will result in - * significantly more I/O and be less efficient than a send space estimation on - * an equivalent snapshot. - */ -int -lzc_send_space(const char *snapname, const char *from, - enum lzc_send_flags flags, uint64_t *spacep) -{ - nvlist_t *args; - nvlist_t *result; - int err; - - args = fnvlist_alloc(); - if (from != NULL) - fnvlist_add_string(args, "from", from); - if (flags & LZC_SEND_FLAG_LARGE_BLOCK) - fnvlist_add_boolean(args, "largeblockok"); - if (flags & LZC_SEND_FLAG_EMBED_DATA) - fnvlist_add_boolean(args, "embedok"); - if (flags & LZC_SEND_FLAG_COMPRESS) - fnvlist_add_boolean(args, "compressok"); - err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result); - nvlist_free(args); - if (err == 0) - *spacep = fnvlist_lookup_uint64(result, "space"); - nvlist_free(result); - return (err); -} - -static int -recv_read(int fd, void *buf, int ilen) -{ - char *cp = buf; - int rv; - int len = ilen; - - do { - rv = read(fd, cp, len); - cp += rv; - len -= rv; - } while (rv > 0); - - if (rv < 0 || len != 0) - return (EIO); - - return (0); -} - -static int -recv_impl(const char *snapname, nvlist_t *props, const char *origin, - boolean_t force, boolean_t resumable, int fd, - const dmu_replay_record_t *begin_record) -{ - /* - * The receive ioctl is still legacy, so we need to construct our own - * zfs_cmd_t rather than using zfsc_ioctl(). - */ - zfs_cmd_t zc = { 0 }; - char *atp; - char *packed = NULL; - size_t size; - int error; - - ASSERT3S(g_refcount, >, 0); - VERIFY3S(g_fd, !=, -1); - - /* zc_name is name of containing filesystem */ - (void) strlcpy(zc.zc_name, snapname, sizeof (zc.zc_name)); - atp = strchr(zc.zc_name, '@'); - if (atp == NULL) - return (EINVAL); - *atp = '\0'; - - /* if the fs does not exist, try its parent. */ - if (!lzc_exists(zc.zc_name)) { - char *slashp = strrchr(zc.zc_name, '/'); - if (slashp == NULL) - return (ENOENT); - *slashp = '\0'; - - } - - /* zc_value is full name of the snapshot to create */ - (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); - - if (props != NULL) { - /* zc_nvlist_src is props to set */ - packed = fnvlist_pack(props, &size); - zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; - zc.zc_nvlist_src_size = size; - } - - /* zc_string is name of clone origin (if DRR_FLAG_CLONE) */ - if (origin != NULL) - (void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string)); - - /* zc_begin_record is non-byteswapped BEGIN record */ - if (begin_record == NULL) { - error = recv_read(fd, &zc.zc_begin_record, - sizeof (zc.zc_begin_record)); - if (error != 0) - goto out; - } else { - zc.zc_begin_record = *begin_record; - } - - /* zc_cookie is fd to read from */ - zc.zc_cookie = fd; - - /* zc guid is force flag */ - zc.zc_guid = force; - - zc.zc_resumable = resumable; - - /* zc_cleanup_fd is unused */ - zc.zc_cleanup_fd = -1; - - error = ioctl(g_fd, ZFS_IOC_RECV, &zc); - if (error != 0) - error = errno; - -out: - if (packed != NULL) - fnvlist_pack_free(packed, size); - free((void*)(uintptr_t)zc.zc_nvlist_dst); - return (error); -} - -/* - * The simplest receive case: receive from the specified fd, creating the - * specified snapshot. Apply the specified properties as "received" properties - * (which can be overridden by locally-set properties). If the stream is a - * clone, its origin snapshot must be specified by 'origin'. The 'force' - * flag will cause the target filesystem to be rolled back or destroyed if - * necessary to receive. - * - * Return 0 on success or an errno on failure. - * - * Note: this interface does not work on dedup'd streams - * (those with DMU_BACKUP_FEATURE_DEDUP). - */ -int -lzc_receive(const char *snapname, nvlist_t *props, const char *origin, - boolean_t force, int fd) -{ - return (recv_impl(snapname, props, origin, force, B_FALSE, fd, NULL)); -} - -/* - * Like lzc_receive, but if the receive fails due to premature stream - * termination, the intermediate state will be preserved on disk. In this - * case, ECKSUM will be returned. The receive may subsequently be resumed - * with a resuming send stream generated by lzc_send_resume(). - */ -int -lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin, - boolean_t force, int fd) -{ - return (recv_impl(snapname, props, origin, force, B_TRUE, fd, NULL)); -} - -/* - * Like lzc_receive, but allows the caller to read the begin record and then to - * pass it in. That could be useful if the caller wants to derive, for example, - * the snapname or the origin parameters based on the information contained in - * the begin record. - * The begin record must be in its original form as read from the stream, - * in other words, it should not be byteswapped. - * - * The 'resumable' parameter allows to obtain the same behavior as with - * lzc_receive_resumable. - */ -int -lzc_receive_with_header(const char *snapname, nvlist_t *props, - const char *origin, boolean_t force, boolean_t resumable, int fd, - const dmu_replay_record_t *begin_record) -{ - if (begin_record == NULL) - return (EINVAL); - return (recv_impl(snapname, props, origin, force, resumable, fd, - begin_record)); -} - -/* - * Roll back this filesystem or volume to its most recent snapshot. - * If snapnamebuf is not NULL, it will be filled in with the name - * of the most recent snapshot. - * Note that the latest snapshot may change if a new one is concurrently - * created or the current one is destroyed. lzc_rollback_to can be used - * to roll back to a specific latest snapshot. - * - * Return 0 on success or an errno on failure. - */ -int -lzc_rollback(const char *fsname, char *snapnamebuf, int snapnamelen) -{ - nvlist_t *args; - nvlist_t *result; - int err; - - args = fnvlist_alloc(); - err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result); - nvlist_free(args); - if (err == 0 && snapnamebuf != NULL) { - const char *snapname = fnvlist_lookup_string(result, "target"); - (void) strlcpy(snapnamebuf, snapname, snapnamelen); - } - nvlist_free(result); - - return (err); -} - -/* - * Roll back this filesystem or volume to the specified snapshot, - * if possible. - * - * Return 0 on success or an errno on failure. - */ -int -lzc_rollback_to(const char *fsname, const char *snapname) -{ - nvlist_t *args; - nvlist_t *result; - int err; - - args = fnvlist_alloc(); - fnvlist_add_string(args, "target", snapname); - err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result); - nvlist_free(args); - nvlist_free(result); - return (err); -} - -/* - * Creates bookmarks. - * - * The bookmarks nvlist maps from name of the bookmark (e.g. "pool/fs#bmark") to - * the name of the snapshot (e.g. "pool/fs@snap"). All the bookmarks and - * snapshots must be in the same pool. - * - * The returned results nvlist will have an entry for each bookmark that failed. - * The value will be the (int32) error code. - * - * The return value will be 0 if all bookmarks were created, otherwise it will - * be the errno of a (undetermined) bookmarks that failed. - */ -int -lzc_bookmark(nvlist_t *bookmarks, nvlist_t **errlist) -{ - nvpair_t *elem; - int error; - char pool[ZFS_MAX_DATASET_NAME_LEN]; - - /* determine the pool name */ - elem = nvlist_next_nvpair(bookmarks, NULL); - if (elem == NULL) - return (0); - (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); - pool[strcspn(pool, "/#")] = '\0'; - - error = lzc_ioctl(ZFS_IOC_BOOKMARK, pool, bookmarks, errlist); - - return (error); -} - -/* - * Retrieve bookmarks. - * - * Retrieve the list of bookmarks for the given file system. The props - * parameter is an nvlist of property names (with no values) that will be - * returned for each bookmark. - * - * The following are valid properties on bookmarks, all of which are numbers - * (represented as uint64 in the nvlist) - * - * "guid" - globally unique identifier of the snapshot it refers to - * "createtxg" - txg when the snapshot it refers to was created - * "creation" - timestamp when the snapshot it refers to was created - * - * The format of the returned nvlist as follows: - * -> { - * -> { - * "value" -> uint64 - * } - * } - */ -int -lzc_get_bookmarks(const char *fsname, nvlist_t *props, nvlist_t **bmarks) -{ - return (lzc_ioctl(ZFS_IOC_GET_BOOKMARKS, fsname, props, bmarks)); -} - -/* - * Destroys bookmarks. - * - * The keys in the bmarks nvlist are the bookmarks to be destroyed. - * They must all be in the same pool. Bookmarks are specified as - * #. - * - * Bookmarks that do not exist will be silently ignored. - * - * The return value will be 0 if all bookmarks that existed were destroyed. - * - * Otherwise the return value will be the errno of a (undetermined) bookmark - * that failed, no bookmarks will be destroyed, and the errlist will have an - * entry for each bookmarks that failed. The value in the errlist will be - * the (int32) error code. - */ -int -lzc_destroy_bookmarks(nvlist_t *bmarks, nvlist_t **errlist) -{ - nvpair_t *elem; - int error; - char pool[ZFS_MAX_DATASET_NAME_LEN]; - - /* determine the pool name */ - elem = nvlist_next_nvpair(bmarks, NULL); - if (elem == NULL) - return (0); - (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); - pool[strcspn(pool, "/#")] = '\0'; - - error = lzc_ioctl(ZFS_IOC_DESTROY_BOOKMARKS, pool, bmarks, errlist); - - return (error); -} - -static int -lzc_channel_program_impl(const char *pool, const char *program, boolean_t sync, - uint64_t instrlimit, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl) -{ - int error; - nvlist_t *args; - - args = fnvlist_alloc(); - fnvlist_add_string(args, ZCP_ARG_PROGRAM, program); - fnvlist_add_nvlist(args, ZCP_ARG_ARGLIST, argnvl); - fnvlist_add_boolean_value(args, ZCP_ARG_SYNC, sync); - fnvlist_add_uint64(args, ZCP_ARG_INSTRLIMIT, instrlimit); - fnvlist_add_uint64(args, ZCP_ARG_MEMLIMIT, memlimit); - error = lzc_ioctl(ZFS_IOC_CHANNEL_PROGRAM, pool, args, outnvl); - fnvlist_free(args); - - return (error); -} - -/* - * Executes a channel program. - * - * If this function returns 0 the channel program was successfully loaded and - * ran without failing. Note that individual commands the channel program ran - * may have failed and the channel program is responsible for reporting such - * errors through outnvl if they are important. - * - * This method may also return: - * - * EINVAL The program contains syntax errors, or an invalid memory or time - * limit was given. No part of the channel program was executed. - * If caused by syntax errors, 'outnvl' contains information about the - * errors. - * - * EDOM The program was executed, but encountered a runtime error, such as - * calling a function with incorrect arguments, invoking the error() - * function directly, failing an assert() command, etc. Some portion - * of the channel program may have executed and committed changes. - * Information about the failure can be found in 'outnvl'. - * - * ENOMEM The program fully executed, but the output buffer was not large - * enough to store the returned value. No output is returned through - * 'outnvl'. - * - * ENOSPC The program was terminated because it exceeded its memory usage - * limit. Some portion of the channel program may have executed and - * committed changes to disk. No output is returned through 'outnvl'. - * - * ETIMEDOUT The program was terminated because it exceeded its Lua instruction - * limit. Some portion of the channel program may have executed and - * committed changes to disk. No output is returned through 'outnvl'. - */ -int -lzc_channel_program(const char *pool, const char *program, uint64_t instrlimit, - uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl) -{ - return (lzc_channel_program_impl(pool, program, B_TRUE, instrlimit, - memlimit, argnvl, outnvl)); -} - -/* - * Creates a checkpoint for the specified pool. - * - * If this function returns 0 the pool was successfully checkpointed. - * - * This method may also return: - * - * ZFS_ERR_CHECKPOINT_EXISTS - * The pool already has a checkpoint. A pools can only have one - * checkpoint at most, at any given time. - * - * ZFS_ERR_DISCARDING_CHECKPOINT - * ZFS is in the middle of discarding a checkpoint for this pool. - * The pool can be checkpointed again once the discard is done. - * - * ZFS_DEVRM_IN_PROGRESS - * A vdev is currently being removed. The pool cannot be - * checkpointed until the device removal is done. - * - * ZFS_VDEV_TOO_BIG - * One or more top-level vdevs exceed the maximum vdev size - * supported for this feature. - */ -int -lzc_pool_checkpoint(const char *pool) -{ - int error; - - nvlist_t *result = NULL; - nvlist_t *args = fnvlist_alloc(); - - error = lzc_ioctl(ZFS_IOC_POOL_CHECKPOINT, pool, args, &result); - - fnvlist_free(args); - fnvlist_free(result); - - return (error); -} - -/* - * Discard the checkpoint from the specified pool. - * - * If this function returns 0 the checkpoint was successfully discarded. - * - * This method may also return: - * - * ZFS_ERR_NO_CHECKPOINT - * The pool does not have a checkpoint. - * - * ZFS_ERR_DISCARDING_CHECKPOINT - * ZFS is already in the middle of discarding the checkpoint. - */ -int -lzc_pool_checkpoint_discard(const char *pool) -{ - int error; - - nvlist_t *result = NULL; - nvlist_t *args = fnvlist_alloc(); - - error = lzc_ioctl(ZFS_IOC_POOL_DISCARD_CHECKPOINT, pool, args, &result); - - fnvlist_free(args); - fnvlist_free(result); - - return (error); -} - -/* - * Executes a read-only channel program. - * - * A read-only channel program works programmatically the same way as a - * normal channel program executed with lzc_channel_program(). The only - * difference is it runs exclusively in open-context and therefore can - * return faster. The downside to that, is that the program cannot change - * on-disk state by calling functions from the zfs.sync submodule. - * - * The return values of this function (and their meaning) are exactly the - * same as the ones described in lzc_channel_program(). - */ -int -lzc_channel_program_nosync(const char *pool, const char *program, - uint64_t timeout, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl) -{ - return (lzc_channel_program_impl(pool, program, B_FALSE, timeout, - memlimit, argnvl, outnvl)); -} - -/* - * Changes initializing state. - * - * vdevs should be a list of (, guid) where guid is a uint64 vdev GUID. - * The key is ignored. - * - * If there are errors related to vdev arguments, per-vdev errors are returned - * in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where - * guid is stringified with PRIu64, and errno is one of the following as - * an int64_t: - * - ENODEV if the device was not found - * - EINVAL if the devices is not a leaf or is not concrete (e.g. missing) - * - EROFS if the device is not writeable - * - EBUSY start requested but the device is already being initialized - * - ESRCH cancel/suspend requested but device is not being initialized - * - * If the errlist is empty, then return value will be: - * - EINVAL if one or more arguments was invalid - * - Other spa_open failures - * - 0 if the operation succeeded - */ -int -lzc_initialize(const char *poolname, pool_initialize_func_t cmd_type, - nvlist_t *vdevs, nvlist_t **errlist) -{ - int error; - nvlist_t *args = fnvlist_alloc(); - fnvlist_add_uint64(args, ZPOOL_INITIALIZE_COMMAND, (uint64_t)cmd_type); - fnvlist_add_nvlist(args, ZPOOL_INITIALIZE_VDEVS, vdevs); - - error = lzc_ioctl(ZFS_IOC_POOL_INITIALIZE, poolname, args, errlist); - - fnvlist_free(args); - - return (error); -} - -/* - * Set the bootenv contents for the given pool. - */ -int -lzc_set_bootenv(const char *pool, const char *env) -{ - nvlist_t *args = fnvlist_alloc(); - fnvlist_add_string(args, "envmap", env); - int error = lzc_ioctl(ZFS_IOC_SET_BOOTENV, pool, args, NULL); - fnvlist_free(args); - return (error); -} - -/* - * Get the contents of the bootenv of the given pool. - */ -int -lzc_get_bootenv(const char *pool, nvlist_t **outnvl) -{ - return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl)); -} diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h deleted file mode 100644 index 76c4fa1bf6b4..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2012, 2020 by Delphix. All rights reserved. - * Copyright (c) 2013 by Martin Matuska . All rights reserved. - * Copyright 2017 RackTop Systems. - * Copyright (c) 2017 Datto Inc. - */ - -#ifndef _LIBZFS_CORE_H -#define _LIBZFS_CORE_H - -#include -#include -#include -#include - - -#ifdef __cplusplus -extern "C" { -#endif - -int libzfs_core_init(void); -void libzfs_core_fini(void); - -/* - * NB: this type should be kept binary compatible with dmu_objset_type_t. - */ -enum lzc_dataset_type { - LZC_DATSET_TYPE_ZFS = 2, - LZC_DATSET_TYPE_ZVOL -}; - -int lzc_remap(const char *fsname); -int lzc_snapshot(nvlist_t *, nvlist_t *, nvlist_t **); -int lzc_create(const char *, enum lzc_dataset_type, nvlist_t *); -int lzc_clone(const char *, const char *, nvlist_t *); -int lzc_promote(const char *, char *, int); -int lzc_destroy_snaps(nvlist_t *, boolean_t, nvlist_t **); -int lzc_bookmark(nvlist_t *, nvlist_t **); -int lzc_get_bookmarks(const char *, nvlist_t *, nvlist_t **); -int lzc_destroy_bookmarks(nvlist_t *, nvlist_t **); -int lzc_initialize(const char *, pool_initialize_func_t, nvlist_t *, - nvlist_t **); - -int lzc_snaprange_space(const char *, const char *, uint64_t *); - -int lzc_hold(nvlist_t *, int, nvlist_t **); -int lzc_release(nvlist_t *, nvlist_t **); -int lzc_get_holds(const char *, nvlist_t **); - -enum lzc_send_flags { - LZC_SEND_FLAG_EMBED_DATA = 1 << 0, - LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1, - LZC_SEND_FLAG_COMPRESS = 1 << 2 -}; - -int lzc_send(const char *, const char *, int, enum lzc_send_flags); -int lzc_send_resume(const char *, const char *, int, - enum lzc_send_flags, uint64_t, uint64_t); -int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *); - -struct dmu_replay_record; - -int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, int); -int lzc_receive_resumable(const char *, nvlist_t *, const char *, - boolean_t, int); -int lzc_receive_with_header(const char *, nvlist_t *, const char *, boolean_t, - boolean_t, int, const struct dmu_replay_record *); - -boolean_t lzc_exists(const char *); - -int lzc_rollback(const char *, char *, int); -int lzc_rollback_to(const char *, const char *); - -int lzc_sync(const char *, nvlist_t *, nvlist_t **); - -int lzc_rename(const char *, const char *); -int lzc_destroy(const char *); - -int lzc_channel_program(const char *, const char *, uint64_t, - uint64_t, nvlist_t *, nvlist_t **); -int lzc_channel_program_nosync(const char *, const char *, uint64_t, - uint64_t, nvlist_t *, nvlist_t **); - -int lzc_pool_checkpoint(const char *); -int lzc_pool_checkpoint_discard(const char *); - -int lzc_set_bootenv(const char *, const char *); -int lzc_get_bootenv(const char *, nvlist_t **); -#ifdef __cplusplus -} -#endif - -#endif /* _LIBZFS_CORE_H */ diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c deleted file mode 100644 index a3b872ee29da..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c +++ /dev/null @@ -1,189 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2013 Martin Matuska . All rights reserved. - */ - -#include -#include -#include "libzfs_core_compat.h" - -extern int zfs_ioctl_version; - -int -lzc_compat_pre(zfs_cmd_t *zc, zfs_ioc_t *ioc, nvlist_t **source) -{ - nvlist_t *nvl = NULL; - nvpair_t *pair, *hpair; - char *buf, *val; - zfs_ioc_t vecnum; - uint32_t type32; - int32_t cleanup_fd; - int error = 0; - int pos; - - if (zfs_ioctl_version >= ZFS_IOCVER_LZC) - return (0); - - vecnum = *ioc; - - switch (vecnum) { - case ZFS_IOC_CREATE: - type32 = fnvlist_lookup_int32(*source, "type"); - zc->zc_objset_type = (uint64_t)type32; - nvlist_lookup_nvlist(*source, "props", &nvl); - *source = nvl; - break; - case ZFS_IOC_CLONE: - buf = fnvlist_lookup_string(*source, "origin"); - strlcpy(zc->zc_value, buf, MAXPATHLEN); - nvlist_lookup_nvlist(*source, "props", &nvl); - *ioc = ZFS_IOC_CREATE; - *source = nvl; - break; - case ZFS_IOC_SNAPSHOT: - nvl = fnvlist_lookup_nvlist(*source, "snaps"); - pair = nvlist_next_nvpair(nvl, NULL); - if (pair != NULL) { - buf = nvpair_name(pair); - pos = strcspn(buf, "@"); - strlcpy(zc->zc_name, buf, pos + 1); - strlcpy(zc->zc_value, buf + pos + 1, MAXPATHLEN); - } else - error = EINVAL; - /* old kernel cannot create multiple snapshots */ - if (!error && nvlist_next_nvpair(nvl, pair) != NULL) - error = EOPNOTSUPP; - nvlist_free(nvl); - nvl = NULL; - nvlist_lookup_nvlist(*source, "props", &nvl); - *source = nvl; - break; - case ZFS_IOC_SPACE_SNAPS: - buf = fnvlist_lookup_string(*source, "firstsnap"); - strlcpy(zc->zc_value, buf, MAXPATHLEN); - break; - case ZFS_IOC_DESTROY_SNAPS: - nvl = fnvlist_lookup_nvlist(*source, "snaps"); - pair = nvlist_next_nvpair(nvl, NULL); - if (pair != NULL) { - buf = nvpair_name(pair); - pos = strcspn(buf, "@"); - strlcpy(zc->zc_name, buf, pos + 1); - } else - error = EINVAL; - /* old kernel cannot atomically destroy multiple snaps */ - if (!error && nvlist_next_nvpair(nvl, pair) != NULL) - error = EOPNOTSUPP; - *source = nvl; - break; - case ZFS_IOC_HOLD: - nvl = fnvlist_lookup_nvlist(*source, "holds"); - pair = nvlist_next_nvpair(nvl, NULL); - if (pair != NULL) { - buf = nvpair_name(pair); - pos = strcspn(buf, "@"); - strlcpy(zc->zc_name, buf, pos + 1); - strlcpy(zc->zc_value, buf + pos + 1, MAXPATHLEN); - if (nvpair_value_string(pair, &val) == 0) - strlcpy(zc->zc_string, val, MAXNAMELEN); - else - error = EINVAL; - } else - error = EINVAL; - /* old kernel cannot atomically create multiple holds */ - if (!error && nvlist_next_nvpair(nvl, pair) != NULL) - error = EOPNOTSUPP; - nvlist_free(nvl); - if (nvlist_lookup_int32(*source, "cleanup_fd", - &cleanup_fd) == 0) - zc->zc_cleanup_fd = cleanup_fd; - else - zc->zc_cleanup_fd = -1; - break; - case ZFS_IOC_RELEASE: - pair = nvlist_next_nvpair(*source, NULL); - if (pair != NULL) { - buf = nvpair_name(pair); - pos = strcspn(buf, "@"); - strlcpy(zc->zc_name, buf, pos + 1); - strlcpy(zc->zc_value, buf + pos + 1, MAXPATHLEN); - if (nvpair_value_nvlist(pair, &nvl) == 0) { - hpair = nvlist_next_nvpair(nvl, NULL); - if (hpair != NULL) - strlcpy(zc->zc_string, - nvpair_name(hpair), MAXNAMELEN); - else - error = EINVAL; - if (!error && nvlist_next_nvpair(nvl, - hpair) != NULL) - error = EOPNOTSUPP; - } else - error = EINVAL; - } else - error = EINVAL; - /* old kernel cannot atomically release multiple holds */ - if (!error && nvlist_next_nvpair(nvl, pair) != NULL) - error = EOPNOTSUPP; - break; - } - - return (error); -} - -void -lzc_compat_post(zfs_cmd_t *zc, const zfs_ioc_t ioc) -{ - if (zfs_ioctl_version >= ZFS_IOCVER_LZC) - return; - - switch (ioc) { - case ZFS_IOC_CREATE: - case ZFS_IOC_CLONE: - case ZFS_IOC_SNAPSHOT: - case ZFS_IOC_SPACE_SNAPS: - case ZFS_IOC_DESTROY_SNAPS: - zc->zc_nvlist_dst_filled = B_FALSE; - break; - } -} - -int -lzc_compat_outnvl(zfs_cmd_t *zc, const zfs_ioc_t ioc, nvlist_t **outnvl) -{ - nvlist_t *nvl; - - if (zfs_ioctl_version >= ZFS_IOCVER_LZC) - return (0); - - switch (ioc) { - case ZFS_IOC_SPACE_SNAPS: - nvl = fnvlist_alloc(); - fnvlist_add_uint64(nvl, "used", zc->zc_cookie); - fnvlist_add_uint64(nvl, "compressed", zc->zc_objset_type); - fnvlist_add_uint64(nvl, "uncompressed", zc->zc_perm_action); - *outnvl = nvl; - break; - } - - return (0); -} diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.h b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.h deleted file mode 100644 index 6527c4b2576f..000000000000 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2013 by Martin Matuska . All rights reserved. - */ - -#ifndef _LIBZFS_CORE_COMPAT_H -#define _LIBZFS_CORE_COMPAT_H - -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -int lzc_compat_pre(zfs_cmd_t *, zfs_ioc_t *, nvlist_t **); -void lzc_compat_post(zfs_cmd_t *, const zfs_ioc_t); -int lzc_compat_outnvl(zfs_cmd_t *, const zfs_ioc_t, nvlist_t **); - -#ifdef __cplusplus -} -#endif - -#endif /* _LIBZFS_CORE_COMPAT_H */ diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c b/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c deleted file mode 100644 index 9b54e419705b..000000000000 --- a/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c +++ /dev/null @@ -1,1238 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Emulation of kernel services in userland. - */ - -#ifndef __FreeBSD__ -int aok; -#endif -uint64_t physmem; -vnode_t *rootdir = (vnode_t *)0xabcd1234; -char hw_serial[HW_HOSTID_LEN]; -#ifdef illumos -kmutex_t cpu_lock; -#endif - -/* If set, all blocks read will be copied to the specified directory. */ -char *vn_dumpdir = NULL; - -struct utsname utsname = { - "userland", "libzpool", "1", "1", "na" -}; - -/* this only exists to have its address taken */ -struct proc p0; - -/* - * ========================================================================= - * threads - * ========================================================================= - */ -/*ARGSUSED*/ -kthread_t * -zk_thread_create(void (*func)(), void *arg) -{ - thread_t tid; - - VERIFY(thr_create(0, 0, (void *(*)(void *))func, arg, THR_DETACHED, - &tid) == 0); - - return ((void *)(uintptr_t)tid); -} - -/* - * ========================================================================= - * kstats - * ========================================================================= - */ -/*ARGSUSED*/ -kstat_t * -kstat_create(char *module, int instance, char *name, char *class, - uchar_t type, ulong_t ndata, uchar_t ks_flag) -{ - return (NULL); -} - -/*ARGSUSED*/ -void -kstat_named_init(kstat_named_t *knp, const char *name, uchar_t type) -{} - -/*ARGSUSED*/ -void -kstat_install(kstat_t *ksp) -{} - -/*ARGSUSED*/ -void -kstat_delete(kstat_t *ksp) -{} - -/* - * ========================================================================= - * mutexes - * ========================================================================= - */ -void -zmutex_init(kmutex_t *mp) -{ - mp->m_owner = NULL; - mp->initialized = B_TRUE; - (void) _mutex_init(&mp->m_lock, USYNC_THREAD, NULL); -} - -void -zmutex_destroy(kmutex_t *mp) -{ - ASSERT(mp->initialized == B_TRUE); - ASSERT(mp->m_owner == NULL); - (void) _mutex_destroy(&(mp)->m_lock); - mp->m_owner = (void *)-1UL; - mp->initialized = B_FALSE; -} - -int -zmutex_owned(kmutex_t *mp) -{ - ASSERT(mp->initialized == B_TRUE); - - return (mp->m_owner == curthread); -} - -void -mutex_enter(kmutex_t *mp) -{ - ASSERT(mp->initialized == B_TRUE); - ASSERT(mp->m_owner != (void *)-1UL); - ASSERT(mp->m_owner != curthread); - VERIFY(mutex_lock(&mp->m_lock) == 0); - ASSERT(mp->m_owner == NULL); - mp->m_owner = curthread; -} - -int -mutex_tryenter(kmutex_t *mp) -{ - ASSERT(mp->initialized == B_TRUE); - ASSERT(mp->m_owner != (void *)-1UL); - if (0 == mutex_trylock(&mp->m_lock)) { - ASSERT(mp->m_owner == NULL); - mp->m_owner = curthread; - return (1); - } else { - return (0); - } -} - -void -mutex_exit(kmutex_t *mp) -{ - ASSERT(mp->initialized == B_TRUE); - ASSERT(mutex_owner(mp) == curthread); - mp->m_owner = NULL; - VERIFY(mutex_unlock(&mp->m_lock) == 0); -} - -void * -mutex_owner(kmutex_t *mp) -{ - ASSERT(mp->initialized == B_TRUE); - return (mp->m_owner); -} - -/* - * ========================================================================= - * rwlocks - * ========================================================================= - */ -/*ARGSUSED*/ -void -rw_init(krwlock_t *rwlp, char *name, int type, void *arg) -{ - rwlock_init(&rwlp->rw_lock, USYNC_THREAD, NULL); - rwlp->rw_owner = NULL; - rwlp->initialized = B_TRUE; - rwlp->rw_count = 0; -} - -void -rw_destroy(krwlock_t *rwlp) -{ - ASSERT(rwlp->rw_count == 0); - rwlock_destroy(&rwlp->rw_lock); - rwlp->rw_owner = (void *)-1UL; - rwlp->initialized = B_FALSE; -} - -void -rw_enter(krwlock_t *rwlp, krw_t rw) -{ - //ASSERT(!RW_LOCK_HELD(rwlp)); - ASSERT(rwlp->initialized == B_TRUE); - ASSERT(rwlp->rw_owner != (void *)-1UL); - ASSERT(rwlp->rw_owner != curthread); - - if (rw == RW_READER) { - VERIFY(rw_rdlock(&rwlp->rw_lock) == 0); - ASSERT(rwlp->rw_count >= 0); - atomic_add_int(&rwlp->rw_count, 1); - } else { - VERIFY(rw_wrlock(&rwlp->rw_lock) == 0); - ASSERT(rwlp->rw_count == 0); - rwlp->rw_count = -1; - rwlp->rw_owner = curthread; - } -} - -void -rw_exit(krwlock_t *rwlp) -{ - ASSERT(rwlp->initialized == B_TRUE); - ASSERT(rwlp->rw_owner != (void *)-1UL); - - if (rwlp->rw_owner == curthread) { - /* Write locked. */ - ASSERT(rwlp->rw_count == -1); - rwlp->rw_count = 0; - rwlp->rw_owner = NULL; - } else { - /* Read locked. */ - ASSERT(rwlp->rw_count > 0); - atomic_add_int(&rwlp->rw_count, -1); - } - VERIFY(rw_unlock(&rwlp->rw_lock) == 0); -} - -int -rw_tryenter(krwlock_t *rwlp, krw_t rw) -{ - int rv; - - ASSERT(rwlp->initialized == B_TRUE); - ASSERT(rwlp->rw_owner != (void *)-1UL); - ASSERT(rwlp->rw_owner != curthread); - - if (rw == RW_READER) - rv = rw_tryrdlock(&rwlp->rw_lock); - else - rv = rw_trywrlock(&rwlp->rw_lock); - - if (rv == 0) { - ASSERT(rwlp->rw_owner == NULL); - if (rw == RW_READER) { - ASSERT(rwlp->rw_count >= 0); - atomic_add_int(&rwlp->rw_count, 1); - } else { - ASSERT(rwlp->rw_count == 0); - rwlp->rw_count = -1; - rwlp->rw_owner = curthread; - } - return (1); - } - - return (0); -} - -/*ARGSUSED*/ -int -rw_tryupgrade(krwlock_t *rwlp) -{ - ASSERT(rwlp->initialized == B_TRUE); - ASSERT(rwlp->rw_owner != (void *)-1UL); - - return (0); -} - -int -rw_lock_held(krwlock_t *rwlp) -{ - - return (rwlp->rw_count != 0); -} - -/* - * ========================================================================= - * condition variables - * ========================================================================= - */ -/*ARGSUSED*/ -void -cv_init(kcondvar_t *cv, char *name, int type, void *arg) -{ - VERIFY(cond_init(cv, name, NULL) == 0); -} - -void -cv_destroy(kcondvar_t *cv) -{ - VERIFY(cond_destroy(cv) == 0); -} - -void -cv_wait(kcondvar_t *cv, kmutex_t *mp) -{ - ASSERT(mutex_owner(mp) == curthread); - mp->m_owner = NULL; - int ret = cond_wait(cv, &mp->m_lock); - VERIFY(ret == 0 || ret == EINTR); - mp->m_owner = curthread; -} - -/* - * NB: this emulates FreeBSD cv_wait_sig(9), not the illumos one. - * Meanings of the return code are different. - * NB: this does not actually catch any signals. - */ -int -cv_wait_sig(kcondvar_t *cv, kmutex_t *mp) -{ - cv_wait(cv, mp); - return (0); -} - -clock_t -cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) -{ - int error; - struct timespec ts; - struct timeval tv; - clock_t delta; - - abstime += ddi_get_lbolt(); -top: - delta = abstime - ddi_get_lbolt(); - if (delta <= 0) - return (-1); - - if (gettimeofday(&tv, NULL) != 0) - assert(!"gettimeofday() failed"); - - ts.tv_sec = tv.tv_sec + delta / hz; - ts.tv_nsec = tv.tv_usec * 1000 + (delta % hz) * (NANOSEC / hz); - ASSERT(ts.tv_nsec >= 0); - - if (ts.tv_nsec >= NANOSEC) { - ts.tv_sec++; - ts.tv_nsec -= NANOSEC; - } - - ASSERT(mutex_owner(mp) == curthread); - mp->m_owner = NULL; - error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); - mp->m_owner = curthread; - - if (error == EINTR) - goto top; - - if (error == ETIMEDOUT) - return (-1); - - ASSERT(error == 0); - - return (1); -} - -/*ARGSUSED*/ -clock_t -cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, - int flag) -{ - int error; - timespec_t ts; - hrtime_t delta; - - ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE); - -top: - delta = tim; - if (flag & CALLOUT_FLAG_ABSOLUTE) - delta -= gethrtime(); - - if (delta <= 0) - return (-1); - - clock_gettime(CLOCK_REALTIME, &ts); - ts.tv_sec += delta / NANOSEC; - ts.tv_nsec += delta % NANOSEC; - if (ts.tv_nsec >= NANOSEC) { - ts.tv_sec++; - ts.tv_nsec -= NANOSEC; - } - - ASSERT(mutex_owner(mp) == curthread); - mp->m_owner = NULL; - error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); - mp->m_owner = curthread; - - if (error == ETIMEDOUT) - return (-1); - - if (error == EINTR) - goto top; - - ASSERT(error == 0); - - return (1); -} - -void -cv_signal(kcondvar_t *cv) -{ - VERIFY(cond_signal(cv) == 0); -} - -void -cv_broadcast(kcondvar_t *cv) -{ - VERIFY(cond_broadcast(cv) == 0); -} - -/* - * ========================================================================= - * vnode operations - * ========================================================================= - */ -/* - * Note: for the xxxat() versions of these functions, we assume that the - * starting vp is always rootdir (which is true for spa_directory.c, the only - * ZFS consumer of these interfaces). We assert this is true, and then emulate - * them by adding '/' in front of the path. - */ - -/*ARGSUSED*/ -int -vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) -{ - int fd; - int dump_fd; - vnode_t *vp; - int old_umask; - char realpath[MAXPATHLEN]; - struct stat64 st; - - /* - * If we're accessing a real disk from userland, we need to use - * the character interface to avoid caching. This is particularly - * important if we're trying to look at a real in-kernel storage - * pool from userland, e.g. via zdb, because otherwise we won't - * see the changes occurring under the segmap cache. - * On the other hand, the stupid character device returns zero - * for its size. So -- gag -- we open the block device to get - * its size, and remember it for subsequent VOP_GETATTR(). - */ - if (strncmp(path, "/dev/", 5) == 0) { - char *dsk; - fd = open64(path, O_RDONLY); - if (fd == -1) - return (errno); - if (fstat64(fd, &st) == -1) { - close(fd); - return (errno); - } - close(fd); - (void) sprintf(realpath, "%s", path); - dsk = strstr(path, "/dsk/"); - if (dsk != NULL) - (void) sprintf(realpath + (dsk - path) + 1, "r%s", - dsk + 1); - } else { - (void) sprintf(realpath, "%s", path); - if (!(flags & FCREAT) && stat64(realpath, &st) == -1) - return (errno); - } - - if (flags & FCREAT) - old_umask = umask(0); - - /* - * The construct 'flags - FREAD' conveniently maps combinations of - * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR. - */ - fd = open64(realpath, flags - FREAD, mode); - - if (flags & FCREAT) - (void) umask(old_umask); - - if (vn_dumpdir != NULL) { - char dumppath[MAXPATHLEN]; - (void) snprintf(dumppath, sizeof (dumppath), - "%s/%s", vn_dumpdir, basename(realpath)); - dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); - if (dump_fd == -1) - return (errno); - } else { - dump_fd = -1; - } - - if (fd == -1) - return (errno); - - if (fstat64(fd, &st) == -1) { - close(fd); - return (errno); - } - - (void) fcntl(fd, F_SETFD, FD_CLOEXEC); - - *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL); - - vp->v_fd = fd; - vp->v_size = st.st_size; - vp->v_path = spa_strdup(path); - vp->v_dump_fd = dump_fd; - - return (0); -} - -/*ARGSUSED*/ -int -vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, - int x3, vnode_t *startvp, int fd) -{ - char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL); - int ret; - - ASSERT(startvp == rootdir); - (void) sprintf(realpath, "/%s", path); - - /* fd ignored for now, need if want to simulate nbmand support */ - ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3); - - umem_free(realpath, strlen(path) + 2); - - return (ret); -} - -/*ARGSUSED*/ -int -vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, - int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp) -{ - ssize_t iolen, split; - - if (uio == UIO_READ) { - iolen = pread64(vp->v_fd, addr, len, offset); - if (vp->v_dump_fd != -1) { - int status = - pwrite64(vp->v_dump_fd, addr, iolen, offset); - ASSERT(status != -1); - } - } else { - /* - * To simulate partial disk writes, we split writes into two - * system calls so that the process can be killed in between. - */ - int sectors = len >> SPA_MINBLOCKSHIFT; - split = (sectors > 0 ? rand() % sectors : 0) << - SPA_MINBLOCKSHIFT; - iolen = pwrite64(vp->v_fd, addr, split, offset); - iolen += pwrite64(vp->v_fd, (char *)addr + split, - len - split, offset + split); - } - - if (iolen == -1) - return (errno); - if (residp) - *residp = len - iolen; - else if (iolen != len) - return (EIO); - return (0); -} - -void -vn_close(vnode_t *vp, int openflag, cred_t *cr, kthread_t *td) -{ - close(vp->v_fd); - if (vp->v_dump_fd != -1) - close(vp->v_dump_fd); - spa_strfree(vp->v_path); - umem_free(vp, sizeof (vnode_t)); -} - -/* - * At a minimum we need to update the size since vdev_reopen() - * will no longer call vn_openat(). - */ -int -fop_getattr(vnode_t *vp, vattr_t *vap) -{ - struct stat64 st; - - if (fstat64(vp->v_fd, &st) == -1) { - close(vp->v_fd); - return (errno); - } - - vap->va_size = st.st_size; - return (0); -} - -#ifdef ZFS_DEBUG - -/* - * ========================================================================= - * Figure out which debugging statements to print - * ========================================================================= - */ - -static char *dprintf_string; -static int dprintf_print_all; - -int -dprintf_find_string(const char *string) -{ - char *tmp_str = dprintf_string; - int len = strlen(string); - - /* - * Find out if this is a string we want to print. - * String format: file1.c,function_name1,file2.c,file3.c - */ - - while (tmp_str != NULL) { - if (strncmp(tmp_str, string, len) == 0 && - (tmp_str[len] == ',' || tmp_str[len] == '\0')) - return (1); - tmp_str = strchr(tmp_str, ','); - if (tmp_str != NULL) - tmp_str++; /* Get rid of , */ - } - return (0); -} - -void -dprintf_setup(int *argc, char **argv) -{ - int i, j; - - /* - * Debugging can be specified two ways: by setting the - * environment variable ZFS_DEBUG, or by including a - * "debug=..." argument on the command line. The command - * line setting overrides the environment variable. - */ - - for (i = 1; i < *argc; i++) { - int len = strlen("debug="); - /* First look for a command line argument */ - if (strncmp("debug=", argv[i], len) == 0) { - dprintf_string = argv[i] + len; - /* Remove from args */ - for (j = i; j < *argc; j++) - argv[j] = argv[j+1]; - argv[j] = NULL; - (*argc)--; - } - } - - if (dprintf_string == NULL) { - /* Look for ZFS_DEBUG environment variable */ - dprintf_string = getenv("ZFS_DEBUG"); - } - - /* - * Are we just turning on all debugging? - */ - if (dprintf_find_string("on")) - dprintf_print_all = 1; - - if (dprintf_string != NULL) - zfs_flags |= ZFS_DEBUG_DPRINTF; -} - -int -sysctl_handle_64(SYSCTL_HANDLER_ARGS) -{ - return (0); -} - -/* - * ========================================================================= - * debug printfs - * ========================================================================= - */ -void -__dprintf(const char *file, const char *func, int line, const char *fmt, ...) -{ - const char *newfile; - va_list adx; - - /* - * Get rid of annoying "../common/" prefix to filename. - */ - newfile = strrchr(file, '/'); - if (newfile != NULL) { - newfile = newfile + 1; /* Get rid of leading / */ - } else { - newfile = file; - } - - if (dprintf_print_all || - dprintf_find_string(newfile) || - dprintf_find_string(func)) { - /* Print out just the function name if requested */ - flockfile(stdout); - if (dprintf_find_string("pid")) - (void) printf("%d ", getpid()); - if (dprintf_find_string("tid")) - (void) printf("%lu ", thr_self()); -#if 0 - if (dprintf_find_string("cpu")) - (void) printf("%u ", getcpuid()); -#endif - if (dprintf_find_string("time")) - (void) printf("%llu ", gethrtime()); - if (dprintf_find_string("long")) - (void) printf("%s, line %d: ", newfile, line); - (void) printf("%s: ", func); - va_start(adx, fmt); - (void) vprintf(fmt, adx); - va_end(adx); - funlockfile(stdout); - } -} - -#endif /* ZFS_DEBUG */ - -/* - * ========================================================================= - * cmn_err() and panic() - * ========================================================================= - */ -static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" }; -static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" }; - -void -vpanic(const char *fmt, va_list adx) -{ - char buf[512]; - (void) vsnprintf(buf, 512, fmt, adx); - assfail(buf, NULL, 0); - abort(); /* necessary to make vpanic meet noreturn requirements */ -} - -void -panic(const char *fmt, ...) -{ - va_list adx; - - va_start(adx, fmt); - vpanic(fmt, adx); - va_end(adx); -} - -void -vcmn_err(int ce, const char *fmt, va_list adx) -{ - if (ce == CE_PANIC) - vpanic(fmt, adx); - if (ce != CE_NOTE) { /* suppress noise in userland stress testing */ - (void) fprintf(stderr, "%s", ce_prefix[ce]); - (void) vfprintf(stderr, fmt, adx); - (void) fprintf(stderr, "%s", ce_suffix[ce]); - } -} - -/*PRINTFLIKE2*/ -void -cmn_err(int ce, const char *fmt, ...) -{ - va_list adx; - - va_start(adx, fmt); - vcmn_err(ce, fmt, adx); - va_end(adx); -} - -/* - * ========================================================================= - * kobj interfaces - * ========================================================================= - */ -struct _buf * -kobj_open_file(char *name) -{ - struct _buf *file; - vnode_t *vp; - - /* set vp as the _fd field of the file */ - if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, - -1) != 0) - return ((void *)-1UL); - - file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL); - file->_fd = (intptr_t)vp; - return (file); -} - -int -kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) -{ - ssize_t resid; - - vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off, - UIO_SYSSPACE, 0, 0, 0, &resid); - - return (size - resid); -} - -void -kobj_close_file(struct _buf *file) -{ - vn_close((vnode_t *)file->_fd, 0, NULL, NULL); - umem_free(file, sizeof (struct _buf)); -} - -int -kobj_get_filesize(struct _buf *file, uint64_t *size) -{ - struct stat64 st; - vnode_t *vp = (vnode_t *)file->_fd; - - if (fstat64(vp->v_fd, &st) == -1) { - vn_close(vp, 0, NULL, NULL); - return (errno); - } - *size = st.st_size; - return (0); -} - -/* - * ========================================================================= - * misc routines - * ========================================================================= - */ - -void -delay(clock_t ticks) -{ - poll(0, 0, ticks * (1000 / hz)); -} - -#if 0 -/* - * Find highest one bit set. - * Returns bit number + 1 of highest bit that is set, otherwise returns 0. - */ -int -highbit64(uint64_t i) -{ - int h = 1; - - if (i == 0) - return (0); - if (i & 0xffffffff00000000ULL) { - h += 32; i >>= 32; - } - if (i & 0xffff0000) { - h += 16; i >>= 16; - } - if (i & 0xff00) { - h += 8; i >>= 8; - } - if (i & 0xf0) { - h += 4; i >>= 4; - } - if (i & 0xc) { - h += 2; i >>= 2; - } - if (i & 0x2) { - h += 1; - } - return (h); -} -#endif - -static int random_fd = -1, urandom_fd = -1; - -static int -random_get_bytes_common(uint8_t *ptr, size_t len, int fd) -{ - size_t resid = len; - ssize_t bytes; - - ASSERT(fd != -1); - - while (resid != 0) { - bytes = read(fd, ptr, resid); - ASSERT3S(bytes, >=, 0); - ptr += bytes; - resid -= bytes; - } - - return (0); -} - -int -random_get_bytes(uint8_t *ptr, size_t len) -{ - return (random_get_bytes_common(ptr, len, random_fd)); -} - -int -random_get_pseudo_bytes(uint8_t *ptr, size_t len) -{ - return (random_get_bytes_common(ptr, len, urandom_fd)); -} - -int -ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result) -{ - char *end; - - *result = strtoul(hw_serial, &end, base); - if (*result == 0) - return (errno); - return (0); -} - -int -ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result) -{ - char *end; - - *result = strtoull(str, &end, base); - if (*result == 0) - return (errno); - return (0); -} - -#ifdef illumos -/* ARGSUSED */ -cyclic_id_t -cyclic_add(cyc_handler_t *hdlr, cyc_time_t *when) -{ - return (1); -} - -/* ARGSUSED */ -void -cyclic_remove(cyclic_id_t id) -{ -} - -/* ARGSUSED */ -int -cyclic_reprogram(cyclic_id_t id, hrtime_t expiration) -{ - return (1); -} -#endif - -/* - * ========================================================================= - * kernel emulation setup & teardown - * ========================================================================= - */ -static int -umem_out_of_memory(void) -{ - char errmsg[] = "out of memory -- generating core dump\n"; - - write(fileno(stderr), errmsg, sizeof (errmsg)); - abort(); - return (0); -} - -void -kernel_init(int mode) -{ - extern uint_t rrw_tsd_key; - - umem_nofail_callback(umem_out_of_memory); - - physmem = sysconf(_SC_PHYS_PAGES); - - dprintf("physmem = %llu pages (%.2f GB)\n", physmem, - (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); - - (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", - (mode & FWRITE) ? get_system_hostid() : 0); - - VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1); - VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1); - - system_taskq_init(); - -#ifdef illumos - mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL); -#endif - - spa_init(mode); - - tsd_create(&rrw_tsd_key, rrw_tsd_destroy); -} - -void -kernel_fini(void) -{ - spa_fini(); - - system_taskq_fini(); - - close(random_fd); - close(urandom_fd); - - random_fd = -1; - urandom_fd = -1; -} - -/* ARGSUSED */ -uint32_t -zone_get_hostid(void *zonep) -{ - /* - * We're emulating the system's hostid in userland. - */ - return (strtoul(hw_serial, NULL, 10)); -} - -int -z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen) -{ - int ret; - uLongf len = *dstlen; - - if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK) - *dstlen = (size_t)len; - - return (ret); -} - -int -z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen, - int level) -{ - int ret; - uLongf len = *dstlen; - - if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK) - *dstlen = (size_t)len; - - return (ret); -} - -uid_t -crgetuid(cred_t *cr) -{ - return (0); -} - -uid_t -crgetruid(cred_t *cr) -{ - return (0); -} - -gid_t -crgetgid(cred_t *cr) -{ - return (0); -} - -int -crgetngroups(cred_t *cr) -{ - return (0); -} - -gid_t * -crgetgroups(cred_t *cr) -{ - return (NULL); -} - -int -zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) -{ - return (0); -} - -int -zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) -{ - return (0); -} - -int -zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) -{ - return (0); -} - -ksiddomain_t * -ksid_lookupdomain(const char *dom) -{ - ksiddomain_t *kd; - - kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL); - kd->kd_name = spa_strdup(dom); - return (kd); -} - -void -ksiddomain_rele(ksiddomain_t *ksid) -{ - spa_strfree(ksid->kd_name); - umem_free(ksid, sizeof (ksiddomain_t)); -} - -/* - * Do not change the length of the returned string; it must be freed - * with strfree(). - */ -char * -kmem_asprintf(const char *fmt, ...) -{ - int size; - va_list adx; - char *buf; - - va_start(adx, fmt); - size = vsnprintf(NULL, 0, fmt, adx) + 1; - va_end(adx); - - buf = kmem_alloc(size, KM_SLEEP); - - va_start(adx, fmt); - size = vsnprintf(buf, size, fmt, adx); - va_end(adx); - - return (buf); -} - -/* ARGSUSED */ -int -zfs_onexit_fd_hold(int fd, minor_t *minorp) -{ - *minorp = 0; - return (0); -} - -/* ARGSUSED */ -void -zfs_onexit_fd_rele(int fd) -{ -} - -/* ARGSUSED */ -int -zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, - uint64_t *action_handle) -{ - return (0); -} - -/* ARGSUSED */ -int -zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) -{ - return (0); -} - -/* ARGSUSED */ -int -zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) -{ - return (0); -} - -#ifdef __FreeBSD__ -/* ARGSUSED */ -int -zvol_create_minors(const char *name) -{ - return (0); -} -#endif - -#ifdef illumos -void -bioinit(buf_t *bp) -{ - bzero(bp, sizeof (buf_t)); -} - -void -biodone(buf_t *bp) -{ - if (bp->b_iodone != NULL) { - (*(bp->b_iodone))(bp); - return; - } - ASSERT((bp->b_flags & B_DONE) == 0); - bp->b_flags |= B_DONE; -} - -void -bioerror(buf_t *bp, int error) -{ - ASSERT(bp != NULL); - ASSERT(error >= 0); - - if (error != 0) { - bp->b_flags |= B_ERROR; - } else { - bp->b_flags &= ~B_ERROR; - } - bp->b_error = error; -} - - -int -geterror(struct buf *bp) -{ - int error = 0; - - if (bp->b_flags & B_ERROR) { - error = bp->b_error; - if (!error) - error = EIO; - } - return (error); -} -#endif diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h b/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h deleted file mode 100644 index 6f1a17f27852..000000000000 --- a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h +++ /dev/null @@ -1,838 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. - */ -/* - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - */ - -#ifndef _SYS_ZFS_CONTEXT_H -#define _SYS_ZFS_CONTEXT_H - -#ifdef __cplusplus -extern "C" { -#endif - -#define _SYS_MUTEX_H -#define _SYS_RWLOCK_H -#define _SYS_CONDVAR_H -#define _SYS_SYSTM_H -#define _SYS_T_LOCK_H -#define _SYS_VNODE_H -#define _SYS_VFS_H -#define _SYS_SUNDDI_H -#define _SYS_CALLB_H -#define _SYS_SCHED_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef illumos -#include "zfs.h" -#endif - -#define ZFS_EXPORTS_PATH "/etc/zfs/exports" - -/* - * Debugging - */ - -/* - * Note that we are not using the debugging levels. - */ - -#define CE_CONT 0 /* continuation */ -#define CE_NOTE 1 /* notice */ -#define CE_WARN 2 /* warning */ -#define CE_PANIC 3 /* panic */ -#define CE_IGNORE 4 /* print nothing */ - -/* - * ZFS debugging - */ - -#define ZFS_LOG(...) do { } while (0) - -typedef u_longlong_t rlim64_t; -#define RLIM64_INFINITY ((rlim64_t)-3) - -#ifdef ZFS_DEBUG -extern void dprintf_setup(int *argc, char **argv); -#endif /* ZFS_DEBUG */ - -extern void cmn_err(int, const char *, ...); -extern void vcmn_err(int, const char *, __va_list); -extern void panic(const char *, ...) __NORETURN; -extern void vpanic(const char *, __va_list) __NORETURN; - -#define fm_panic panic - -extern int aok; - -/* - * DTrace SDT probes have different signatures in userland than they do in - * the kernel. If they're being used in kernel code, re-define them out of - * existence for their counterparts in libzpool. - * - * Here's an example of how to use the set-error probes in userland: - * zfs$target:::set-error /arg0 == EBUSY/ {stack();} - * - * Here's an example of how to use DTRACE_PROBE probes in userland: - * If there is a probe declared as follows: - * DTRACE_PROBE2(zfs__probe_name, uint64_t, blkid, dnode_t *, dn); - * Then you can use it as follows: - * zfs$target:::probe2 /copyinstr(arg0) == "zfs__probe_name"/ - * {printf("%u %p\n", arg1, arg2);} - */ - -#ifdef DTRACE_PROBE -#undef DTRACE_PROBE -#endif /* DTRACE_PROBE */ -#ifdef illumos -#define DTRACE_PROBE(a) \ - ZFS_PROBE0(#a) -#endif - -#ifdef DTRACE_PROBE1 -#undef DTRACE_PROBE1 -#endif /* DTRACE_PROBE1 */ -#ifdef illumos -#define DTRACE_PROBE1(a, b, c) \ - ZFS_PROBE1(#a, (unsigned long)c) -#endif - -#ifdef DTRACE_PROBE2 -#undef DTRACE_PROBE2 -#endif /* DTRACE_PROBE2 */ -#ifdef illumos -#define DTRACE_PROBE2(a, b, c, d, e) \ - ZFS_PROBE2(#a, (unsigned long)c, (unsigned long)e) -#endif - -#ifdef DTRACE_PROBE3 -#undef DTRACE_PROBE3 -#endif /* DTRACE_PROBE3 */ -#ifdef illumos -#define DTRACE_PROBE3(a, b, c, d, e, f, g) \ - ZFS_PROBE3(#a, (unsigned long)c, (unsigned long)e, (unsigned long)g) -#endif - -#ifdef DTRACE_PROBE4 -#undef DTRACE_PROBE4 -#endif /* DTRACE_PROBE4 */ -#ifdef illumos -#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) \ - ZFS_PROBE4(#a, (unsigned long)c, (unsigned long)e, (unsigned long)g, \ - (unsigned long)i) -#endif - -#ifdef illumos -/* - * We use the comma operator so that this macro can be used without much - * additional code. For example, "return (EINVAL);" becomes - * "return (SET_ERROR(EINVAL));". Note that the argument will be evaluated - * twice, so it should not have side effects (e.g. something like: - * "return (SET_ERROR(log_error(EINVAL, info)));" would log the error twice). - */ -#define SET_ERROR(err) (ZFS_SET_ERROR(err), err) -#else /* !illumos */ - -#define DTRACE_PROBE(a) ((void)0) -#define DTRACE_PROBE1(a, b, c) ((void)0) -#define DTRACE_PROBE2(a, b, c, d, e) ((void)0) -#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0) -#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void)0) - -#define SET_ERROR(err) (err) -#endif /* !illumos */ - -/* - * Threads - */ -#define curthread ((void *)(uintptr_t)thr_self()) - -#define kpreempt(x) sched_yield() - -typedef struct kthread kthread_t; - -#define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ - zk_thread_create(func, arg) -#define thread_exit() thr_exit(NULL) -#define thread_join(t) panic("libzpool cannot join threads") - -#define newproc(f, a, cid, pri, ctp, pid) (ENOSYS) - -/* in libzpool, p0 exists only to have its address taken */ -struct proc { - uintptr_t this_is_never_used_dont_dereference_it; -}; - -extern struct proc p0; -#define curproc (&p0) - -#define PS_NONE -1 - -extern kthread_t *zk_thread_create(void (*func)(void*), void *arg); - -#define issig(why) (FALSE) -#define ISSIG(thr, why) (FALSE) - -/* - * Mutexes - */ -typedef struct kmutex { - void *m_owner; - boolean_t initialized; - mutex_t m_lock; -} kmutex_t; - -#define MUTEX_DEFAULT USYNC_THREAD -#undef MUTEX_HELD -#undef MUTEX_NOT_HELD -#define MUTEX_HELD(m) ((m)->m_owner == curthread) -#define MUTEX_NOT_HELD(m) (!MUTEX_HELD(m)) -#define _mutex_held(m) pthread_mutex_isowned_np(m) - -/* - * Argh -- we have to get cheesy here because the kernel and userland - * have different signatures for the same routine. - */ -//extern int _mutex_init(mutex_t *mp, int type, void *arg); -//extern int _mutex_destroy(mutex_t *mp); -//extern int _mutex_owned(mutex_t *mp); - -#define mutex_init(mp, b, c, d) zmutex_init((kmutex_t *)(mp)) -#define mutex_destroy(mp) zmutex_destroy((kmutex_t *)(mp)) -#define mutex_owned(mp) zmutex_owned((kmutex_t *)(mp)) - -extern void zmutex_init(kmutex_t *mp); -extern void zmutex_destroy(kmutex_t *mp); -extern int zmutex_owned(kmutex_t *mp); -extern void mutex_enter(kmutex_t *mp); -extern void mutex_exit(kmutex_t *mp); -extern int mutex_tryenter(kmutex_t *mp); -extern void *mutex_owner(kmutex_t *mp); - -/* - * RW locks - */ -typedef struct krwlock { - int rw_count; - void *rw_owner; - boolean_t initialized; - rwlock_t rw_lock; -} krwlock_t; - -typedef int krw_t; - -#define RW_READER 0 -#define RW_WRITER 1 -#define RW_DEFAULT USYNC_THREAD - -#undef RW_READ_HELD -#define RW_READ_HELD(x) ((x)->rw_owner == NULL && (x)->rw_count > 0) - -#undef RW_WRITE_HELD -#define RW_WRITE_HELD(x) ((x)->rw_owner == curthread) -#define RW_LOCK_HELD(x) rw_lock_held(x) - -#undef RW_LOCK_HELD -#define RW_LOCK_HELD(x) (RW_READ_HELD(x) || RW_WRITE_HELD(x)) - -extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg); -extern void rw_destroy(krwlock_t *rwlp); -extern void rw_enter(krwlock_t *rwlp, krw_t rw); -extern int rw_tryenter(krwlock_t *rwlp, krw_t rw); -extern int rw_tryupgrade(krwlock_t *rwlp); -extern void rw_exit(krwlock_t *rwlp); -extern int rw_lock_held(krwlock_t *rwlp); -#define rw_downgrade(rwlp) do { } while (0) - -extern uid_t crgetuid(cred_t *cr); -extern uid_t crgetruid(cred_t *cr); -extern gid_t crgetgid(cred_t *cr); -extern int crgetngroups(cred_t *cr); -extern gid_t *crgetgroups(cred_t *cr); - -/* - * Condition variables - */ -typedef cond_t kcondvar_t; - -#define CV_DEFAULT USYNC_THREAD -#define CALLOUT_FLAG_ABSOLUTE 0x2 - -extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg); -extern void cv_destroy(kcondvar_t *cv); -extern void cv_wait(kcondvar_t *cv, kmutex_t *mp); -extern int cv_wait_sig(kcondvar_t *cv, kmutex_t *mp); -extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); -#define cv_timedwait_sig(cvp, mp, t) cv_timedwait(cvp, mp, t) -extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, - hrtime_t res, int flag); -#define cv_timedwait_sig_hires(cvp, mp, t, r, f) \ - cv_timedwait_hires(cvp, mp, t, r, f) -extern void cv_signal(kcondvar_t *cv); -extern void cv_broadcast(kcondvar_t *cv); - -/* - * Thread-specific data - */ -#define tsd_get(k) pthread_getspecific(k) -#define tsd_set(k, v) pthread_setspecific(k, v) -#define tsd_create(kp, d) pthread_key_create(kp, d) -#define tsd_destroy(kp) /* nothing */ - -/* - * Kernel memory - */ -#define KM_SLEEP UMEM_NOFAIL -#define KM_PUSHPAGE KM_SLEEP -#define KM_NOSLEEP UMEM_DEFAULT -#define KM_NORMALPRI 0 /* not needed with UMEM_DEFAULT */ -#define KMC_NODEBUG UMC_NODEBUG -#define KMC_NOTOUCH 0 /* not needed for userland caches */ -#define KM_NODEBUG 0 -#define kmem_alloc(_s, _f) umem_alloc(_s, _f) -#define kmem_zalloc(_s, _f) umem_zalloc(_s, _f) -#define kmem_free(_b, _s) umem_free(_b, _s) -#define kmem_size() (physmem * PAGESIZE) -#define kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \ - umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) -#define kmem_cache_destroy(_c) umem_cache_destroy(_c) -#define kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f) -#define kmem_cache_free(_c, _b) umem_cache_free(_c, _b) -#define kmem_debugging() 0 -#define kmem_cache_reap_active() (B_FALSE) -#define kmem_cache_reap_soon(_c) /* nothing */ -#define kmem_cache_set_move(_c, _cb) /* nothing */ -#define POINTER_INVALIDATE(_pp) /* nothing */ -#define POINTER_IS_VALID(_p) 0 - -typedef umem_cache_t kmem_cache_t; - -typedef enum kmem_cbrc { - KMEM_CBRC_YES, - KMEM_CBRC_NO, - KMEM_CBRC_LATER, - KMEM_CBRC_DONT_NEED, - KMEM_CBRC_DONT_KNOW -} kmem_cbrc_t; - -/* - * Task queues - */ -typedef struct taskq taskq_t; -typedef uintptr_t taskqid_t; -typedef void (task_func_t)(void *); - -typedef struct taskq_ent { - struct taskq_ent *tqent_next; - struct taskq_ent *tqent_prev; - task_func_t *tqent_func; - void *tqent_arg; - uintptr_t tqent_flags; -} taskq_ent_t; - -#define TQENT_FLAG_PREALLOC 0x1 /* taskq_dispatch_ent used */ - -#define TASKQ_PREPOPULATE 0x0001 -#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ -#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ -#define TASKQ_THREADS_CPU_PCT 0x0008 /* Scale # threads by # cpus */ -#define TASKQ_DC_BATCH 0x0010 /* Mark threads as batch */ - -#define TQ_SLEEP KM_SLEEP /* Can block for memory */ -#define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */ -#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ -#define TQ_FRONT 0x08 /* Queue in front */ - -#define TASKQID_INVALID ((taskqid_t)0) - -extern taskq_t *system_taskq; - -extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); -#define taskq_create_proc(a, b, c, d, e, p, f) \ - (taskq_create(a, b, c, d, e, f)) -#define taskq_create_sysdc(a, b, d, e, p, dc, f) \ - (taskq_create(a, b, maxclsyspri, d, e, f)) -extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); -extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, - taskq_ent_t *); -extern void taskq_destroy(taskq_t *); -extern void taskq_wait(taskq_t *); -extern void taskq_wait_id(taskq_t *, taskqid_t); -extern int taskq_member(taskq_t *, void *); -extern void system_taskq_init(void); -extern void system_taskq_fini(void); - -#define taskq_dispatch_safe(tq, func, arg, flags, task) \ - taskq_dispatch((tq), (func), (arg), (flags)) - -#define XVA_MAPSIZE 3 -#define XVA_MAGIC 0x78766174 - -/* - * vnodes - */ -typedef struct vnode { - uint64_t v_size; - int v_fd; - char *v_path; - int v_dump_fd; -} vnode_t; - -extern char *vn_dumpdir; -#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ - -typedef struct xoptattr { - timestruc_t xoa_createtime; /* Create time of file */ - uint8_t xoa_archive; - uint8_t xoa_system; - uint8_t xoa_readonly; - uint8_t xoa_hidden; - uint8_t xoa_nounlink; - uint8_t xoa_immutable; - uint8_t xoa_appendonly; - uint8_t xoa_nodump; - uint8_t xoa_settable; - uint8_t xoa_opaque; - uint8_t xoa_av_quarantined; - uint8_t xoa_av_modified; - uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ]; - uint8_t xoa_reparse; - uint8_t xoa_offline; - uint8_t xoa_sparse; -} xoptattr_t; - -typedef struct vattr { - uint_t va_mask; /* bit-mask of attributes */ - u_offset_t va_size; /* file size in bytes */ -} vattr_t; - - -typedef struct xvattr { - vattr_t xva_vattr; /* Embedded vattr structure */ - uint32_t xva_magic; /* Magic Number */ - uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */ - uint32_t *xva_rtnattrmapp; /* Ptr to xva_rtnattrmap[] */ - uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */ - uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */ - xoptattr_t xva_xoptattrs; /* Optional attributes */ -} xvattr_t; - -typedef struct vsecattr { - uint_t vsa_mask; /* See below */ - int vsa_aclcnt; /* ACL entry count */ - void *vsa_aclentp; /* pointer to ACL entries */ - int vsa_dfaclcnt; /* default ACL entry count */ - void *vsa_dfaclentp; /* pointer to default ACL entries */ - size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */ -} vsecattr_t; - -#define AT_TYPE 0x00001 -#define AT_MODE 0x00002 -#define AT_UID 0x00004 -#define AT_GID 0x00008 -#define AT_FSID 0x00010 -#define AT_NODEID 0x00020 -#define AT_NLINK 0x00040 -#define AT_SIZE 0x00080 -#define AT_ATIME 0x00100 -#define AT_MTIME 0x00200 -#define AT_CTIME 0x00400 -#define AT_RDEV 0x00800 -#define AT_BLKSIZE 0x01000 -#define AT_NBLOCKS 0x02000 -#define AT_SEQ 0x08000 -#define AT_XVATTR 0x10000 - -#define CRCREAT 0 - -extern int fop_getattr(vnode_t *vp, vattr_t *vap); - -#define VOP_CLOSE(vp, f, c, o, cr, ct) 0 -#define VOP_PUTPAGE(vp, of, sz, fl, cr, ct) 0 -#define VOP_GETATTR(vp, vap, cr) fop_getattr((vp), (vap)); - -#define VOP_FSYNC(vp, f, cr, ct) fsync((vp)->v_fd) - -#define VN_RELE(vp) vn_close(vp, 0, NULL, NULL) -#define VN_RELE_ASYNC(vp, taskq) vn_close(vp, 0, NULL, NULL) - -#define vn_lock(vp, type) -#define VOP_UNLOCK(vp) - -extern int vn_open(char *path, int x1, int oflags, int mode, vnode_t **vpp, - int x2, int x3); -extern int vn_openat(char *path, int x1, int oflags, int mode, vnode_t **vpp, - int x2, int x3, vnode_t *vp, int fd); -extern int vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, - offset_t offset, int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp); -extern void vn_close(vnode_t *vp, int openflag, cred_t *cr, kthread_t *td); - -#define vn_remove(path, x1, x2) remove(path) -#define vn_rename(from, to, seg) rename((from), (to)) -#define vn_is_readonly(vp) B_FALSE - -extern vnode_t *rootdir; - -#include /* for FREAD, FWRITE, etc */ -#define FTRUNC O_TRUNC - -/* - * Random stuff - */ -#define ddi_get_lbolt() (gethrtime() >> 23) -#define ddi_get_lbolt64() (gethrtime() >> 23) -#define hz 119 /* frequency when using gethrtime() >> 23 for lbolt */ - -extern void delay(clock_t ticks); - -#define SEC_TO_TICK(sec) ((sec) * hz) -#define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz)) - -#define gethrestime_sec() time(NULL) -#define gethrestime(t) \ - do {\ - (t)->tv_sec = gethrestime_sec();\ - (t)->tv_nsec = 0;\ - } while (0); - -#define max_ncpus 64 -#define boot_ncpus (sysconf(_SC_NPROCESSORS_ONLN)) - -#define minclsyspri 60 -#define maxclsyspri 99 - -#define CPU_SEQID (thr_self() & (max_ncpus - 1)) - -#define kcred NULL -#define CRED() NULL - -#ifndef ptob -#define ptob(x) ((x) * PAGESIZE) -#endif - -extern uint64_t physmem; - -extern int highbit64(uint64_t i); -extern int random_get_bytes(uint8_t *ptr, size_t len); -extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); - -extern void kernel_init(int); -extern void kernel_fini(void); - -struct spa; -extern void nicenum(uint64_t num, char *buf, size_t); -extern void show_pool_stats(struct spa *); -extern int set_global_var(char *arg); - -typedef struct callb_cpr { - kmutex_t *cc_lockp; -} callb_cpr_t; - -#define CALLB_CPR_INIT(cp, lockp, func, name) { \ - (cp)->cc_lockp = lockp; \ -} - -#define CALLB_CPR_SAFE_BEGIN(cp) { \ - ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ -} - -#define CALLB_CPR_SAFE_END(cp, lockp) { \ - ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ -} - -#define CALLB_CPR_EXIT(cp) { \ - ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ - mutex_exit((cp)->cc_lockp); \ -} - -#define zone_dataset_visible(x, y) (1) -#define INGLOBALZONE(z) (1) -extern uint32_t zone_get_hostid(void *zonep); - -extern char *kmem_asprintf(const char *fmt, ...); -#define strfree(str) kmem_free((str), strlen(str) + 1) - -/* - * Hostname information - */ -extern struct utsname utsname; -extern char hw_serial[]; /* for userland-emulated hostid access */ -extern int ddi_strtoul(const char *str, char **nptr, int base, - unsigned long *result); - -extern int ddi_strtoull(const char *str, char **nptr, int base, - u_longlong_t *result); - -/* ZFS Boot Related stuff. */ - -struct _buf { - intptr_t _fd; -}; - -struct bootstat { - uint64_t st_size; -}; - -typedef struct ace_object { - uid_t a_who; - uint32_t a_access_mask; - uint16_t a_flags; - uint16_t a_type; - uint8_t a_obj_type[16]; - uint8_t a_inherit_obj_type[16]; -} ace_object_t; - - -#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 -#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 -#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 -#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 - -extern struct _buf *kobj_open_file(char *name); -extern int kobj_read_file(struct _buf *file, char *buf, unsigned size, - unsigned off); -extern void kobj_close_file(struct _buf *file); -extern int kobj_get_filesize(struct _buf *file, uint64_t *size); -extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); -extern int zfs_secpolicy_rename_perms(const char *from, const char *to, - cred_t *cr); -extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); -extern zoneid_t getzoneid(void); -/* Random compatibility stuff. */ -#define pwrite64(d, p, n, o) pwrite(d, p, n, o) -#define readdir64(d) readdir(d) -#define SIGPENDING(td) (0) -#define root_mount_wait() do { } while (0) -#define root_mounted() (1) - -#define noinline __attribute__((noinline)) -#define likely(x) __builtin_expect((x), 1) - -struct file { - void *dummy; -}; - -#define FCREAT O_CREAT -#define FOFFMAX 0x0 - -/* SID stuff */ -typedef struct ksiddomain { - uint_t kd_ref; - uint_t kd_len; - char *kd_name; -} ksiddomain_t; - -ksiddomain_t *ksid_lookupdomain(const char *); -void ksiddomain_rele(ksiddomain_t *); - -typedef uint32_t idmap_rid_t; - -#define DDI_SLEEP KM_SLEEP -#define ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) (0) - -#define SX_SYSINIT(name, lock, desc) - -#define SYSCTL_HANDLER_ARGS struct sysctl_oid *oidp, void *arg1, \ - intptr_t arg2, struct sysctl_req *req - -/* - * This describes the access space for a sysctl request. This is needed - * so that we can use the interface from the kernel or from user-space. - */ -struct sysctl_req { - struct thread *td; /* used for access checking */ - int lock; /* wiring state */ - void *oldptr; - size_t oldlen; - size_t oldidx; - int (*oldfunc)(struct sysctl_req *, const void *, size_t); - void *newptr; - size_t newlen; - size_t newidx; - int (*newfunc)(struct sysctl_req *, void *, size_t); - size_t validlen; - int flags; -}; - -SLIST_HEAD(sysctl_oid_list, sysctl_oid); - -/* - * This describes one "oid" in the MIB tree. Potentially more nodes can - * be hidden behind it, expanded by the handler. - */ -struct sysctl_oid { - struct sysctl_oid_list *oid_parent; - SLIST_ENTRY(sysctl_oid) oid_link; - int oid_number; - u_int oid_kind; - void *oid_arg1; - intptr_t oid_arg2; - const char *oid_name; - int (*oid_handler)(SYSCTL_HANDLER_ARGS); - const char *oid_fmt; - int oid_refcnt; - u_int oid_running; - const char *oid_descr; -}; - -#define SYSCTL_DECL(...) -#define SYSCTL_NODE(...) -#define SYSCTL_INT(...) -#define SYSCTL_UINT(...) -#define SYSCTL_ULONG(...) -#define SYSCTL_PROC(...) -#define SYSCTL_QUAD(...) -#define SYSCTL_UQUAD(...) -#ifdef TUNABLE_INT -#undef TUNABLE_INT -#undef TUNABLE_ULONG -#undef TUNABLE_QUAD -#endif -#define TUNABLE_INT(...) -#define TUNABLE_ULONG(...) -#define TUNABLE_QUAD(...) - -int sysctl_handle_64(SYSCTL_HANDLER_ARGS); - -/* Errors */ - -#ifndef ERESTART -#define ERESTART (-1) -#endif - -#ifdef illumos -/* - * Cyclic information - */ -extern kmutex_t cpu_lock; - -typedef uintptr_t cyclic_id_t; -typedef uint16_t cyc_level_t; -typedef void (*cyc_func_t)(void *); - -#define CY_LOW_LEVEL 0 -#define CY_INFINITY INT64_MAX -#define CYCLIC_NONE ((cyclic_id_t)0) - -typedef struct cyc_time { - hrtime_t cyt_when; - hrtime_t cyt_interval; -} cyc_time_t; - -typedef struct cyc_handler { - cyc_func_t cyh_func; - void *cyh_arg; - cyc_level_t cyh_level; -} cyc_handler_t; - -extern cyclic_id_t cyclic_add(cyc_handler_t *, cyc_time_t *); -extern void cyclic_remove(cyclic_id_t); -extern int cyclic_reprogram(cyclic_id_t, hrtime_t); -#endif /* illumos */ - -#ifdef illumos -/* - * Buf structure - */ -#define B_BUSY 0x0001 -#define B_DONE 0x0002 -#define B_ERROR 0x0004 -#define B_READ 0x0040 /* read when I/O occurs */ -#define B_WRITE 0x0100 /* non-read pseudo-flag */ - -typedef struct buf { - int b_flags; - size_t b_bcount; - union { - caddr_t b_addr; - } b_un; - - lldaddr_t _b_blkno; -#define b_lblkno _b_blkno._f - size_t b_resid; - size_t b_bufsize; - int (*b_iodone)(struct buf *); - int b_error; - void *b_private; -} buf_t; - -extern void bioinit(buf_t *); -extern void biodone(buf_t *); -extern void bioerror(buf_t *, int); -extern int geterror(buf_t *); -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c b/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c deleted file mode 100644 index 595d766e93df..000000000000 --- a/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c +++ /dev/null @@ -1,353 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright 2012 Garrett D'Amore . All rights reserved. - * Copyright (c) 2014 by Delphix. All rights reserved. - */ - -#include - -int taskq_now; -taskq_t *system_taskq; - -#define TASKQ_ACTIVE 0x00010000 -#define TASKQ_NAMELEN 31 - -struct taskq { - char tq_name[TASKQ_NAMELEN + 1]; - kmutex_t tq_lock; - krwlock_t tq_threadlock; - kcondvar_t tq_dispatch_cv; - kcondvar_t tq_wait_cv; - thread_t *tq_threadlist; - int tq_flags; - int tq_active; - int tq_nthreads; - int tq_nalloc; - int tq_minalloc; - int tq_maxalloc; - kcondvar_t tq_maxalloc_cv; - int tq_maxalloc_wait; - taskq_ent_t *tq_freelist; - taskq_ent_t tq_task; -}; - -static taskq_ent_t * -task_alloc(taskq_t *tq, int tqflags) -{ - taskq_ent_t *t; - int rv; - -again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) { - tq->tq_freelist = t->tqent_next; - } else { - if (tq->tq_nalloc >= tq->tq_maxalloc) { - if (!(tqflags & KM_SLEEP)) - return (NULL); - - /* - * We don't want to exceed tq_maxalloc, but we can't - * wait for other tasks to complete (and thus free up - * task structures) without risking deadlock with - * the caller. So, we just delay for one second - * to throttle the allocation rate. If we have tasks - * complete before one second timeout expires then - * taskq_ent_free will signal us and we will - * immediately retry the allocation. - */ - tq->tq_maxalloc_wait++; -#ifdef __FreeBSD__ - rv = cv_timedwait(&tq->tq_maxalloc_cv, - &tq->tq_lock, hz); -#else - rv = cv_timedwait(&tq->tq_maxalloc_cv, - &tq->tq_lock, ddi_get_lbolt() + hz); -#endif - tq->tq_maxalloc_wait--; - if (rv > 0) - goto again; /* signaled */ - } - mutex_exit(&tq->tq_lock); - - t = kmem_alloc(sizeof (taskq_ent_t), tqflags & KM_SLEEP); - - mutex_enter(&tq->tq_lock); - if (t != NULL) - tq->tq_nalloc++; - } - return (t); -} - -static void -task_free(taskq_t *tq, taskq_ent_t *t) -{ - if (tq->tq_nalloc <= tq->tq_minalloc) { - t->tqent_next = tq->tq_freelist; - tq->tq_freelist = t; - } else { - tq->tq_nalloc--; - mutex_exit(&tq->tq_lock); - kmem_free(t, sizeof (taskq_ent_t)); - mutex_enter(&tq->tq_lock); - } - - if (tq->tq_maxalloc_wait) - cv_signal(&tq->tq_maxalloc_cv); -} - -taskqid_t -taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags) -{ - taskq_ent_t *t; - - if (taskq_now) { - func(arg); - return (1); - } - - mutex_enter(&tq->tq_lock); - ASSERT(tq->tq_flags & TASKQ_ACTIVE); - if ((t = task_alloc(tq, tqflags)) == NULL) { - mutex_exit(&tq->tq_lock); - return (0); - } - if (tqflags & TQ_FRONT) { - t->tqent_next = tq->tq_task.tqent_next; - t->tqent_prev = &tq->tq_task; - } else { - t->tqent_next = &tq->tq_task; - t->tqent_prev = tq->tq_task.tqent_prev; - } - t->tqent_next->tqent_prev = t; - t->tqent_prev->tqent_next = t; - t->tqent_func = func; - t->tqent_arg = arg; - t->tqent_flags = 0; - cv_signal(&tq->tq_dispatch_cv); - mutex_exit(&tq->tq_lock); - return (1); -} - -void -taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, - taskq_ent_t *t) -{ - ASSERT(func != NULL); - ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC)); - - /* - * Mark it as a prealloc'd task. This is important - * to ensure that we don't free it later. - */ - t->tqent_flags |= TQENT_FLAG_PREALLOC; - /* - * Enqueue the task to the underlying queue. - */ - mutex_enter(&tq->tq_lock); - - if (flags & TQ_FRONT) { - t->tqent_next = tq->tq_task.tqent_next; - t->tqent_prev = &tq->tq_task; - } else { - t->tqent_next = &tq->tq_task; - t->tqent_prev = tq->tq_task.tqent_prev; - } - t->tqent_next->tqent_prev = t; - t->tqent_prev->tqent_next = t; - t->tqent_func = func; - t->tqent_arg = arg; - cv_signal(&tq->tq_dispatch_cv); - mutex_exit(&tq->tq_lock); -} - -void -taskq_wait(taskq_t *tq) -{ - mutex_enter(&tq->tq_lock); - while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0) - cv_wait(&tq->tq_wait_cv, &tq->tq_lock); - mutex_exit(&tq->tq_lock); -} - -void -taskq_wait_id(taskq_t *tq, taskqid_t id) -{ - taskq_wait(tq); -} - -static void * -taskq_thread(void *arg) -{ - taskq_t *tq = arg; - taskq_ent_t *t; - boolean_t prealloc; - - mutex_enter(&tq->tq_lock); - while (tq->tq_flags & TASKQ_ACTIVE) { - if ((t = tq->tq_task.tqent_next) == &tq->tq_task) { - if (--tq->tq_active == 0) - cv_broadcast(&tq->tq_wait_cv); - cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock); - tq->tq_active++; - continue; - } - t->tqent_prev->tqent_next = t->tqent_next; - t->tqent_next->tqent_prev = t->tqent_prev; - t->tqent_next = NULL; - t->tqent_prev = NULL; - prealloc = t->tqent_flags & TQENT_FLAG_PREALLOC; - mutex_exit(&tq->tq_lock); - - rw_enter(&tq->tq_threadlock, RW_READER); - t->tqent_func(t->tqent_arg); - rw_exit(&tq->tq_threadlock); - - mutex_enter(&tq->tq_lock); - if (!prealloc) - task_free(tq, t); - } - tq->tq_nthreads--; - cv_broadcast(&tq->tq_wait_cv); - mutex_exit(&tq->tq_lock); - return (NULL); -} - -/*ARGSUSED*/ -taskq_t * -taskq_create(const char *name, int nthreads, pri_t pri, - int minalloc, int maxalloc, uint_t flags) -{ - taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP); - int t; - - if (flags & TASKQ_THREADS_CPU_PCT) { - int pct; - ASSERT3S(nthreads, >=, 0); - ASSERT3S(nthreads, <=, 100); - pct = MIN(nthreads, 100); - pct = MAX(pct, 0); - - nthreads = (sysconf(_SC_NPROCESSORS_ONLN) * pct) / 100; - nthreads = MAX(nthreads, 1); /* need at least 1 thread */ - } else { - ASSERT3S(nthreads, >=, 1); - } - - rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL); - mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL); - cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL); - cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL); - (void) strncpy(tq->tq_name, name, TASKQ_NAMELEN + 1); - tq->tq_flags = flags | TASKQ_ACTIVE; - tq->tq_active = nthreads; - tq->tq_nthreads = nthreads; - tq->tq_minalloc = minalloc; - tq->tq_maxalloc = maxalloc; - tq->tq_task.tqent_next = &tq->tq_task; - tq->tq_task.tqent_prev = &tq->tq_task; - tq->tq_threadlist = kmem_alloc(nthreads * sizeof (thread_t), KM_SLEEP); - - if (flags & TASKQ_PREPOPULATE) { - mutex_enter(&tq->tq_lock); - while (minalloc-- > 0) - task_free(tq, task_alloc(tq, KM_SLEEP)); - mutex_exit(&tq->tq_lock); - } - - for (t = 0; t < nthreads; t++) - (void) thr_create(0, 0, taskq_thread, - tq, THR_BOUND, &tq->tq_threadlist[t]); - - return (tq); -} - -void -taskq_destroy(taskq_t *tq) -{ - int t; - int nthreads = tq->tq_nthreads; - - taskq_wait(tq); - - mutex_enter(&tq->tq_lock); - - tq->tq_flags &= ~TASKQ_ACTIVE; - cv_broadcast(&tq->tq_dispatch_cv); - - while (tq->tq_nthreads != 0) - cv_wait(&tq->tq_wait_cv, &tq->tq_lock); - - tq->tq_minalloc = 0; - while (tq->tq_nalloc != 0) { - ASSERT(tq->tq_freelist != NULL); - task_free(tq, task_alloc(tq, KM_SLEEP)); - } - - mutex_exit(&tq->tq_lock); - - for (t = 0; t < nthreads; t++) - (void) thr_join(tq->tq_threadlist[t], NULL, NULL); - - kmem_free(tq->tq_threadlist, nthreads * sizeof (thread_t)); - - rw_destroy(&tq->tq_threadlock); - mutex_destroy(&tq->tq_lock); - cv_destroy(&tq->tq_dispatch_cv); - cv_destroy(&tq->tq_wait_cv); - cv_destroy(&tq->tq_maxalloc_cv); - - kmem_free(tq, sizeof (taskq_t)); -} - -int -taskq_member(taskq_t *tq, void *t) -{ - int i; - - if (taskq_now) - return (1); - - for (i = 0; i < tq->tq_nthreads; i++) - if (tq->tq_threadlist[i] == (thread_t)(uintptr_t)t) - return (1); - - return (0); -} - -void -system_taskq_init(void) -{ - system_taskq = taskq_create("system_taskq", 64, minclsyspri, 4, 512, - TASKQ_DYNAMIC | TASKQ_PREPOPULATE); -} - -void -system_taskq_fini(void) -{ - taskq_destroy(system_taskq); - system_taskq = NULL; /* defensive */ -} diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/util.c b/cddl/contrib/opensolaris/lib/libzpool/common/util.c deleted file mode 100644 index d2ed31a46832..000000000000 --- a/cddl/contrib/opensolaris/lib/libzpool/common/util.c +++ /dev/null @@ -1,196 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. - * Copyright (c) 2017, Intel Corporation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Routines needed by more than one client of libzpool. - */ - -static void -show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) -{ - vdev_stat_t *vs; - vdev_stat_t v0 = { 0 }; - uint64_t sec; - uint64_t is_log = 0; - nvlist_t **child; - uint_t c, children; - char used[6], avail[6]; - char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6]; - - if (indent == 0 && desc != NULL) { - (void) printf(" " - " capacity operations bandwidth ---- errors ----\n"); - (void) printf("description " - "used avail read write read write read write cksum\n"); - } - - if (desc != NULL) { - char *suffix = "", *bias = NULL; - char bias_suffix[32]; - - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); - (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, - &bias); - if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &c) != 0) - vs = &v0; - - if (bias != NULL) { - (void) snprintf(bias_suffix, sizeof (bias_suffix), - " (%s)", bias); - suffix = bias_suffix; - } else if (is_log) { - suffix = " (log)"; - } - - sec = MAX(1, vs->vs_timestamp / NANOSEC); - - nicenum(vs->vs_alloc, used, sizeof (used)); - nicenum(vs->vs_space - vs->vs_alloc, avail, sizeof (avail)); - nicenum(vs->vs_ops[ZIO_TYPE_READ] / sec, rops, sizeof (rops)); - nicenum(vs->vs_ops[ZIO_TYPE_WRITE] / sec, wops, sizeof (wops)); - nicenum(vs->vs_bytes[ZIO_TYPE_READ] / sec, rbytes, - sizeof (rbytes)); - nicenum(vs->vs_bytes[ZIO_TYPE_WRITE] / sec, wbytes, - sizeof (wbytes)); - nicenum(vs->vs_read_errors, rerr, sizeof (rerr)); - nicenum(vs->vs_write_errors, werr, sizeof (werr)); - nicenum(vs->vs_checksum_errors, cerr, sizeof (cerr)); - - (void) printf("%*s%s%*s%*s%*s %5s %5s %5s %5s %5s %5s %5s\n", - indent, "", - desc, - (int)(indent+strlen(desc)-25-(vs->vs_space ? 0 : 12)), - suffix, - vs->vs_space ? 6 : 0, vs->vs_space ? used : "", - vs->vs_space ? 6 : 0, vs->vs_space ? avail : "", - rops, wops, rbytes, wbytes, rerr, werr, cerr); - } - - if (nvlist_lookup_nvlist_array(nv, ctype, &child, &children) != 0) - return; - - for (c = 0; c < children; c++) { - nvlist_t *cnv = child[c]; - char *cname, *tname; - uint64_t np; - if (nvlist_lookup_string(cnv, ZPOOL_CONFIG_PATH, &cname) && - nvlist_lookup_string(cnv, ZPOOL_CONFIG_TYPE, &cname)) - cname = ""; - tname = calloc(1, strlen(cname) + 2); - (void) strcpy(tname, cname); - if (nvlist_lookup_uint64(cnv, ZPOOL_CONFIG_NPARITY, &np) == 0) - tname[strlen(tname)] = '0' + np; - show_vdev_stats(tname, ctype, cnv, indent + 2); - free(tname); - } -} - -void -show_pool_stats(spa_t *spa) -{ - nvlist_t *config, *nvroot; - char *name; - - VERIFY(spa_get_stats(spa_name(spa), &config, NULL, 0) == 0); - - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, - &name) == 0); - - show_vdev_stats(name, ZPOOL_CONFIG_CHILDREN, nvroot, 0); - show_vdev_stats(NULL, ZPOOL_CONFIG_L2CACHE, nvroot, 0); - show_vdev_stats(NULL, ZPOOL_CONFIG_SPARES, nvroot, 0); - - nvlist_free(config); -} - -/* - * Sets given global variable in libzpool to given unsigned 32-bit value. - * arg: "=" - */ -int -set_global_var(char *arg) -{ - void *zpoolhdl; - char *varname = arg, *varval; - u_longlong_t val; - -#ifndef _LITTLE_ENDIAN - /* - * On big endian systems changing a 64-bit variable would set the high - * 32 bits instead of the low 32 bits, which could cause unexpected - * results. - */ - fprintf(stderr, "Setting global variables is only supported on " - "little-endian systems\n", varname); - return (ENOTSUP); -#endif - if ((varval = strchr(arg, '=')) != NULL) { - *varval = '\0'; - varval++; - val = strtoull(varval, NULL, 0); - if (val > UINT32_MAX) { - fprintf(stderr, "Value for global variable '%s' must " - "be a 32-bit unsigned integer\n", varname); - return (EOVERFLOW); - } - } else { - return (EINVAL); - } - - zpoolhdl = dlopen("libzpool.so", RTLD_LAZY); - if (zpoolhdl != NULL) { - uint32_t *var; - var = dlsym(zpoolhdl, varname); - if (var == NULL) { - fprintf(stderr, "Global variable '%s' does not exist " - "in libzpool.so\n", varname); - return (EINVAL); - } - *var = (uint32_t)val; - - dlclose(zpoolhdl); - } else { - fprintf(stderr, "Failed to open libzpool.so to set global " - "variable\n"); - return (EIO); - } - - return (0); -} diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/zfs.d b/cddl/contrib/opensolaris/lib/libzpool/common/zfs.d deleted file mode 100644 index 1351733c807b..000000000000 --- a/cddl/contrib/opensolaris/lib/libzpool/common/zfs.d +++ /dev/null @@ -1,36 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2013 by Delphix. All rights reserved. - */ - -provider zfs { - probe probe0(char *probename); - probe probe1(char *probename, unsigned long arg1); - probe probe2(char *probename, unsigned long arg1, unsigned long arg2); - probe probe3(char *probename, unsigned long arg1, unsigned long arg2, - unsigned long arg3); - probe probe4(char *probename, unsigned long arg1, unsigned long arg2, - unsigned long arg3, unsigned long arg4); - - probe set__error(int err); -}; - -#pragma D attributes Evolving/Evolving/ISA provider zfs provider -#pragma D attributes Private/Private/Unknown provider zfs module -#pragma D attributes Private/Private/Unknown provider zfs function -#pragma D attributes Evolving/Evolving/ISA provider zfs name -#pragma D attributes Evolving/Evolving/ISA provider zfs args diff --git a/cddl/contrib/opensolaris/tools/ctf/cvt/util.c b/cddl/contrib/opensolaris/tools/ctf/cvt/util.c index fb76cbaeb422..0eda56dbf65a 100644 --- a/cddl/contrib/opensolaris/tools/ctf/cvt/util.c +++ b/cddl/contrib/opensolaris/tools/ctf/cvt/util.c @@ -29,6 +29,7 @@ * Utility functions */ +#include #include #include #include diff --git a/cddl/lib/Makefile b/cddl/lib/Makefile index b65983dd64a7..399e314e76b1 100644 --- a/cddl/lib/Makefile +++ b/cddl/lib/Makefile @@ -6,27 +6,40 @@ SUBDIR= drti \ libavl \ libctf \ libdtrace \ + ${_libicp} \ + ${_libicp_rescue} \ libnvpair \ + libspl \ + ${_libtpool} \ libumem \ libuutil \ ${_libzfs_core} \ ${_libzfs} \ ${_libzpool} \ + ${_libzutil} SUBDIR.${MK_TESTS}+= tests .if ${MK_ZFS} != "no" _libzfs_core= libzfs_core +_libicp= libicp +_libicp_rescue= libicp_rescue _libzfs= libzfs +_libzutil= libzutil .if ${MK_LIBTHR} != "no" _libzpool= libzpool +_libtpool= libtpool .endif .endif +SUBDIR_DEPEND_libctf= libspl SUBDIR_DEPEND_libdtrace= libctf +SUBDIR_DEPEND_libtpool= libspl +SUBDIR_DEPEND_libuutil= libavl libspl SUBDIR_DEPEND_libzfs_core= libnvpair -SUBDIR_DEPEND_libzfs= libavl libnvpair libumem libuutil libzfs_core -SUBDIR_DEPEND_libzpool= libavl libnvpair libumem +SUBDIR_DEPEND_libzfs= libavl libnvpair libumem libuutil libzfs_core libzutil +SUBDIR_DEPEND_libzpool= libavl libnvpair libumem libicp +SUBDIR_DEPEND_libzutil= libavl libtpool SUBDIR_PARALLEL= diff --git a/cddl/lib/drti/Makefile b/cddl/lib/drti/Makefile index 19cbd748d234..eeb1c865e632 100644 --- a/cddl/lib/drti/Makefile +++ b/cddl/lib/drti/Makefile @@ -11,7 +11,14 @@ FILESDIR= ${LIBDIR}/dtrace CLEANFILES= ${FILES} # These FILES qualify as libraries for the purpose of LIBRARIES_ONLY. .undef LIBRARIES_ONLY - +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \ -I${SRCTOP}/cddl/compat/opensolaris/include \ -I${OPENSOLARIS_USR_DISTDIR}/head \ diff --git a/cddl/lib/libavl/Makefile b/cddl/lib/libavl/Makefile index 995e4d013e50..2f7b9ad30856 100644 --- a/cddl/lib/libavl/Makefile +++ b/cddl/lib/libavl/Makefile @@ -1,12 +1,15 @@ # $FreeBSD$ -.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/avl +.PATH: ${SRCTOP}/sys/contrib/openzfs/module/avl PACKAGE= runtime LIB= avl SRCS= avl.c WARNS?= 3 -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common - +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h .include diff --git a/cddl/lib/libctf/Makefile b/cddl/lib/libctf/Makefile index ed7ebc5351de..b45bda3385e6 100644 --- a/cddl/lib/libctf/Makefile +++ b/cddl/lib/libctf/Makefile @@ -21,6 +21,14 @@ MAN= ctf.5 WARNS?= 2 CFLAGS+= -DCTF_OLD_VERSIONS +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID + CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \ -I${SRCTOP}/cddl/compat/opensolaris/include \ -I${OPENSOLARIS_USR_DISTDIR}/head \ @@ -28,6 +36,6 @@ CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \ -I${OPENSOLARIS_USR_DISTDIR}/lib/libctf/common \ -I${OPENSOLARIS_SYS_DISTDIR}/uts/common -LIBADD+= z +LIBADD+= spl z .include diff --git a/cddl/lib/libdtrace/Makefile b/cddl/lib/libdtrace/Makefile index 11565b03b0ac..e0125cad2b38 100644 --- a/cddl/lib/libdtrace/Makefile +++ b/cddl/lib/libdtrace/Makefile @@ -66,6 +66,16 @@ FILESMODE= ${NOBINMODE} WARNS?= 1 +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID + + CFLAGS+= -I${.OBJDIR} -I${.CURDIR} \ -I${SRCTOP}/sys/cddl/dev/dtrace/${MACHINE_ARCH} \ -I${SRCTOP}/sys/cddl/compat/opensolaris \ diff --git a/cddl/lib/libicp/Makefile b/cddl/lib/libicp/Makefile new file mode 100644 index 000000000000..36858338ac6b --- /dev/null +++ b/cddl/lib/libicp/Makefile @@ -0,0 +1,101 @@ +# $FreeBSD$ + +.PATH: ${SRCTOP}/sys/contrib/openzfs/module/icp + +PACKAGE= runtime +LIB= icp +LIBADD= + + +.if ${MACHINE_ARCH} == "amd64" +ASM_SOURCES_C = asm-x86_64/aes/aeskey.c +ASM_SOURCES_AS = \ + asm-x86_64/aes/aes_amd64.S \ + asm-x86_64/aes/aes_aesni.S \ + asm-x86_64/modes/gcm_pclmulqdq.S \ + asm-x86_64/modes/aesni-gcm-x86_64.S \ + asm-x86_64/modes/ghash-x86_64.S \ + asm-x86_64/sha1/sha1-x86_64.S \ + asm-x86_64/sha2/sha256_impl.S \ + asm-x86_64/sha2/sha512_impl.S + +CFLAGS+= -D__amd64 -D_SYS_STACK_H -UHAVE_AES +.else +ASM_SOURCES_C = +ASM_SOURCES_AS = +.endif + + +KERNEL_C = \ + spi/kcf_spi.c \ + api/kcf_ctxops.c \ + api/kcf_digest.c \ + api/kcf_cipher.c \ + api/kcf_miscapi.c \ + api/kcf_mac.c \ + algs/aes/aes_impl_aesni.c \ + algs/aes/aes_impl_generic.c \ + algs/aes/aes_impl_x86-64.c \ + algs/aes/aes_impl.c \ + algs/aes/aes_modes.c \ + algs/edonr/edonr.c \ + algs/modes/modes.c \ + algs/modes/cbc.c \ + algs/modes/gcm_generic.c \ + algs/modes/gcm_pclmulqdq.c \ + algs/modes/gcm.c \ + algs/modes/ctr.c \ + algs/modes/ccm.c \ + algs/modes/ecb.c \ + algs/sha1/sha1.c \ + algs/sha2/sha2.c \ + algs/skein/skein.c \ + algs/skein/skein_block.c \ + algs/skein/skein_iv.c \ + illumos-crypto.c \ + io/aes.c \ + io/edonr_mod.c \ + io/sha1_mod.c \ + io/sha2_mod.c \ + io/skein_mod.c \ + os/modhash.c \ + os/modconf.c \ + core/kcf_sched.c \ + core/kcf_prov_lib.c \ + core/kcf_callprov.c \ + core/kcf_mech_tabs.c \ + core/kcf_prov_tabs.c \ + $(ASM_SOURCES_C) + + + + + + +SRCS= $(ASM_SOURCES_AS) $(KERNEL_C) + +WARNS?= 2 +SHLIB_MAJOR= 3 +CSTD= c99 +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID +CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h + + +CFLAGS.aes_amd64.S+= -DLOCORE +CFLAGS.aes_aesni.S+= -DLOCORE +CFLAGS.gcm_pclmulqdq.S+= -DLOCORE +CFLAGS.aesni-gcm-x86_64.S+= -DLOCORE +CFLAGS.ghash-x86_64.S+= -DLOCORE +CFLAGS.sha1-x86_64.S+= -DLOCORE +CFLAGS.sha256_impl.S+= -DLOCORE +CFLAGS.sha512_impl.S+= -DLOCORE + +.include diff --git a/cddl/lib/libicp_rescue/Makefile b/cddl/lib/libicp_rescue/Makefile new file mode 100644 index 000000000000..1ebe1b0ff649 --- /dev/null +++ b/cddl/lib/libicp_rescue/Makefile @@ -0,0 +1,99 @@ +# $FreeBSD$ + +.PATH: ${SRCTOP}/sys/contrib/openzfs/module/icp + +PACKAGE= runtime +LIB= icp_rescue +LIBADD= + + +.if ${MACHINE_ARCH} == "amd64" +ASM_SOURCES_C = asm-x86_64/aes/aeskey.c +ASM_SOURCES_AS = \ + asm-x86_64/aes/aes_amd64.S \ + asm-x86_64/aes/aes_aesni.S \ + asm-x86_64/modes/gcm_pclmulqdq.S \ + asm-x86_64/modes/aesni-gcm-x86_64.S \ + asm-x86_64/sha1/sha1-x86_64.S \ + asm-x86_64/sha2/sha256_impl.S \ + asm-x86_64/sha2/sha512_impl.S + +CFLAGS+= -D__amd64 -D_SYS_STACK_H +.else +ASM_SOURCES_C = +ASM_SOURCES_AS = +.endif + + +KERNEL_C = \ + spi/kcf_spi.c \ + api/kcf_ctxops.c \ + api/kcf_digest.c \ + api/kcf_cipher.c \ + api/kcf_miscapi.c \ + api/kcf_mac.c \ + algs/aes/aes_impl_aesni.c \ + algs/aes/aes_impl_generic.c \ + algs/aes/aes_impl_x86-64.c \ + algs/aes/aes_impl.c \ + algs/aes/aes_modes.c \ + algs/edonr/edonr.c \ + algs/modes/modes.c \ + algs/modes/cbc.c \ + algs/modes/gcm_generic.c \ + algs/modes/gcm_pclmulqdq.c \ + algs/modes/gcm.c \ + algs/modes/ctr.c \ + algs/modes/ccm.c \ + algs/modes/ecb.c \ + algs/sha1/sha1.c \ + algs/sha2/sha2.c \ + algs/skein/skein_block.c \ + illumos-crypto.c \ + io/aes.c \ + io/edonr_mod.c \ + io/sha1_mod.c \ + io/sha2_mod.c \ + io/skein_mod.c \ + os/modhash.c \ + os/modconf.c \ + core/kcf_sched.c \ + core/kcf_prov_lib.c \ + core/kcf_callprov.c \ + core/kcf_mech_tabs.c \ + core/kcf_prov_tabs.c \ + $(ASM_SOURCES_C) + + + + + + +SRCS= $(ASM_SOURCES_AS) $(KERNEL_C) + +WARNS?= 2 +SHLIB_MAJOR= 3 +CSTD= c99 +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID -UHAVE_AVX -DRESCUE +CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h + + +CFLAGS.aes_amd64.S+= -DLOCORE +CFLAGS.aes_aesni.S+= -DLOCORE +CFLAGS.gcm_pclmulqdq.S+= -DLOCORE +CFLAGS.aesni-gcm-x86_64.S+= -DLOCORE +CFLAGS.ghash-x86_64.S+= -DLOCORE +CFLAGS.sha1-x86_64.S+= -DLOCORE +CFLAGS.sha256_impl.S+= -DLOCORE +CFLAGS.sha512_impl.S+= -DLOCORE +CFLAGS.gcm.c+= -UCAN_USE_GCM_ASM + +.include diff --git a/cddl/lib/libnvpair/Makefile b/cddl/lib/libnvpair/Makefile index 90c3295d5048..670253eff7c1 100644 --- a/cddl/lib/libnvpair/Makefile +++ b/cddl/lib/libnvpair/Makefile @@ -1,36 +1,30 @@ # $FreeBSD$ -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair -.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/nvpair +.PATH: ${SRCTOP}/sys/contrib/openzfs/module/nvpair +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libnvpair LIB= nvpair PACKAGE= runtime -INCS= libnvpair.h +# user SRCS= libnvpair.c \ - nvpair_alloc_system.c \ - nvpair_json.c \ - opensolaris_fnvpair.c \ - opensolaris_nvpair.c \ - opensolaris_nvpair_alloc_fixed.c + libnvpair_json.c \ + nvpair_alloc_system.c +# kernel +SRCS+= nvpair_alloc_fixed.c \ + nvpair.c \ + fnvpair.c -WARNS?= 1 -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs +WARNS?= 2 +CFLAGS+= -DIN_BASE -DHAVE_RPC_TYPES +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd CFLAGS+= -I${SRCTOP}/sys -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID -DHAVE_CONFIG_H -DHAVE_XDR_BYTESREC -# This library uses macros to define fprintf behavior for several object types -# The compiler will see the non-string literal arguments to the fprintf calls and -# omit warnings for them. Quiesce these warnings in contrib code: -# -# cddl/contrib/opensolaris/lib/libnvpair/libnvpair.c:743:12: warning: format -# string is not a string literal (potentially insecure) [-Wformat-security] -# ARENDER(pctl, nvlist_array, nvl, name, val, nelem); -# -CFLAGS+= -Wno-format-security + +CFLAGS.nvpair.c+= -UHAVE_RPC_TYPES .include diff --git a/cddl/lib/libspl/Makefile b/cddl/lib/libspl/Makefile new file mode 100644 index 000000000000..64317c41b730 --- /dev/null +++ b/cddl/lib/libspl/Makefile @@ -0,0 +1,56 @@ +# $FreeBSD$ + +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libspl +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libspl/os/freebsd +.PATH: ${SRCTOP}/sys/contrib/openzfs/include + + +LIB= spl +LIBADD= +PACKAGE= runtime + +SRCS = \ + assert.c \ + list.c \ + mkdirp.c \ + page.c \ + strlcat.c \ + strlcpy.c \ + timestamp.c \ + zone.c \ + include/sys/list.h \ + include/sys/list_impl.h + +SRCS += \ + getexecname.c \ + gethostid.c \ + getmntany.c \ + mnttab.c + + +.if ${MACHINE_ARCH} == "amd64" +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libspl/asm-x86_64 +SRCS += atomic.S +.elif ${MACHINE_ARCH} == "i386" +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libspl/asm-i386 +SRCS += atomic.S +.else +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libspl/asm-generic +SRCS += atomic.c +.endif + + +WARNS?= 2 +CSTD= c99 +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID +CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h +CFLAGS.atomic.S+= -DLOCORE + +.include diff --git a/cddl/lib/libtpool/Makefile b/cddl/lib/libtpool/Makefile new file mode 100644 index 000000000000..637385bc842e --- /dev/null +++ b/cddl/lib/libtpool/Makefile @@ -0,0 +1,27 @@ +# $FreeBSD$ + +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libtpool +.PATH: ${SRCTOP}/sys/contrib/openzfs/include + + +LIB= tpool +LIBADD= spl +PACKAGE= runtime + +INCS= thread_pool_impl.h +SRCS= thread_pool.c + +WARNS?= 2 +CSTD= c99 +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID +CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h + +.include diff --git a/cddl/lib/libuutil/Makefile b/cddl/lib/libuutil/Makefile index 63997c70c236..76567b50c610 100644 --- a/cddl/lib/libuutil/Makefile +++ b/cddl/lib/libuutil/Makefile @@ -1,11 +1,10 @@ # $FreeBSD$ -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common -.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/avl +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libuutil PACKAGE= runtime LIB= uutil -SRCS= avl.c \ +SRCS=\ uu_alloc.c \ uu_avl.c \ uu_dprintf.c \ @@ -14,14 +13,17 @@ SRCS= avl.c \ uu_misc.c \ uu_open.c \ uu_pname.c \ - uu_strtoint.c + uu_string.c -WARNS?= 1 -CFLAGS+= -DNATIVE_BUILD -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common +WARNS?= 2 +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h + +LIBADD= avl spl .include diff --git a/cddl/lib/libzfs/Makefile b/cddl/lib/libzfs/Makefile index b42365cf0d93..cd2d2b99deb0 100644 --- a/cddl/lib/libzfs/Makefile +++ b/cddl/lib/libzfs/Makefile @@ -1,62 +1,102 @@ # $FreeBSD$ -.PATH: ${SRCTOP}/cddl/compat/opensolaris/misc -.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs -.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils/common +.PATH: ${SRCTOP}/sys/contrib/openzfs/module/icp +.PATH: ${SRCTOP}/sys/contrib/openzfs/module/zcommon +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzfs +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzfs/os/freebsd +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libshare +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libshare/os/freebsd +.PATH: ${SRCTOP}/sys/contrib/openzfs/include +.PATH: ${SRCTOP}/sys/contrib/openzfs/module/zstd +.PATH: ${SRCTOP}/sys/contrib/openzfs/module/zstd/lib PACKAGE= runtime LIB= zfs -LIBADD= md pthread umem util uutil m avl bsdxml geom nvpair z zfs_core -SRCS= deviceid.c \ - fsshare.c \ - mkdirp.c \ - mnttab.c \ - thread_pool.c \ - zmount.c \ - zone.c +LIBADD= md pthread umem util uutil m avl bsdxml geom nvpair z zfs_core zutil -SRCS+= nicenum.c +INCS= libzfs.h +USER_C = \ + libzfs_changelist.c \ + libzfs_config.c \ + libzfs_crypto.c \ + libzfs_dataset.c \ + libzfs_diff.c \ + libzfs_import.c \ + libzfs_iter.c \ + libzfs_mount.c \ + libzfs_pool.c \ + libzfs_sendrecv.c \ + libzfs_status.c \ + libzfs_util.c -SRCS+= libzfs_changelist.c \ - libzfs_compat.c \ - libzfs_config.c \ - libzfs_dataset.c \ - libzfs_diff.c \ - libzfs_import.c \ - libzfs_iter.c \ - libzfs_mount.c \ - libzfs_pool.c \ - libzfs_sendrecv.c \ - libzfs_status.c \ - libzfs_util.c \ - zfeature_common.c \ - zfs_comutil.c \ - zfs_deleg.c \ - zfs_fletcher.c \ - zfs_namecheck.c \ - zfs_prop.c \ - zpool_prop.c \ - zprop_common.c \ +# FreeBSD +USER_C += \ + libzfs_compat.c \ + libzfs_ioctl_compat.c \ + libzfs_zmount.c -WARNS?= 0 -SHLIB_MAJOR= 3 +# libshare +USER_C += \ + libshare.c \ + nfs.c \ + smb.c + + +KERNEL_C = \ + algs/sha2/sha2.c \ + cityhash.c \ + zfeature_common.c \ + zfs_comutil.c \ + zfs_deleg.c \ + zfs_fletcher.c \ + zfs_fletcher_superscalar.c \ + zfs_fletcher_superscalar4.c \ + zfs_namecheck.c \ + zfs_prop.c \ + zfs_uio.c \ + zpool_prop.c \ + zprop_common.c + + +KERNEL_C+= zstd.c \ + zfs_zstd.c + + +ARCH_C = +.if ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "i386" +ARCH_C += zfs_fletcher_intel.c \ + zfs_fletcher_sse.c +CFLAGS += -DHAVE_SSE2 +.endif +.if ${MACHINE_ARCH} == "amd64" +ARCH_C += zfs_fletcher_avx512.c +CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_AVX512F +.endif +.if ${MACHINE_ARCH} == "aarch64" +ARCH_C += zfs_fletcher_aarch64_neon.c +.endif + +SRCS= $(USER_C) $(KERNEL_C) $(ARCH_C) + +WARNS?= 2 +SHLIB_MAJOR= 4 CSTD= c99 -CFLAGS+= -DZFS_NO_ACL -CFLAGS+= -I${SRCTOP}/sbin/mount -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libshare +CFLAGS+= -I${SRCTOP}/sys/contrib/ck/include +CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include +CFLAGS+= -I${SRCDIR}/sys/contrib/openzfs/module/zstd/include +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID +CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h +CFLAGS.zfs_zstd.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zstd.c= -fno-tree-vectorize + .include diff --git a/cddl/lib/libzfs_core/Makefile b/cddl/lib/libzfs_core/Makefile index 412a5d2be938..52747bfcc2d8 100644 --- a/cddl/lib/libzfs_core/Makefile +++ b/cddl/lib/libzfs_core/Makefile @@ -1,37 +1,28 @@ # $FreeBSD$ -.PATH: ${SRCTOP}/cddl/compat/opensolaris/misc -.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs -.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzfs_core +.PATH: ${SRCTOP}/sys/contrib/openzfs/include + LIB= zfs_core LIBADD= nvpair PACKAGE= runtime INCS= libzfs_core.h -SRCS= libzfs_core.c \ - libzfs_core_compat.c \ - zfs_ioctl_compat.c +SRCS= libzfs_core.c -SRCS+= libzfs_compat.c - -WARNS?= 0 +WARNS?= 2 CSTD= c99 -CFLAGS+= -DZFS_NO_ACL -CFLAGS+= -I${SRCTOP}/sbin/mount -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzfs_core/common +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID +CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h .include diff --git a/cddl/lib/libzpool/Makefile b/cddl/lib/libzpool/Makefile index 576b89b7725a..e1090c628c1f 100644 --- a/cddl/lib/libzpool/Makefile +++ b/cddl/lib/libzpool/Makefile @@ -1,20 +1,17 @@ # $FreeBSD$ -.include "${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/Makefile.files" # ZFS_COMMON_SRCS -.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs +.PATH: ${SRCTOP}/sys/contrib/openzfs/module/zfs +.PATH: ${SRCTOP}/sys/contrib/openzfs/module/zcommon +.PATH: ${SRCTOP}/sys/contrib/openzfs/module/unicode # LUA_SRCS -.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua -# ZFS_SHARED_SRCS -.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs -# LZ4_COMMON_SRCS -.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/lz4 -# KERNEL_SRCS -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -# LIST_SRCS -.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/os -# ATOMIC_SRCS +.PATH: ${SRCTOP}/sys/contrib/openzfs/module/lua + +.PATH: ${SRCTOP}/sys/contrib/openzfs/module/os/linux/zfs + +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzpool + .if exists(${SRCTOP}/sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH}/opensolaris_atomic.S) .PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH} ATOMIC_SRCS= opensolaris_atomic.S @@ -23,40 +20,218 @@ ACFLAGS+= -Wa,--noexecstack .PATH: ${SRCTOP}/sys/cddl/compat/opensolaris/kern ATOMIC_SRCS= opensolaris_atomic.c .endif -# UNICODE_SRCS -.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/unicode -# LIBCMDUTILS_SRCS -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils/common + +.if ${MACHINE_ARCH} == "powerpc" +# Don't waste GOT entries on small data. +PICFLAG= -fPIC +.endif LIB= zpool -ZFS_COMMON_SRCS= ${ZFS_COMMON_OBJS:C/.o$/.c/} trim_map.c -ZFS_SHARED_SRCS= ${ZFS_SHARED_OBJS:C/.o$/.c/} -LZ4_COMMON_SRCS= lz4.c -LUA_SRCS= ${LUA_OBJS:C/.o$/.c/} -KERNEL_SRCS= kernel.c taskq.c util.c -LIST_SRCS= list.c -UNICODE_SRCS= u8_textprep.c -LIBCMDUTILS_SRCS=nicenum.c -SRCS= ${ZFS_COMMON_SRCS} ${ZFS_SHARED_SRCS} ${LUA_SRCS} \ - ${LZ4_COMMON_SRCS} ${KERNEL_SRCS} ${LIST_SRCS} ${ATOMIC_SRCS} \ - ${UNICODE_SRCS} ${LIBCMDUTILS_SRCS} -WARNS?= 0 -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/lz4 -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils +USER_C = \ + kernel.c \ + taskq.c \ + util.c + +KERNEL_C = \ + zfeature_common.c \ + zfs_comutil.c \ + zfs_deleg.c \ + zfs_fletcher.c \ + zfs_fletcher_superscalar.c \ + zfs_fletcher_superscalar4.c \ + zfs_namecheck.c \ + zfs_prop.c \ + zfs_uio.c \ + zpool_prop.c \ + zprop_common.c \ + abd.c \ + abd_os.c \ + aggsum.c \ + arc.c \ + arc_os.c \ + blkptr.c \ + bplist.c \ + bpobj.c \ + bptree.c \ + btree.c \ + bqueue.c \ + cityhash.c \ + dbuf.c \ + dbuf_stats.c \ + ddt.c \ + ddt_zap.c \ + dmu.c \ + dmu_diff.c \ + dmu_object.c \ + dmu_objset.c \ + dmu_recv.c \ + dmu_redact.c \ + dmu_send.c \ + dmu_traverse.c \ + dmu_tx.c \ + dmu_zfetch.c \ + dnode.c \ + dnode_sync.c \ + dsl_bookmark.c \ + dsl_dataset.c \ + dsl_deadlist.c \ + dsl_deleg.c \ + dsl_dir.c \ + dsl_crypt.c \ + dsl_pool.c \ + dsl_prop.c \ + dsl_scan.c \ + dsl_synctask.c \ + dsl_destroy.c \ + dsl_userhold.c \ + edonr_zfs.c \ + hkdf.c \ + fm.c \ + gzip.c \ + lzjb.c \ + lz4.c \ + metaslab.c \ + mmp.c \ + multilist.c \ + objlist.c \ + pathname.c \ + range_tree.c \ + refcount.c \ + rrwlock.c \ + sa.c \ + sha256.c \ + skein_zfs.c \ + spa.c \ + spa_boot.c \ + spa_checkpoint.c \ + spa_config.c \ + spa_errlog.c \ + spa_history.c \ + spa_log_spacemap.c \ + spa_misc.c \ + spa_stats.c \ + space_map.c \ + space_reftree.c \ + txg.c \ + trace.c \ + uberblock.c \ + unique.c \ + vdev.c \ + vdev_cache.c \ + vdev_file.c \ + vdev_indirect_births.c \ + vdev_indirect.c \ + vdev_indirect_mapping.c \ + vdev_initialize.c \ + vdev_label.c \ + vdev_mirror.c \ + vdev_missing.c \ + vdev_queue.c \ + vdev_raidz.c \ + vdev_raidz_math_aarch64_neon.c \ + vdev_raidz_math_aarch64_neonx2.c \ + vdev_raidz_math_avx2.c \ + vdev_raidz_math_avx512bw.c \ + vdev_raidz_math_avx512f.c \ + vdev_raidz_math.c \ + vdev_raidz_math_scalar.c \ + vdev_rebuild.c \ + vdev_removal.c \ + vdev_root.c \ + vdev_trim.c \ + zap.c \ + zap_leaf.c \ + zap_micro.c \ + zcp.c \ + zcp_get.c \ + zcp_global.c \ + zcp_iter.c \ + zcp_set.c \ + zcp_synctask.c \ + zfeature.c \ + zfs_byteswap.c \ + zfs_debug.c \ + zfs_fm.c \ + zfs_fuid.c \ + zfs_sa.c \ + zfs_znode.c \ + zfs_ratelimit.c \ + zfs_rlock.c \ + zil.c \ + zio.c \ + zio_checksum.c \ + zio_compress.c \ + zio_crypt.c \ + zio_inject.c \ + zle.c \ + zrlock.c \ + zthr.c + +ARCH_C = +.if ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "i386" +ARCH_C += vdev_raidz_math_sse2.c \ + vdev_raidz_math_ssse3.c \ + zfs_fletcher_intel.c \ + zfs_fletcher_sse.c +CFLAGS += -DHAVE_SSE2 -DHAVE_SSE3 +.endif +.if ${MACHINE_ARCH} == "amd64" +ARCH_C += zfs_fletcher_avx512.c +CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_AVX512F \ + -DHAVE_AVX512BW +.endif +.if ${MACHINE_ARCH} == "aarch64" +ARCH_C += zfs_fletcher_aarch64_neon.c +.endif + +LUA_C = \ + lapi.c \ + lauxlib.c \ + lbaselib.c \ + lcode.c \ + lcompat.c \ + lcorolib.c \ + lctype.c \ + ldebug.c \ + ldo.c \ + lfunc.c \ + lgc.c \ + llex.c \ + lmem.c \ + lobject.c \ + lopcodes.c \ + lparser.c \ + lstate.c \ + lstring.c \ + lstrlib.c \ + ltable.c \ + ltablib.c \ + ltm.c \ + lvm.c \ + lzio.c + +UNICODE_C = u8_textprep.c uconv.c + +SRCS= ${USER_C} ${KERNEL_C} ${LUA_C} ${UNICODE_C} ${ARCH_C} + +WARNS?= 2 +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID +CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h +CFLAGS+= -I${SRCTOP}/sys/modules/zfs +CFLAGS+= -DLIB_ZPOOL_BUILD -DZFS_DEBUG + + # XXX: pthread doesn't have mutex_owned() equivalent, so we need to look # into libthr private structures. That's sooo evil, but it's only for # ZFS debugging tools needs. @@ -64,10 +239,9 @@ CFLAGS+= -DWANTS_MUTEX_OWNED CFLAGS+= -I${SRCTOP}/lib/libpthread/thread CFLAGS+= -I${SRCTOP}/lib/libpthread/sys CFLAGS+= -I${SRCTOP}/lib/libthr/arch/${MACHINE_CPUARCH}/include -CFLAGS.lz4.c+= -D_FAKE_KERNEL CFLAGS.gcc+= -fms-extensions -LIBADD= md pthread z nvpair avl umem +LIBADD= md pthread z spl icp nvpair avl umem # atomic.S doesn't like profiling. MK_PROFILE= no diff --git a/cddl/lib/libzpool/Makefile.depend b/cddl/lib/libzpool/Makefile.depend index 97914fc35322..06045f57b437 100644 --- a/cddl/lib/libzpool/Makefile.depend +++ b/cddl/lib/libzpool/Makefile.depend @@ -3,6 +3,7 @@ DIRDEPS = \ cddl/lib/libavl \ + cddl/lib/libicp \ cddl/lib/libnvpair \ cddl/lib/libumem \ gnu/lib/csu \ diff --git a/cddl/lib/libzutil/Makefile b/cddl/lib/libzutil/Makefile new file mode 100644 index 000000000000..7aea9da14e90 --- /dev/null +++ b/cddl/lib/libzutil/Makefile @@ -0,0 +1,42 @@ +# $FreeBSD$ + +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzutil +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzutil/os/freebsd +.PATH: ${SRCTOP}/sys/contrib/openzfs/module/os/freebsd/zfs + +LIB= zutil +LIBADD= avl tpool +PACKAGE= runtime + +INCS = zutil_import.h + +SRCS = \ + zutil_device_path.c \ + zutil_import.c \ + zutil_import.h \ + zutil_nicenum.c \ + zutil_pool.c + +SRCS += \ + zutil_device_path_os.c \ + zutil_import_os.c \ + zutil_compat.c + +SRCS += zfs_ioctl_compat.c + + +WARNS?= 2 +CSTD= c99 + +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/zfs +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzutil +CFLAGS+= -DHAVE_ISSETUGID -DIN_BASE +CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h + +.include diff --git a/cddl/sbin/zfs/Makefile b/cddl/sbin/zfs/Makefile index e3645bd362b7..40f713d069c8 100644 --- a/cddl/sbin/zfs/Makefile +++ b/cddl/sbin/zfs/Makefile @@ -1,27 +1,77 @@ # $FreeBSD$ -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zfs +ZFSTOP= ${SRCTOP}/sys/contrib/openzfs + +.PATH: ${ZFSTOP}/cmd/zfs +.PATH: ${ZFSTOP}/man/man8 +.PATH: ${ZFSTOP}/module/os/freebsd/spl PACKAGE= runtime PROG= zfs -MAN= zfs.8 zfs-program.8 -SRCS= zfs_main.c zfs_iter.c +MAN= \ + zfs.8 \ + zfs-allow.8 \ + zfs-bookmark.8 \ + zfs-change-key.8 \ + zfs-clone.8 \ + zfs-create.8 \ + zfs-destroy.8 \ + zfs-diff.8 \ + zfs-get.8 \ + zfs-groupspace.8 \ + zfs-hold.8 \ + zfs-inherit.8 \ + zfs-jail.8 \ + zfs-list.8 \ + zfs-load-key.8 \ + zfs-mount.8 \ + zfs-program.8 \ + zfs-project.8 \ + zfs-projectspace.8 \ + zfs-promote.8 \ + zfs-receive.8 \ + zfs-recv.8 \ + zfs-redact.8 \ + zfs-release.8 \ + zfs-rename.8 \ + zfs-rollback.8 \ + zfs-send.8 \ + zfs-set.8 \ + zfs-share.8 \ + zfs-snapshot.8 \ + zfs-unallow.8 \ + zfs-unjail.8 \ + zfs-unload-key.8 \ + zfs-unmount.8 \ + zfs-upgrade.8 \ + zfs-userspace.8 \ + zfs-wait.8 \ + zfsconcepts.8 \ + zfsprops.8 +SRCS= \ + zfs_iter.c \ + zfs_iter.h \ + zfs_main.c \ + zfs_util.h \ + zfs_project.c \ + zfs_projectutil.h -WARNS?= 0 -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libumem/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs +WARNS?= 2 -LIBADD= jail nvpair uutil zfs_core zfs +CFLAGS+= \ + -DIN_BASE \ + -I${ZFSTOP}/include \ + -I${ZFSTOP}/include/os/freebsd \ + -I${ZFSTOP}/lib/libspl/include \ + -I${ZFSTOP}/lib/libspl/include/os/freebsd \ + -I${SRCTOP}/sys \ + -I${SRCTOP}/cddl/compat/opensolaris/include \ + -I${ZFSTOP}/module/icp/include \ + -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \ + -DHAVE_ISSETUGID \ + -include ${SRCTOP}/sys/modules/zfs/zfs_config.h \ + -I${SRCTOP}/sys/modules/zfs +LIBADD= jail avl nvpair geom uutil zfs_core spl tpool zutil zfs m crypto +LDADD+= -pthread .include diff --git a/cddl/sbin/zpool/Makefile b/cddl/sbin/zpool/Makefile index f4a3b9b6f5cc..6928cb132780 100644 --- a/cddl/sbin/zpool/Makefile +++ b/cddl/sbin/zpool/Makefile @@ -1,32 +1,76 @@ # $FreeBSD$ -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zpool -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/stat/common -.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs +ZFSTOP= ${SRCTOP}/sys/contrib/openzfs + +.PATH: ${ZFSTOP}/man/man5 +.PATH: ${ZFSTOP}/man/man8 +.PATH: ${ZFSTOP}/cmd/zpool +.PATH: ${ZFSTOP}/cmd/zpool/os/freebsd + PACKAGE= runtime PROG= zpool -MAN= zpool.8 zpool-features.7 -SRCS= zpool_main.c zpool_vdev.c zpool_iter.c zpool_util.c zfs_comutil.c -SRCS+= timestamp.c +MAN= \ + spl-module-parameters.5 \ + zfs-module-parameters.5 \ + zpool.8 \ + zpool-add.8 \ + zpool-attach.8 \ + zpool-checkpoint.8 \ + zpool-clear.8 \ + zpool-create.8 \ + zpool-destroy.8 \ + zpool-detach.8 \ + zpool-events.8 \ + zpool-export.8 \ + zpool-features.5 \ + zpool-get.8 \ + zpool-history.8 \ + zpool-import.8 \ + zpool-initialize.8 \ + zpool-iostat.8 \ + zpool-labelclear.8 \ + zpool-list.8 \ + zpool-offline.8 \ + zpool-online.8 \ + zpool-reguid.8 \ + zpool-remove.8 \ + zpool-reopen.8 \ + zpool-replace.8 \ + zpool-resilver.8 \ + zpool-scrub.8 \ + zpool-set.8 \ + zpool-split.8 \ + zpool-status.8 \ + zpool-sync.8 \ + zpool-trim.8 \ + zpool-upgrade.8 \ + zpool-wait.8 \ + zpoolconcepts.8 \ + zpoolprops.8 +SRCS= \ + zpool_iter.c \ + zpool_main.c \ + zpool_util.c \ + zpool_util.h \ + zpool_vdev.c \ + zpool_vdev_os.c -WARNS?= 0 -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libumem/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/cmd/stat/common +WARNS?= 2 -LIBADD= geom nvpair uutil zfs +CFLAGS+= \ + -DIN_BASE \ + -I${ZFSTOP}/include \ + -I${ZFSTOP}/lib/libspl/include \ + -I${ZFSTOP}/lib/libspl/include/os/freebsd \ + -I${SRCTOP}/sys \ + -I${SRCTOP}/cddl/compat/opensolaris/include \ + -I${ZFSTOP}/cmd/zpool \ + -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \ + -DHAVE_ISSETUGID \ + -include ${SRCTOP}/sys/modules/zfs/zfs_config.h \ + -DSYSCONFDIR=\"/etc\" +LIBADD= geom nvpair uutil zfs zutil avl spl tpool zfs_core m +LDADD+= -pthread .include diff --git a/cddl/usr.bin/Makefile b/cddl/usr.bin/Makefile index 573f99b55bd8..3416244b5b0b 100644 --- a/cddl/usr.bin/Makefile +++ b/cddl/usr.bin/Makefile @@ -7,7 +7,7 @@ SUBDIR= \ ctfdump \ ctfmerge \ ${_zinject} \ - ${_zlook} \ + ${_zstream} \ ${_zstreamdump} \ ${_ztest} @@ -15,10 +15,9 @@ SUBDIR.${MK_TESTS}+= tests .if ${MK_ZFS} != "no" _zinject= zinject -#_zlook= zlook .if ${MK_LIBTHR} != "no" _ztest= ztest -_zstreamdump = zstreamdump +_zstream = zstream .endif .endif diff --git a/cddl/usr.bin/ctfconvert/Makefile b/cddl/usr.bin/ctfconvert/Makefile index 42061c1fa3d3..699b1985e246 100644 --- a/cddl/usr.bin/ctfconvert/Makefile +++ b/cddl/usr.bin/ctfconvert/Makefile @@ -27,6 +27,12 @@ SRCS= alist.c \ traverse.c \ util.c +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \ -I${SRCTOP}/cddl/compat/opensolaris/include \ -I${OPENSOLARIS_USR_DISTDIR} \ @@ -35,8 +41,9 @@ CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \ -I${OPENSOLARIS_USR_DISTDIR}/tools/ctf/common \ -I${OPENSOLARIS_USR_DISTDIR}/tools/ctf/cvt \ -I${OPENSOLARIS_SYS_DISTDIR}/uts/common +CFLAGS+= -DHAVE_ISSETUGID -LIBADD= dwarf elf z pthread +LIBADD= spl dwarf elf z pthread HAS_TESTS= SUBDIR.${MK_TESTS}+= tests diff --git a/cddl/usr.bin/ctfconvert/Makefile.depend b/cddl/usr.bin/ctfconvert/Makefile.depend index 44e570745e22..eb40124b7c90 100644 --- a/cddl/usr.bin/ctfconvert/Makefile.depend +++ b/cddl/usr.bin/ctfconvert/Makefile.depend @@ -5,6 +5,7 @@ DIRDEPS = \ gnu/lib/csu \ include \ include/xlocale \ + cddl/lib/libspl \ lib/${CSU_DIR} \ lib/libc \ lib/libcompiler_rt \ diff --git a/cddl/usr.bin/ctfdump/Makefile b/cddl/usr.bin/ctfdump/Makefile index 953acfd3abb8..e7347b272dac 100644 --- a/cddl/usr.bin/ctfdump/Makefile +++ b/cddl/usr.bin/ctfdump/Makefile @@ -8,6 +8,13 @@ SRCS= dump.c \ symbol.c \ utils.c +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include CFLAGS+= -I${OPENSOLARIS_USR_DISTDIR} \ -I${OPENSOLARIS_SYS_DISTDIR} \ -I${OPENSOLARIS_USR_DISTDIR}/head \ @@ -16,6 +23,7 @@ CFLAGS+= -I${OPENSOLARIS_USR_DISTDIR} \ -I${SRCTOP}/cddl/compat/opensolaris/include \ -I${OPENSOLARIS_USR_DISTDIR}/tools/ctf/common \ -I${OPENSOLARIS_SYS_DISTDIR}/uts/common +CFLAGS+= -DHAVE_ISSETUGID LIBADD= elf z diff --git a/cddl/usr.bin/ctfmerge/Makefile b/cddl/usr.bin/ctfmerge/Makefile index c863ea0edf87..20e79463fea5 100644 --- a/cddl/usr.bin/ctfmerge/Makefile +++ b/cddl/usr.bin/ctfmerge/Makefile @@ -24,6 +24,13 @@ SRCS= alist.c \ WARNS?= 1 + +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \ -I${SRCTOP}/cddl/compat/opensolaris/include \ -I${OPENSOLARIS_USR_DISTDIR} \ @@ -32,7 +39,8 @@ CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \ -I${OPENSOLARIS_USR_DISTDIR}/tools/ctf/common \ -I${OPENSOLARIS_USR_DISTDIR}/tools/ctf/cvt \ -I${OPENSOLARIS_SYS_DISTDIR}/uts/common +CFLAGS+= -DHAVE_ISSETUGID -LIBADD= elf z pthread +LIBADD= spl elf z pthread .include diff --git a/cddl/usr.bin/zinject/Makefile b/cddl/usr.bin/zinject/Makefile index 5cfd7e29edb0..fcc472c24214 100644 --- a/cddl/usr.bin/zinject/Makefile +++ b/cddl/usr.bin/zinject/Makefile @@ -1,24 +1,28 @@ # $FreeBSD$ -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zinject +ZFSTOP= ${SRCTOP}/sys/contrib/openzfs + +.PATH: ${ZFSTOP}/cmd/zinject +.PATH: ${ZFSTOP}/man/man8 PROG= zinject +INCS= zinject.h SRCS= zinject.c translate.c -MAN= +MAN= zinject.8 -WARNS?= 0 -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs/ -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head +WARNS?= 2 +CFLAGS+= \ + -DIN_BASE \ + -I${ZFSTOP}/include \ + -I${ZFSTOP}/lib/libspl/include \ + -I${ZFSTOP}/lib/libspl/include/os/freebsd \ + -I${SRCTOP}/sys \ + -I${SRCTOP}/cddl/compat/opensolaris/include \ + -I${ZFSTOP}/module/icp/include \ + -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \ + -DHAVE_ISSETUGID \ + -include ${SRCTOP}/sys/modules/zfs/zfs_config.h -LIBADD= geom m nvpair umem uutil zfs_core zfs zpool +LIBADD= geom m nvpair umem uutil avl spl zfs_core zfs zutil zpool .include diff --git a/cddl/usr.bin/zlook/Makefile b/cddl/usr.bin/zlook/Makefile deleted file mode 100644 index e36be05e7b71..000000000000 --- a/cddl/usr.bin/zlook/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -# $FreeBSD$ - -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zlook - -PROG= zlook -MAN= - -WARNS?= 0 -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common - -.include diff --git a/cddl/usr.bin/zstream/Makefile b/cddl/usr.bin/zstream/Makefile new file mode 100644 index 000000000000..d6ac8c5f3b16 --- /dev/null +++ b/cddl/usr.bin/zstream/Makefile @@ -0,0 +1,32 @@ +# $FreeBSD$ + +ZFSTOP= ${SRCTOP}/sys/contrib/openzfs + +.PATH: ${ZFSTOP}/cmd/zstream +.PATH: ${ZFSTOP}/man/man8 + +PROG= zstream +MAN= zstream.8 +INCS= zstream.h +SRCS= \ + zstream.c \ + zstream_dump.c \ + zstream_redup.c \ + zstream_token.c + +WARNS?= 2 +CFLAGS+= \ + -DIN_BASE \ + -I${ZFSTOP}/include \ + -I${ZFSTOP}/lib/libspl/include \ + -I${ZFSTOP}/lib/libspl/include/os/freebsd \ + -I${SRCTOP}/sys \ + -I${SRCTOP}/cddl/compat/opensolaris/include \ + -I${ZFSTOP}/module/icp/include \ + -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \ + -DHAVE_ISSETUGID \ + -include ${SRCTOP}/sys/modules/zfs/zfs_config.h + +LIBADD= geom m nvpair umem uutil avl spl zfs_core zfs zutil zpool + +.include diff --git a/cddl/usr.bin/zstreamdump/Makefile b/cddl/usr.bin/zstreamdump/Makefile index 69fd399e14d2..63f365d0445a 100644 --- a/cddl/usr.bin/zstreamdump/Makefile +++ b/cddl/usr.bin/zstreamdump/Makefile @@ -1,23 +1,11 @@ # $FreeBSD$ -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zstreamdump +ZFSTOP= ${SRCTOP}/sys/contrib/openzfs -PROG= zstreamdump -MAN= zstreamdump.1 +.PATH: ${ZFSTOP}/cmd/zstreamdump +.PATH: ${ZFSTOP}/man/man8 -WARNS?= 0 -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head - -LIBADD= m nvpair umem zpool zfs pthread z avl - -CSTD= c99 +SCRIPTS= zstreamdump +MAN= zstreamdump.8 .include diff --git a/cddl/usr.bin/ztest/Makefile b/cddl/usr.bin/ztest/Makefile index 5b44aa2d8097..de8ae801bb76 100644 --- a/cddl/usr.bin/ztest/Makefile +++ b/cddl/usr.bin/ztest/Makefile @@ -1,30 +1,33 @@ # $FreeBSD$ -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/ztest +ZFSTOP= ${SRCTOP}/sys/contrib/openzfs + +.PATH: ${ZFSTOP}/cmd/ztest +.PATH: ${ZFSTOP}/man/man1 PROG= ztest -MAN= +MAN= ztest.1 -WARNS?= 0 -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head +WARNS?= 2 +CFLAGS+= \ + -DIN_BASE \ + -I${ZFSTOP}/include \ + -I${ZFSTOP}/lib/libspl/include \ + -I${ZFSTOP}/lib/libspl/include/os/freebsd \ + -I${SRCTOP}/sys \ + -I${SRCTOP}/cddl/compat/opensolaris/include \ + -I${ZFSTOP}/module/icp/include \ + -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \ + -DHAVE_ISSETUGID \ + -include ${SRCTOP}/sys/modules/zfs/zfs_config.h -LIBADD= geom m nvpair umem zpool pthread avl zfs_core zfs uutil +LIBADD= geom m nvpair umem zpool pthread avl zfs_core spl zutil zfs uutil icp CSTD= c99 # Since there are many asserts in this program, it makes no sense to compile # it without debugging. -CFLAGS+= -g -DDEBUG=1 -Wno-format +CFLAGS+= -g -DDEBUG=1 -Wno-format -DZFS_DEBUG=1 CFLAGS.gcc+= -fms-extensions HAS_TESTS= diff --git a/cddl/usr.sbin/dtrace/Makefile b/cddl/usr.sbin/dtrace/Makefile index 501b7c14861a..19a55d5dc1be 100644 --- a/cddl/usr.sbin/dtrace/Makefile +++ b/cddl/usr.sbin/dtrace/Makefile @@ -10,6 +10,13 @@ BINDIR?= /usr/sbin WARNS?= 1 +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \ -I${SRCTOP}/cddl/compat/opensolaris/include \ -I${OPENSOLARIS_USR_DISTDIR}/head \ @@ -17,12 +24,13 @@ CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \ -I${OPENSOLARIS_USR_DISTDIR}/lib/libproc/common \ -I${OPENSOLARIS_SYS_DISTDIR}/uts/common \ -I${OPENSOLARIS_SYS_DISTDIR}/compat +CFLAGS+= -DHAVE_ISSETUGID # Optional debugging stuff... #CFLAGS+= -DNEED_ERRLOC #YFLAGS+= -d -LIBADD= dtrace ctf elf proc +LIBADD= dtrace ctf elf proc spl .if ${MK_DTRACE_TESTS} != "no" SUBDIR+= tests diff --git a/cddl/usr.sbin/lockstat/Makefile b/cddl/usr.sbin/lockstat/Makefile index 3ffcc7049f35..6efb9e6fa5c5 100644 --- a/cddl/usr.sbin/lockstat/Makefile +++ b/cddl/usr.sbin/lockstat/Makefile @@ -8,6 +8,14 @@ BINDIR?= /usr/sbin WARNS?= 1 +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include +CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \ -I${SRCTOP}/cddl/compat/opensolaris/include \ -I${OPENSOLARIS_USR_DISTDIR}/head \ @@ -16,6 +24,7 @@ CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \ -I${OPENSOLARIS_SYS_DISTDIR}/uts/common \ -I${OPENSOLARIS_SYS_DISTDIR}/compat \ -I${SRCTOP}/sys +CFLAGS+= -DHAVE_ISSETUGID CFLAGS+= -DNEED_ERRLOC -g diff --git a/cddl/usr.sbin/plockstat/Makefile b/cddl/usr.sbin/plockstat/Makefile index cb263fb427a7..f65ac8146e4e 100644 --- a/cddl/usr.sbin/plockstat/Makefile +++ b/cddl/usr.sbin/plockstat/Makefile @@ -8,6 +8,13 @@ BINDIR?= /usr/sbin WARNS?= 1 +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \ -I${SRCTOP}/cddl/compat/opensolaris/include \ -I${OPENSOLARIS_USR_DISTDIR}/head \ @@ -17,6 +24,7 @@ CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris \ -I${OPENSOLARIS_SYS_DISTDIR}/compat \ -I${SRCTOP}/cddl/lib/libdtrace \ -I${SRCTOP}/sys +CFLAGS+= -DHAVE_ISSETUGID LIBADD= dtrace proc diff --git a/cddl/usr.sbin/zdb/Makefile b/cddl/usr.sbin/zdb/Makefile index 8c919341f8e3..efd8c746128f 100644 --- a/cddl/usr.sbin/zdb/Makefile +++ b/cddl/usr.sbin/zdb/Makefile @@ -1,33 +1,33 @@ # $FreeBSD$ -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zdb +ZFSTOP= ${SRCTOP}/sys/contrib/openzfs + +.PATH: ${ZFSTOP}/cmd/zdb +.PATH: ${ZFSTOP}/man/man8 PROG= zdb MAN= zdb.8 +INCS= zdb.h SRCS= zdb.c zdb_il.c WARNS?= 2 CSTD= c99 -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head +CFLAGS+= \ + -DIN_BASE \ + -I${ZFSTOP}/include \ + -I${ZFSTOP}/lib/libspl/include \ + -I${ZFSTOP}/lib/libspl/include/os/freebsd \ + -I${ZFSTOP}/lib/libspl/include/os/freebsd/spl \ + -I${SRCTOP}/sys \ + -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \ + -DHAVE_ISSETUGID -LIBADD= nvpair umem uutil zfs zpool +LIBADD= nvpair umem uutil zfs spl avl zutil zpool CFLAGS.gcc+= -fms-extensions # Since there are many asserts in this program, it makes no sense to compile # it without debugging. -CFLAGS+= -g -DDEBUG=1 +CFLAGS+= -g -DDEBUG=1 -DZFS_DEBUG=1 .include diff --git a/cddl/usr.sbin/zfsd/Makefile.common b/cddl/usr.sbin/zfsd/Makefile.common index 7d45a3ed1695..95882dec175a 100644 --- a/cddl/usr.sbin/zfsd/Makefile.common +++ b/cddl/usr.sbin/zfsd/Makefile.common @@ -10,29 +10,24 @@ SRCS= callout.cc \ zpool_list.cc \ zfsd_main.cc -WARNS?= 3 +WARNS?= 2 # Ignore warnings about Solaris specific pragmas. IGNORE_PRAGMA= YES -INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -INCFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -INCFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem -INCFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head -INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common -INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libumem/common -INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common -INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common -INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair -INCFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs -INCFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common -INCFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -INCFLAGS+= -I${SRCTOP}/cddl/usr.sbin +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -I${SRCTOP}/cddl/usr.sbin -CFLAGS+= -DNEED_SOLARIS_BOOLEAN ${INCFLAGS} +# use issetugid(2) +CFLAGS+= -D_MACHINE_FLOAT_H_ -DHAVE_ISSETUGID -LIBADD+= devdctl zfs zfs_core util geom bsdxml sbuf nvpair uutil +LIBADD+= devdctl zfs zfs_core util geom bsdxml sbuf nvpair avl uutil zutil cscope: find ${.CURDIR} -type f -a \( -name "*.[ch]" -o -name "*.cc" \) \ diff --git a/cddl/usr.sbin/zfsd/callout.cc b/cddl/usr.sbin/zfsd/callout.cc index 2671c5d3e783..3e5cd5779559 100644 --- a/cddl/usr.sbin/zfsd/callout.cc +++ b/cddl/usr.sbin/zfsd/callout.cc @@ -39,6 +39,7 @@ * timer services built on top of the POSIX interval timer. */ +#include #include #include diff --git a/cddl/usr.sbin/zfsd/case_file.cc b/cddl/usr.sbin/zfsd/case_file.cc index 19c4abe45fc9..da2125b4d716 100644 --- a/cddl/usr.sbin/zfsd/case_file.cc +++ b/cddl/usr.sbin/zfsd/case_file.cc @@ -39,11 +39,13 @@ * accumulate in order to mark a device as degraded. */ #include +#include #include #include #include +#include #include #include #include @@ -75,7 +77,6 @@ __FBSDID("$FreeBSD$"); /*============================ Namespace Control =============================*/ -using std::auto_ptr; using std::hex; using std::ifstream; using std::stringstream; @@ -239,8 +240,6 @@ CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev) { ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front()); - zpool_boot_label_t boot_type; - uint64_t boot_size; if (pool == NULL || !RefreshVdevState()) { /* @@ -333,13 +332,7 @@ CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev) } /* Write a label on the newly inserted disk. */ - if (zpool_is_bootable(pool)) - boot_type = ZPOOL_COPY_BOOT_LABEL; - else - boot_type = ZPOOL_NO_BOOT_LABEL; - boot_size = zpool_get_prop_int(pool, ZPOOL_PROP_BOOTSIZE, NULL); - if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str(), - boot_type, boot_size, NULL) != 0) { + if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) { syslog(LOG_ERR, "Replace vdev(%s/%s) by physical path (label): %s: %s\n", zpool_get_name(pool), VdevGUIDString().c_str(), @@ -1118,7 +1111,7 @@ CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) { nvlist_free(newvd); retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot, - /*replace*/B_TRUE) == 0); + /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0); if (retval) syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n", poolname, oldstr.c_str(), path); diff --git a/cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc b/cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc index d65295075c1f..e2833170f9ac 100644 --- a/cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc +++ b/cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc @@ -30,6 +30,7 @@ * Authors: Alan Somers (Spectra Logic Corporation) */ #include +#include #include #include diff --git a/cddl/usr.sbin/zfsd/vdev.cc b/cddl/usr.sbin/zfsd/vdev.cc index 687b0647ceb5..508af8cda925 100644 --- a/cddl/usr.sbin/zfsd/vdev.cc +++ b/cddl/usr.sbin/zfsd/vdev.cc @@ -39,6 +39,7 @@ */ #include #include +#include #include #include diff --git a/cddl/usr.sbin/zfsd/vdev_iterator.cc b/cddl/usr.sbin/zfsd/vdev_iterator.cc index 31a4ce962970..b5a4f22c1c60 100644 --- a/cddl/usr.sbin/zfsd/vdev_iterator.cc +++ b/cddl/usr.sbin/zfsd/vdev_iterator.cc @@ -38,6 +38,7 @@ * Implementation of the VdevIterator class. */ #include +#include #include #include diff --git a/cddl/usr.sbin/zfsd/zfsd.cc b/cddl/usr.sbin/zfsd/zfsd.cc index 2f17b474e493..876cca836e9e 100644 --- a/cddl/usr.sbin/zfsd/zfsd.cc +++ b/cddl/usr.sbin/zfsd/zfsd.cc @@ -42,10 +42,12 @@ */ #include +#include #include #include #include +#include #include #include #include diff --git a/cddl/usr.sbin/zfsd/zfsd_event.cc b/cddl/usr.sbin/zfsd/zfsd_event.cc index 707a868c67e8..688e7c0354a2 100644 --- a/cddl/usr.sbin/zfsd/zfsd_event.cc +++ b/cddl/usr.sbin/zfsd/zfsd_event.cc @@ -34,6 +34,7 @@ * \file zfsd_event.cc */ #include +#include #include #include #include @@ -41,12 +42,13 @@ #include #include +#include /* * Undefine flush, defined by cpufunc.h on sparc64, because it conflicts with * C++ flush methods */ #undef flush - +#undef __init #include #include #include @@ -190,7 +192,8 @@ GeomEvent::ReadLabel(int devFd, bool &inUse, bool °raded) if (poolName != NULL) free(poolName); - nlabels = zpool_read_all_labels(devFd, &devLabel); + if (zpool_read_label(devFd, &devLabel, &nlabels) != 0) + return (NULL); /* * If we find a disk with fewer than the maximum number of * labels, it might be the whole disk of a partitioned disk diff --git a/cddl/usr.sbin/zfsd/zfsd_exception.cc b/cddl/usr.sbin/zfsd/zfsd_exception.cc index 7859a654003e..7ebb74ca8c6a 100644 --- a/cddl/usr.sbin/zfsd/zfsd_exception.cc +++ b/cddl/usr.sbin/zfsd/zfsd_exception.cc @@ -36,6 +36,7 @@ * Implementation of the ZfsdException class. */ #include +#include #include #include diff --git a/cddl/usr.sbin/zfsd/zpool_list.cc b/cddl/usr.sbin/zfsd/zpool_list.cc index 1d1540b294a2..82c35736df13 100644 --- a/cddl/usr.sbin/zfsd/zpool_list.cc +++ b/cddl/usr.sbin/zfsd/zpool_list.cc @@ -38,6 +38,7 @@ * Implementation of the ZpoolList class. */ #include +#include #include #include diff --git a/cddl/usr.sbin/zhack/Makefile b/cddl/usr.sbin/zhack/Makefile index 1347111c7c17..24800c693cf3 100644 --- a/cddl/usr.sbin/zhack/Makefile +++ b/cddl/usr.sbin/zhack/Makefile @@ -1,6 +1,6 @@ # $FreeBSD$ -.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zhack +.PATH: ${SRCTOP}/sys/contrib/openzfs/cmd/zhack PROG= zhack MAN= @@ -8,20 +8,20 @@ MAN= WARNS?= 0 CSTD= c99 -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris +WARNS?= 2 +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID +CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h -LIBADD= nvpair zfs zpool + +LIBADD= nvpair zfs spl zutil zpool CFLAGS+= -DDEBUG=1 #DEBUG_FLAGS+= -g diff --git a/include/Makefile b/include/Makefile index 91ad90e2c402..bafb5ef7c9f5 100644 --- a/include/Makefile +++ b/include/Makefile @@ -244,7 +244,7 @@ copies: .PHONY .META ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 teken.h \ ${SDESTDIR}${INCLUDEDIR}/teken .if ${MK_CDDL} != "no" - cd ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/sys; \ + cd ${SRCTOP}/sys/contrib/openzfs/include/sys; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 nvpair.h \ ${SDESTDIR}${INCLUDEDIR}/sys .endif @@ -377,7 +377,7 @@ symlinks: .PHONY .META done .if ${MK_CDDL} != "no" ${INSTALL_SYMLINK} ${TAG_ARGS} \ - ../../../sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h \ + ../../../sys/contrib/openenzfs/include/sys/nvpair.h \ ${SDESTDIR}${INCLUDEDIR}/sys .endif .if ${MK_MLX5TOOL} != "no" diff --git a/lib/libbe/Makefile b/lib/libbe/Makefile index 3a02f22e69a7..c7dfde4b0dba 100644 --- a/lib/libbe/Makefile +++ b/lib/libbe/Makefile @@ -16,19 +16,18 @@ WARNS?= 2 IGNORE_PRAGMA= yes LIBADD+= zfs -LIBADD+= nvpair +LIBADD+= nvpair spl -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris +CFLAGS+= -DIN_BASE -DHAVE_RPC_TYPES +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID -CFLAGS+= -DNEED_SOLARIS_BOOLEAN HAS_TESTS= YES SUBDIR.${MK_TESTS}+= tests diff --git a/lib/libbe/be.c b/lib/libbe/be.c index 98304c8bd166..a0de973cd478 100644 --- a/lib/libbe/be.c +++ b/lib/libbe/be.c @@ -35,10 +35,13 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include #include #include +#include #include #include #include @@ -993,12 +996,8 @@ be_rename(libbe_handle_t *lbh, const char *old, const char *new) ZFS_TYPE_FILESYSTEM)) == NULL) return (set_error(lbh, BE_ERR_ZFSOPEN)); - /* recurse, nounmount, forceunmount */ - struct renameflags flags = { - .nounmount = 1, - }; - err = zfs_rename(zfs_hdl, NULL, full_new, flags); + err = zfs_rename(zfs_hdl,full_new, B_FALSE, B_FALSE); zfs_close(zfs_hdl); if (err != 0) @@ -1025,7 +1024,7 @@ be_export(libbe_handle_t *lbh, const char *bootenv, int fd) if ((zfs = zfs_open(lbh->lzh, buf, ZFS_TYPE_DATASET)) == NULL) return (set_error(lbh, BE_ERR_ZFSOPEN)); - err = zfs_send_one(zfs, NULL, fd, flags); + err = zfs_send_one(zfs, NULL, fd, &flags, /* redactbook */ NULL); zfs_close(zfs); return (err); diff --git a/lib/libbe/tests/Makefile b/lib/libbe/tests/Makefile index cc5f40fc4bf9..fc5691bbe6f8 100644 --- a/lib/libbe/tests/Makefile +++ b/lib/libbe/tests/Makefile @@ -8,14 +8,19 @@ PROGS= target_prog SRCS_target_prog= target_prog.c BINDIR_target_prog= ${TESTSDIR} -LIBADD+= zfs -LIBADD+= nvpair -LIBADD+= be +LIBADD+= zfs \ + spl \ + nvpair \ + be \ CFLAGS+= -I${SRCTOP}/lib/libbe -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common - -CFLAGS+= -DNEED_SOLARIS_BOOLEAN +CFLAGS+= -DIN_BASE -DHAVE_RPC_TYPES +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID .include diff --git a/lib/libproc/Makefile b/lib/libproc/Makefile index a20dd69ef346..624b90e571cc 100644 --- a/lib/libproc/Makefile +++ b/lib/libproc/Makefile @@ -29,6 +29,13 @@ LIBADD+= elf procstat rtld_db util .if ${MK_CDDL} != "no" LIBADD+= ctf IGNORE_PRAGMA= YES +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID -DHAVE_BOOLEAN CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libctf/common \ -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common \ -I${SRCTOP}/sys/cddl/compat/opensolaris diff --git a/lib/libproc/proc_bkpt.c b/lib/libproc/proc_bkpt.c index af16877c619f..ff37bb7c4461 100644 --- a/lib/libproc/proc_bkpt.c +++ b/lib/libproc/proc_bkpt.c @@ -112,7 +112,7 @@ proc_bkptset(struct proc_handle *phdl, uintptr_t address, return (-1); } - DPRINTFX("adding breakpoint at 0x%lx", address); + DPRINTFX("adding breakpoint at 0x%lx", (unsigned long)address); stopped = 0; if (phdl->status != PS_STOP) { @@ -173,7 +173,7 @@ proc_bkptdel(struct proc_handle *phdl, uintptr_t address, return (-1); } - DPRINTFX("removing breakpoint at 0x%lx", address); + DPRINTFX("removing breakpoint at 0x%lx", (unsigned long)address); stopped = 0; if (phdl->status != PS_STOP) { diff --git a/lib/libproc/proc_sym.c b/lib/libproc/proc_sym.c index c3a84a9403a2..ad6673f9f9ba 100644 --- a/lib/libproc/proc_sym.c +++ b/lib/libproc/proc_sym.c @@ -307,7 +307,7 @@ open_object(struct map_info *mapping) */ if (data->d_size < sizeof(crc) + 1) { DPRINTFX("ERROR: debuglink section is too small (%zd bytes)", - data->d_size); + (ssize_t)data->d_size); goto internal; } if (strnlen(data->d_buf, data->d_size) >= data->d_size - sizeof(crc)) { @@ -510,7 +510,7 @@ proc_addr2sym(struct proc_handle *p, uintptr_t addr, char *name, int error; if ((mapping = _proc_addr2map(p, addr)) == NULL) { - DPRINTFX("ERROR: proc_addr2map failed to resolve 0x%jx", addr); + DPRINTFX("ERROR: proc_addr2map failed to resolve 0x%jx", (uintmax_t)addr); return (-1); } if (open_object(mapping) != 0) { diff --git a/lib/libprocstat/libprocstat.c b/lib/libprocstat/libprocstat.c index f48abd99c12f..7be6a224eb82 100644 --- a/lib/libprocstat/libprocstat.c +++ b/lib/libprocstat/libprocstat.c @@ -70,6 +70,7 @@ __FBSDID("$FreeBSD$"); #include #define _KERNEL #include +#include #include #include #include diff --git a/lib/libprocstat/zfs/Makefile b/lib/libprocstat/zfs/Makefile index 880118fbc643..75e4a253fd4d 100644 --- a/lib/libprocstat/zfs/Makefile +++ b/lib/libprocstat/zfs/Makefile @@ -6,15 +6,19 @@ SRCS= zfs_defs.c OBJS= zfs_defs.o WARNS?= 1 -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head -CFLAGS+= -I${.CURDIR:H} -CFLAGS+= -DNEED_SOLARIS_BOOLEAN + +CFLAGS+= -DIN_BASE -D__KERNEL__ -D_KERNEL -I. -I${.CURDIR} +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/zfs +CFLAGS+= -I${SRCTOP}/sys/contrib/ck/include + +CFLAGS+= -I${SRCTOP}/sys -I. -I.. +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID -D_SYS_VMEM_H_ -UKLD_TIED -DKLD_MODULE + CFLAGS+= -fno-builtin -nostdlib all: ${OBJS} diff --git a/lib/libprocstat/zfs_defs.c b/lib/libprocstat/zfs_defs.c index 3258b6c5567e..c41054f05136 100644 --- a/lib/libprocstat/zfs_defs.c +++ b/lib/libprocstat/zfs_defs.c @@ -26,13 +26,9 @@ */ #include +#include __FBSDID("$FreeBSD$"); -/* Pretend we are kernel to get the same binary layout. */ -#define _KERNEL - -/* A hack to deal with kpilite.h. */ -#define KLD_MODULE /* * Prevent some headers from getting included and fake some types @@ -41,14 +37,40 @@ __FBSDID("$FreeBSD$"); */ #define _OPENSOLARIS_SYS_PATHNAME_H_ #define _OPENSOLARIS_SYS_POLICY_H_ -#define _OPENSOLARIS_SYS_VNODE_H_ #define _VNODE_PAGER_ -typedef struct vnode vnode_t; -typedef struct vattr vattr_t; -typedef struct xvattr xvattr_t; -typedef struct vsecattr vsecattr_t; -typedef enum vtype vtype_t; + +enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD, + VMARKER }; + +/* + * Vnode attributes. A field value of VNOVAL represents a field whose value + * is unavailable (getattr) or which is not to be changed (setattr). + */ +struct vattr { + enum vtype va_type; /* vnode type (for create) */ + u_short va_mode; /* files access mode and type */ + u_short va_padding0; + uid_t va_uid; /* owner user id */ + gid_t va_gid; /* owner group id */ + nlink_t va_nlink; /* number of references to file */ + dev_t va_fsid; /* filesystem id */ + ino_t va_fileid; /* file id */ + u_quad_t va_size; /* file size in bytes */ + long va_blocksize; /* blocksize preferred for i/o */ + struct timespec va_atime; /* time of last access */ + struct timespec va_mtime; /* time of last modification */ + struct timespec va_ctime; /* time file changed */ + struct timespec va_birthtime; /* time file created */ + u_long va_gen; /* generation number of file */ + u_long va_flags; /* flags defined for file */ + dev_t va_rdev; /* device the special file represents */ + u_quad_t va_bytes; /* bytes of disk space held by file */ + u_quad_t va_filerev; /* file modification number */ + u_int va_vaflags; /* operations flags, see below */ + long va_spare; /* remain quad aligned */ +}; + #include #include diff --git a/libexec/rc/rc.d/zfs b/libexec/rc/rc.d/zfs index 2d35f9b54642..69e3ad5ce671 100755 --- a/libexec/rc/rc.d/zfs +++ b/libexec/rc/rc.d/zfs @@ -25,6 +25,13 @@ zfs_start_jail() zfs_start_main() { + local cachefile + + for cachefile in /boot/zfs/zpool.cache /etc/zfs/zpool.cache; do + if [ -r $cachefile ]; then + zpool import -c $cachefile -a + fi + done zfs mount -va zfs share -a if [ ! -r /etc/zfs/exports ]; then diff --git a/rescue/rescue/Makefile b/rescue/rescue/Makefile index 66aa7f188ec1..bda45c7d2884 100644 --- a/rescue/rescue/Makefile +++ b/rescue/rescue/Makefile @@ -129,7 +129,7 @@ CRUNCH_PROGS_usr.sbin+= zdb CRUNCH_LIBS+= -l80211 -lalias -lcam -lncursesw -ldevstat -lipsec -llzma .if ${MK_ZFS} != "no" CRUNCH_LIBS+= -lavl -lzpool -lzfs_core -lzfs -lnvpair -lpthread -luutil -lumem -CRUNCH_LIBS+= -lbe +CRUNCH_LIBS+= -lbe -lzutil -ltpool -lspl -licp_rescue .else # liblzma needs pthread CRUNCH_LIBS+= -lpthread diff --git a/sbin/bectl/Makefile b/sbin/bectl/Makefile index 9fc66a097cd8..e261c85e1e10 100644 --- a/sbin/bectl/Makefile +++ b/sbin/bectl/Makefile @@ -7,16 +7,22 @@ MAN= bectl.8 SRCS= bectl.c bectl_jail.c bectl_list.c -LIBADD+= be -LIBADD+= jail -LIBADD+= nvpair -LIBADD+= util +LIBADD+= be \ + jail \ + nvpair \ + spl \ + util \ -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common - -CFLAGS+= -DNEED_SOLARIS_BOOLEAN +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/ +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID +CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h HAS_TESTS= yes SUBDIR.${MK_TESTS}+= tests diff --git a/sbin/bectl/Makefile.depend b/sbin/bectl/Makefile.depend index 0c584612f6ec..125289c3cdc7 100644 --- a/sbin/bectl/Makefile.depend +++ b/sbin/bectl/Makefile.depend @@ -4,6 +4,7 @@ DIRDEPS = \ cddl/lib/libavl \ cddl/lib/libnvpair \ + cddl/lib/libspl \ cddl/lib/libumem \ cddl/lib/libuutil \ cddl/lib/libzfs \ diff --git a/sbin/bectl/bectl.c b/sbin/bectl/bectl.c index 71108b9c0855..31ece42c6ff2 100644 --- a/sbin/bectl/bectl.c +++ b/sbin/bectl/bectl.c @@ -60,6 +60,8 @@ static int bectl_cmd_unmount(int argc, char *argv[]); libbe_handle_t *be; +int aok; + int usage(bool explicit) { diff --git a/sbin/zfsbootcfg/Makefile b/sbin/zfsbootcfg/Makefile index d485d8dad8dc..8bc73ffa1e06 100644 --- a/sbin/zfsbootcfg/Makefile +++ b/sbin/zfsbootcfg/Makefile @@ -2,7 +2,7 @@ # $FreeBSD$ PROG= zfsbootcfg -WARNS?= 1 +WARNS?= 2 MAN= zfsbootcfg.8 LIBADD+=zfs @@ -11,17 +11,16 @@ LIBADD+=umem LIBADD+=uutil LIBADD+=geom +CFLAGS+= -DIN_BASE +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include -CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair -CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs -CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common -CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head - -CFLAGS+= -DNEED_SOLARIS_BOOLEAN +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -DHAVE_ISSETUGID +CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h .include diff --git a/share/mk/bsd.libnames.mk b/share/mk/bsd.libnames.mk index 0f97e9c29bac..dc7e9b1c2737 100644 --- a/share/mk/bsd.libnames.mk +++ b/share/mk/bsd.libnames.mk @@ -80,6 +80,7 @@ LIBIBMAD?= ${LIBDESTDIR}${LIBDIR_BASE}/libibmad.a LIBIBNETDISC?= ${LIBDESTDIR}${LIBDIR_BASE}/libibnetdisc.a LIBIBUMAD?= ${LIBDESTDIR}${LIBDIR_BASE}/libibumad.a LIBIBVERBS?= ${LIBDESTDIR}${LIBDIR_BASE}/libibverbs.a +LIBICP?= ${LIBDESTDIR}${LIBDIR_BASE}/libicp.a LIBIPSEC?= ${LIBDESTDIR}${LIBDIR_BASE}/libipsec.a LIBIPT?= ${LIBDESTDIR}${LIBDIR_BASE}/libipt.a LIBJAIL?= ${LIBDESTDIR}${LIBDIR_BASE}/libjail.a @@ -135,6 +136,7 @@ LIBRTLD_DB?= ${LIBDESTDIR}${LIBDIR_BASE}/librtld_db.a LIBSBUF?= ${LIBDESTDIR}${LIBDIR_BASE}/libsbuf.a LIBSDP?= ${LIBDESTDIR}${LIBDIR_BASE}/libsdp.a LIBSMB?= ${LIBDESTDIR}${LIBDIR_BASE}/libsmb.a +LIBSPL?= ${LIBDESTDIR}${LIBDIR_BASE}/libspl.a LIBSSL?= ${LIBDESTDIR}${LIBDIR_BASE}/libssl.a LIBSSP_NONSHARED?= ${LIBDESTDIR}${LIBDIR_BASE}/libssp_nonshared.a LIBSTATS?= ${LIBDESTDIR}${LIBDIR_BASE}/libstats.a @@ -146,6 +148,7 @@ LIBTERMCAP?= ${LIBDESTDIR}${LIBDIR_BASE}/libtermcap.a LIBTERMCAPW?= ${LIBDESTDIR}${LIBDIR_BASE}/libtermcapw.a LIBTERMLIB?= "don't use LIBTERMLIB, use LIBTERMCAP" LIBTINFO?= "don't use LIBTINFO, use LIBNCURSES" +LIBTPOOL?= ${LIBDESTDIR}${LIBDIR_BASE}/libtpool.a LIBUFS?= ${LIBDESTDIR}${LIBDIR_BASE}/libufs.a LIBUGIDFW?= ${LIBDESTDIR}${LIBDIR_BASE}/libugidfw.a LIBULOG?= ${LIBDESTDIR}${LIBDIR_BASE}/libulog.a @@ -166,6 +169,7 @@ LIBZ?= ${LIBDESTDIR}${LIBDIR_BASE}/libz.a LIBZFS?= ${LIBDESTDIR}${LIBDIR_BASE}/libzfs.a LIBZFS_CORE?= ${LIBDESTDIR}${LIBDIR_BASE}/libzfs_core.a LIBZPOOL?= ${LIBDESTDIR}${LIBDIR_BASE}/libzpool.a +LIBZUTIL?= ${LIBDESTDIR}${LIBDIR_BASE}/libzutil.a # enforce the 2 -lpthread and -lc to always be the last in that exact order .if defined(LDADD) diff --git a/share/mk/src.libnames.mk b/share/mk/src.libnames.mk index 2488527a5a6f..6ba2b0109bb9 100644 --- a/share/mk/src.libnames.mk +++ b/share/mk/src.libnames.mk @@ -125,6 +125,7 @@ _LIBRARIES= \ heimntlm \ heimsqlite \ hx509 \ + icp \ ipsec \ ipt \ jail \ @@ -172,6 +173,7 @@ _LIBRARIES= \ sdp \ sm \ smb \ + spl \ ssl \ ssp_nonshared \ stats \ @@ -181,6 +183,7 @@ _LIBRARIES= \ tacplus \ termcap \ termcapw \ + tpool \ ufs \ ugidfw \ ulog \ @@ -199,6 +202,7 @@ _LIBRARIES= \ zfs_core \ zfs \ zpool \ + zutil .if ${MK_BLACKLIST} != "no" _LIBRARIES+= \ @@ -355,9 +359,10 @@ _DP_lzma= md pthread _DP_ucl= m _DP_vmmapi= util _DP_opencsd= cxxrt -_DP_ctf= z +_DP_ctf= spl z _DP_dtrace= ctf elf proc pthread rtld_db _DP_xo= util +_DP_ztest= geom m nvpair umem zpool pthread avl zfs_core spl zutil zfs uutil icp # The libc dependencies are not strictly needed but are defined to make the # assert happy. _DP_c= compiler_rt @@ -375,11 +380,14 @@ _DP_smb= kiconv _DP_ulog= md _DP_fifolog= z _DP_ipf= kvm -_DP_zfs= md pthread umem util uutil m nvpair avl bsdxml geom nvpair z \ - zfs_core +_DP_tpool= spl +_DP_uutil= avl spl +_DP_zfs= md pthread umem util uutil m avl bsdxml geom nvpair \ + z zfs_core zutil _DP_zfs_core= nvpair -_DP_zpool= md pthread z nvpair avl umem -_DP_be= zfs nvpair +_DP_zpool= md pthread z icp spl nvpair avl umem +_DP_zutil= avl tpool +_DP_be= zfs spl nvpair # OFED support .if ${MK_OFED} != "no" @@ -583,12 +591,15 @@ LIBC_NOSSP_PIC?= ${LIBC_NOSSP_PICDIR}/libc_nossp_pic.a LIBAVLDIR= ${OBJTOP}/cddl/lib/libavl LIBCTFDIR= ${OBJTOP}/cddl/lib/libctf LIBDTRACEDIR= ${OBJTOP}/cddl/lib/libdtrace +LIBICPDIR= ${OBJTOP}/cddl/lib/libicp LIBNVPAIRDIR= ${OBJTOP}/cddl/lib/libnvpair LIBUMEMDIR= ${OBJTOP}/cddl/lib/libumem LIBUUTILDIR= ${OBJTOP}/cddl/lib/libuutil LIBZFSDIR= ${OBJTOP}/cddl/lib/libzfs LIBZFS_COREDIR= ${OBJTOP}/cddl/lib/libzfs_core LIBZPOOLDIR= ${OBJTOP}/cddl/lib/libzpool +LIBZUTILDIR= ${OBJTOP}/cddl/lib/libzutil +LIBTPOOLDIR= ${OBJTOP}/cddl/lib/libtpool # OFED support LIBCXGB4DIR= ${OBJTOP}/lib/ofed/libcxgb4 @@ -655,6 +666,7 @@ LIBNCURSESWDIR= ${OBJTOP}/lib/ncurses/ncursesw LIBPANELDIR= ${OBJTOP}/lib/ncurses/panel LIBPANELWDIR= ${OBJTOP}/lib/ncurses/panelw LIBCRYPTODIR= ${OBJTOP}/secure/lib/libcrypto +LIBSPLDIR= ${OBJTOP}/cddl/lib/libspl LIBSSHDIR= ${OBJTOP}/secure/lib/libssh LIBSSLDIR= ${OBJTOP}/secure/lib/libssl LIBTEKENDIR= ${OBJTOP}/sys/teken/libteken diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris.c b/sys/cddl/compat/opensolaris/kern/opensolaris.c index f804a1b561d1..bed1f21235d5 100644 --- a/sys/cddl/compat/opensolaris/kern/opensolaris.c +++ b/sys/cddl/compat/opensolaris/kern/opensolaris.c @@ -37,6 +37,8 @@ #include #include +extern struct opensolaris_utsname utsname; + cpu_core_t cpu_core[MAXCPU]; kmutex_t cpu_lock; solaris_cpu_t solaris_cpu[MAXCPU]; @@ -82,7 +84,6 @@ opensolaris_modevent(module_t mod __unused, int type, void *data __unused) switch (type) { case MOD_LOAD: - utsname.nodename = prison0.pr_hostname; break; case MOD_UNLOAD: diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c b/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c deleted file mode 100644 index fa23233833bb..000000000000 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c +++ /dev/null @@ -1,296 +0,0 @@ -/*- - * Copyright (c) 2006-2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#ifdef KMEM_DEBUG -#include -#include -#endif - -#ifdef _KERNEL -MALLOC_DEFINE(M_SOLARIS, "solaris", "Solaris"); -#else -#define malloc(size, type, flags) malloc(size) -#define free(addr, type) free(addr) -#endif - -#ifdef KMEM_DEBUG -struct kmem_item { - struct stack stack; - LIST_ENTRY(kmem_item) next; -}; -static LIST_HEAD(, kmem_item) kmem_items; -static struct mtx kmem_items_mtx; -MTX_SYSINIT(kmem_items_mtx, &kmem_items_mtx, "kmem_items", MTX_DEF); -#endif /* KMEM_DEBUG */ - -#include - -void * -zfs_kmem_alloc(size_t size, int kmflags) -{ - void *p; -#ifdef KMEM_DEBUG - struct kmem_item *i; - - size += sizeof(struct kmem_item); -#endif - p = malloc(size, M_SOLARIS, kmflags); -#ifndef _KERNEL - if (kmflags & KM_SLEEP) - assert(p != NULL); -#endif -#ifdef KMEM_DEBUG - if (p != NULL) { - i = p; - p = (u_char *)p + sizeof(struct kmem_item); - stack_save(&i->stack); - mtx_lock(&kmem_items_mtx); - LIST_INSERT_HEAD(&kmem_items, i, next); - mtx_unlock(&kmem_items_mtx); - } -#endif - return (p); -} - -void -zfs_kmem_free(void *buf, size_t size __unused) -{ -#ifdef KMEM_DEBUG - if (buf == NULL) { - printf("%s: attempt to free NULL\n", __func__); - return; - } - struct kmem_item *i; - - buf = (u_char *)buf - sizeof(struct kmem_item); - mtx_lock(&kmem_items_mtx); - LIST_FOREACH(i, &kmem_items, next) { - if (i == buf) - break; - } - ASSERT(i != NULL); - LIST_REMOVE(i, next); - mtx_unlock(&kmem_items_mtx); -#endif - free(buf, M_SOLARIS); -} - -static uint64_t kmem_size_val; - -static void -kmem_size_init(void *unused __unused) -{ - - kmem_size_val = (uint64_t)vm_cnt.v_page_count * PAGE_SIZE; - if (kmem_size_val > vm_kmem_size) - kmem_size_val = vm_kmem_size; -} -SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL); - -uint64_t -kmem_size(void) -{ - - return (kmem_size_val); -} - -static int -kmem_std_constructor(void *mem, int size __unused, void *private, int flags) -{ - struct kmem_cache *cache = private; - - return (cache->kc_constructor(mem, cache->kc_private, flags)); -} - -static void -kmem_std_destructor(void *mem, int size __unused, void *private) -{ - struct kmem_cache *cache = private; - - cache->kc_destructor(mem, cache->kc_private); -} - -kmem_cache_t * -kmem_cache_create(char *name, size_t bufsize, size_t align, - int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), - void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags) -{ - kmem_cache_t *cache; - - ASSERT(vmp == NULL); - - cache = kmem_alloc(sizeof(*cache), KM_SLEEP); - strlcpy(cache->kc_name, name, sizeof(cache->kc_name)); - cache->kc_constructor = constructor; - cache->kc_destructor = destructor; - cache->kc_private = private; -#if defined(_KERNEL) && !defined(KMEM_DEBUG) - cache->kc_zone = uma_zcreate(cache->kc_name, bufsize, - constructor != NULL ? kmem_std_constructor : NULL, - destructor != NULL ? kmem_std_destructor : NULL, - NULL, NULL, align > 0 ? align - 1 : 0, cflags); -#else - cache->kc_size = bufsize; -#endif - - return (cache); -} - -void -kmem_cache_destroy(kmem_cache_t *cache) -{ -#if defined(_KERNEL) && !defined(KMEM_DEBUG) - uma_zdestroy(cache->kc_zone); -#endif - kmem_free(cache, sizeof(*cache)); -} - -void * -kmem_cache_alloc(kmem_cache_t *cache, int flags) -{ -#if defined(_KERNEL) && !defined(KMEM_DEBUG) - return (uma_zalloc_arg(cache->kc_zone, cache, flags)); -#else - void *p; - - p = kmem_alloc(cache->kc_size, flags); - if (p != NULL && cache->kc_constructor != NULL) - kmem_std_constructor(p, cache->kc_size, cache, flags); - return (p); -#endif -} - -void -kmem_cache_free(kmem_cache_t *cache, void *buf) -{ -#if defined(_KERNEL) && !defined(KMEM_DEBUG) - uma_zfree_arg(cache->kc_zone, buf, cache); -#else - if (cache->kc_destructor != NULL) - kmem_std_destructor(buf, cache->kc_size, cache); - kmem_free(buf, cache->kc_size); -#endif -} - -/* - * Allow our caller to determine if there are running reaps. - * - * This call is very conservative and may return B_TRUE even when - * reaping activity isn't active. If it returns B_FALSE, then reaping - * activity is definitely inactive. - */ -boolean_t -kmem_cache_reap_active(void) -{ - - return (B_FALSE); -} - -/* - * Reap (almost) everything soon. - * - * Note: this does not wait for the reap-tasks to complete. Caller - * should use kmem_cache_reap_active() (above) and/or moderation to - * avoid scheduling too many reap-tasks. - */ -#ifdef _KERNEL -void -kmem_cache_reap_soon(kmem_cache_t *cache) -{ -#ifndef KMEM_DEBUG - uma_zone_reclaim(cache->kc_zone, UMA_RECLAIM_DRAIN); -#endif -} - -void -kmem_reap(void) -{ - uma_reclaim(UMA_RECLAIM_TRIM); -} -#else -void -kmem_cache_reap_soon(kmem_cache_t *cache __unused) -{ -} - -void -kmem_reap(void) -{ -} -#endif - -int -kmem_debugging(void) -{ - return (0); -} - -void * -calloc(size_t n, size_t s) -{ - return (kmem_zalloc(n * s, KM_NOSLEEP)); -} - -#ifdef KMEM_DEBUG -void kmem_show(void *); -void -kmem_show(void *dummy __unused) -{ - struct kmem_item *i; - - mtx_lock(&kmem_items_mtx); - if (LIST_EMPTY(&kmem_items)) - printf("KMEM_DEBUG: No leaked elements.\n"); - else { - printf("KMEM_DEBUG: Leaked elements:\n\n"); - LIST_FOREACH(i, &kmem_items, next) { - printf("address=%p\n", i); - stack_print_ddb(&i->stack); - printf("\n"); - } - } - mtx_unlock(&kmem_items_mtx); -} - -SYSUNINIT(sol_kmem, SI_SUB_CPU, SI_ORDER_FIRST, kmem_show, NULL); -#endif /* KMEM_DEBUG */ diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_kobj.c b/sys/cddl/compat/opensolaris/kern/opensolaris_kobj.c deleted file mode 100644 index 14d48da82ac0..000000000000 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_kobj.c +++ /dev/null @@ -1,210 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -void -kobj_free(void *address, size_t size) -{ - - kmem_free(address, size); -} - -void * -kobj_alloc(size_t size, int flag) -{ - - return (kmem_alloc(size, (flag & KM_NOWAIT) ? KM_NOSLEEP : KM_SLEEP)); -} - -void * -kobj_zalloc(size_t size, int flag) -{ - void *p; - - if ((p = kobj_alloc(size, flag)) != NULL) - bzero(p, size); - return (p); -} - -static void * -kobj_open_file_vnode(const char *file) -{ - struct thread *td = curthread; - struct nameidata nd; - int error, flags; - - pwd_ensure_dirs(); - - flags = FREAD | O_NOFOLLOW; - NDINIT(&nd, LOOKUP, 0, UIO_SYSSPACE, file, td); - error = vn_open_cred(&nd, &flags, 0, 0, curthread->td_ucred, NULL); - if (error != 0) - return (NULL); - NDFREE(&nd, NDF_ONLY_PNBUF); - /* We just unlock so we hold a reference. */ - VOP_UNLOCK(nd.ni_vp); - return (nd.ni_vp); -} - -static void * -kobj_open_file_loader(const char *file) -{ - - return (preload_search_by_name(file)); -} - -struct _buf * -kobj_open_file(const char *file) -{ - struct _buf *out; - - out = kmem_alloc(sizeof(*out), KM_SLEEP); - out->mounted = root_mounted(); - /* - * If root is already mounted we read file using file system, - * if not, we use loader. - */ - if (out->mounted) - out->ptr = kobj_open_file_vnode(file); - else - out->ptr = kobj_open_file_loader(file); - if (out->ptr == NULL) { - kmem_free(out, sizeof(*out)); - return ((struct _buf *)-1); - } - return (out); -} - -static int -kobj_get_filesize_vnode(struct _buf *file, uint64_t *size) -{ - struct vnode *vp = file->ptr; - struct vattr va; - int error; - - vn_lock(vp, LK_SHARED | LK_RETRY); - error = VOP_GETATTR(vp, &va, curthread->td_ucred); - VOP_UNLOCK(vp); - if (error == 0) - *size = (uint64_t)va.va_size; - return (error); -} - -static int -kobj_get_filesize_loader(struct _buf *file, uint64_t *size) -{ - void *ptr; - - ptr = preload_search_info(file->ptr, MODINFO_SIZE); - if (ptr == NULL) - return (ENOENT); - *size = (uint64_t)*(size_t *)ptr; - return (0); -} - -int -kobj_get_filesize(struct _buf *file, uint64_t *size) -{ - - if (file->mounted) - return (kobj_get_filesize_vnode(file, size)); - else - return (kobj_get_filesize_loader(file, size)); -} - -int -kobj_read_file_vnode(struct _buf *file, char *buf, unsigned size, unsigned off) -{ - struct vnode *vp = file->ptr; - struct thread *td = curthread; - struct uio auio; - struct iovec aiov; - int error; - - bzero(&aiov, sizeof(aiov)); - bzero(&auio, sizeof(auio)); - - aiov.iov_base = buf; - aiov.iov_len = size; - - auio.uio_iov = &aiov; - auio.uio_offset = (off_t)off; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_READ; - auio.uio_iovcnt = 1; - auio.uio_resid = size; - auio.uio_td = td; - - vn_lock(vp, LK_SHARED | LK_RETRY); - error = VOP_READ(vp, &auio, IO_UNIT | IO_SYNC, td->td_ucred); - VOP_UNLOCK(vp); - return (error != 0 ? -1 : size - auio.uio_resid); -} - -int -kobj_read_file_loader(struct _buf *file, char *buf, unsigned size, unsigned off) -{ - char *ptr; - - ptr = preload_fetch_addr(file->ptr); - if (ptr == NULL) - return (ENOENT); - bcopy(ptr + off, buf, size); - return (0); -} - -int -kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) -{ - - if (file->mounted) - return (kobj_read_file_vnode(file, buf, size, off)); - else - return (kobj_read_file_loader(file, buf, size, off)); -} - -void -kobj_close_file(struct _buf *file) -{ - - if (file->mounted) - vn_close(file->ptr, FREAD, curthread->td_ucred, curthread); - kmem_free(file, sizeof(*file)); -} diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_kstat.c b/sys/cddl/compat/opensolaris/kern/opensolaris_kstat.c deleted file mode 100644 index 51fe07497f98..000000000000 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_kstat.c +++ /dev/null @@ -1,148 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include - -static MALLOC_DEFINE(M_KSTAT, "kstat_data", "Kernel statistics"); - -SYSCTL_ROOT_NODE(OID_AUTO, kstat, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "Kernel statistics"); - -kstat_t * -kstat_create(char *module, int instance, char *name, char *class, uchar_t type, - ulong_t ndata, uchar_t flags) -{ - struct sysctl_oid *root; - kstat_t *ksp; - - KASSERT(instance == 0, ("instance=%d", instance)); - KASSERT(type == KSTAT_TYPE_NAMED, ("type=%hhu", type)); - KASSERT(flags == KSTAT_FLAG_VIRTUAL, ("flags=%02hhx", flags)); - - /* - * Allocate the main structure. We don't need to copy module/class/name - * stuff in here, because it is only used for sysctl node creation - * done in this function. - */ - ksp = malloc(sizeof(*ksp), M_KSTAT, M_WAITOK); - ksp->ks_ndata = ndata; - - /* - * Create sysctl tree for those statistics: - * - * kstat.... - */ - sysctl_ctx_init(&ksp->ks_sysctl_ctx); - root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, - SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module, - CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); - if (root == NULL) { - printf("%s: Cannot create kstat.%s tree!\n", __func__, module); - sysctl_ctx_free(&ksp->ks_sysctl_ctx); - free(ksp, M_KSTAT); - return (NULL); - } - root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), - OID_AUTO, class, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); - if (root == NULL) { - printf("%s: Cannot create kstat.%s.%s tree!\n", __func__, - module, class); - sysctl_ctx_free(&ksp->ks_sysctl_ctx); - free(ksp, M_KSTAT); - return (NULL); - } - root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), - OID_AUTO, name, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); - if (root == NULL) { - printf("%s: Cannot create kstat.%s.%s.%s tree!\n", __func__, - module, class, name); - sysctl_ctx_free(&ksp->ks_sysctl_ctx); - free(ksp, M_KSTAT); - return (NULL); - } - ksp->ks_sysctl_root = root; - - return (ksp); -} - -static int -kstat_sysctl(SYSCTL_HANDLER_ARGS) -{ - kstat_named_t *ksent = arg1; - uint64_t val; - - val = ksent->value.ui64; - return sysctl_handle_64(oidp, &val, 0, req); -} - -void -kstat_install(kstat_t *ksp) -{ - kstat_named_t *ksent; - u_int i; - - ksent = ksp->ks_data; - for (i = 0; i < ksp->ks_ndata; i++, ksent++) { - KASSERT(ksent->data_type == KSTAT_DATA_UINT64, - ("data_type=%d", ksent->data_type)); - SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, - SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, ksent->name, - CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RD, ksent, - sizeof(*ksent), kstat_sysctl, "QU", ksent->desc); - } -} - -void -kstat_delete(kstat_t *ksp) -{ - - sysctl_ctx_free(&ksp->ks_sysctl_ctx); - free(ksp, M_KSTAT); -} - -void -kstat_set_string(char *dst, const char *src) -{ - - bzero(dst, KSTAT_STRLEN); - (void) strncpy(dst, src, KSTAT_STRLEN - 1); -} - -void -kstat_named_init(kstat_named_t *knp, const char *name, uchar_t data_type) -{ - - kstat_set_string(knp->name, name); - knp->data_type = data_type; -} diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c b/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c deleted file mode 100644 index e025e10b240f..000000000000 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c +++ /dev/null @@ -1,64 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include - -int -lookupname(char *dirname, enum uio_seg seg, enum symfollow follow, - vnode_t **dirvpp, vnode_t **compvpp) -{ - - return (lookupnameat(dirname, seg, follow, dirvpp, compvpp, NULL)); -} - -int -lookupnameat(char *dirname, enum uio_seg seg, enum symfollow follow, - vnode_t **dirvpp, vnode_t **compvpp, vnode_t *startvp) -{ - struct nameidata nd; - int error, ltype; - - ASSERT(dirvpp == NULL); - - vref(startvp); - ltype = VOP_ISLOCKED(startvp); - VOP_UNLOCK(startvp); - NDINIT_ATVP(&nd, LOOKUP, LOCKLEAF | follow, seg, dirname, - startvp, curthread); - error = namei(&nd); - *compvpp = nd.ni_vp; - NDFREE(&nd, NDF_ONLY_PNBUF); - vn_lock(startvp, ltype | LK_RETRY); - return (error); -} diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c b/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c deleted file mode 100644 index 595f3c0b3c55..000000000000 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c +++ /dev/null @@ -1,54 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include - -char hw_serial[11] = "0"; - -struct opensolaris_utsname utsname = { - .machine = MACHINE -}; - -static void -opensolaris_utsname_init(void *arg) -{ - - utsname.sysname = ostype; - utsname.nodename = prison0.pr_hostname; - utsname.release = osrelease; - snprintf(utsname.version, sizeof(utsname.version), "%d", osreldate); -} -SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY, - opensolaris_utsname_init, NULL); diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c b/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c deleted file mode 100644 index e49cee11a101..000000000000 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_policy.c +++ /dev/null @@ -1,429 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int -secpolicy_nfs(cred_t *cr) -{ - - return (priv_check_cred(cr, PRIV_NFS_DAEMON)); -} - -int -secpolicy_zfs(cred_t *cr) -{ - - return (priv_check_cred(cr, PRIV_VFS_MOUNT)); -} - -int -secpolicy_sys_config(cred_t *cr, int checkonly __unused) -{ - - return (priv_check_cred(cr, PRIV_ZFS_POOL_CONFIG)); -} - -int -secpolicy_zinject(cred_t *cr) -{ - - return (priv_check_cred(cr, PRIV_ZFS_INJECT)); -} - -int -secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp __unused) -{ - - return (priv_check_cred(cr, PRIV_VFS_UNMOUNT)); -} - -int -secpolicy_fs_owner(struct mount *mp, cred_t *cr) -{ - - if (zfs_super_owner) { - if (cr->cr_uid == mp->mnt_cred->cr_uid && - cr->cr_prison == mp->mnt_cred->cr_prison) { - return (0); - } - } - return (EPERM); -} - -/* - * This check is done in kern_link(), so we could just return 0 here. - */ -extern int hardlink_check_uid; -int -secpolicy_basic_link(vnode_t *vp, cred_t *cr) -{ - - if (!hardlink_check_uid) - return (0); - if (secpolicy_fs_owner(vp->v_mount, cr) == 0) - return (0); - return (priv_check_cred(cr, PRIV_VFS_LINK)); -} - -int -secpolicy_vnode_stky_modify(cred_t *cr) -{ - - return (EPERM); -} - -int -secpolicy_vnode_remove(vnode_t *vp, cred_t *cr) -{ - - if (secpolicy_fs_owner(vp->v_mount, cr) == 0) - return (0); - return (priv_check_cred(cr, PRIV_VFS_ADMIN)); -} - -int -secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner, accmode_t accmode) -{ - - if (secpolicy_fs_owner(vp->v_mount, cr) == 0) - return (0); - - if ((accmode & VREAD) && priv_check_cred(cr, PRIV_VFS_READ) != 0) - return (EACCES); - if ((accmode & VWRITE) && - priv_check_cred(cr, PRIV_VFS_WRITE) != 0) { - return (EACCES); - } - if (accmode & VEXEC) { - if (vp->v_type == VDIR) { - if (priv_check_cred(cr, PRIV_VFS_LOOKUP) != 0) - return (EACCES); - } else { - if (priv_check_cred(cr, PRIV_VFS_EXEC) != 0) - return (EACCES); - } - } - return (0); -} - -/* - * Like secpolicy_vnode_access() but we get the actual wanted mode and the - * current mode of the file, not the missing bits. - */ -int -secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner, - accmode_t curmode, accmode_t wantmode) -{ - accmode_t mode; - - mode = ~curmode & wantmode; - - if (mode == 0) - return (0); - - return (secpolicy_vnode_access(cr, vp, owner, mode)); -} - -int -secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner) -{ - static int privs[] = { - PRIV_VFS_ADMIN, - PRIV_VFS_READ, - PRIV_VFS_WRITE, - PRIV_VFS_EXEC, - PRIV_VFS_LOOKUP - }; - int i; - - if (secpolicy_fs_owner(vp->v_mount, cr) == 0) - return (0); - - /* Same as secpolicy_vnode_setdac */ - if (owner == cr->cr_uid) - return (0); - - for (i = 0; i < sizeof (privs)/sizeof (int); i++) { - boolean_t allzone = B_FALSE; - int priv; - - switch (priv = privs[i]) { - case PRIV_VFS_EXEC: - if (vp->v_type == VDIR) - continue; - break; - case PRIV_VFS_LOOKUP: - if (vp->v_type != VDIR) - continue; - break; - } - if (priv_check_cred(cr, priv) == 0) - return (0); - } - return (EPERM); -} - -int -secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner) -{ - - if (owner == cr->cr_uid) - return (0); - if (secpolicy_fs_owner(vp->v_mount, cr) == 0) - return (0); - return (priv_check_cred(cr, PRIV_VFS_ADMIN)); -} - -int -secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap, - const struct vattr *ovap, int flags, - int unlocked_access(void *, int, cred_t *), void *node) -{ - int mask = vap->va_mask; - int error; - - if (mask & AT_SIZE) { - if (vp->v_type == VDIR) - return (EISDIR); - error = unlocked_access(node, VWRITE, cr); - if (error) - return (error); - } - if (mask & AT_MODE) { - /* - * If not the owner of the file then check privilege - * for two things: the privilege to set the mode at all - * and, if we're setting setuid, we also need permissions - * to add the set-uid bit, if we're not the owner. - * In the specific case of creating a set-uid root - * file, we need even more permissions. - */ - error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); - if (error) - return (error); - error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cr); - if (error) - return (error); - } else { - vap->va_mode = ovap->va_mode; - } - if (mask & (AT_UID | AT_GID)) { - error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); - if (error) - return (error); - - /* - * To change the owner of a file, or change the group of a file to a - * group of which we are not a member, the caller must have - * privilege. - */ - if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) || - ((mask & AT_GID) && vap->va_gid != ovap->va_gid && - !groupmember(vap->va_gid, cr))) { - if (secpolicy_fs_owner(vp->v_mount, cr) != 0) { - error = priv_check_cred(cr, PRIV_VFS_CHOWN); - if (error) - return (error); - } - } - - if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) || - ((mask & AT_GID) && vap->va_gid != ovap->va_gid)) { - secpolicy_setid_clear(vap, vp, cr); - } - } - if (mask & (AT_ATIME | AT_MTIME)) { - /* - * From utimes(2): - * If times is NULL, ... The caller must be the owner of - * the file, have permission to write the file, or be the - * super-user. - * If times is non-NULL, ... The caller must be the owner of - * the file or be the super-user. - */ - error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); - if (error && (vap->va_vaflags & VA_UTIMES_NULL)) - error = unlocked_access(node, VWRITE, cr); - if (error) - return (error); - } - return (0); -} - -int -secpolicy_vnode_create_gid(cred_t *cr) -{ - - return (EPERM); -} - -int -secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid) -{ - - if (groupmember(gid, cr)) - return (0); - if (secpolicy_fs_owner(vp->v_mount, cr) == 0) - return (0); - return (priv_check_cred(cr, PRIV_VFS_SETGID)); -} - -int -secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cr, - boolean_t issuidroot __unused) -{ - - if (secpolicy_fs_owner(vp->v_mount, cr) == 0) - return (0); - return (priv_check_cred(cr, PRIV_VFS_RETAINSUGID)); -} - -void -secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr) -{ - - if (secpolicy_fs_owner(vp->v_mount, cr) == 0) - return; - - if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0) { - if (priv_check_cred(cr, PRIV_VFS_RETAINSUGID)) { - vap->va_mask |= AT_MODE; - vap->va_mode &= ~(S_ISUID|S_ISGID); - } - } -} - -int -secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap, - const struct vattr *ovap, cred_t *cr) -{ - int error; - - if (secpolicy_fs_owner(vp->v_mount, cr) == 0) - return (0); - - /* - * Privileged processes may set the sticky bit on non-directories, - * as well as set the setgid bit on a file with a group that the process - * is not a member of. Both of these are allowed in jail(8). - */ - if (vp->v_type != VDIR && (vap->va_mode & S_ISTXT)) { - if (priv_check_cred(cr, PRIV_VFS_STICKYFILE)) - return (EFTYPE); - } - /* - * Check for privilege if attempting to set the - * group-id bit. - */ - if ((vap->va_mode & S_ISGID) != 0) { - error = secpolicy_vnode_setids_setgids(vp, cr, ovap->va_gid); - if (error) - return (error); - } - /* - * Deny setting setuid if we are not the file owner. - */ - if ((vap->va_mode & S_ISUID) && ovap->va_uid != cr->cr_uid) { - error = priv_check_cred(cr, PRIV_VFS_ADMIN); - if (error) - return (error); - } - return (0); -} - -int -secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp) -{ - - return (priv_check_cred(cr, PRIV_VFS_MOUNT)); -} - -int -secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner) -{ - - if (owner == cr->cr_uid) - return (0); - if (secpolicy_fs_owner(vp->v_mount, cr) == 0) - return (0); - - /* XXX: vfs_suser()? */ - return (priv_check_cred(cr, PRIV_VFS_MOUNT_OWNER)); -} - -int -secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner) -{ - - if (secpolicy_fs_owner(vp->v_mount, cr) == 0) - return (0); - return (priv_check_cred(cr, PRIV_VFS_CHOWN)); -} - -void -secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp) -{ - - if (priv_check_cred(cr, PRIV_VFS_MOUNT_NONUSER) != 0) { - MNT_ILOCK(vfsp); - vfsp->vfs_flag |= VFS_NOSETUID | MNT_USER; - vfs_clearmntopt(vfsp, MNTOPT_SETUID); - vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 0); - MNT_IUNLOCK(vfsp); - } -} - -/* - * Check privileges for setting xvattr attributes - */ -int -secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr, - vtype_t vtype) -{ - - if (secpolicy_fs_owner(vp->v_mount, cr) == 0) - return (0); - return (priv_check_cred(cr, PRIV_VFS_SYSFLAGS)); -} - -int -secpolicy_smb(cred_t *cr) -{ - - return (priv_check_cred(cr, PRIV_NETSMB)); -} diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_sunddi.c b/sys/cddl/compat/opensolaris/kern/opensolaris_sunddi.c deleted file mode 100644 index 4a13cd8956c0..000000000000 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_sunddi.c +++ /dev/null @@ -1,194 +0,0 @@ -/*- - * Copyright (c) 2010 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include - -int -ddi_strtol(const char *str, char **nptr, int base, long *result) -{ - - *result = strtol(str, nptr, base); - return (0); -} - -int -ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result) -{ - - if (str == hw_serial) { - *result = prison0.pr_hostid; - return (0); - } - - *result = strtoul(str, nptr, base); - return (0); -} - -int -ddi_strtoull(const char *str, char **nptr, int base, unsigned long long *result) -{ - - *result = (unsigned long long)strtouq(str, nptr, base); - return (0); -} - -int -ddi_strtoll(const char *str, char **nptr, int base, long long *result) -{ - - *result = (long long)strtoq(str, nptr, base); - return (0); -} - -struct ddi_soft_state_item { - int ssi_item; - void *ssi_data; - LIST_ENTRY(ddi_soft_state_item) ssi_next; -}; - -struct ddi_soft_state { - size_t ss_size; - kmutex_t ss_lock; - LIST_HEAD(, ddi_soft_state_item) ss_list; -}; - -static void * -ddi_get_soft_state_locked(struct ddi_soft_state *ss, int item) -{ - struct ddi_soft_state_item *itemp; - - ASSERT(MUTEX_HELD(&ss->ss_lock)); - - LIST_FOREACH(itemp, &ss->ss_list, ssi_next) { - if (itemp->ssi_item == item) - return (itemp->ssi_data); - } - return (NULL); -} - -void * -ddi_get_soft_state(void *state, int item) -{ - struct ddi_soft_state *ss = state; - void *data; - - mutex_enter(&ss->ss_lock); - data = ddi_get_soft_state_locked(ss, item); - mutex_exit(&ss->ss_lock); - return (data); -} - -int -ddi_soft_state_zalloc(void *state, int item) -{ - struct ddi_soft_state *ss = state; - struct ddi_soft_state_item *itemp; - - itemp = kmem_alloc(sizeof(*itemp), KM_SLEEP); - itemp->ssi_item = item; - itemp->ssi_data = kmem_zalloc(ss->ss_size, KM_SLEEP); - - mutex_enter(&ss->ss_lock); - if (ddi_get_soft_state_locked(ss, item) != NULL) { - mutex_exit(&ss->ss_lock); - kmem_free(itemp->ssi_data, ss->ss_size); - kmem_free(itemp, sizeof(*itemp)); - return (DDI_FAILURE); - } - LIST_INSERT_HEAD(&ss->ss_list, itemp, ssi_next); - mutex_exit(&ss->ss_lock); - return (DDI_SUCCESS); -} - -static void -ddi_soft_state_free_locked(struct ddi_soft_state *ss, int item) -{ - struct ddi_soft_state_item *itemp; - - ASSERT(MUTEX_HELD(&ss->ss_lock)); - - LIST_FOREACH(itemp, &ss->ss_list, ssi_next) { - if (itemp->ssi_item == item) - break; - } - if (itemp != NULL) { - LIST_REMOVE(itemp, ssi_next); - kmem_free(itemp->ssi_data, ss->ss_size); - kmem_free(itemp, sizeof(*itemp)); - } -} - -void -ddi_soft_state_free(void *state, int item) -{ - struct ddi_soft_state *ss = state; - - mutex_enter(&ss->ss_lock); - ddi_soft_state_free_locked(ss, item); - mutex_exit(&ss->ss_lock); -} - -int -ddi_soft_state_init(void **statep, size_t size, size_t nitems __unused) -{ - struct ddi_soft_state *ss; - - ss = kmem_alloc(sizeof(*ss), KM_SLEEP); - mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL); - ss->ss_size = size; - LIST_INIT(&ss->ss_list); - *statep = ss; - return (0); -} - -void -ddi_soft_state_fini(void **statep) -{ - struct ddi_soft_state *ss = *statep; - struct ddi_soft_state_item *itemp; - int item; - - mutex_enter(&ss->ss_lock); - while ((itemp = LIST_FIRST(&ss->ss_list)) != NULL) { - item = itemp->ssi_item; - ddi_soft_state_free_locked(ss, item); - } - mutex_exit(&ss->ss_lock); - mutex_destroy(&ss->ss_lock); - kmem_free(ss, sizeof(*ss)); - - *statep = NULL; -} diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_sysevent.c b/sys/cddl/compat/opensolaris/kern/opensolaris_sysevent.c deleted file mode 100644 index e810ae458140..000000000000 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_sysevent.c +++ /dev/null @@ -1,338 +0,0 @@ -/*- - * Copyright (c) 2010 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct sysevent { - nvlist_t *se_nvl; - char se_class[128]; - char se_subclass[128]; - char se_pub[128]; -}; - -sysevent_t * -sysevent_alloc(char *class, char *subclass, char *pub, int flag) -{ - struct sysevent *ev; - - ASSERT(class != NULL); - ASSERT(subclass != NULL); - ASSERT(pub != NULL); - ASSERT(flag == SE_SLEEP); - - ev = kmem_alloc(sizeof(*ev), KM_SLEEP); - ev->se_nvl = NULL; - strlcpy(ev->se_class, class, sizeof(ev->se_class)); - strlcpy(ev->se_subclass, subclass, sizeof(ev->se_subclass)); - strlcpy(ev->se_pub, pub, sizeof(ev->se_pub)); - - return ((sysevent_t *)ev); -} - -void -sysevent_free(sysevent_t *evp) -{ - struct sysevent *ev = (struct sysevent *)evp; - - ASSERT(evp != NULL); - - if (ev->se_nvl != NULL) - sysevent_free_attr(ev->se_nvl); - kmem_free(ev, sizeof(*ev)); -} - -int -sysevent_add_attr(sysevent_attr_list_t **ev_attr_list, char *name, - sysevent_value_t *se_value, int flag) -{ - nvlist_t *nvl; - int error; - - ASSERT(ev_attr_list != NULL); - ASSERT(name != NULL); - ASSERT(se_value != NULL); - ASSERT(flag == SE_SLEEP); - - if (strlen(name) >= MAX_ATTR_NAME) - return (SE_EINVAL); - - nvl = *ev_attr_list; - if (nvl == NULL) { - if (nvlist_alloc(&nvl, NV_UNIQUE_NAME_TYPE, KM_SLEEP) != 0) - return (SE_ENOMEM); - } - - error = 0; - - switch (se_value->value_type) { - case SE_DATA_TYPE_UINT64: - error = nvlist_add_uint64(nvl, name, se_value->value.sv_uint64); - break; - case SE_DATA_TYPE_STRING: - if (strlen(se_value->value.sv_string) >= MAX_STRING_SZ) - error = SE_EINVAL; - if (error == 0) { - error = nvlist_add_string(nvl, name, - se_value->value.sv_string); - } - break; - default: -#if 0 - printf("%s: type %d is not implemented\n", __func__, - se_value->value_type); -#endif - break; - } - - if (error != 0) { - nvlist_free(nvl); - return (error); - } - - *ev_attr_list = nvl; - - return (0); -} - -void -sysevent_free_attr(sysevent_attr_list_t *ev_attr_list) -{ - - nvlist_free(ev_attr_list); -} - -int -sysevent_attach_attributes(sysevent_t *evp, sysevent_attr_list_t *ev_attr_list) -{ - struct sysevent *ev = (struct sysevent *)evp; - - ASSERT(ev->se_nvl == NULL); - - ev->se_nvl = ev_attr_list; - - return (0); -} - -void -sysevent_detach_attributes(sysevent_t *evp) -{ - struct sysevent *ev = (struct sysevent *)evp; - - ASSERT(ev->se_nvl != NULL); - - ev->se_nvl = NULL; -} - -int -log_sysevent(sysevent_t *evp, int flag, sysevent_id_t *eid) -{ - struct sysevent *ev = (struct sysevent *)evp; - struct sbuf *sb; - const char *type; - char typestr[128]; - nvpair_t *elem = NULL; - - ASSERT(evp != NULL); - ASSERT(ev->se_nvl != NULL); - ASSERT(flag == SE_SLEEP); - ASSERT(eid != NULL); - - sb = sbuf_new_auto(); - if (sb == NULL) - return (SE_ENOMEM); - type = NULL; - - while ((elem = nvlist_next_nvpair(ev->se_nvl, elem)) != NULL) { - switch (nvpair_type(elem)) { - case DATA_TYPE_BOOLEAN: - { - boolean_t value; - - (void) nvpair_value_boolean_value(elem, &value); - sbuf_printf(sb, " %s=%s", nvpair_name(elem), - value ? "true" : "false"); - break; - } - case DATA_TYPE_UINT8: - { - uint8_t value; - - (void) nvpair_value_uint8(elem, &value); - sbuf_printf(sb, " %s=%hhu", nvpair_name(elem), value); - break; - } - case DATA_TYPE_INT32: - { - int32_t value; - - (void) nvpair_value_int32(elem, &value); - sbuf_printf(sb, " %s=%jd", nvpair_name(elem), - (intmax_t)value); - break; - } - case DATA_TYPE_UINT32: - { - uint32_t value; - - (void) nvpair_value_uint32(elem, &value); - sbuf_printf(sb, " %s=%ju", nvpair_name(elem), - (uintmax_t)value); - break; - } - case DATA_TYPE_INT64: - { - int64_t value; - - (void) nvpair_value_int64(elem, &value); - sbuf_printf(sb, " %s=%jd", nvpair_name(elem), - (intmax_t)value); - break; - } - case DATA_TYPE_UINT64: - { - uint64_t value; - - (void) nvpair_value_uint64(elem, &value); - sbuf_printf(sb, " %s=%ju", nvpair_name(elem), - (uintmax_t)value); - break; - } - case DATA_TYPE_STRING: - { - char *value; - - (void) nvpair_value_string(elem, &value); - sbuf_printf(sb, " %s=%s", nvpair_name(elem), value); - if (strcmp(FM_CLASS, nvpair_name(elem)) == 0) - type = value; - break; - } - case DATA_TYPE_UINT8_ARRAY: - { - uint8_t *value; - uint_t ii, nelem; - - (void) nvpair_value_uint8_array(elem, &value, &nelem); - sbuf_printf(sb, " %s=", nvpair_name(elem)); - for (ii = 0; ii < nelem; ii++) - sbuf_printf(sb, "%02hhx", value[ii]); - break; - } - case DATA_TYPE_UINT16_ARRAY: - { - uint16_t *value; - uint_t ii, nelem; - - (void) nvpair_value_uint16_array(elem, &value, &nelem); - sbuf_printf(sb, " %s=", nvpair_name(elem)); - for (ii = 0; ii < nelem; ii++) - sbuf_printf(sb, "%04hx", value[ii]); - break; - } - case DATA_TYPE_UINT32_ARRAY: - { - uint32_t *value; - uint_t ii, nelem; - - (void) nvpair_value_uint32_array(elem, &value, &nelem); - sbuf_printf(sb, " %s=", nvpair_name(elem)); - for (ii = 0; ii < nelem; ii++) - sbuf_printf(sb, "%08jx", (uintmax_t)value[ii]); - break; - } - case DATA_TYPE_UINT64_ARRAY: - { - uint64_t *value; - uint_t ii, nelem; - - (void) nvpair_value_uint64_array(elem, &value, &nelem); - sbuf_printf(sb, " %s=", nvpair_name(elem)); - for (ii = 0; ii < nelem; ii++) - sbuf_printf(sb, "%016jx", (uintmax_t)value[ii]); - break; - } - default: -#if 0 - printf("%s: type %d is not implemented\n", __func__, - nvpair_type(elem)); -#endif - break; - } - } - - if (sbuf_finish(sb) != 0) { - sbuf_delete(sb); - return (SE_ENOMEM); - } - - if (type == NULL) - type = ev->se_subclass; - if (strncmp(type, "ESC_ZFS_", 8) == 0) { - snprintf(typestr, sizeof(typestr), "misc.fs.zfs.%s", type + 8); - type = typestr; - } - devctl_notify("ZFS", "ZFS", type, sbuf_data(sb)); - sbuf_delete(sb); - - return (0); -} - -int -_ddi_log_sysevent(char *vendor, char *class, char *subclass, - nvlist_t *attr_list, sysevent_id_t *eidp, int flag) -{ - sysevent_t *ev; - int ret; - - ASSERT(vendor != NULL); - ASSERT(class != NULL); - ASSERT(subclass != NULL); - ASSERT(attr_list != NULL); - ASSERT(eidp != NULL); - ASSERT(flag == DDI_SLEEP); - - ev = sysevent_alloc(class, subclass, vendor, SE_SLEEP); - ASSERT(ev != NULL); - (void)sysevent_attach_attributes(ev, attr_list); - ret = log_sysevent(ev, SE_SLEEP, eidp); - sysevent_detach_attributes(ev); - sysevent_free(ev); - - return (ret); -} diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c b/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c deleted file mode 100644 index 0ba0338a9848..000000000000 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c +++ /dev/null @@ -1,252 +0,0 @@ -/*- - * Copyright (c) 2006-2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -MALLOC_DECLARE(M_MOUNT); - -void -vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg, - int flags __unused) -{ - struct vfsopt *opt; - size_t namesize; - int locked; - - if (!(locked = mtx_owned(MNT_MTX(vfsp)))) - MNT_ILOCK(vfsp); - - if (vfsp->mnt_opt == NULL) { - void *opts; - - MNT_IUNLOCK(vfsp); - opts = malloc(sizeof(*vfsp->mnt_opt), M_MOUNT, M_WAITOK); - MNT_ILOCK(vfsp); - if (vfsp->mnt_opt == NULL) { - vfsp->mnt_opt = opts; - TAILQ_INIT(vfsp->mnt_opt); - } else { - free(opts, M_MOUNT); - } - } - - MNT_IUNLOCK(vfsp); - - opt = malloc(sizeof(*opt), M_MOUNT, M_WAITOK); - namesize = strlen(name) + 1; - opt->name = malloc(namesize, M_MOUNT, M_WAITOK); - strlcpy(opt->name, name, namesize); - opt->pos = -1; - opt->seen = 1; - if (arg == NULL) { - opt->value = NULL; - opt->len = 0; - } else { - opt->len = strlen(arg) + 1; - opt->value = malloc(opt->len, M_MOUNT, M_WAITOK); - bcopy(arg, opt->value, opt->len); - } - - MNT_ILOCK(vfsp); - TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link); - if (!locked) - MNT_IUNLOCK(vfsp); -} - -void -vfs_clearmntopt(vfs_t *vfsp, const char *name) -{ - int locked; - - if (!(locked = mtx_owned(MNT_MTX(vfsp)))) - MNT_ILOCK(vfsp); - vfs_deleteopt(vfsp->mnt_opt, name); - if (!locked) - MNT_IUNLOCK(vfsp); -} - -int -vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp) -{ - struct vfsoptlist *opts = vfsp->mnt_optnew; - int error; - - if (opts == NULL) - return (0); - error = vfs_getopt(opts, opt, (void **)argp, NULL); - return (error != 0 ? 0 : 1); -} - -int -mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, - char *fspec, int fsflags) -{ - struct vfsconf *vfsp; - struct mount *mp; - vnode_t *vp, *mvp; - struct ucred *cr; - int error; - - ASSERT_VOP_ELOCKED(*vpp, "mount_snapshot"); - - vp = *vpp; - *vpp = NULL; - error = 0; - - /* - * Be ultra-paranoid about making sure the type and fspath - * variables will fit in our mp buffers, including the - * terminating NUL. - */ - if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) - error = ENAMETOOLONG; - if (error == 0 && (vfsp = vfs_byname_kld(fstype, td, &error)) == NULL) - error = ENODEV; - if (error == 0 && vp->v_type != VDIR) - error = ENOTDIR; - /* - * We need vnode lock to protect v_mountedhere and vnode interlock - * to protect v_iflag. - */ - if (error == 0) { - VI_LOCK(vp); - if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) - vp->v_iflag |= VI_MOUNT; - else - error = EBUSY; - VI_UNLOCK(vp); - } - if (error != 0) { - vput(vp); - return (error); - } - vn_seqc_write_begin(vp); - VOP_UNLOCK(vp); - - /* - * Allocate and initialize the filesystem. - * We don't want regular user that triggered snapshot mount to be able - * to unmount it, so pass credentials of the parent mount. - */ - mp = vfs_mount_alloc(vp, vfsp, fspath, vp->v_mount->mnt_cred); - - mp->mnt_optnew = NULL; - vfs_setmntopt(mp, "from", fspec, 0); - mp->mnt_optnew = mp->mnt_opt; - mp->mnt_opt = NULL; - - /* - * Set the mount level flags. - */ - mp->mnt_flag = fsflags & MNT_UPDATEMASK; - /* - * Snapshots are always read-only. - */ - mp->mnt_flag |= MNT_RDONLY; - /* - * We don't want snapshots to allow access to vulnerable setuid - * programs, so we turn off setuid when mounting snapshots. - */ - mp->mnt_flag |= MNT_NOSUID; - /* - * We don't want snapshots to be visible in regular - * mount(8) and df(1) output. - */ - mp->mnt_flag |= MNT_IGNORE; - /* - * XXX: This is evil, but we can't mount a snapshot as a regular user. - * XXX: Is is safe when snapshot is mounted from within a jail? - */ - cr = td->td_ucred; - td->td_ucred = kcred; - error = VFS_MOUNT(mp); - td->td_ucred = cr; - - if (error != 0) { - /* - * Clear VI_MOUNT and decrement the use count "atomically", - * under the vnode lock. This is not strictly required, - * but makes it easier to reason about the life-cycle and - * ownership of the covered vnode. - */ - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - VI_LOCK(vp); - vp->v_iflag &= ~VI_MOUNT; - VI_UNLOCK(vp); - vn_seqc_write_end(vp); - vput(vp); - vfs_unbusy(mp); - vfs_freeopts(mp->mnt_optnew); - mp->mnt_vnodecovered = NULL; - vfs_mount_destroy(mp); - return (error); - } - - if (mp->mnt_opt != NULL) - vfs_freeopts(mp->mnt_opt); - mp->mnt_opt = mp->mnt_optnew; - (void)VFS_STATFS(mp, &mp->mnt_stat); - - /* - * Prevent external consumers of mount options from reading - * mnt_optnew. - */ - mp->mnt_optnew = NULL; - - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); -#ifdef FREEBSD_NAMECACHE - cache_purge(vp); -#endif - VI_LOCK(vp); - vp->v_iflag &= ~VI_MOUNT; - VI_UNLOCK(vp); - - vp->v_mountedhere = mp; - /* Put the new filesystem on the mount list. */ - mtx_lock(&mountlist_mtx); - TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); - mtx_unlock(&mountlist_mtx); - vfs_event_signal(NULL, VQ_MOUNT, 0); - if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp)) - panic("mount: lost mount"); - vn_seqc_write_end(vp); - VOP_UNLOCK(vp); - vfs_op_exit(mp); - vfs_unbusy(mp); - *vpp = mvp; - return (0); -} diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c b/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c deleted file mode 100644 index f4a6042ffef2..000000000000 --- a/sys/cddl/compat/opensolaris/kern/opensolaris_zone.c +++ /dev/null @@ -1,256 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static MALLOC_DEFINE(M_ZONES, "zones_data", "Zones data"); - -/* - * Structure to record list of ZFS datasets exported to a zone. - */ -typedef struct zone_dataset { - LIST_ENTRY(zone_dataset) zd_next; - char zd_dataset[0]; -} zone_dataset_t; - -LIST_HEAD(zone_dataset_head, zone_dataset); - -static int zone_slot; - -int -zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid) -{ - struct zone_dataset_head *head; - zone_dataset_t *zd, *zd2; - struct prison *pr; - int dofree, error; - - if ((error = priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0) - return (error); - - /* Allocate memory before we grab prison's mutex. */ - zd = malloc(sizeof(*zd) + strlen(dataset) + 1, M_ZONES, M_WAITOK); - - sx_slock(&allprison_lock); - pr = prison_find(jailid); /* Locks &pr->pr_mtx. */ - sx_sunlock(&allprison_lock); - if (pr == NULL) { - free(zd, M_ZONES); - return (ENOENT); - } - - head = osd_jail_get(pr, zone_slot); - if (head != NULL) { - dofree = 0; - LIST_FOREACH(zd2, head, zd_next) { - if (strcmp(dataset, zd2->zd_dataset) == 0) { - free(zd, M_ZONES); - error = EEXIST; - goto end; - } - } - } else { - dofree = 1; - prison_hold_locked(pr); - mtx_unlock(&pr->pr_mtx); - head = malloc(sizeof(*head), M_ZONES, M_WAITOK); - LIST_INIT(head); - mtx_lock(&pr->pr_mtx); - error = osd_jail_set(pr, zone_slot, head); - KASSERT(error == 0, ("osd_jail_set() failed (error=%d)", error)); - } - strcpy(zd->zd_dataset, dataset); - LIST_INSERT_HEAD(head, zd, zd_next); -end: - if (dofree) - prison_free_locked(pr); - else - mtx_unlock(&pr->pr_mtx); - return (error); -} - -int -zone_dataset_detach(struct ucred *cred, const char *dataset, int jailid) -{ - struct zone_dataset_head *head; - zone_dataset_t *zd; - struct prison *pr; - int error; - - if ((error = priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0) - return (error); - - sx_slock(&allprison_lock); - pr = prison_find(jailid); - sx_sunlock(&allprison_lock); - if (pr == NULL) - return (ENOENT); - head = osd_jail_get(pr, zone_slot); - if (head == NULL) { - error = ENOENT; - goto end; - } - LIST_FOREACH(zd, head, zd_next) { - if (strcmp(dataset, zd->zd_dataset) == 0) - break; - } - if (zd == NULL) - error = ENOENT; - else { - LIST_REMOVE(zd, zd_next); - free(zd, M_ZONES); - if (LIST_EMPTY(head)) - osd_jail_del(pr, zone_slot); - error = 0; - } -end: - mtx_unlock(&pr->pr_mtx); - return (error); -} - -/* - * Returns true if the named dataset is visible in the current zone. - * The 'write' parameter is set to 1 if the dataset is also writable. - */ -int -zone_dataset_visible(const char *dataset, int *write) -{ - struct zone_dataset_head *head; - zone_dataset_t *zd; - struct prison *pr; - size_t len; - int ret = 0; - - if (dataset[0] == '\0') - return (0); - if (INGLOBALZONE(curthread)) { - if (write != NULL) - *write = 1; - return (1); - } - pr = curthread->td_ucred->cr_prison; - mtx_lock(&pr->pr_mtx); - head = osd_jail_get(pr, zone_slot); - if (head == NULL) - goto end; - - /* - * Walk the list once, looking for datasets which match exactly, or - * specify a dataset underneath an exported dataset. If found, return - * true and note that it is writable. - */ - LIST_FOREACH(zd, head, zd_next) { - len = strlen(zd->zd_dataset); - if (strlen(dataset) >= len && - bcmp(dataset, zd->zd_dataset, len) == 0 && - (dataset[len] == '\0' || dataset[len] == '/' || - dataset[len] == '@')) { - if (write) - *write = 1; - ret = 1; - goto end; - } - } - - /* - * Walk the list a second time, searching for datasets which are parents - * of exported datasets. These should be visible, but read-only. - * - * Note that we also have to support forms such as 'pool/dataset/', with - * a trailing slash. - */ - LIST_FOREACH(zd, head, zd_next) { - len = strlen(dataset); - if (dataset[len - 1] == '/') - len--; /* Ignore trailing slash */ - if (len < strlen(zd->zd_dataset) && - bcmp(dataset, zd->zd_dataset, len) == 0 && - zd->zd_dataset[len] == '/') { - if (write) - *write = 0; - ret = 1; - goto end; - } - } -end: - mtx_unlock(&pr->pr_mtx); - return (ret); -} - -static void -zone_destroy(void *arg) -{ - struct zone_dataset_head *head; - zone_dataset_t *zd; - - head = arg; - while ((zd = LIST_FIRST(head)) != NULL) { - LIST_REMOVE(zd, zd_next); - free(zd, M_ZONES); - } - free(head, M_ZONES); -} - -uint32_t -zone_get_hostid(void *ptr) -{ - - KASSERT(ptr == NULL, ("only NULL pointer supported in %s", __func__)); - - return ((uint32_t)curthread->td_ucred->cr_prison->pr_hostid); -} - -static void -zone_sysinit(void *arg __unused) -{ - - zone_slot = osd_jail_register(zone_destroy, NULL); -} - -static void -zone_sysuninit(void *arg __unused) -{ - - osd_jail_deregister(zone_slot); -} - -SYSINIT(zone_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysinit, NULL); -SYSUNINIT(zone_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysuninit, NULL); diff --git a/sys/cddl/compat/opensolaris/sys/acl.h b/sys/cddl/compat/opensolaris/sys/acl.h deleted file mode 100644 index 57fad6faa001..000000000000 --- a/sys/cddl/compat/opensolaris/sys/acl.h +++ /dev/null @@ -1,39 +0,0 @@ -/*- - * Copyright (c) 2008, 2009 Edward Tomasz NapieraÅ‚a - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef OPENSOLARIS_SYS_ACL_H -#define OPENSOLARIS_SYS_ACL_H - -#include_next - -struct acl; - -void aces_from_acl(ace_t *aces, int *nentries, const struct acl *aclp); -int acl_from_aces(struct acl *aclp, const ace_t *aces, int nentries); - -#endif /* OPENSOLARIS_SYS_ACL_H */ diff --git a/sys/cddl/compat/opensolaris/sys/file.h b/sys/cddl/compat/opensolaris/sys/file.h deleted file mode 100644 index 04851ee86d6d..000000000000 --- a/sys/cddl/compat/opensolaris/sys/file.h +++ /dev/null @@ -1,64 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _OPENSOLARIS_SYS_FILE_H_ -#define _OPENSOLARIS_SYS_FILE_H_ - -#include_next - -#define FKIOCTL 0x80000000 /* ioctl addresses are from kernel */ - -#ifdef _KERNEL -typedef struct file file_t; - -#include - -static __inline file_t * -getf(int fd, cap_rights_t *rightsp) -{ - struct file *fp; - - if (fget(curthread, fd, rightsp, &fp) == 0) - return (fp); - return (NULL); -} - -static __inline void -releasef(int fd) -{ - struct file *fp; - - /* No CAP_ rights required, as we're only releasing. */ - if (fget(curthread, fd, &cap_no_rights, &fp) == 0) { - fdrop(fp, curthread); - fdrop(fp, curthread); - } -} -#endif /* _KERNEL */ - -#endif /* !_OPENSOLARIS_SYS_FILE_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/kobj.h b/sys/cddl/compat/opensolaris/sys/kobj.h deleted file mode 100644 index e060ff04ce2f..000000000000 --- a/sys/cddl/compat/opensolaris/sys/kobj.h +++ /dev/null @@ -1,60 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _OPENSOLARIS_SYS_KOBJ_H_ -#define _OPENSOLARIS_SYS_KOBJ_H_ - -#include -#include -#include_next -#ifdef AT_UID -#undef AT_UID -#endif -#ifdef AT_GID -#undef AT_GID -#endif -#include - -#define KM_NOWAIT 0x01 -#define KM_TMP 0x02 - -void kobj_free(void *address, size_t size); -void *kobj_alloc(size_t size, int flag); -void *kobj_zalloc(size_t size, int flag); - -struct _buf { - void *ptr; - int mounted; -}; - -struct _buf *kobj_open_file(const char *path); -int kobj_get_filesize(struct _buf *file, uint64_t *size); -int kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off); -void kobj_close_file(struct _buf *file); - -#endif /* _OPENSOLARIS_SYS_KOBJ_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/lock.h b/sys/cddl/compat/opensolaris/sys/lock.h deleted file mode 100644 index 27663f46e446..000000000000 --- a/sys/cddl/compat/opensolaris/sys/lock.h +++ /dev/null @@ -1,45 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _OPENSOLARIS_SYS_LOCK_H_ -#define _OPENSOLARIS_SYS_LOCK_H_ - -#include_next - -#ifdef _KERNEL - -#define LO_ALLMASK (LO_INITIALIZED | LO_WITNESS | LO_QUIET | \ - LO_RECURSABLE | LO_SLEEPABLE | LO_UPGRADABLE | \ - LO_DUPOK | LO_CLASSMASK | LO_NOPROFILE) -#define LO_EXPECTED (LO_INITIALIZED | LO_WITNESS | LO_RECURSABLE | \ - LO_SLEEPABLE | LO_UPGRADABLE | LO_DUPOK | \ - /* sx lock class */(2 << LO_CLASSSHIFT)) - -#endif /* defined(_KERNEL) */ - -#endif /* _OPENSOLARIS_SYS_LOCK_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/misc.h b/sys/cddl/compat/opensolaris/sys/misc.h index e128ce06d165..b2230da24e7a 100644 --- a/sys/cddl/compat/opensolaris/sys/misc.h +++ b/sys/cddl/compat/opensolaris/sys/misc.h @@ -55,7 +55,6 @@ struct opensolaris_utsname { }; extern char hw_serial[11]; -extern struct opensolaris_utsname utsname; #endif #endif /* _OPENSOLARIS_SYS_MISC_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/mman.h b/sys/cddl/compat/opensolaris/sys/mman.h deleted file mode 100644 index ca746898f65d..000000000000 --- a/sys/cddl/compat/opensolaris/sys/mman.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (C) 2007 John Birrell - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - * - */ - -#ifndef _COMPAT_OPENSOLARIS_SYS_MMAN_H_ -#define _COMPAT_OPENSOLARIS_SYS_MMAN_H_ - -#include_next - -#define mmap64(_a,_b,_c,_d,_e,_f) mmap(_a,_b,_c,_d,_e,_f) - -#endif diff --git a/sys/cddl/compat/opensolaris/sys/modctl.h b/sys/cddl/compat/opensolaris/sys/modctl.h index 7af39b090f3b..af91036a31b4 100644 --- a/sys/cddl/compat/opensolaris/sys/modctl.h +++ b/sys/cddl/compat/opensolaris/sys/modctl.h @@ -31,6 +31,7 @@ #define _COMPAT_OPENSOLARIS_SYS_MODCTL_H #include +#include #include typedef struct linker_file modctl_t; diff --git a/sys/cddl/compat/opensolaris/sys/mount.h b/sys/cddl/compat/opensolaris/sys/mount.h deleted file mode 100644 index e012597a92e6..000000000000 --- a/sys/cddl/compat/opensolaris/sys/mount.h +++ /dev/null @@ -1,41 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _OPENSOLARIS_SYS_MOUNT_H_ -#define _OPENSOLARIS_SYS_MOUNT_H_ - -#include - -#include_next - -#define MS_FORCE MNT_FORCE -#define MS_REMOUNT MNT_UPDATE - -typedef struct fid fid_t; - -#endif /* !_OPENSOLARIS_SYS_MOUNT_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/mutex.h b/sys/cddl/compat/opensolaris/sys/mutex.h deleted file mode 100644 index 45a33bd71cf6..000000000000 --- a/sys/cddl/compat/opensolaris/sys/mutex.h +++ /dev/null @@ -1,77 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _OPENSOLARIS_SYS_MUTEX_H_ -#define _OPENSOLARIS_SYS_MUTEX_H_ - -#ifdef _KERNEL - -#include -#include -#include_next -#include -#include - -typedef enum { - MUTEX_DEFAULT = 6 /* kernel default mutex */ -} kmutex_type_t; - -#define MUTEX_HELD(x) (mutex_owned(x)) -#define MUTEX_NOT_HELD(x) (!mutex_owned(x) || KERNEL_PANICKED()) - -typedef struct sx kmutex_t; - -#ifndef OPENSOLARIS_WITNESS -#define MUTEX_FLAGS (SX_DUPOK | SX_NEW | SX_NOWITNESS) -#else -#define MUTEX_FLAGS (SX_DUPOK | SX_NEW) -#endif - -#define mutex_init(lock, desc, type, arg) do { \ - const char *_name; \ - ASSERT((type) == 0 || (type) == MUTEX_DEFAULT); \ - KASSERT(((lock)->lock_object.lo_flags & LO_ALLMASK) != \ - LO_EXPECTED, ("lock %s already initialized", #lock)); \ - for (_name = #lock; *_name != '\0'; _name++) { \ - if (*_name >= 'a' && *_name <= 'z') \ - break; \ - } \ - if (*_name == '\0') \ - _name = #lock; \ - sx_init_flags((lock), _name, MUTEX_FLAGS); \ -} while (0) -#define mutex_destroy(lock) sx_destroy(lock) -#define mutex_enter(lock) sx_xlock(lock) -#define mutex_tryenter(lock) sx_try_xlock(lock) -#define mutex_exit(lock) sx_xunlock(lock) -#define mutex_owned(lock) sx_xlocked(lock) -#define mutex_owner(lock) sx_xholder(lock) - -#endif /* _KERNEL */ - -#endif /* _OPENSOLARIS_SYS_MUTEX_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/nvpair.h b/sys/cddl/compat/opensolaris/sys/nvpair.h deleted file mode 100644 index 33b62cbe49c5..000000000000 --- a/sys/cddl/compat/opensolaris/sys/nvpair.h +++ /dev/null @@ -1,230 +0,0 @@ -/*- - * Copyright (c) 2014 Sandvine Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _OPENSOLARIS_SYS_NVPAIR_H_ -#define _OPENSOLARIS_SYS_NVPAIR_H_ - -#ifdef _KERNEL - -/* - * Some of the symbols in the Illumos nvpair library conflict with symbols - * provided by nv(9), so we use this preprocessor hack to avoid the conflict. - * - * This list was generated by: - * cat nv.h nv_impl.h nvlist_* nvpair_impl.h | \ - * sed -nE 's/^[[:alnum:]_][[:alnum:]_ ]*[[:space:]]+[*]*([[:alnum:]_]+)\(.*$/#define \1 illumos_\1/p' | \ - * sort -u - */ -#define nvlist_add_binary illumos_nvlist_add_binary -#define nvlist_add_bool illumos_nvlist_add_bool -#define nvlist_add_bool_array illumos_nvlist_add_bool_array -#define nvlist_add_descriptor illumos_nvlist_add_descriptor -#define nvlist_add_descriptor_array illumos_nvlist_add_descriptor_array -#define nvlist_add_null illumos_nvlist_add_null -#define nvlist_add_number illumos_nvlist_add_number -#define nvlist_add_number_array illumos_nvlist_add_number_array -#define nvlist_add_nvlist illumos_nvlist_add_nvlist -#define nvlist_add_nvlist_array illumos_nvlist_add_nvlist_array -#define nvlist_add_nvpair illumos_nvlist_add_nvpair -#define nvlist_add_string illumos_nvlist_add_string -#define nvlist_add_string_array illumos_nvlist_add_string_array -#define nvlist_add_stringf illumos_nvlist_add_stringf -#define nvlist_add_stringv illumos_nvlist_add_stringv -#define nvlist_clone illumos_nvlist_clone -#define nvlist_create illumos_nvlist_create -#define nvlist_descriptors illumos_nvlist_descriptors -#define nvlist_destroy illumos_nvlist_destroy -#define nvlist_dump illumos_nvlist_dump -#define nvlist_empty illumos_nvlist_empty -#define nvlist_error illumos_nvlist_error -#define nvlist_exists illumos_nvlist_exists -#define nvlist_exists_binary illumos_nvlist_exists_binary -#define nvlist_exists_bool illumos_nvlist_exists_bool -#define nvlist_exists_bool_array illumos_nvlist_exists_bool_array -#define nvlist_exists_descriptor illumos_nvlist_exists_descriptor -#define nvlist_exists_descriptor_array illumos_nvlist_exists_descriptor_array -#define nvlist_exists_null illumos_nvlist_exists_null -#define nvlist_exists_number illumos_nvlist_exists_number -#define nvlist_exists_number_array illumos_nvlist_exists_number_array -#define nvlist_exists_nvlist illumos_nvlist_exists_nvlist -#define nvlist_exists_nvlist_array illumos_nvlist_exists_nvlist_array -#define nvlist_exists_string illumos_nvlist_exists_string -#define nvlist_exists_string_array illumos_nvlist_exists_string_array -#define nvlist_exists_type illumos_nvlist_exists_type -#define nvlist_fdump illumos_nvlist_fdump -#define nvlist_first_nvpair illumos_nvlist_first_nvpair -#define nvlist_flags illumos_nvlist_flags -#define nvlist_free illumos_nvlist_free -#define nvlist_free_binary illumos_nvlist_free_binary -#define nvlist_free_binary_array illumos_nvlist_free_binary_array -#define nvlist_free_bool illumos_nvlist_free_bool -#define nvlist_free_bool_array illumos_nvlist_free_bool_array -#define nvlist_free_descriptor illumos_nvlist_free_descriptor -#define nvlist_free_descriptor_array illumos_nvlist_free_descriptor_array -#define nvlist_free_null illumos_nvlist_free_null -#define nvlist_free_number illumos_nvlist_free_number -#define nvlist_free_number_array illumos_nvlist_free_number_array -#define nvlist_free_nvlist illumos_nvlist_free_nvlist -#define nvlist_free_nvlist_array illumos_nvlist_free_nvlist_array -#define nvlist_free_nvpair illumos_nvlist_free_nvpair -#define nvlist_free_string illumos_nvlist_free_string -#define nvlist_free_string_array illumos_nvlist_free_string_array -#define nvlist_free_type illumos_nvlist_free_type -#define nvlist_get_array_next illumos_nvlist_get_array_next -#define nvlist_get_binary illumos_nvlist_get_binary -#define nvlist_get_bool illumos_nvlist_get_bool -#define nvlist_get_bool_array illumos_nvlist_get_bool_array -#define nvlist_get_descriptor illumos_nvlist_get_descriptor -#define nvlist_get_descriptor_array illumos_nvlist_get_descriptor_array -#define nvlist_get_number illumos_nvlist_get_number -#define nvlist_get_number_array illumos_nvlist_get_number_array -#define nvlist_get_nvlist illumos_nvlist_get_nvlist -#define nvlist_get_nvpair illumos_nvlist_get_nvpair -#define nvlist_get_nvpair_parent illumos_nvlist_get_nvpair_parent -#define nvlist_get_pararr illumos_nvlist_get_pararr -#define nvlist_get_parent illumos_nvlist_get_parent -#define nvlist_get_string illumos_nvlist_get_string -#define nvlist_in_array illumos_nvlist_in_array -#define nvlist_move_binary illumos_nvlist_move_binary -#define nvlist_move_bool_array illumos_nvlist_move_bool_array -#define nvlist_move_descriptor illumos_nvlist_move_descriptor -#define nvlist_move_descriptor_array illumos_nvlist_move_descriptor_array -#define nvlist_move_number_array illumos_nvlist_move_number_array -#define nvlist_move_nvlist illumos_nvlist_move_nvlist -#define nvlist_move_nvlist_array illumos_nvlist_move_nvlist_array -#define nvlist_move_nvpair illumos_nvlist_move_nvpair -#define nvlist_move_string illumos_nvlist_move_string -#define nvlist_move_string_array illumos_nvlist_move_string_array -#define nvlist_ndescriptors illumos_nvlist_ndescriptors -#define nvlist_next illumos_nvlist_next -#define nvlist_next_nvpair illumos_nvlist_next_nvpair -#define nvlist_pack illumos_nvlist_pack -#define nvlist_prev_nvpair illumos_nvlist_prev_nvpair -#define nvlist_recv illumos_nvlist_recv -#define nvlist_remove_nvpair illumos_nvlist_remove_nvpair -#define nvlist_send illumos_nvlist_send -#define nvlist_set_array_next illumos_nvlist_set_array_next -#define nvlist_set_error illumos_nvlist_set_error -#define nvlist_set_flags illumos_nvlist_set_flags -#define nvlist_set_parent illumos_nvlist_set_parent -#define nvlist_size illumos_nvlist_size -#define nvlist_take_binary illumos_nvlist_take_binary -#define nvlist_take_bool illumos_nvlist_take_bool -#define nvlist_take_bool_array illumos_nvlist_take_bool_array -#define nvlist_take_descriptor illumos_nvlist_take_descriptor -#define nvlist_take_descriptor_array illumos_nvlist_take_descriptor_array -#define nvlist_take_number illumos_nvlist_take_number -#define nvlist_take_number_array illumos_nvlist_take_number_array -#define nvlist_take_nvlist illumos_nvlist_take_nvlist -#define nvlist_take_nvlist_array illumos_nvlist_take_nvlist_array -#define nvlist_take_nvpair illumos_nvlist_take_nvpair -#define nvlist_take_string illumos_nvlist_take_string -#define nvlist_take_string_array illumos_nvlist_take_string_array -#define nvlist_unpack illumos_nvlist_unpack -#define nvlist_unpack_header illumos_nvlist_unpack_header -#define nvlist_xfer illumos_nvlist_xfer -#define nvpair_assert illumos_nvpair_assert -#define nvpair_clone illumos_nvpair_clone -#define nvpair_create_binary illumos_nvpair_create_binary -#define nvpair_create_bool illumos_nvpair_create_bool -#define nvpair_create_bool_array illumos_nvpair_create_bool_array -#define nvpair_create_descriptor illumos_nvpair_create_descriptor -#define nvpair_create_descriptor_array illumos_nvpair_create_descriptor_array -#define nvpair_create_null illumos_nvpair_create_null -#define nvpair_create_number illumos_nvpair_create_number -#define nvpair_create_number_array illumos_nvpair_create_number_array -#define nvpair_create_nvlist illumos_nvpair_create_nvlist -#define nvpair_create_nvlist_array illumos_nvpair_create_nvlist_array -#define nvpair_create_string illumos_nvpair_create_string -#define nvpair_create_string_array illumos_nvpair_create_string_array -#define nvpair_create_stringf illumos_nvpair_create_stringf -#define nvpair_create_stringv illumos_nvpair_create_stringv -#define nvpair_free illumos_nvpair_free -#define nvpair_free_structure illumos_nvpair_free_structure -#define nvpair_get_binary illumos_nvpair_get_binary -#define nvpair_get_bool illumos_nvpair_get_bool -#define nvpair_get_bool_array illumos_nvpair_get_bool_array -#define nvpair_get_descriptor illumos_nvpair_get_descriptor -#define nvpair_get_descriptor_array illumos_nvpair_get_descriptor_array -#define nvpair_get_number illumos_nvpair_get_number -#define nvpair_get_number_array illumos_nvpair_get_number_array -#define nvpair_get_nvlist illumos_nvpair_get_nvlist -#define nvpair_get_string illumos_nvpair_get_string -#define nvpair_header_size illumos_nvpair_header_size -#define nvpair_init_datasize illumos_nvpair_init_datasize -#define nvpair_insert illumos_nvpair_insert -#define nvpair_move_binary illumos_nvpair_move_binary -#define nvpair_move_bool_array illumos_nvpair_move_bool_array -#define nvpair_move_descriptor illumos_nvpair_move_descriptor -#define nvpair_move_descriptor_array illumos_nvpair_move_descriptor_array -#define nvpair_move_number_array illumos_nvpair_move_number_array -#define nvpair_move_nvlist illumos_nvpair_move_nvlist -#define nvpair_move_nvlist_array illumos_nvpair_move_nvlist_array -#define nvpair_move_string illumos_nvpair_move_string -#define nvpair_move_string_array illumos_nvpair_move_string_array -#define nvpair_name illumos_nvpair_name -#define nvpair_next illumos_nvpair_next -#define nvpair_nvlist illumos_nvpair_nvlist -#define nvpair_pack_binary illumos_nvpair_pack_binary -#define nvpair_pack_bool illumos_nvpair_pack_bool -#define nvpair_pack_bool_array illumos_nvpair_pack_bool_array -#define nvpair_pack_descriptor illumos_nvpair_pack_descriptor -#define nvpair_pack_descriptor_array illumos_nvpair_pack_descriptor_array -#define nvpair_pack_header illumos_nvpair_pack_header -#define nvpair_pack_null illumos_nvpair_pack_null -#define nvpair_pack_number illumos_nvpair_pack_number -#define nvpair_pack_number_array illumos_nvpair_pack_number_array -#define nvpair_pack_nvlist_array_next illumos_nvpair_pack_nvlist_array_next -#define nvpair_pack_nvlist_up illumos_nvpair_pack_nvlist_up -#define nvpair_pack_string illumos_nvpair_pack_string -#define nvpair_pack_string_array illumos_nvpair_pack_string_array -#define nvpair_prev illumos_nvpair_prev -#define nvpair_remove illumos_nvpair_remove -#define nvpair_size illumos_nvpair_size -#define nvpair_type illumos_nvpair_type -#define nvpair_type_string illumos_nvpair_type_string -#define nvpair_unpack illumos_nvpair_unpack -#define nvpair_unpack_binary illumos_nvpair_unpack_binary -#define nvpair_unpack_bool illumos_nvpair_unpack_bool -#define nvpair_unpack_bool_array illumos_nvpair_unpack_bool_array -#define nvpair_unpack_descriptor illumos_nvpair_unpack_descriptor -#define nvpair_unpack_descriptor_array illumos_nvpair_unpack_descriptor_array -#define nvpair_unpack_header illumos_nvpair_unpack_header -#define nvpair_unpack_null illumos_nvpair_unpack_null -#define nvpair_unpack_number illumos_nvpair_unpack_number -#define nvpair_unpack_number_array illumos_nvpair_unpack_number_array -#define nvpair_unpack_nvlist illumos_nvpair_unpack_nvlist -#define nvpair_unpack_nvlist_array illumos_nvpair_unpack_nvlist_array -#define nvpair_unpack_string illumos_nvpair_unpack_string -#define nvpair_unpack_string_array illumos_nvpair_unpack_string_array - -#endif /* _KERNEL */ - -#include_next - -#endif diff --git a/sys/cddl/compat/opensolaris/sys/param.h b/sys/cddl/compat/opensolaris/sys/param.h deleted file mode 100644 index 609d22afe8c1..000000000000 --- a/sys/cddl/compat/opensolaris/sys/param.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (C) 2007 John Birrell - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - * - */ - -#ifndef _COMPAT_OPENSOLARIS_SYS_PARAM_H_ -#define _COMPAT_OPENSOLARIS_SYS_PARAM_H_ - -#include_next - -#define PAGESIZE PAGE_SIZE - -#ifdef _KERNEL -#define ptob(x) ((uint64_t)(x) << PAGE_SHIFT) -#endif - -#endif diff --git a/sys/cddl/compat/opensolaris/sys/proc.h b/sys/cddl/compat/opensolaris/sys/proc.h deleted file mode 100644 index d91833a58f8c..000000000000 --- a/sys/cddl/compat/opensolaris/sys/proc.h +++ /dev/null @@ -1,105 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _OPENSOLARIS_SYS_PROC_H_ -#define _OPENSOLARIS_SYS_PROC_H_ - -#include -#include -#include_next -#include -#include -#include -#include -#include -#include -#include - -#ifdef _KERNEL - -#define CPU curcpu -#define minclsyspri PRIBIO -#define maxclsyspri PVM -#define max_ncpus (mp_maxid + 1) -#define boot_max_ncpus (mp_maxid + 1) -#define syscid 1 - -#define TS_RUN 0 - -#define p0 proc0 - -#define t_did td_tid - -typedef short pri_t; -typedef struct thread _kthread; -typedef struct thread kthread_t; -typedef struct thread *kthread_id_t; -typedef struct proc proc_t; - -extern struct proc *system_proc; - -static __inline kthread_t * -do_thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg, - size_t len, proc_t *pp, int state, pri_t pri) -{ - kthread_t *td = NULL; - proc_t **ppp; - int error; - - /* - * Be sure there are no surprises. - */ - ASSERT(stk == NULL); - ASSERT(len == 0); - ASSERT(state == TS_RUN); - ASSERT(pp != NULL); - - if (pp == &p0) - ppp = &system_proc; - else - ppp = &pp; - error = kproc_kthread_add(proc, arg, ppp, &td, RFSTOPPED, - stksize / PAGE_SIZE, "zfskern", "solthread %p", proc); - if (error == 0) { - thread_lock(td); - sched_prio(td, pri); - sched_add(td, SRQ_BORING); - } - return (td); -} - -#define thread_create(stk, stksize, proc, arg, len, pp, state, pri) \ - do_thread_create(stk, stksize, proc, arg, len, pp, state, pri) -#define thread_exit() kthread_exit() - -int uread(proc_t *, void *, size_t, uintptr_t); -int uwrite(proc_t *, void *, size_t, uintptr_t); - -#endif /* _KERNEL */ - -#endif /* _OPENSOLARIS_SYS_PROC_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/stat.h b/sys/cddl/compat/opensolaris/sys/stat.h deleted file mode 100644 index 05b9671789dd..000000000000 --- a/sys/cddl/compat/opensolaris/sys/stat.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (C) 2007 John Birrell - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - * - */ - -#ifndef _COMPAT_OPENSOLARIS_SYS_STAT_H_ -#define _COMPAT_OPENSOLARIS_SYS_STAT_H_ - -#include_next - -/* - * When bootstrapping on Linux a stat64/fstat64 functions exists in both - * glibc and musl libc. To avoid compilation errors, use those functions instead - * of redefining them to stat/fstat. - * Similarly, macOS provides (deprecated) stat64 functions that we can use - * for now. - */ -#if !defined(__linux__) && !defined(__APPLE__) -#define stat64 stat - -#define MAXOFFSET_T OFF_MAX - -#if !defined(_KERNEL) -#include - -static __inline int -fstat64(int fd, struct stat *sb) -{ - int ret; - - ret = fstat(fd, sb); - if (ret == 0) { - if (S_ISCHR(sb->st_mode)) - (void)ioctl(fd, DIOCGMEDIASIZE, &sb->st_size); - } - return (ret); -} -#endif /* !defined(_KERNEL) */ -#endif /* !defined(__linux__) && !defined(__APPLE__) */ - -#endif /* !_COMPAT_OPENSOLARIS_SYS_STAT_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/systm.h b/sys/cddl/compat/opensolaris/sys/systm.h deleted file mode 100644 index fe0e1998c2c4..000000000000 --- a/sys/cddl/compat/opensolaris/sys/systm.h +++ /dev/null @@ -1,47 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _OPENSOLARIS_SYS_SYSTM_H_ -#define _OPENSOLARIS_SYS_SYSTM_H_ - -#ifdef _KERNEL - -#include -#include_next - -#include - -#define PAGESIZE PAGE_SIZE -#define PAGEOFFSET (PAGESIZE - 1) -#define PAGEMASK (~PAGEOFFSET) - -#define delay(x) pause("soldelay", (x)) - -#endif /* _KERNEL */ - -#endif /* _OPENSOLARIS_SYS_SYSTM_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/time.h b/sys/cddl/compat/opensolaris/sys/time.h deleted file mode 100644 index 5f51d08550f4..000000000000 --- a/sys/cddl/compat/opensolaris/sys/time.h +++ /dev/null @@ -1,95 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _OPENSOLARIS_SYS_TIME_H_ -#define _OPENSOLARIS_SYS_TIME_H_ - -#include -#include_next - -#define SEC 1 -#define MILLISEC 1000 -#define MICROSEC 1000000 -#define NANOSEC 1000000000 -#define TIME_MAX LLONG_MAX - -#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) -#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) - -#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC)) -#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC)) - -#define NSEC2SEC(n) ((n) / (NANOSEC / SEC)) -#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC)) - -typedef longlong_t hrtime_t; - -#if defined(__i386__) || defined(__powerpc__) -#define TIMESPEC_OVERFLOW(ts) \ - ((ts)->tv_sec < INT32_MIN || (ts)->tv_sec > INT32_MAX) -#else -#define TIMESPEC_OVERFLOW(ts) \ - ((ts)->tv_sec < INT64_MIN || (ts)->tv_sec > INT64_MAX) -#endif - -#define SEC_TO_TICK(sec) ((sec) * hz) -#define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz)) - -#ifdef _KERNEL -static __inline hrtime_t -gethrtime(void) { - - struct timespec ts; - hrtime_t nsec; - - getnanouptime(&ts); - nsec = (hrtime_t)ts.tv_sec * NANOSEC + ts.tv_nsec; - return (nsec); -} - -#define gethrestime_sec() (time_second) -#define gethrestime(ts) getnanotime(ts) -#define gethrtime_waitfree() gethrtime() - -extern int nsec_per_tick; /* nanoseconds per clock tick */ - -#define ddi_get_lbolt64() \ - (int64_t)(((getsbinuptime() >> 16) * hz) >> 16) -#define ddi_get_lbolt() (clock_t)ddi_get_lbolt64() - -#else - -static __inline hrtime_t gethrtime(void) { - struct timespec ts; - clock_gettime(CLOCK_UPTIME,&ts); - return (((u_int64_t) ts.tv_sec) * NANOSEC + ts.tv_nsec); -} - -#endif /* _KERNEL */ - -#endif /* !_OPENSOLARIS_SYS_TIME_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/types.h b/sys/cddl/compat/opensolaris/sys/types.h deleted file mode 100644 index ee065f023af0..000000000000 --- a/sys/cddl/compat/opensolaris/sys/types.h +++ /dev/null @@ -1,101 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _OPENSOLARIS_SYS_TYPES_H_ -#define _OPENSOLARIS_SYS_TYPES_H_ - -/* - * This is a bag of dirty hacks to keep things compiling. - */ - -#include - -#ifdef _KERNEL -typedef int64_t clock_t; -#define _CLOCK_T_DECLARED -#endif - -#include_next - -#define MAXNAMELEN 256 - -typedef struct timespec timestruc_t; -typedef struct timespec timespec_t; -typedef u_int uint_t; -typedef u_char uchar_t; -typedef u_short ushort_t; -typedef u_long ulong_t; -typedef long long longlong_t; -typedef unsigned long long u_longlong_t; -#ifndef _OFF64_T_DECLARED -#define _OFF64_T_DECLARED -typedef off_t off64_t; -#endif -typedef id_t taskid_t; -typedef id_t projid_t; -typedef id_t poolid_t; -typedef id_t zoneid_t; -typedef id_t ctid_t; -typedef mode_t o_mode_t; -typedef uint64_t pgcnt_t; -typedef u_int minor_t; - -#ifdef _KERNEL - -#define B_FALSE 0 -#define B_TRUE 1 - -typedef short index_t; -typedef off_t offset_t; -#ifndef _PTRDIFF_T_DECLARED -typedef __ptrdiff_t ptrdiff_t; /* pointer difference */ -#define _PTRDIFF_T_DECLARED -#endif -typedef int64_t rlim64_t; -typedef int major_t; - -#else -#ifdef NEED_SOLARIS_BOOLEAN -#if defined(__XOPEN_OR_POSIX) -typedef enum { _B_FALSE, _B_TRUE } boolean_t; -#else -typedef enum { B_FALSE, B_TRUE } boolean_t; -#endif /* defined(__XOPEN_OR_POSIX) */ -#endif - -typedef longlong_t offset_t; -typedef u_longlong_t u_offset_t; -typedef uint64_t upad64_t; -typedef short pri_t; -typedef int32_t daddr32_t; -typedef int32_t time32_t; -typedef u_longlong_t diskaddr_t; - -#endif /* !_KERNEL */ - -#endif /* !_OPENSOLARIS_SYS_TYPES_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/uio.h b/sys/cddl/compat/opensolaris/sys/uio.h deleted file mode 100644 index f0edfb1c0541..000000000000 --- a/sys/cddl/compat/opensolaris/sys/uio.h +++ /dev/null @@ -1,89 +0,0 @@ -/*- - * Copyright (c) 2010 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _OPENSOLARIS_SYS_UIO_H_ -#define _OPENSOLARIS_SYS_UIO_H_ - -#include_next -#include - -#ifndef _KERNEL -#define FOF_OFFSET 1 /* Use the offset in uio argument */ - -struct uio { - struct iovec *uio_iov; - int uio_iovcnt; - off_t uio_offset; - int uio_resid; - enum uio_seg uio_segflg; - enum uio_rw uio_rw; - void *uio_td; -}; -#endif - -#define uio_loffset uio_offset - -typedef struct uio uio_t; -typedef struct iovec iovec_t; - -typedef enum xuio_type { - UIOTYPE_ASYNCIO, - UIOTYPE_ZEROCOPY -} xuio_type_t; - -typedef struct xuio { - uio_t xu_uio; - - /* Extended uio fields */ - enum xuio_type xu_type; /* What kind of uio structure? */ - union { - struct { - int xu_zc_rw; - void *xu_zc_priv; - } xu_zc; - } xu_ext; -} xuio_t; - -#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv -#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw - -#ifdef BUILDING_ZFS -static __inline int -zfs_uiomove(void *cp, size_t n, enum uio_rw dir, uio_t *uio) -{ - - ASSERT(uio->uio_rw == dir); - return (uiomove(cp, (int)n, uio)); -} -#define uiomove(cp, n, dir, uio) zfs_uiomove((cp), (n), (dir), (uio)) - -int uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes); -void uioskip(uio_t *uiop, size_t n); -#endif /* BUILDING_ZFS */ - -#endif /* !_OPENSOLARIS_SYS_UIO_H_ */ diff --git a/sys/cddl/compat/opensolaris/sys/vnode.h b/sys/cddl/compat/opensolaris/sys/vnode.h deleted file mode 100644 index 0d1646396538..000000000000 --- a/sys/cddl/compat/opensolaris/sys/vnode.h +++ /dev/null @@ -1,287 +0,0 @@ -/*- - * Copyright (c) 2007 Pawel Jakub Dawidek - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _OPENSOLARIS_SYS_VNODE_H_ -#define _OPENSOLARIS_SYS_VNODE_H_ - -#ifdef _KERNEL - -struct vnode; -struct vattr; - -typedef struct vnode vnode_t; -typedef struct vattr vattr_t; -typedef enum vtype vtype_t; - -#include -enum symfollow { NO_FOLLOW = NOFOLLOW }; - -#include -#include_next -#include -#include -#include -#include -#include -#include - -typedef struct vop_vector vnodeops_t; -#define VOP_FID VOP_VPTOFH -#define vop_fid vop_vptofh -#define vop_fid_args vop_vptofh_args -#define a_fid a_fhp - -#define IS_XATTRDIR(dvp) (0) - -#define v_count v_usecount - -#define V_APPEND VAPPEND - -#define rootvfs (rootvnode == NULL ? NULL : rootvnode->v_mount) - -static __inline int -vn_is_readonly(vnode_t *vp) -{ - return (vp->v_mount->mnt_flag & MNT_RDONLY); -} -#define vn_vfswlock(vp) (0) -#define vn_vfsunlock(vp) do { } while (0) -#define vn_ismntpt(vp) ((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL) -#define vn_mountedvfs(vp) ((vp)->v_mountedhere) -#define vn_has_cached_data(vp) \ - ((vp)->v_object != NULL && \ - (vp)->v_object->resident_page_count > 0) -#define vn_exists(vp) do { } while (0) -#define vn_invalid(vp) do { } while (0) -#define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0) -#define vn_free(vp) do { } while (0) -#define vn_matchops(vp, vops) ((vp)->v_op == &(vops)) - -#define VN_HOLD(v) vref(v) -#define VN_RELE(v) vrele(v) -#define VN_URELE(v) vput(v) - -#define vnevent_create(vp, ct) do { } while (0) -#define vnevent_link(vp, ct) do { } while (0) -#define vnevent_remove(vp, dvp, name, ct) do { } while (0) -#define vnevent_rmdir(vp, dvp, name, ct) do { } while (0) -#define vnevent_rename_src(vp, dvp, name, ct) do { } while (0) -#define vnevent_rename_dest(vp, dvp, name, ct) do { } while (0) -#define vnevent_rename_dest_dir(vp, ct) do { } while (0) - -#define specvp(vp, rdev, type, cr) (VN_HOLD(vp), (vp)) -#define MANDMODE(mode) (0) -#define MANDLOCK(vp, mode) (0) -#define chklock(vp, op, offset, size, mode, ct) (0) -#define cleanlocks(vp, pid, foo) do { } while (0) -#define cleanshares(vp, pid) do { } while (0) - -/* - * We will use va_spare is place of Solaris' va_mask. - * This field is initialized in zfs_setattr(). - */ -#define va_mask va_spare -/* TODO: va_fileid is shorter than va_nodeid !!! */ -#define va_nodeid va_fileid -/* TODO: This field needs conversion! */ -#define va_nblocks va_bytes -#define va_blksize va_blocksize -#define va_seq va_gen - -#define MAXOFFSET_T OFF_MAX -#define EXCL 0 - -#define ACCESSED (AT_ATIME) -#define STATE_CHANGED (AT_CTIME) -#define CONTENT_MODIFIED (AT_MTIME | AT_CTIME) - -static __inline void -vattr_init_mask(vattr_t *vap) -{ - - vap->va_mask = 0; - - if (vap->va_type != VNON) - vap->va_mask |= AT_TYPE; - if (vap->va_uid != (uid_t)VNOVAL) - vap->va_mask |= AT_UID; - if (vap->va_gid != (gid_t)VNOVAL) - vap->va_mask |= AT_GID; - if (vap->va_size != (u_quad_t)VNOVAL) - vap->va_mask |= AT_SIZE; - if (vap->va_atime.tv_sec != VNOVAL) - vap->va_mask |= AT_ATIME; - if (vap->va_mtime.tv_sec != VNOVAL) - vap->va_mask |= AT_MTIME; - if (vap->va_mode != (u_short)VNOVAL) - vap->va_mask |= AT_MODE; - if (vap->va_flags != VNOVAL) - vap->va_mask |= AT_XVATTR; -} - -#define FCREAT O_CREAT -#define FTRUNC O_TRUNC -#define FEXCL O_EXCL -#define FDSYNC FFSYNC -#define FRSYNC FFSYNC -#define FSYNC FFSYNC -#define FOFFMAX 0x00 -#define FIGNORECASE 0x00 - -static __inline int -vn_openat(char *pnamep, enum uio_seg seg, int filemode, int createmode, - vnode_t **vpp, enum create crwhy, mode_t umask, struct vnode *startvp, - int fd) -{ - struct thread *td = curthread; - struct nameidata nd; - int error, operation; - - ASSERT(seg == UIO_SYSSPACE); - if ((filemode & FCREAT) != 0) { - ASSERT(filemode == (FWRITE | FCREAT | FTRUNC | FOFFMAX)); - ASSERT(crwhy == CRCREAT); - operation = CREATE; - } else { - ASSERT(filemode == (FREAD | FOFFMAX) || - filemode == (FREAD | FWRITE | FOFFMAX)); - ASSERT(crwhy == 0); - operation = LOOKUP; - } - ASSERT(umask == 0); - - pwd_ensure_dirs(); - - if (startvp != NULL) - vref(startvp); - NDINIT_ATVP(&nd, operation, 0, UIO_SYSSPACE, pnamep, startvp, td); - filemode |= O_NOFOLLOW; - error = vn_open_cred(&nd, &filemode, createmode, 0, td->td_ucred, NULL); - NDFREE(&nd, NDF_ONLY_PNBUF); - if (error == 0) { - /* We just unlock so we hold a reference. */ - VOP_UNLOCK(nd.ni_vp); - *vpp = nd.ni_vp; - } - return (error); -} - -static __inline int -zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode, - vnode_t **vpp, enum create crwhy, mode_t umask) -{ - - return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy, - umask, NULL, -1)); -} -#define vn_open(pnamep, seg, filemode, createmode, vpp, crwhy, umask) \ - zfs_vn_open((pnamep), (seg), (filemode), (createmode), (vpp), (crwhy), (umask)) - -#define RLIM64_INFINITY 0 -static __inline int -zfs_vn_rdwr(enum uio_rw rw, vnode_t *vp, caddr_t base, ssize_t len, - offset_t offset, enum uio_seg seg, int ioflag, int ulimit, cred_t *cr, - ssize_t *residp) -{ - struct thread *td = curthread; - int error; - ssize_t resid; - - ASSERT(ioflag == 0); - ASSERT(ulimit == RLIM64_INFINITY); - - if (rw == UIO_WRITE) { - ioflag = IO_SYNC; - } else { - ioflag = IO_DIRECT; - } - error = vn_rdwr(rw, vp, base, len, offset, seg, ioflag, cr, NOCRED, - &resid, td); - if (residp != NULL) - *residp = (ssize_t)resid; - return (error); -} -#define vn_rdwr(rw, vp, base, len, offset, seg, ioflag, ulimit, cr, residp) \ - zfs_vn_rdwr((rw), (vp), (base), (len), (offset), (seg), (ioflag), (ulimit), (cr), (residp)) - -static __inline int -zfs_vop_fsync(vnode_t *vp, int flag, cred_t *cr) -{ - struct mount *mp; - int error; - - ASSERT(flag == FSYNC); - - if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) - goto drop; - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - error = VOP_FSYNC(vp, MNT_WAIT, curthread); - VOP_UNLOCK(vp); - vn_finished_write(mp); -drop: - return (error); -} -#define VOP_FSYNC(vp, flag, cr, ct) zfs_vop_fsync((vp), (flag), (cr)) - -static __inline int -zfs_vop_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) -{ - int error; - - ASSERT(count == 1); - ASSERT(offset == 0); - - error = vn_close(vp, flag, cr, curthread); - return (error); -} -#define VOP_CLOSE(vp, oflags, count, offset, cr, ct) \ - zfs_vop_close((vp), (oflags), (count), (offset), (cr)) - -static __inline int -vn_rename(char *from, char *to, enum uio_seg seg) -{ - - ASSERT(seg == UIO_SYSSPACE); - - return (kern_renameat(curthread, AT_FDCWD, from, AT_FDCWD, to, seg)); -} - -static __inline int -vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag) -{ - - ASSERT(seg == UIO_SYSSPACE); - ASSERT(dirflag == RMFILE); - - return (kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, - 0)); -} - -#endif /* _KERNEL */ - -#endif /* _OPENSOLARIS_SYS_VNODE_H_ */ diff --git a/sys/cddl/contrib/opensolaris/common/acl/acl_common.c b/sys/cddl/contrib/opensolaris/common/acl/acl_common.c deleted file mode 100644 index a681905579c6..000000000000 --- a/sys/cddl/contrib/opensolaris/common/acl/acl_common.c +++ /dev/null @@ -1,1765 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - */ - -#include -#include -#include -#include -#if defined(_KERNEL) -#include -#include -#include -#include -#include -#else -#include -#include -#include -#include -#include -#include -#include -#include -#include -#define ASSERT assert -#endif - -#define ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA | \ - ACE_WRITE_DATA | ACE_APPEND_DATA | ACE_EXECUTE | \ - ACE_READ_ATTRIBUTES | ACE_READ_ACL | ACE_WRITE_ACL) - - -#define ACL_SYNCHRONIZE_SET_DENY 0x0000001 -#define ACL_SYNCHRONIZE_SET_ALLOW 0x0000002 -#define ACL_SYNCHRONIZE_ERR_DENY 0x0000004 -#define ACL_SYNCHRONIZE_ERR_ALLOW 0x0000008 - -#define ACL_WRITE_OWNER_SET_DENY 0x0000010 -#define ACL_WRITE_OWNER_SET_ALLOW 0x0000020 -#define ACL_WRITE_OWNER_ERR_DENY 0x0000040 -#define ACL_WRITE_OWNER_ERR_ALLOW 0x0000080 - -#define ACL_DELETE_SET_DENY 0x0000100 -#define ACL_DELETE_SET_ALLOW 0x0000200 -#define ACL_DELETE_ERR_DENY 0x0000400 -#define ACL_DELETE_ERR_ALLOW 0x0000800 - -#define ACL_WRITE_ATTRS_OWNER_SET_DENY 0x0001000 -#define ACL_WRITE_ATTRS_OWNER_SET_ALLOW 0x0002000 -#define ACL_WRITE_ATTRS_OWNER_ERR_DENY 0x0004000 -#define ACL_WRITE_ATTRS_OWNER_ERR_ALLOW 0x0008000 - -#define ACL_WRITE_ATTRS_WRITER_SET_DENY 0x0010000 -#define ACL_WRITE_ATTRS_WRITER_SET_ALLOW 0x0020000 -#define ACL_WRITE_ATTRS_WRITER_ERR_DENY 0x0040000 -#define ACL_WRITE_ATTRS_WRITER_ERR_ALLOW 0x0080000 - -#define ACL_WRITE_NAMED_WRITER_SET_DENY 0x0100000 -#define ACL_WRITE_NAMED_WRITER_SET_ALLOW 0x0200000 -#define ACL_WRITE_NAMED_WRITER_ERR_DENY 0x0400000 -#define ACL_WRITE_NAMED_WRITER_ERR_ALLOW 0x0800000 - -#define ACL_READ_NAMED_READER_SET_DENY 0x1000000 -#define ACL_READ_NAMED_READER_SET_ALLOW 0x2000000 -#define ACL_READ_NAMED_READER_ERR_DENY 0x4000000 -#define ACL_READ_NAMED_READER_ERR_ALLOW 0x8000000 - - -#define ACE_VALID_MASK_BITS (\ - ACE_READ_DATA | \ - ACE_LIST_DIRECTORY | \ - ACE_WRITE_DATA | \ - ACE_ADD_FILE | \ - ACE_APPEND_DATA | \ - ACE_ADD_SUBDIRECTORY | \ - ACE_READ_NAMED_ATTRS | \ - ACE_WRITE_NAMED_ATTRS | \ - ACE_EXECUTE | \ - ACE_DELETE_CHILD | \ - ACE_READ_ATTRIBUTES | \ - ACE_WRITE_ATTRIBUTES | \ - ACE_DELETE | \ - ACE_READ_ACL | \ - ACE_WRITE_ACL | \ - ACE_WRITE_OWNER | \ - ACE_SYNCHRONIZE) - -#define ACE_MASK_UNDEFINED 0x80000000 - -#define ACE_VALID_FLAG_BITS (ACE_FILE_INHERIT_ACE | \ - ACE_DIRECTORY_INHERIT_ACE | \ - ACE_NO_PROPAGATE_INHERIT_ACE | ACE_INHERIT_ONLY_ACE | \ - ACE_SUCCESSFUL_ACCESS_ACE_FLAG | ACE_FAILED_ACCESS_ACE_FLAG | \ - ACE_IDENTIFIER_GROUP | ACE_OWNER | ACE_GROUP | ACE_EVERYONE) - -/* - * ACL conversion helpers - */ - -typedef enum { - ace_unused, - ace_user_obj, - ace_user, - ace_group, /* includes GROUP and GROUP_OBJ */ - ace_other_obj -} ace_to_aent_state_t; - -typedef struct acevals { - uid_t key; - avl_node_t avl; - uint32_t mask; - uint32_t allowed; - uint32_t denied; - int aent_type; -} acevals_t; - -typedef struct ace_list { - acevals_t user_obj; - avl_tree_t user; - int numusers; - acevals_t group_obj; - avl_tree_t group; - int numgroups; - acevals_t other_obj; - uint32_t acl_mask; - int hasmask; - int dfacl_flag; - ace_to_aent_state_t state; - int seen; /* bitmask of all aclent_t a_type values seen */ -} ace_list_t; - -/* - * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified. - * v = Ptr to array/vector of objs - * n = # objs in the array - * s = size of each obj (must be multiples of a word size) - * f = ptr to function to compare two objs - * returns (-1 = less than, 0 = equal, 1 = greater than - */ -void -ksort(caddr_t v, int n, int s, int (*f)()) -{ - int g, i, j, ii; - unsigned int *p1, *p2; - unsigned int tmp; - - /* No work to do */ - if (v == NULL || n <= 1) - return; - - /* Sanity check on arguments */ - ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0); - ASSERT(s > 0); - for (g = n / 2; g > 0; g /= 2) { - for (i = g; i < n; i++) { - for (j = i - g; j >= 0 && - (*f)(v + j * s, v + (j + g) * s) == 1; - j -= g) { - p1 = (void *)(v + j * s); - p2 = (void *)(v + (j + g) * s); - for (ii = 0; ii < s / 4; ii++) { - tmp = *p1; - *p1++ = *p2; - *p2++ = tmp; - } - } - } - } -} - -/* - * Compare two acls, all fields. Returns: - * -1 (less than) - * 0 (equal) - * +1 (greater than) - */ -int -cmp2acls(void *a, void *b) -{ - aclent_t *x = (aclent_t *)a; - aclent_t *y = (aclent_t *)b; - - /* Compare types */ - if (x->a_type < y->a_type) - return (-1); - if (x->a_type > y->a_type) - return (1); - /* Equal types; compare id's */ - if (x->a_id < y->a_id) - return (-1); - if (x->a_id > y->a_id) - return (1); - /* Equal ids; compare perms */ - if (x->a_perm < y->a_perm) - return (-1); - if (x->a_perm > y->a_perm) - return (1); - /* Totally equal */ - return (0); -} - -/*ARGSUSED*/ -static void * -cacl_realloc(void *ptr, size_t size, size_t new_size) -{ -#if defined(_KERNEL) - void *tmp; - - tmp = kmem_alloc(new_size, KM_SLEEP); - (void) memcpy(tmp, ptr, (size < new_size) ? size : new_size); - kmem_free(ptr, size); - return (tmp); -#else - return (realloc(ptr, new_size)); -#endif -} - -static int -cacl_malloc(void **ptr, size_t size) -{ -#if defined(_KERNEL) - *ptr = kmem_zalloc(size, KM_SLEEP); - return (0); -#else - *ptr = calloc(1, size); - if (*ptr == NULL) - return (errno); - - return (0); -#endif -} - -/*ARGSUSED*/ -static void -cacl_free(void *ptr, size_t size) -{ -#if defined(_KERNEL) - kmem_free(ptr, size); -#else - free(ptr); -#endif -} - -#if !defined(_KERNEL) -acl_t * -acl_alloc(enum acl_type type) -{ - acl_t *aclp; - - if (cacl_malloc((void **)&aclp, sizeof (acl_t)) != 0) - return (NULL); - - aclp->acl_aclp = NULL; - aclp->acl_cnt = 0; - - switch (type) { - case ACE_T: - aclp->acl_type = ACE_T; - aclp->acl_entry_size = sizeof (ace_t); - break; - case ACLENT_T: - aclp->acl_type = ACLENT_T; - aclp->acl_entry_size = sizeof (aclent_t); - break; - default: - acl_free(aclp); - aclp = NULL; - } - return (aclp); -} - -/* - * Free acl_t structure - */ -void -acl_free(acl_t *aclp) -{ - int acl_size; - - if (aclp == NULL) - return; - - if (aclp->acl_aclp) { - acl_size = aclp->acl_cnt * aclp->acl_entry_size; - cacl_free(aclp->acl_aclp, acl_size); - } - - cacl_free(aclp, sizeof (acl_t)); -} - -static uint32_t -access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow) -{ - uint32_t access_mask = 0; - int acl_produce; - int synchronize_set = 0, write_owner_set = 0; - int delete_set = 0, write_attrs_set = 0; - int read_named_set = 0, write_named_set = 0; - - acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW | - ACL_WRITE_ATTRS_OWNER_SET_ALLOW | - ACL_WRITE_ATTRS_WRITER_SET_DENY); - - if (isallow) { - synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW; - write_owner_set = ACL_WRITE_OWNER_SET_ALLOW; - delete_set = ACL_DELETE_SET_ALLOW; - if (hasreadperm) - read_named_set = ACL_READ_NAMED_READER_SET_ALLOW; - if (haswriteperm) - write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW; - if (isowner) - write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW; - else if (haswriteperm) - write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW; - } else { - - synchronize_set = ACL_SYNCHRONIZE_SET_DENY; - write_owner_set = ACL_WRITE_OWNER_SET_DENY; - delete_set = ACL_DELETE_SET_DENY; - if (hasreadperm) - read_named_set = ACL_READ_NAMED_READER_SET_DENY; - if (haswriteperm) - write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY; - if (isowner) - write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY; - else if (haswriteperm) - write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY; - else - /* - * If the entity is not the owner and does not - * have write permissions ACE_WRITE_ATTRIBUTES will - * always go in the DENY ACE. - */ - access_mask |= ACE_WRITE_ATTRIBUTES; - } - - if (acl_produce & synchronize_set) - access_mask |= ACE_SYNCHRONIZE; - if (acl_produce & write_owner_set) - access_mask |= ACE_WRITE_OWNER; - if (acl_produce & delete_set) - access_mask |= ACE_DELETE; - if (acl_produce & write_attrs_set) - access_mask |= ACE_WRITE_ATTRIBUTES; - if (acl_produce & read_named_set) - access_mask |= ACE_READ_NAMED_ATTRS; - if (acl_produce & write_named_set) - access_mask |= ACE_WRITE_NAMED_ATTRS; - - return (access_mask); -} - -/* - * Given an mode_t, convert it into an access_mask as used - * by nfsace, assuming aclent_t -> nfsace semantics. - */ -static uint32_t -mode_to_ace_access(mode_t mode, boolean_t isdir, int isowner, int isallow) -{ - uint32_t access = 0; - int haswriteperm = 0; - int hasreadperm = 0; - - if (isallow) { - haswriteperm = (mode & S_IWOTH); - hasreadperm = (mode & S_IROTH); - } else { - haswriteperm = !(mode & S_IWOTH); - hasreadperm = !(mode & S_IROTH); - } - - /* - * The following call takes care of correctly setting the following - * mask bits in the access_mask: - * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE, - * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS - */ - access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow); - - if (isallow) { - access |= ACE_READ_ACL | ACE_READ_ATTRIBUTES; - if (isowner) - access |= ACE_WRITE_ACL; - } else { - if (! isowner) - access |= ACE_WRITE_ACL; - } - - /* read */ - if (mode & S_IROTH) { - access |= ACE_READ_DATA; - } - /* write */ - if (mode & S_IWOTH) { - access |= ACE_WRITE_DATA | - ACE_APPEND_DATA; - if (isdir) - access |= ACE_DELETE_CHILD; - } - /* exec */ - if (mode & S_IXOTH) { - access |= ACE_EXECUTE; - } - - return (access); -} - -/* - * Given an nfsace (presumably an ALLOW entry), make a - * corresponding DENY entry at the address given. - */ -static void -ace_make_deny(ace_t *allow, ace_t *deny, int isdir, int isowner) -{ - (void) memcpy(deny, allow, sizeof (ace_t)); - - deny->a_who = allow->a_who; - - deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE; - deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS; - if (isdir) - deny->a_access_mask ^= ACE_DELETE_CHILD; - - deny->a_access_mask &= ~(ACE_SYNCHRONIZE | ACE_WRITE_OWNER | - ACE_DELETE | ACE_WRITE_ATTRIBUTES | ACE_READ_NAMED_ATTRS | - ACE_WRITE_NAMED_ATTRS); - deny->a_access_mask |= access_mask_set((allow->a_access_mask & - ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner, - B_FALSE); -} -/* - * Make an initial pass over an array of aclent_t's. Gather - * information such as an ACL_MASK (if any), number of users, - * number of groups, and whether the array needs to be sorted. - */ -static int -ln_aent_preprocess(aclent_t *aclent, int n, - int *hasmask, mode_t *mask, - int *numuser, int *numgroup, int *needsort) -{ - int error = 0; - int i; - int curtype = 0; - - *hasmask = 0; - *mask = 07; - *needsort = 0; - *numuser = 0; - *numgroup = 0; - - for (i = 0; i < n; i++) { - if (aclent[i].a_type < curtype) - *needsort = 1; - else if (aclent[i].a_type > curtype) - curtype = aclent[i].a_type; - if (aclent[i].a_type & USER) - (*numuser)++; - if (aclent[i].a_type & (GROUP | GROUP_OBJ)) - (*numgroup)++; - if (aclent[i].a_type & CLASS_OBJ) { - if (*hasmask) { - error = EINVAL; - goto out; - } else { - *hasmask = 1; - *mask = aclent[i].a_perm; - } - } - } - - if ((! *hasmask) && (*numuser + *numgroup > 1)) { - error = EINVAL; - goto out; - } - -out: - return (error); -} - -/* - * Convert an array of aclent_t into an array of nfsace entries, - * following POSIX draft -> nfsv4 conversion semantics as outlined in - * the IETF draft. - */ -static int -ln_aent_to_ace(aclent_t *aclent, int n, ace_t **acepp, int *rescount, int isdir) -{ - int error = 0; - mode_t mask; - int numuser, numgroup, needsort; - int resultsize = 0; - int i, groupi = 0, skip; - ace_t *acep, *result = NULL; - int hasmask; - - error = ln_aent_preprocess(aclent, n, &hasmask, &mask, - &numuser, &numgroup, &needsort); - if (error != 0) - goto out; - - /* allow + deny for each aclent */ - resultsize = n * 2; - if (hasmask) { - /* - * stick extra deny on the group_obj and on each - * user|group for the mask (the group_obj was added - * into the count for numgroup) - */ - resultsize += numuser + numgroup; - /* ... and don't count the mask itself */ - resultsize -= 2; - } - - /* sort the source if necessary */ - if (needsort) - ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls); - - if (cacl_malloc((void **)&result, resultsize * sizeof (ace_t)) != 0) - goto out; - - acep = result; - - for (i = 0; i < n; i++) { - /* - * don't process CLASS_OBJ (mask); mask was grabbed in - * ln_aent_preprocess() - */ - if (aclent[i].a_type & CLASS_OBJ) - continue; - - /* If we need an ACL_MASK emulator, prepend it now */ - if ((hasmask) && - (aclent[i].a_type & (USER | GROUP | GROUP_OBJ))) { - acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE; - acep->a_flags = 0; - if (aclent[i].a_type & GROUP_OBJ) { - acep->a_who = (uid_t)-1; - acep->a_flags |= - (ACE_IDENTIFIER_GROUP|ACE_GROUP); - } else if (aclent[i].a_type & USER) { - acep->a_who = aclent[i].a_id; - } else { - acep->a_who = aclent[i].a_id; - acep->a_flags |= ACE_IDENTIFIER_GROUP; - } - if (aclent[i].a_type & ACL_DEFAULT) { - acep->a_flags |= ACE_INHERIT_ONLY_ACE | - ACE_FILE_INHERIT_ACE | - ACE_DIRECTORY_INHERIT_ACE; - } - /* - * Set the access mask for the prepended deny - * ace. To do this, we invert the mask (found - * in ln_aent_preprocess()) then convert it to an - * DENY ace access_mask. - */ - acep->a_access_mask = mode_to_ace_access((mask ^ 07), - isdir, 0, 0); - acep += 1; - } - - /* handle a_perm -> access_mask */ - acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm, - isdir, aclent[i].a_type & USER_OBJ, 1); - - /* emulate a default aclent */ - if (aclent[i].a_type & ACL_DEFAULT) { - acep->a_flags |= ACE_INHERIT_ONLY_ACE | - ACE_FILE_INHERIT_ACE | - ACE_DIRECTORY_INHERIT_ACE; - } - - /* - * handle a_perm and a_id - * - * this must be done last, since it involves the - * corresponding deny aces, which are handled - * differently for each different a_type. - */ - if (aclent[i].a_type & USER_OBJ) { - acep->a_who = (uid_t)-1; - acep->a_flags |= ACE_OWNER; - ace_make_deny(acep, acep + 1, isdir, B_TRUE); - acep += 2; - } else if (aclent[i].a_type & USER) { - acep->a_who = aclent[i].a_id; - ace_make_deny(acep, acep + 1, isdir, B_FALSE); - acep += 2; - } else if (aclent[i].a_type & (GROUP_OBJ | GROUP)) { - if (aclent[i].a_type & GROUP_OBJ) { - acep->a_who = (uid_t)-1; - acep->a_flags |= ACE_GROUP; - } else { - acep->a_who = aclent[i].a_id; - } - acep->a_flags |= ACE_IDENTIFIER_GROUP; - /* - * Set the corresponding deny for the group ace. - * - * The deny aces go after all of the groups, unlike - * everything else, where they immediately follow - * the allow ace. - * - * We calculate "skip", the number of slots to - * skip ahead for the deny ace, here. - * - * The pattern is: - * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3 - * thus, skip is - * (2 * numgroup) - 1 - groupi - * (2 * numgroup) to account for MD + A - * - 1 to account for the fact that we're on the - * access (A), not the mask (MD) - * - groupi to account for the fact that we have - * passed up groupi number of MD's. - */ - skip = (2 * numgroup) - 1 - groupi; - ace_make_deny(acep, acep + skip, isdir, B_FALSE); - /* - * If we just did the last group, skip acep past - * all of the denies; else, just move ahead one. - */ - if (++groupi >= numgroup) - acep += numgroup + 1; - else - acep += 1; - } else if (aclent[i].a_type & OTHER_OBJ) { - acep->a_who = (uid_t)-1; - acep->a_flags |= ACE_EVERYONE; - ace_make_deny(acep, acep + 1, isdir, B_FALSE); - acep += 2; - } else { - error = EINVAL; - goto out; - } - } - - *acepp = result; - *rescount = resultsize; - -out: - if (error != 0) { - if ((result != NULL) && (resultsize > 0)) { - cacl_free(result, resultsize * sizeof (ace_t)); - } - } - - return (error); -} - -static int -convert_aent_to_ace(aclent_t *aclentp, int aclcnt, boolean_t isdir, - ace_t **retacep, int *retacecnt) -{ - ace_t *acep; - ace_t *dfacep; - int acecnt = 0; - int dfacecnt = 0; - int dfaclstart = 0; - int dfaclcnt = 0; - aclent_t *aclp; - int i; - int error; - int acesz, dfacesz; - - ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls); - - for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) { - if (aclp->a_type & ACL_DEFAULT) - break; - } - - if (i < aclcnt) { - dfaclstart = i; - dfaclcnt = aclcnt - i; - } - - if (dfaclcnt && !isdir) { - return (EINVAL); - } - - error = ln_aent_to_ace(aclentp, i, &acep, &acecnt, isdir); - if (error) - return (error); - - if (dfaclcnt) { - error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt, - &dfacep, &dfacecnt, isdir); - if (error) { - if (acep) { - cacl_free(acep, acecnt * sizeof (ace_t)); - } - return (error); - } - } - - if (dfacecnt != 0) { - acesz = sizeof (ace_t) * acecnt; - dfacesz = sizeof (ace_t) * dfacecnt; - acep = cacl_realloc(acep, acesz, acesz + dfacesz); - if (acep == NULL) - return (ENOMEM); - if (dfaclcnt) { - (void) memcpy(acep + acecnt, dfacep, dfacesz); - } - } - if (dfaclcnt) - cacl_free(dfacep, dfacecnt * sizeof (ace_t)); - - *retacecnt = acecnt + dfacecnt; - *retacep = acep; - return (0); -} - -static int -ace_mask_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir) -{ - int error = 0; - o_mode_t mode = 0; - uint32_t bits, wantbits; - - /* read */ - if (mask & ACE_READ_DATA) - mode |= S_IROTH; - - /* write */ - wantbits = (ACE_WRITE_DATA | ACE_APPEND_DATA); - if (isdir) - wantbits |= ACE_DELETE_CHILD; - bits = mask & wantbits; - if (bits != 0) { - if (bits != wantbits) { - error = ENOTSUP; - goto out; - } - mode |= S_IWOTH; - } - - /* exec */ - if (mask & ACE_EXECUTE) { - mode |= S_IXOTH; - } - - *modep = mode; - -out: - return (error); -} - -static void -acevals_init(acevals_t *vals, uid_t key) -{ - bzero(vals, sizeof (*vals)); - vals->allowed = ACE_MASK_UNDEFINED; - vals->denied = ACE_MASK_UNDEFINED; - vals->mask = ACE_MASK_UNDEFINED; - vals->key = key; -} - -static void -ace_list_init(ace_list_t *al, int dfacl_flag) -{ - acevals_init(&al->user_obj, 0); - acevals_init(&al->group_obj, 0); - acevals_init(&al->other_obj, 0); - al->numusers = 0; - al->numgroups = 0; - al->acl_mask = 0; - al->hasmask = 0; - al->state = ace_unused; - al->seen = 0; - al->dfacl_flag = dfacl_flag; -} - -/* - * Find or create an acevals holder for a given id and avl tree. - * - * Note that only one thread will ever touch these avl trees, so - * there is no need for locking. - */ -static acevals_t * -acevals_find(ace_t *ace, avl_tree_t *avl, int *num) -{ - acevals_t key, *rc; - avl_index_t where; - - key.key = ace->a_who; - rc = avl_find(avl, &key, &where); - if (rc != NULL) - return (rc); - - /* this memory is freed by ln_ace_to_aent()->ace_list_free() */ - if (cacl_malloc((void **)&rc, sizeof (acevals_t)) != 0) - return (NULL); - - acevals_init(rc, ace->a_who); - avl_insert(avl, rc, where); - (*num)++; - - return (rc); -} - -static int -access_mask_check(ace_t *acep, int mask_bit, int isowner) -{ - int set_deny, err_deny; - int set_allow, err_allow; - int acl_consume; - int haswriteperm, hasreadperm; - - if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) { - haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 0 : 1; - hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 0 : 1; - } else { - haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 1 : 0; - hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 1 : 0; - } - - acl_consume = (ACL_SYNCHRONIZE_ERR_DENY | - ACL_DELETE_ERR_DENY | - ACL_WRITE_OWNER_ERR_DENY | - ACL_WRITE_OWNER_ERR_ALLOW | - ACL_WRITE_ATTRS_OWNER_SET_ALLOW | - ACL_WRITE_ATTRS_OWNER_ERR_DENY | - ACL_WRITE_ATTRS_WRITER_SET_DENY | - ACL_WRITE_ATTRS_WRITER_ERR_ALLOW | - ACL_WRITE_NAMED_WRITER_ERR_DENY | - ACL_READ_NAMED_READER_ERR_DENY); - - if (mask_bit == ACE_SYNCHRONIZE) { - set_deny = ACL_SYNCHRONIZE_SET_DENY; - err_deny = ACL_SYNCHRONIZE_ERR_DENY; - set_allow = ACL_SYNCHRONIZE_SET_ALLOW; - err_allow = ACL_SYNCHRONIZE_ERR_ALLOW; - } else if (mask_bit == ACE_WRITE_OWNER) { - set_deny = ACL_WRITE_OWNER_SET_DENY; - err_deny = ACL_WRITE_OWNER_ERR_DENY; - set_allow = ACL_WRITE_OWNER_SET_ALLOW; - err_allow = ACL_WRITE_OWNER_ERR_ALLOW; - } else if (mask_bit == ACE_DELETE) { - set_deny = ACL_DELETE_SET_DENY; - err_deny = ACL_DELETE_ERR_DENY; - set_allow = ACL_DELETE_SET_ALLOW; - err_allow = ACL_DELETE_ERR_ALLOW; - } else if (mask_bit == ACE_WRITE_ATTRIBUTES) { - if (isowner) { - set_deny = ACL_WRITE_ATTRS_OWNER_SET_DENY; - err_deny = ACL_WRITE_ATTRS_OWNER_ERR_DENY; - set_allow = ACL_WRITE_ATTRS_OWNER_SET_ALLOW; - err_allow = ACL_WRITE_ATTRS_OWNER_ERR_ALLOW; - } else if (haswriteperm) { - set_deny = ACL_WRITE_ATTRS_WRITER_SET_DENY; - err_deny = ACL_WRITE_ATTRS_WRITER_ERR_DENY; - set_allow = ACL_WRITE_ATTRS_WRITER_SET_ALLOW; - err_allow = ACL_WRITE_ATTRS_WRITER_ERR_ALLOW; - } else { - if ((acep->a_access_mask & mask_bit) && - (acep->a_type & ACE_ACCESS_ALLOWED_ACE_TYPE)) { - return (ENOTSUP); - } - return (0); - } - } else if (mask_bit == ACE_READ_NAMED_ATTRS) { - if (!hasreadperm) - return (0); - - set_deny = ACL_READ_NAMED_READER_SET_DENY; - err_deny = ACL_READ_NAMED_READER_ERR_DENY; - set_allow = ACL_READ_NAMED_READER_SET_ALLOW; - err_allow = ACL_READ_NAMED_READER_ERR_ALLOW; - } else if (mask_bit == ACE_WRITE_NAMED_ATTRS) { - if (!haswriteperm) - return (0); - - set_deny = ACL_WRITE_NAMED_WRITER_SET_DENY; - err_deny = ACL_WRITE_NAMED_WRITER_ERR_DENY; - set_allow = ACL_WRITE_NAMED_WRITER_SET_ALLOW; - err_allow = ACL_WRITE_NAMED_WRITER_ERR_ALLOW; - } else { - return (EINVAL); - } - - if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) { - if (acl_consume & set_deny) { - if (!(acep->a_access_mask & mask_bit)) { - return (ENOTSUP); - } - } else if (acl_consume & err_deny) { - if (acep->a_access_mask & mask_bit) { - return (ENOTSUP); - } - } - } else { - /* ACE_ACCESS_ALLOWED_ACE_TYPE */ - if (acl_consume & set_allow) { - if (!(acep->a_access_mask & mask_bit)) { - return (ENOTSUP); - } - } else if (acl_consume & err_allow) { - if (acep->a_access_mask & mask_bit) { - return (ENOTSUP); - } - } - } - return (0); -} - -static int -ace_to_aent_legal(ace_t *acep) -{ - int error = 0; - int isowner; - - /* only ALLOW or DENY */ - if ((acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE) && - (acep->a_type != ACE_ACCESS_DENIED_ACE_TYPE)) { - error = ENOTSUP; - goto out; - } - - /* check for invalid flags */ - if (acep->a_flags & ~(ACE_VALID_FLAG_BITS)) { - error = EINVAL; - goto out; - } - - /* some flags are illegal */ - if (acep->a_flags & (ACE_SUCCESSFUL_ACCESS_ACE_FLAG | - ACE_FAILED_ACCESS_ACE_FLAG | - ACE_NO_PROPAGATE_INHERIT_ACE)) { - error = ENOTSUP; - goto out; - } - - /* check for invalid masks */ - if (acep->a_access_mask & ~(ACE_VALID_MASK_BITS)) { - error = EINVAL; - goto out; - } - - if ((acep->a_flags & ACE_OWNER)) { - isowner = 1; - } else { - isowner = 0; - } - - error = access_mask_check(acep, ACE_SYNCHRONIZE, isowner); - if (error) - goto out; - - error = access_mask_check(acep, ACE_WRITE_OWNER, isowner); - if (error) - goto out; - - error = access_mask_check(acep, ACE_DELETE, isowner); - if (error) - goto out; - - error = access_mask_check(acep, ACE_WRITE_ATTRIBUTES, isowner); - if (error) - goto out; - - error = access_mask_check(acep, ACE_READ_NAMED_ATTRS, isowner); - if (error) - goto out; - - error = access_mask_check(acep, ACE_WRITE_NAMED_ATTRS, isowner); - if (error) - goto out; - - /* more detailed checking of masks */ - if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) { - if (! (acep->a_access_mask & ACE_READ_ATTRIBUTES)) { - error = ENOTSUP; - goto out; - } - if ((acep->a_access_mask & ACE_WRITE_DATA) && - (! (acep->a_access_mask & ACE_APPEND_DATA))) { - error = ENOTSUP; - goto out; - } - if ((! (acep->a_access_mask & ACE_WRITE_DATA)) && - (acep->a_access_mask & ACE_APPEND_DATA)) { - error = ENOTSUP; - goto out; - } - } - - /* ACL enforcement */ - if ((acep->a_access_mask & ACE_READ_ACL) && - (acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE)) { - error = ENOTSUP; - goto out; - } - if (acep->a_access_mask & ACE_WRITE_ACL) { - if ((acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) && - (isowner)) { - error = ENOTSUP; - goto out; - } - if ((acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) && - (! isowner)) { - error = ENOTSUP; - goto out; - } - } - -out: - return (error); -} - -static int -ace_allow_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir) -{ - /* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */ - if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) != - (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) { - return (ENOTSUP); - } - - return (ace_mask_to_mode(mask, modep, isdir)); -} - -static int -acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list, - uid_t owner, gid_t group, boolean_t isdir) -{ - int error; - uint32_t flips = ACE_POSIX_SUPPORTED_BITS; - - if (isdir) - flips |= ACE_DELETE_CHILD; - if (vals->allowed != (vals->denied ^ flips)) { - error = ENOTSUP; - goto out; - } - if ((list->hasmask) && (list->acl_mask != vals->mask) && - (vals->aent_type & (USER | GROUP | GROUP_OBJ))) { - error = ENOTSUP; - goto out; - } - error = ace_allow_to_mode(vals->allowed, &dest->a_perm, isdir); - if (error != 0) - goto out; - dest->a_type = vals->aent_type; - if (dest->a_type & (USER | GROUP)) { - dest->a_id = vals->key; - } else if (dest->a_type & USER_OBJ) { - dest->a_id = owner; - } else if (dest->a_type & GROUP_OBJ) { - dest->a_id = group; - } else if (dest->a_type & OTHER_OBJ) { - dest->a_id = 0; - } else { - error = EINVAL; - goto out; - } - -out: - return (error); -} - - -static int -ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt, - uid_t owner, gid_t group, boolean_t isdir) -{ - int error = 0; - aclent_t *aent, *result = NULL; - acevals_t *vals; - int resultcount; - - if ((list->seen & (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) != - (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) { - error = ENOTSUP; - goto out; - } - if ((! list->hasmask) && (list->numusers + list->numgroups > 0)) { - error = ENOTSUP; - goto out; - } - - resultcount = 3 + list->numusers + list->numgroups; - /* - * This must be the same condition as below, when we add the CLASS_OBJ - * (aka ACL mask) - */ - if ((list->hasmask) || (! list->dfacl_flag)) - resultcount += 1; - - if (cacl_malloc((void **)&result, - resultcount * sizeof (aclent_t)) != 0) { - error = ENOMEM; - goto out; - } - aent = result; - - /* USER_OBJ */ - if (!(list->user_obj.aent_type & USER_OBJ)) { - error = EINVAL; - goto out; - } - - error = acevals_to_aent(&list->user_obj, aent, list, owner, group, - isdir); - - if (error != 0) - goto out; - ++aent; - /* USER */ - vals = NULL; - for (vals = avl_first(&list->user); vals != NULL; - vals = AVL_NEXT(&list->user, vals)) { - if (!(vals->aent_type & USER)) { - error = EINVAL; - goto out; - } - error = acevals_to_aent(vals, aent, list, owner, group, - isdir); - if (error != 0) - goto out; - ++aent; - } - /* GROUP_OBJ */ - if (!(list->group_obj.aent_type & GROUP_OBJ)) { - error = EINVAL; - goto out; - } - error = acevals_to_aent(&list->group_obj, aent, list, owner, group, - isdir); - if (error != 0) - goto out; - ++aent; - /* GROUP */ - vals = NULL; - for (vals = avl_first(&list->group); vals != NULL; - vals = AVL_NEXT(&list->group, vals)) { - if (!(vals->aent_type & GROUP)) { - error = EINVAL; - goto out; - } - error = acevals_to_aent(vals, aent, list, owner, group, - isdir); - if (error != 0) - goto out; - ++aent; - } - /* - * CLASS_OBJ (aka ACL_MASK) - * - * An ACL_MASK is not fabricated if the ACL is a default ACL. - * This is to follow UFS's behavior. - */ - if ((list->hasmask) || (! list->dfacl_flag)) { - if (list->hasmask) { - uint32_t flips = ACE_POSIX_SUPPORTED_BITS; - if (isdir) - flips |= ACE_DELETE_CHILD; - error = ace_mask_to_mode(list->acl_mask ^ flips, - &aent->a_perm, isdir); - if (error != 0) - goto out; - } else { - /* fabricate the ACL_MASK from the group permissions */ - error = ace_mask_to_mode(list->group_obj.allowed, - &aent->a_perm, isdir); - if (error != 0) - goto out; - } - aent->a_id = 0; - aent->a_type = CLASS_OBJ | list->dfacl_flag; - ++aent; - } - /* OTHER_OBJ */ - if (!(list->other_obj.aent_type & OTHER_OBJ)) { - error = EINVAL; - goto out; - } - error = acevals_to_aent(&list->other_obj, aent, list, owner, group, - isdir); - if (error != 0) - goto out; - ++aent; - - *aclentp = result; - *aclcnt = resultcount; - -out: - if (error != 0) { - if (result != NULL) - cacl_free(result, resultcount * sizeof (aclent_t)); - } - - return (error); -} - - -/* - * free all data associated with an ace_list - */ -static void -ace_list_free(ace_list_t *al) -{ - acevals_t *node; - void *cookie; - - if (al == NULL) - return; - - cookie = NULL; - while ((node = avl_destroy_nodes(&al->user, &cookie)) != NULL) - cacl_free(node, sizeof (acevals_t)); - cookie = NULL; - while ((node = avl_destroy_nodes(&al->group, &cookie)) != NULL) - cacl_free(node, sizeof (acevals_t)); - - avl_destroy(&al->user); - avl_destroy(&al->group); - - /* free the container itself */ - cacl_free(al, sizeof (ace_list_t)); -} - -static int -acevals_compare(const void *va, const void *vb) -{ - const acevals_t *a = va, *b = vb; - - if (a->key == b->key) - return (0); - - if (a->key > b->key) - return (1); - - else - return (-1); -} - -/* - * Convert a list of ace_t entries to equivalent regular and default - * aclent_t lists. Return error (ENOTSUP) when conversion is not possible. - */ -static int -ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group, - aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt, - boolean_t isdir) -{ - int error = 0; - ace_t *acep; - uint32_t bits; - int i; - ace_list_t *normacl = NULL, *dfacl = NULL, *acl; - acevals_t *vals; - - *aclentp = NULL; - *aclcnt = 0; - *dfaclentp = NULL; - *dfaclcnt = 0; - - /* we need at least user_obj, group_obj, and other_obj */ - if (n < 6) { - error = ENOTSUP; - goto out; - } - if (ace == NULL) { - error = EINVAL; - goto out; - } - - error = cacl_malloc((void **)&normacl, sizeof (ace_list_t)); - if (error != 0) - goto out; - - avl_create(&normacl->user, acevals_compare, sizeof (acevals_t), - offsetof(acevals_t, avl)); - avl_create(&normacl->group, acevals_compare, sizeof (acevals_t), - offsetof(acevals_t, avl)); - - ace_list_init(normacl, 0); - - error = cacl_malloc((void **)&dfacl, sizeof (ace_list_t)); - if (error != 0) - goto out; - - avl_create(&dfacl->user, acevals_compare, sizeof (acevals_t), - offsetof(acevals_t, avl)); - avl_create(&dfacl->group, acevals_compare, sizeof (acevals_t), - offsetof(acevals_t, avl)); - ace_list_init(dfacl, ACL_DEFAULT); - - /* process every ace_t... */ - for (i = 0; i < n; i++) { - acep = &ace[i]; - - /* rule out certain cases quickly */ - error = ace_to_aent_legal(acep); - if (error != 0) - goto out; - - /* - * Turn off these bits in order to not have to worry about - * them when doing the checks for compliments. - */ - acep->a_access_mask &= ~(ACE_WRITE_OWNER | ACE_DELETE | - ACE_SYNCHRONIZE | ACE_WRITE_ATTRIBUTES | - ACE_READ_NAMED_ATTRS | ACE_WRITE_NAMED_ATTRS); - - /* see if this should be a regular or default acl */ - bits = acep->a_flags & - (ACE_INHERIT_ONLY_ACE | - ACE_FILE_INHERIT_ACE | - ACE_DIRECTORY_INHERIT_ACE); - if (bits != 0) { - /* all or nothing on these inherit bits */ - if (bits != (ACE_INHERIT_ONLY_ACE | - ACE_FILE_INHERIT_ACE | - ACE_DIRECTORY_INHERIT_ACE)) { - error = ENOTSUP; - goto out; - } - acl = dfacl; - } else { - acl = normacl; - } - - if ((acep->a_flags & ACE_OWNER)) { - if (acl->state > ace_user_obj) { - error = ENOTSUP; - goto out; - } - acl->state = ace_user_obj; - acl->seen |= USER_OBJ; - vals = &acl->user_obj; - vals->aent_type = USER_OBJ | acl->dfacl_flag; - } else if ((acep->a_flags & ACE_EVERYONE)) { - acl->state = ace_other_obj; - acl->seen |= OTHER_OBJ; - vals = &acl->other_obj; - vals->aent_type = OTHER_OBJ | acl->dfacl_flag; - } else if (acep->a_flags & ACE_IDENTIFIER_GROUP) { - if (acl->state > ace_group) { - error = ENOTSUP; - goto out; - } - if ((acep->a_flags & ACE_GROUP)) { - acl->seen |= GROUP_OBJ; - vals = &acl->group_obj; - vals->aent_type = GROUP_OBJ | acl->dfacl_flag; - } else { - acl->seen |= GROUP; - vals = acevals_find(acep, &acl->group, - &acl->numgroups); - if (vals == NULL) { - error = ENOMEM; - goto out; - } - vals->aent_type = GROUP | acl->dfacl_flag; - } - acl->state = ace_group; - } else { - if (acl->state > ace_user) { - error = ENOTSUP; - goto out; - } - acl->state = ace_user; - acl->seen |= USER; - vals = acevals_find(acep, &acl->user, - &acl->numusers); - if (vals == NULL) { - error = ENOMEM; - goto out; - } - vals->aent_type = USER | acl->dfacl_flag; - } - - if (!(acl->state > ace_unused)) { - error = EINVAL; - goto out; - } - - if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) { - /* no more than one allowed per aclent_t */ - if (vals->allowed != ACE_MASK_UNDEFINED) { - error = ENOTSUP; - goto out; - } - vals->allowed = acep->a_access_mask; - } else { - /* - * it's a DENY; if there was a previous DENY, it - * must have been an ACL_MASK. - */ - if (vals->denied != ACE_MASK_UNDEFINED) { - /* ACL_MASK is for USER and GROUP only */ - if ((acl->state != ace_user) && - (acl->state != ace_group)) { - error = ENOTSUP; - goto out; - } - - if (! acl->hasmask) { - acl->hasmask = 1; - acl->acl_mask = vals->denied; - /* check for mismatched ACL_MASK emulations */ - } else if (acl->acl_mask != vals->denied) { - error = ENOTSUP; - goto out; - } - vals->mask = vals->denied; - } - vals->denied = acep->a_access_mask; - } - } - - /* done collating; produce the aclent_t lists */ - if (normacl->state != ace_unused) { - error = ace_list_to_aent(normacl, aclentp, aclcnt, - owner, group, isdir); - if (error != 0) { - goto out; - } - } - if (dfacl->state != ace_unused) { - error = ace_list_to_aent(dfacl, dfaclentp, dfaclcnt, - owner, group, isdir); - if (error != 0) { - goto out; - } - } - -out: - if (normacl != NULL) - ace_list_free(normacl); - if (dfacl != NULL) - ace_list_free(dfacl); - - return (error); -} - -static int -convert_ace_to_aent(ace_t *acebufp, int acecnt, boolean_t isdir, - uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt) -{ - int error = 0; - aclent_t *aclentp, *dfaclentp; - int aclcnt, dfaclcnt; - int aclsz, dfaclsz; - - error = ln_ace_to_aent(acebufp, acecnt, owner, group, - &aclentp, &aclcnt, &dfaclentp, &dfaclcnt, isdir); - - if (error) - return (error); - - - if (dfaclcnt != 0) { - /* - * Slap aclentp and dfaclentp into a single array. - */ - aclsz = sizeof (aclent_t) * aclcnt; - dfaclsz = sizeof (aclent_t) * dfaclcnt; - aclentp = cacl_realloc(aclentp, aclsz, aclsz + dfaclsz); - if (aclentp != NULL) { - (void) memcpy(aclentp + aclcnt, dfaclentp, dfaclsz); - } else { - error = ENOMEM; - } - } - - if (aclentp) { - *retaclentp = aclentp; - *retaclcnt = aclcnt + dfaclcnt; - } - - if (dfaclentp) - cacl_free(dfaclentp, dfaclsz); - - return (error); -} - - -int -acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, uid_t owner, - gid_t group) -{ - int aclcnt; - void *acldata; - int error; - - /* - * See if we need to translate - */ - if ((target_flavor == _ACL_ACE_ENABLED && aclp->acl_type == ACE_T) || - (target_flavor == _ACL_ACLENT_ENABLED && - aclp->acl_type == ACLENT_T)) - return (0); - - if (target_flavor == -1) { - error = EINVAL; - goto out; - } - - if (target_flavor == _ACL_ACE_ENABLED && - aclp->acl_type == ACLENT_T) { - error = convert_aent_to_ace(aclp->acl_aclp, - aclp->acl_cnt, isdir, (ace_t **)&acldata, &aclcnt); - if (error) - goto out; - - } else if (target_flavor == _ACL_ACLENT_ENABLED && - aclp->acl_type == ACE_T) { - error = convert_ace_to_aent(aclp->acl_aclp, aclp->acl_cnt, - isdir, owner, group, (aclent_t **)&acldata, &aclcnt); - if (error) - goto out; - } else { - error = ENOTSUP; - goto out; - } - - /* - * replace old acl with newly translated acl - */ - cacl_free(aclp->acl_aclp, aclp->acl_cnt * aclp->acl_entry_size); - aclp->acl_aclp = acldata; - aclp->acl_cnt = aclcnt; - if (target_flavor == _ACL_ACE_ENABLED) { - aclp->acl_type = ACE_T; - aclp->acl_entry_size = sizeof (ace_t); - } else { - aclp->acl_type = ACLENT_T; - aclp->acl_entry_size = sizeof (aclent_t); - } - return (0); - -out: - -#if !defined(_KERNEL) - errno = error; - return (-1); -#else - return (error); -#endif -} -#endif /* !_KERNEL */ - -#define SET_ACE(acl, index, who, mask, type, flags) { \ - acl[0][index].a_who = (uint32_t)who; \ - acl[0][index].a_type = type; \ - acl[0][index].a_flags = flags; \ - acl[0][index++].a_access_mask = mask; \ -} - -void -acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks) -{ - uint32_t read_mask = ACE_READ_DATA; - uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA; - uint32_t execute_mask = ACE_EXECUTE; - - (void) isdir; /* will need this later */ - - masks->deny1 = 0; - if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) - masks->deny1 |= read_mask; - if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) - masks->deny1 |= write_mask; - if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) - masks->deny1 |= execute_mask; - - masks->deny2 = 0; - if (!(mode & S_IRGRP) && (mode & S_IROTH)) - masks->deny2 |= read_mask; - if (!(mode & S_IWGRP) && (mode & S_IWOTH)) - masks->deny2 |= write_mask; - if (!(mode & S_IXGRP) && (mode & S_IXOTH)) - masks->deny2 |= execute_mask; - - masks->allow0 = 0; - if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) - masks->allow0 |= read_mask; - if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) - masks->allow0 |= write_mask; - if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) - masks->allow0 |= execute_mask; - - masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| - ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| - ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; - if (mode & S_IRUSR) - masks->owner |= read_mask; - if (mode & S_IWUSR) - masks->owner |= write_mask; - if (mode & S_IXUSR) - masks->owner |= execute_mask; - - masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| - ACE_SYNCHRONIZE; - if (mode & S_IRGRP) - masks->group |= read_mask; - if (mode & S_IWGRP) - masks->group |= write_mask; - if (mode & S_IXGRP) - masks->group |= execute_mask; - - masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| - ACE_SYNCHRONIZE; - if (mode & S_IROTH) - masks->everyone |= read_mask; - if (mode & S_IWOTH) - masks->everyone |= write_mask; - if (mode & S_IXOTH) - masks->everyone |= execute_mask; -} - -int -acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count) -{ - int index = 0; - int error; - trivial_acl_t masks; - - *count = 3; - acl_trivial_access_masks(mode, isdir, &masks); - - if (masks.allow0) - (*count)++; - if (masks.deny1) - (*count)++; - if (masks.deny2) - (*count)++; - - if ((error = cacl_malloc((void **)acl, *count * sizeof (ace_t))) != 0) - return (error); - - if (masks.allow0) { - SET_ACE(acl, index, -1, masks.allow0, - ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER); - } - if (masks.deny1) { - SET_ACE(acl, index, -1, masks.deny1, - ACE_ACCESS_DENIED_ACE_TYPE, ACE_OWNER); - } - if (masks.deny2) { - SET_ACE(acl, index, -1, masks.deny2, - ACE_ACCESS_DENIED_ACE_TYPE, ACE_GROUP|ACE_IDENTIFIER_GROUP); - } - - SET_ACE(acl, index, -1, masks.owner, ACE_ACCESS_ALLOWED_ACE_TYPE, - ACE_OWNER); - SET_ACE(acl, index, -1, masks.group, ACE_ACCESS_ALLOWED_ACE_TYPE, - ACE_IDENTIFIER_GROUP|ACE_GROUP); - SET_ACE(acl, index, -1, masks.everyone, ACE_ACCESS_ALLOWED_ACE_TYPE, - ACE_EVERYONE); - - return (0); -} - -/* - * ace_trivial: - * determine whether an ace_t acl is trivial - * - * Trivialness implies that the acl is composed of only - * owner, group, everyone entries. ACL can't - * have read_acl denied, and write_owner/write_acl/write_attributes - * can only be owner@ entry. - */ -int -ace_trivial_common(void *acep, int aclcnt, - uint64_t (*walk)(void *, uint64_t, int aclcnt, - uint16_t *, uint16_t *, uint32_t *)) -{ - uint16_t flags; - uint32_t mask; - uint16_t type; - uint64_t cookie = 0; - - while (cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask)) { - switch (flags & ACE_TYPE_FLAGS) { - case ACE_OWNER: - case ACE_GROUP|ACE_IDENTIFIER_GROUP: - case ACE_EVERYONE: - break; - default: - return (1); - - } - - if (flags & (ACE_FILE_INHERIT_ACE| - ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| - ACE_INHERIT_ONLY_ACE)) - return (1); - - /* - * Special check for some special bits - * - * Don't allow anybody to deny reading basic - * attributes or a files ACL. - */ - if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && - (type == ACE_ACCESS_DENIED_ACE_TYPE)) - return (1); - - /* - * Delete permissions are never set by default - */ - if (mask & (ACE_DELETE|ACE_DELETE_CHILD)) - return (1); - /* - * only allow owner@ to have - * write_acl/write_owner/write_attributes/write_xattr/ - */ - if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && - (!(flags & ACE_OWNER) && (mask & - (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES| - ACE_WRITE_NAMED_ATTRS)))) - return (1); - - } - return (0); -} - -uint64_t -ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags, - uint16_t *type, uint32_t *mask) -{ - ace_t *acep = datap; - - if (cookie >= aclcnt) - return (0); - - *flags = acep[cookie].a_flags; - *type = acep[cookie].a_type; - *mask = acep[cookie++].a_access_mask; - - return (cookie); -} - -int -ace_trivial(ace_t *acep, int aclcnt) -{ - return (ace_trivial_common(acep, aclcnt, ace_walk)); -} diff --git a/sys/cddl/contrib/opensolaris/common/acl/acl_common.h b/sys/cddl/contrib/opensolaris/common/acl/acl_common.h deleted file mode 100644 index acf1f5da89d6..000000000000 --- a/sys/cddl/contrib/opensolaris/common/acl/acl_common.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - */ - -#ifndef _ACL_COMMON_H -#define _ACL_COMMON_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct trivial_acl { - uint32_t allow0; /* allow mask for bits only in owner */ - uint32_t deny1; /* deny mask for bits not in owner */ - uint32_t deny2; /* deny mask for bits not in group */ - uint32_t owner; /* allow mask matching mode */ - uint32_t group; /* allow mask matching mode */ - uint32_t everyone; /* allow mask matching mode */ -} trivial_acl_t; - -extern int acltrivial(const char *); -extern void adjust_ace_pair(ace_t *pair, mode_t mode); -extern void adjust_ace_pair_common(void *, size_t, size_t, mode_t); -extern int ace_trivial(ace_t *acep, int aclcnt); -extern int ace_trivial_common(void *, int, - uint64_t (*walk)(void *, uint64_t, int aclcnt, uint16_t *, uint16_t *, - uint32_t *mask)); -#if !defined(_KERNEL) -extern acl_t *acl_alloc(acl_type_t); -extern void acl_free(acl_t *aclp); -extern int acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, - uid_t owner, gid_t group); -#endif /* !_KERNEL */ -void ksort(caddr_t v, int n, int s, int (*f)()); -int cmp2acls(void *a, void *b); -int acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count); -void acl_trivial_access_masks(mode_t mode, boolean_t isdir, - trivial_acl_t *masks); - -#ifdef __cplusplus -} -#endif - -#endif /* _ACL_COMMON_H */ diff --git a/sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S b/sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S deleted file mode 100644 index bc21e85878df..000000000000 --- a/sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S +++ /dev/null @@ -1,133 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - - .file "atomic.s" - -#define _ASM -#include - - /* - * NOTE: If atomic_dec_64 and atomic_dec_64_nv are ever - * separated, it is important to edit the libc i386 platform - * specific mapfile and remove the NODYNSORT attribute - * from atomic_dec_64_nv. - */ - ENTRY(atomic_dec_64) - ALTENTRY(atomic_dec_64_nv) - pushl %edi - pushl %ebx - movl 12(%esp), %edi // %edi = target address - movl (%edi), %eax - movl 4(%edi), %edx // %edx:%eax = old value -1: - xorl %ebx, %ebx - xorl %ecx, %ecx - not %ecx - not %ebx // %ecx:%ebx = -1 - addl %eax, %ebx - adcl %edx, %ecx // add in the carry from inc - lock - cmpxchg8b (%edi) // try to stick it in - jne 1b - movl %ebx, %eax - movl %ecx, %edx // return new value - popl %ebx - popl %edi - ret - SET_SIZE(atomic_dec_64_nv) - SET_SIZE(atomic_dec_64) - - /* - * NOTE: If atomic_add_64 and atomic_add_64_nv are ever - * separated, it is important to edit the libc i386 platform - * specific mapfile and remove the NODYNSORT attribute - * from atomic_add_64_nv. - */ - ENTRY(atomic_add_64) - ALTENTRY(atomic_add_64_nv) - pushl %edi - pushl %ebx - movl 12(%esp), %edi // %edi = target address - movl (%edi), %eax - movl 4(%edi), %edx // %edx:%eax = old value -1: - movl 16(%esp), %ebx - movl 20(%esp), %ecx // %ecx:%ebx = delta - addl %eax, %ebx - adcl %edx, %ecx // %ecx:%ebx = new value - lock - cmpxchg8b (%edi) // try to stick it in - jne 1b - movl %ebx, %eax - movl %ecx, %edx // return new value - popl %ebx - popl %edi - ret - SET_SIZE(atomic_add_64_nv) - SET_SIZE(atomic_add_64) - - ENTRY(atomic_cas_64) - pushl %ebx - pushl %esi - movl 12(%esp), %esi - movl 16(%esp), %eax - movl 20(%esp), %edx - movl 24(%esp), %ebx - movl 28(%esp), %ecx - lock - cmpxchg8b (%esi) - popl %esi - popl %ebx - ret - SET_SIZE(atomic_cas_64) - - ENTRY(atomic_swap_64) - pushl %esi - pushl %ebx - movl 12(%esp), %esi - movl 16(%esp), %ebx - movl 20(%esp), %ecx - movl (%esi), %eax - movl 4(%esi), %edx // %edx:%eax = old value -1: - lock - cmpxchg8b (%esi) - jne 1b - popl %ebx - popl %esi - ret - SET_SIZE(atomic_swap_64) - - ENTRY(atomic_load_64) - pushl %esi - movl 8(%esp), %esi - movl %ebx, %eax // make old and new values equal, so that - movl %ecx, %edx // destination is never changed - lock - cmpxchg8b (%esi) - popl %esi - ret - SET_SIZE(atomic_load_64) diff --git a/sys/cddl/contrib/opensolaris/common/avl/avl.c b/sys/cddl/contrib/opensolaris/common/avl/avl.c deleted file mode 100644 index 2349aba2bf3e..000000000000 --- a/sys/cddl/contrib/opensolaris/common/avl/avl.c +++ /dev/null @@ -1,1063 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2014 by Delphix. All rights reserved. - * Copyright 2015 Nexenta Systems, Inc. All rights reserved. - */ - -/* - * AVL - generic AVL tree implementation for kernel use - * - * A complete description of AVL trees can be found in many CS textbooks. - * - * Here is a very brief overview. An AVL tree is a binary search tree that is - * almost perfectly balanced. By "almost" perfectly balanced, we mean that at - * any given node, the left and right subtrees are allowed to differ in height - * by at most 1 level. - * - * This relaxation from a perfectly balanced binary tree allows doing - * insertion and deletion relatively efficiently. Searching the tree is - * still a fast operation, roughly O(log(N)). - * - * The key to insertion and deletion is a set of tree manipulations called - * rotations, which bring unbalanced subtrees back into the semi-balanced state. - * - * This implementation of AVL trees has the following peculiarities: - * - * - The AVL specific data structures are physically embedded as fields - * in the "using" data structures. To maintain generality the code - * must constantly translate between "avl_node_t *" and containing - * data structure "void *"s by adding/subtracting the avl_offset. - * - * - Since the AVL data is always embedded in other structures, there is - * no locking or memory allocation in the AVL routines. This must be - * provided for by the enclosing data structure's semantics. Typically, - * avl_insert()/_add()/_remove()/avl_insert_here() require some kind of - * exclusive write lock. Other operations require a read lock. - * - * - The implementation uses iteration instead of explicit recursion, - * since it is intended to run on limited size kernel stacks. Since - * there is no recursion stack present to move "up" in the tree, - * there is an explicit "parent" link in the avl_node_t. - * - * - The left/right children pointers of a node are in an array. - * In the code, variables (instead of constants) are used to represent - * left and right indices. The implementation is written as if it only - * dealt with left handed manipulations. By changing the value assigned - * to "left", the code also works for right handed trees. The - * following variables/terms are frequently used: - * - * int left; // 0 when dealing with left children, - * // 1 for dealing with right children - * - * int left_heavy; // -1 when left subtree is taller at some node, - * // +1 when right subtree is taller - * - * int right; // will be the opposite of left (0 or 1) - * int right_heavy;// will be the opposite of left_heavy (-1 or 1) - * - * int direction; // 0 for "<" (ie. left child); 1 for ">" (right) - * - * Though it is a little more confusing to read the code, the approach - * allows using half as much code (and hence cache footprint) for tree - * manipulations and eliminates many conditional branches. - * - * - The avl_index_t is an opaque "cookie" used to find nodes at or - * adjacent to where a new value would be inserted in the tree. The value - * is a modified "avl_node_t *". The bottom bit (normally 0 for a - * pointer) is set to indicate if that the new node has a value greater - * than the value of the indicated "avl_node_t *". - * - * Note - in addition to userland (e.g. libavl and libutil) and the kernel - * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module, - * which each have their own compilation environments and subsequent - * requirements. Each of these environments must be considered when adding - * dependencies from avl.c. - */ - -#include -#include -#include -#include -#include - -/* - * Small arrays to translate between balance (or diff) values and child indices. - * - * Code that deals with binary tree data structures will randomly use - * left and right children when examining a tree. C "if()" statements - * which evaluate randomly suffer from very poor hardware branch prediction. - * In this code we avoid some of the branch mispredictions by using the - * following translation arrays. They replace random branches with an - * additional memory reference. Since the translation arrays are both very - * small the data should remain efficiently in cache. - */ -static const int avl_child2balance[2] = {-1, 1}; -static const int avl_balance2child[] = {0, 0, 1}; - - -/* - * Walk from one node to the previous valued node (ie. an infix walk - * towards the left). At any given node we do one of 2 things: - * - * - If there is a left child, go to it, then to it's rightmost descendant. - * - * - otherwise we return through parent nodes until we've come from a right - * child. - * - * Return Value: - * NULL - if at the end of the nodes - * otherwise next node - */ -void * -avl_walk(avl_tree_t *tree, void *oldnode, int left) -{ - size_t off = tree->avl_offset; - avl_node_t *node = AVL_DATA2NODE(oldnode, off); - int right = 1 - left; - int was_child; - - - /* - * nowhere to walk to if tree is empty - */ - if (node == NULL) - return (NULL); - - /* - * Visit the previous valued node. There are two possibilities: - * - * If this node has a left child, go down one left, then all - * the way right. - */ - if (node->avl_child[left] != NULL) { - for (node = node->avl_child[left]; - node->avl_child[right] != NULL; - node = node->avl_child[right]) - ; - /* - * Otherwise, return thru left children as far as we can. - */ - } else { - for (;;) { - was_child = AVL_XCHILD(node); - node = AVL_XPARENT(node); - if (node == NULL) - return (NULL); - if (was_child == right) - break; - } - } - - return (AVL_NODE2DATA(node, off)); -} - -/* - * Return the lowest valued node in a tree or NULL. - * (leftmost child from root of tree) - */ -void * -avl_first(avl_tree_t *tree) -{ - avl_node_t *node; - avl_node_t *prev = NULL; - size_t off = tree->avl_offset; - - for (node = tree->avl_root; node != NULL; node = node->avl_child[0]) - prev = node; - - if (prev != NULL) - return (AVL_NODE2DATA(prev, off)); - return (NULL); -} - -/* - * Return the highest valued node in a tree or NULL. - * (rightmost child from root of tree) - */ -void * -avl_last(avl_tree_t *tree) -{ - avl_node_t *node; - avl_node_t *prev = NULL; - size_t off = tree->avl_offset; - - for (node = tree->avl_root; node != NULL; node = node->avl_child[1]) - prev = node; - - if (prev != NULL) - return (AVL_NODE2DATA(prev, off)); - return (NULL); -} - -/* - * Access the node immediately before or after an insertion point. - * - * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child - * - * Return value: - * NULL: no node in the given direction - * "void *" of the found tree node - */ -void * -avl_nearest(avl_tree_t *tree, avl_index_t where, int direction) -{ - int child = AVL_INDEX2CHILD(where); - avl_node_t *node = AVL_INDEX2NODE(where); - void *data; - size_t off = tree->avl_offset; - - if (node == NULL) { - ASSERT(tree->avl_root == NULL); - return (NULL); - } - data = AVL_NODE2DATA(node, off); - if (child != direction) - return (data); - - return (avl_walk(tree, data, direction)); -} - - -/* - * Search for the node which contains "value". The algorithm is a - * simple binary tree search. - * - * return value: - * NULL: the value is not in the AVL tree - * *where (if not NULL) is set to indicate the insertion point - * "void *" of the found tree node - */ -void * -avl_find(avl_tree_t *tree, const void *value, avl_index_t *where) -{ - avl_node_t *node; - avl_node_t *prev = NULL; - int child = 0; - int diff; - size_t off = tree->avl_offset; - - for (node = tree->avl_root; node != NULL; - node = node->avl_child[child]) { - - prev = node; - - diff = tree->avl_compar(value, AVL_NODE2DATA(node, off)); - ASSERT(-1 <= diff && diff <= 1); - if (diff == 0) { -#ifdef DEBUG - if (where != NULL) - *where = 0; -#endif - return (AVL_NODE2DATA(node, off)); - } - child = avl_balance2child[1 + diff]; - - } - - if (where != NULL) - *where = AVL_MKINDEX(prev, child); - - return (NULL); -} - - -/* - * Perform a rotation to restore balance at the subtree given by depth. - * - * This routine is used by both insertion and deletion. The return value - * indicates: - * 0 : subtree did not change height - * !0 : subtree was reduced in height - * - * The code is written as if handling left rotations, right rotations are - * symmetric and handled by swapping values of variables right/left[_heavy] - * - * On input balance is the "new" balance at "node". This value is either - * -2 or +2. - */ -static int -avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance) -{ - int left = !(balance < 0); /* when balance = -2, left will be 0 */ - int right = 1 - left; - int left_heavy = balance >> 1; - int right_heavy = -left_heavy; - avl_node_t *parent = AVL_XPARENT(node); - avl_node_t *child = node->avl_child[left]; - avl_node_t *cright; - avl_node_t *gchild; - avl_node_t *gright; - avl_node_t *gleft; - int which_child = AVL_XCHILD(node); - int child_bal = AVL_XBALANCE(child); - - /* BEGIN CSTYLED */ - /* - * case 1 : node is overly left heavy, the left child is balanced or - * also left heavy. This requires the following rotation. - * - * (node bal:-2) - * / \ - * / \ - * (child bal:0 or -1) - * / \ - * / \ - * cright - * - * becomes: - * - * (child bal:1 or 0) - * / \ - * / \ - * (node bal:-1 or 0) - * / \ - * / \ - * cright - * - * we detect this situation by noting that child's balance is not - * right_heavy. - */ - /* END CSTYLED */ - if (child_bal != right_heavy) { - - /* - * compute new balance of nodes - * - * If child used to be left heavy (now balanced) we reduced - * the height of this sub-tree -- used in "return...;" below - */ - child_bal += right_heavy; /* adjust towards right */ - - /* - * move "cright" to be node's left child - */ - cright = child->avl_child[right]; - node->avl_child[left] = cright; - if (cright != NULL) { - AVL_SETPARENT(cright, node); - AVL_SETCHILD(cright, left); - } - - /* - * move node to be child's right child - */ - child->avl_child[right] = node; - AVL_SETBALANCE(node, -child_bal); - AVL_SETCHILD(node, right); - AVL_SETPARENT(node, child); - - /* - * update the pointer into this subtree - */ - AVL_SETBALANCE(child, child_bal); - AVL_SETCHILD(child, which_child); - AVL_SETPARENT(child, parent); - if (parent != NULL) - parent->avl_child[which_child] = child; - else - tree->avl_root = child; - - return (child_bal == 0); - } - - /* BEGIN CSTYLED */ - /* - * case 2 : When node is left heavy, but child is right heavy we use - * a different rotation. - * - * (node b:-2) - * / \ - * / \ - * / \ - * (child b:+1) - * / \ - * / \ - * (gchild b: != 0) - * / \ - * / \ - * gleft gright - * - * becomes: - * - * (gchild b:0) - * / \ - * / \ - * / \ - * (child b:?) (node b:?) - * / \ / \ - * / \ / \ - * gleft gright - * - * computing the new balances is more complicated. As an example: - * if gchild was right_heavy, then child is now left heavy - * else it is balanced - */ - /* END CSTYLED */ - gchild = child->avl_child[right]; - gleft = gchild->avl_child[left]; - gright = gchild->avl_child[right]; - - /* - * move gright to left child of node and - * - * move gleft to right child of node - */ - node->avl_child[left] = gright; - if (gright != NULL) { - AVL_SETPARENT(gright, node); - AVL_SETCHILD(gright, left); - } - - child->avl_child[right] = gleft; - if (gleft != NULL) { - AVL_SETPARENT(gleft, child); - AVL_SETCHILD(gleft, right); - } - - /* - * move child to left child of gchild and - * - * move node to right child of gchild and - * - * fixup parent of all this to point to gchild - */ - balance = AVL_XBALANCE(gchild); - gchild->avl_child[left] = child; - AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0)); - AVL_SETPARENT(child, gchild); - AVL_SETCHILD(child, left); - - gchild->avl_child[right] = node; - AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0)); - AVL_SETPARENT(node, gchild); - AVL_SETCHILD(node, right); - - AVL_SETBALANCE(gchild, 0); - AVL_SETPARENT(gchild, parent); - AVL_SETCHILD(gchild, which_child); - if (parent != NULL) - parent->avl_child[which_child] = gchild; - else - tree->avl_root = gchild; - - return (1); /* the new tree is always shorter */ -} - - -/* - * Insert a new node into an AVL tree at the specified (from avl_find()) place. - * - * Newly inserted nodes are always leaf nodes in the tree, since avl_find() - * searches out to the leaf positions. The avl_index_t indicates the node - * which will be the parent of the new node. - * - * After the node is inserted, a single rotation further up the tree may - * be necessary to maintain an acceptable AVL balance. - */ -void -avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where) -{ - avl_node_t *node; - avl_node_t *parent = AVL_INDEX2NODE(where); - int old_balance; - int new_balance; - int which_child = AVL_INDEX2CHILD(where); - size_t off = tree->avl_offset; - - ASSERT(tree); -#ifdef _LP64 - ASSERT(((uintptr_t)new_data & 0x7) == 0); -#endif - - node = AVL_DATA2NODE(new_data, off); - - /* - * First, add the node to the tree at the indicated position. - */ - ++tree->avl_numnodes; - - node->avl_child[0] = NULL; - node->avl_child[1] = NULL; - - AVL_SETCHILD(node, which_child); - AVL_SETBALANCE(node, 0); - AVL_SETPARENT(node, parent); - if (parent != NULL) { - ASSERT(parent->avl_child[which_child] == NULL); - parent->avl_child[which_child] = node; - } else { - ASSERT(tree->avl_root == NULL); - tree->avl_root = node; - } - /* - * Now, back up the tree modifying the balance of all nodes above the - * insertion point. If we get to a highly unbalanced ancestor, we - * need to do a rotation. If we back out of the tree we are done. - * If we brought any subtree into perfect balance (0), we are also done. - */ - for (;;) { - node = parent; - if (node == NULL) - return; - - /* - * Compute the new balance - */ - old_balance = AVL_XBALANCE(node); - new_balance = old_balance + avl_child2balance[which_child]; - - /* - * If we introduced equal balance, then we are done immediately - */ - if (new_balance == 0) { - AVL_SETBALANCE(node, 0); - return; - } - - /* - * If both old and new are not zero we went - * from -1 to -2 balance, do a rotation. - */ - if (old_balance != 0) - break; - - AVL_SETBALANCE(node, new_balance); - parent = AVL_XPARENT(node); - which_child = AVL_XCHILD(node); - } - - /* - * perform a rotation to fix the tree and return - */ - (void) avl_rotation(tree, node, new_balance); -} - -/* - * Insert "new_data" in "tree" in the given "direction" either after or - * before (AVL_AFTER, AVL_BEFORE) the data "here". - * - * Insertions can only be done at empty leaf points in the tree, therefore - * if the given child of the node is already present we move to either - * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since - * every other node in the tree is a leaf, this always works. - * - * To help developers using this interface, we assert that the new node - * is correctly ordered at every step of the way in DEBUG kernels. - */ -void -avl_insert_here( - avl_tree_t *tree, - void *new_data, - void *here, - int direction) -{ - avl_node_t *node; - int child = direction; /* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */ -#ifdef DEBUG - int diff; -#endif - - ASSERT(tree != NULL); - ASSERT(new_data != NULL); - ASSERT(here != NULL); - ASSERT(direction == AVL_BEFORE || direction == AVL_AFTER); - - /* - * If corresponding child of node is not NULL, go to the neighboring - * node and reverse the insertion direction. - */ - node = AVL_DATA2NODE(here, tree->avl_offset); - -#ifdef DEBUG - diff = tree->avl_compar(new_data, here); - ASSERT(-1 <= diff && diff <= 1); - ASSERT(diff != 0); - ASSERT(diff > 0 ? child == 1 : child == 0); -#endif - - if (node->avl_child[child] != NULL) { - node = node->avl_child[child]; - child = 1 - child; - while (node->avl_child[child] != NULL) { -#ifdef DEBUG - diff = tree->avl_compar(new_data, - AVL_NODE2DATA(node, tree->avl_offset)); - ASSERT(-1 <= diff && diff <= 1); - ASSERT(diff != 0); - ASSERT(diff > 0 ? child == 1 : child == 0); -#endif - node = node->avl_child[child]; - } -#ifdef DEBUG - diff = tree->avl_compar(new_data, - AVL_NODE2DATA(node, tree->avl_offset)); - ASSERT(-1 <= diff && diff <= 1); - ASSERT(diff != 0); - ASSERT(diff > 0 ? child == 1 : child == 0); -#endif - } - ASSERT(node->avl_child[child] == NULL); - - avl_insert(tree, new_data, AVL_MKINDEX(node, child)); -} - -/* - * Add a new node to an AVL tree. - */ -void -avl_add(avl_tree_t *tree, void *new_node) -{ - avl_index_t where; - - /* - * This is unfortunate. We want to call panic() here, even for - * non-DEBUG kernels. In userland, however, we can't depend on anything - * in libc or else the rtld build process gets confused. - * Thankfully, rtld provides us with its own assfail() so we can use - * that here. We use assfail() directly to get a nice error message - * in the core - much like what panic() does for crashdumps. - */ - if (avl_find(tree, new_node, &where) != NULL) -#ifdef _KERNEL - panic("avl_find() succeeded inside avl_add()"); -#else - (void) assfail("avl_find() succeeded inside avl_add()", - __FILE__, __LINE__); -#endif - avl_insert(tree, new_node, where); -} - -/* - * Delete a node from the AVL tree. Deletion is similar to insertion, but - * with 2 complications. - * - * First, we may be deleting an interior node. Consider the following subtree: - * - * d c c - * / \ / \ / \ - * b e b e b e - * / \ / \ / - * a c a a - * - * When we are deleting node (d), we find and bring up an adjacent valued leaf - * node, say (c), to take the interior node's place. In the code this is - * handled by temporarily swapping (d) and (c) in the tree and then using - * common code to delete (d) from the leaf position. - * - * Secondly, an interior deletion from a deep tree may require more than one - * rotation to fix the balance. This is handled by moving up the tree through - * parents and applying rotations as needed. The return value from - * avl_rotation() is used to detect when a subtree did not change overall - * height due to a rotation. - */ -void -avl_remove(avl_tree_t *tree, void *data) -{ - avl_node_t *delete; - avl_node_t *parent; - avl_node_t *node; - avl_node_t tmp; - int old_balance; - int new_balance; - int left; - int right; - int which_child; - size_t off = tree->avl_offset; - - ASSERT(tree); - - delete = AVL_DATA2NODE(data, off); - - /* - * Deletion is easiest with a node that has at most 1 child. - * We swap a node with 2 children with a sequentially valued - * neighbor node. That node will have at most 1 child. Note this - * has no effect on the ordering of the remaining nodes. - * - * As an optimization, we choose the greater neighbor if the tree - * is right heavy, otherwise the left neighbor. This reduces the - * number of rotations needed. - */ - if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) { - - /* - * choose node to swap from whichever side is taller - */ - old_balance = AVL_XBALANCE(delete); - left = avl_balance2child[old_balance + 1]; - right = 1 - left; - - /* - * get to the previous value'd node - * (down 1 left, as far as possible right) - */ - for (node = delete->avl_child[left]; - node->avl_child[right] != NULL; - node = node->avl_child[right]) - ; - - /* - * create a temp placeholder for 'node' - * move 'node' to delete's spot in the tree - */ - tmp = *node; - - *node = *delete; - if (node->avl_child[left] == node) - node->avl_child[left] = &tmp; - - parent = AVL_XPARENT(node); - if (parent != NULL) - parent->avl_child[AVL_XCHILD(node)] = node; - else - tree->avl_root = node; - AVL_SETPARENT(node->avl_child[left], node); - AVL_SETPARENT(node->avl_child[right], node); - - /* - * Put tmp where node used to be (just temporary). - * It always has a parent and at most 1 child. - */ - delete = &tmp; - parent = AVL_XPARENT(delete); - parent->avl_child[AVL_XCHILD(delete)] = delete; - which_child = (delete->avl_child[1] != 0); - if (delete->avl_child[which_child] != NULL) - AVL_SETPARENT(delete->avl_child[which_child], delete); - } - - - /* - * Here we know "delete" is at least partially a leaf node. It can - * be easily removed from the tree. - */ - ASSERT(tree->avl_numnodes > 0); - --tree->avl_numnodes; - parent = AVL_XPARENT(delete); - which_child = AVL_XCHILD(delete); - if (delete->avl_child[0] != NULL) - node = delete->avl_child[0]; - else - node = delete->avl_child[1]; - - /* - * Connect parent directly to node (leaving out delete). - */ - if (node != NULL) { - AVL_SETPARENT(node, parent); - AVL_SETCHILD(node, which_child); - } - if (parent == NULL) { - tree->avl_root = node; - return; - } - parent->avl_child[which_child] = node; - - - /* - * Since the subtree is now shorter, begin adjusting parent balances - * and performing any needed rotations. - */ - do { - - /* - * Move up the tree and adjust the balance - * - * Capture the parent and which_child values for the next - * iteration before any rotations occur. - */ - node = parent; - old_balance = AVL_XBALANCE(node); - new_balance = old_balance - avl_child2balance[which_child]; - parent = AVL_XPARENT(node); - which_child = AVL_XCHILD(node); - - /* - * If a node was in perfect balance but isn't anymore then - * we can stop, since the height didn't change above this point - * due to a deletion. - */ - if (old_balance == 0) { - AVL_SETBALANCE(node, new_balance); - break; - } - - /* - * If the new balance is zero, we don't need to rotate - * else - * need a rotation to fix the balance. - * If the rotation doesn't change the height - * of the sub-tree we have finished adjusting. - */ - if (new_balance == 0) - AVL_SETBALANCE(node, new_balance); - else if (!avl_rotation(tree, node, new_balance)) - break; - } while (parent != NULL); -} - -#define AVL_REINSERT(tree, obj) \ - avl_remove((tree), (obj)); \ - avl_add((tree), (obj)) - -boolean_t -avl_update_lt(avl_tree_t *t, void *obj) -{ - void *neighbor; - - ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) || - (t->avl_compar(obj, neighbor) <= 0)); - - neighbor = AVL_PREV(t, obj); - if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { - AVL_REINSERT(t, obj); - return (B_TRUE); - } - - return (B_FALSE); -} - -boolean_t -avl_update_gt(avl_tree_t *t, void *obj) -{ - void *neighbor; - - ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) || - (t->avl_compar(obj, neighbor) >= 0)); - - neighbor = AVL_NEXT(t, obj); - if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { - AVL_REINSERT(t, obj); - return (B_TRUE); - } - - return (B_FALSE); -} - -boolean_t -avl_update(avl_tree_t *t, void *obj) -{ - void *neighbor; - - neighbor = AVL_PREV(t, obj); - if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { - AVL_REINSERT(t, obj); - return (B_TRUE); - } - - neighbor = AVL_NEXT(t, obj); - if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { - AVL_REINSERT(t, obj); - return (B_TRUE); - } - - return (B_FALSE); -} - -void -avl_swap(avl_tree_t *tree1, avl_tree_t *tree2) -{ - avl_node_t *temp_node; - ulong_t temp_numnodes; - - ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar); - ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset); - ASSERT3U(tree1->avl_size, ==, tree2->avl_size); - - temp_node = tree1->avl_root; - temp_numnodes = tree1->avl_numnodes; - tree1->avl_root = tree2->avl_root; - tree1->avl_numnodes = tree2->avl_numnodes; - tree2->avl_root = temp_node; - tree2->avl_numnodes = temp_numnodes; -} - -/* - * initialize a new AVL tree - */ -void -avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *), - size_t size, size_t offset) -{ - ASSERT(tree); - ASSERT(compar); - ASSERT(size > 0); - ASSERT(size >= offset + sizeof (avl_node_t)); -#ifdef _LP64 - ASSERT((offset & 0x7) == 0); -#endif - - tree->avl_compar = compar; - tree->avl_root = NULL; - tree->avl_numnodes = 0; - tree->avl_size = size; - tree->avl_offset = offset; -} - -/* - * Delete a tree. - */ -/* ARGSUSED */ -void -avl_destroy(avl_tree_t *tree) -{ - ASSERT(tree); - ASSERT(tree->avl_numnodes == 0); - ASSERT(tree->avl_root == NULL); -} - - -/* - * Return the number of nodes in an AVL tree. - */ -ulong_t -avl_numnodes(avl_tree_t *tree) -{ - ASSERT(tree); - return (tree->avl_numnodes); -} - -boolean_t -avl_is_empty(avl_tree_t *tree) -{ - ASSERT(tree); - return (tree->avl_numnodes == 0); -} - -#define CHILDBIT (1L) - -/* - * Post-order tree walk used to visit all tree nodes and destroy the tree - * in post order. This is used for destroying a tree without paying any cost - * for rebalancing it. - * - * example: - * - * void *cookie = NULL; - * my_data_t *node; - * - * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL) - * free(node); - * avl_destroy(tree); - * - * The cookie is really an avl_node_t to the current node's parent and - * an indication of which child you looked at last. - * - * On input, a cookie value of CHILDBIT indicates the tree is done. - */ -void * -avl_destroy_nodes(avl_tree_t *tree, void **cookie) -{ - avl_node_t *node; - avl_node_t *parent; - int child; - void *first; - size_t off = tree->avl_offset; - - /* - * Initial calls go to the first node or it's right descendant. - */ - if (*cookie == NULL) { - first = avl_first(tree); - - /* - * deal with an empty tree - */ - if (first == NULL) { - *cookie = (void *)CHILDBIT; - return (NULL); - } - - node = AVL_DATA2NODE(first, off); - parent = AVL_XPARENT(node); - goto check_right_side; - } - - /* - * If there is no parent to return to we are done. - */ - parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT); - if (parent == NULL) { - if (tree->avl_root != NULL) { - ASSERT(tree->avl_numnodes == 1); - tree->avl_root = NULL; - tree->avl_numnodes = 0; - } - return (NULL); - } - - /* - * Remove the child pointer we just visited from the parent and tree. - */ - child = (uintptr_t)(*cookie) & CHILDBIT; - parent->avl_child[child] = NULL; - ASSERT(tree->avl_numnodes > 1); - --tree->avl_numnodes; - - /* - * If we just did a right child or there isn't one, go up to parent. - */ - if (child == 1 || parent->avl_child[1] == NULL) { - node = parent; - parent = AVL_XPARENT(parent); - goto done; - } - - /* - * Do parent's right child, then leftmost descendent. - */ - node = parent->avl_child[1]; - while (node->avl_child[0] != NULL) { - parent = node; - node = node->avl_child[0]; - } - - /* - * If here, we moved to a left child. It may have one - * child on the right (when balance == +1). - */ -check_right_side: - if (node->avl_child[1] != NULL) { - ASSERT(AVL_XBALANCE(node) == 1); - parent = node; - node = node->avl_child[1]; - ASSERT(node->avl_child[0] == NULL && - node->avl_child[1] == NULL); - } else { - ASSERT(AVL_XBALANCE(node) <= 0); - } - -done: - if (parent == NULL) { - *cookie = (void *)CHILDBIT; - ASSERT(node == tree->avl_root); - } else { - *cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node)); - } - - return (AVL_NODE2DATA(node, off)); -} diff --git a/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c b/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c deleted file mode 100644 index eb200a24e6d2..000000000000 --- a/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c +++ /dev/null @@ -1,512 +0,0 @@ - -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2012 by Delphix. All rights reserved. - */ - -#include -#ifndef _KERNEL -#include -#else -#include -#include -#include -#include -#endif - -/* - * "Force" nvlist wrapper. - * - * These functions wrap the nvlist_* functions with assertions that assume - * the operation is successful. This allows the caller's code to be much - * more readable, especially for the fnvlist_lookup_* and fnvpair_value_* - * functions, which can return the requested value (rather than filling in - * a pointer). - * - * These functions use NV_UNIQUE_NAME, encoding NV_ENCODE_NATIVE, and allocate - * with KM_SLEEP. - * - * More wrappers should be added as needed -- for example - * nvlist_lookup_*_array and nvpair_value_*_array. - */ - -nvlist_t * -fnvlist_alloc(void) -{ - nvlist_t *nvl; - VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)); - return (nvl); -} - -void -fnvlist_free(nvlist_t *nvl) -{ - nvlist_free(nvl); -} - -size_t -fnvlist_size(nvlist_t *nvl) -{ - size_t size; - VERIFY0(nvlist_size(nvl, &size, NV_ENCODE_NATIVE)); - return (size); -} - -/* - * Returns allocated buffer of size *sizep. Caller must free the buffer with - * fnvlist_pack_free(). - */ -char * -fnvlist_pack(nvlist_t *nvl, size_t *sizep) -{ - char *packed = 0; - VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE, - KM_SLEEP), ==, 0); - return (packed); -} - -/*ARGSUSED*/ -void -fnvlist_pack_free(char *pack, size_t size) -{ -#ifdef _KERNEL - kmem_free(pack, size); -#else - free(pack); -#endif -} - -nvlist_t * -fnvlist_unpack(char *buf, size_t buflen) -{ - nvlist_t *rv; - VERIFY0(nvlist_unpack(buf, buflen, &rv, KM_SLEEP)); - return (rv); -} - -nvlist_t * -fnvlist_dup(nvlist_t *nvl) -{ - nvlist_t *rv; - VERIFY0(nvlist_dup(nvl, &rv, KM_SLEEP)); - return (rv); -} - -void -fnvlist_merge(nvlist_t *dst, nvlist_t *src) -{ - VERIFY0(nvlist_merge(dst, src, KM_SLEEP)); -} - -size_t -fnvlist_num_pairs(nvlist_t *nvl) -{ - size_t count = 0; - nvpair_t *pair; - - for (pair = nvlist_next_nvpair(nvl, 0); pair != NULL; - pair = nvlist_next_nvpair(nvl, pair)) - count++; - return (count); -} - -void -fnvlist_add_boolean(nvlist_t *nvl, const char *name) -{ - VERIFY0(nvlist_add_boolean(nvl, name)); -} - -void -fnvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val) -{ - VERIFY0(nvlist_add_boolean_value(nvl, name, val)); -} - -void -fnvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val) -{ - VERIFY0(nvlist_add_byte(nvl, name, val)); -} - -void -fnvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val) -{ - VERIFY0(nvlist_add_int8(nvl, name, val)); -} - -void -fnvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val) -{ - VERIFY0(nvlist_add_uint8(nvl, name, val)); -} - -void -fnvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val) -{ - VERIFY0(nvlist_add_int16(nvl, name, val)); -} - -void -fnvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val) -{ - VERIFY0(nvlist_add_uint16(nvl, name, val)); -} - -void -fnvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val) -{ - VERIFY0(nvlist_add_int32(nvl, name, val)); -} - -void -fnvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val) -{ - VERIFY0(nvlist_add_uint32(nvl, name, val)); -} - -void -fnvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val) -{ - VERIFY0(nvlist_add_int64(nvl, name, val)); -} - -void -fnvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val) -{ - VERIFY0(nvlist_add_uint64(nvl, name, val)); -} - -void -fnvlist_add_string(nvlist_t *nvl, const char *name, const char *val) -{ - VERIFY0(nvlist_add_string(nvl, name, val)); -} - -void -fnvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val) -{ - VERIFY0(nvlist_add_nvlist(nvl, name, val)); -} - -void -fnvlist_add_nvpair(nvlist_t *nvl, nvpair_t *pair) -{ - VERIFY0(nvlist_add_nvpair(nvl, pair)); -} - -void -fnvlist_add_boolean_array(nvlist_t *nvl, const char *name, - boolean_t *val, uint_t n) -{ - VERIFY0(nvlist_add_boolean_array(nvl, name, val, n)); -} - -void -fnvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *val, uint_t n) -{ - VERIFY0(nvlist_add_byte_array(nvl, name, val, n)); -} - -void -fnvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *val, uint_t n) -{ - VERIFY0(nvlist_add_int8_array(nvl, name, val, n)); -} - -void -fnvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *val, uint_t n) -{ - VERIFY0(nvlist_add_uint8_array(nvl, name, val, n)); -} - -void -fnvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *val, uint_t n) -{ - VERIFY0(nvlist_add_int16_array(nvl, name, val, n)); -} - -void -fnvlist_add_uint16_array(nvlist_t *nvl, const char *name, - uint16_t *val, uint_t n) -{ - VERIFY0(nvlist_add_uint16_array(nvl, name, val, n)); -} - -void -fnvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *val, uint_t n) -{ - VERIFY0(nvlist_add_int32_array(nvl, name, val, n)); -} - -void -fnvlist_add_uint32_array(nvlist_t *nvl, const char *name, - uint32_t *val, uint_t n) -{ - VERIFY0(nvlist_add_uint32_array(nvl, name, val, n)); -} - -void -fnvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *val, uint_t n) -{ - VERIFY0(nvlist_add_int64_array(nvl, name, val, n)); -} - -void -fnvlist_add_uint64_array(nvlist_t *nvl, const char *name, - uint64_t *val, uint_t n) -{ - VERIFY0(nvlist_add_uint64_array(nvl, name, val, n)); -} - -void -fnvlist_add_string_array(nvlist_t *nvl, const char *name, - char * const *val, uint_t n) -{ - VERIFY0(nvlist_add_string_array(nvl, name, val, n)); -} - -void -fnvlist_add_nvlist_array(nvlist_t *nvl, const char *name, - nvlist_t **val, uint_t n) -{ - VERIFY0(nvlist_add_nvlist_array(nvl, name, val, n)); -} - -void -fnvlist_remove(nvlist_t *nvl, const char *name) -{ - VERIFY0(nvlist_remove_all(nvl, name)); -} - -void -fnvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *pair) -{ - VERIFY0(nvlist_remove_nvpair(nvl, pair)); -} - -nvpair_t * -fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name) -{ - nvpair_t *rv; - VERIFY0(nvlist_lookup_nvpair(nvl, name, &rv)); - return (rv); -} - -/* returns B_TRUE if the entry exists */ -boolean_t -fnvlist_lookup_boolean(nvlist_t *nvl, const char *name) -{ - return (nvlist_lookup_boolean(nvl, name) == 0); -} - -boolean_t -fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name) -{ - boolean_t rv; - VERIFY0(nvlist_lookup_boolean_value(nvl, name, &rv)); - return (rv); -} - -uchar_t -fnvlist_lookup_byte(nvlist_t *nvl, const char *name) -{ - uchar_t rv; - VERIFY0(nvlist_lookup_byte(nvl, name, &rv)); - return (rv); -} - -int8_t -fnvlist_lookup_int8(nvlist_t *nvl, const char *name) -{ - int8_t rv; - VERIFY0(nvlist_lookup_int8(nvl, name, &rv)); - return (rv); -} - -int16_t -fnvlist_lookup_int16(nvlist_t *nvl, const char *name) -{ - int16_t rv; - VERIFY0(nvlist_lookup_int16(nvl, name, &rv)); - return (rv); -} - -int32_t -fnvlist_lookup_int32(nvlist_t *nvl, const char *name) -{ - int32_t rv; - VERIFY0(nvlist_lookup_int32(nvl, name, &rv)); - return (rv); -} - -int64_t -fnvlist_lookup_int64(nvlist_t *nvl, const char *name) -{ - int64_t rv; - VERIFY0(nvlist_lookup_int64(nvl, name, &rv)); - return (rv); -} - -uint8_t -fnvlist_lookup_uint8_t(nvlist_t *nvl, const char *name) -{ - uint8_t rv; - VERIFY0(nvlist_lookup_uint8(nvl, name, &rv)); - return (rv); -} - -uint16_t -fnvlist_lookup_uint16(nvlist_t *nvl, const char *name) -{ - uint16_t rv; - VERIFY0(nvlist_lookup_uint16(nvl, name, &rv)); - return (rv); -} - -uint32_t -fnvlist_lookup_uint32(nvlist_t *nvl, const char *name) -{ - uint32_t rv; - VERIFY0(nvlist_lookup_uint32(nvl, name, &rv)); - return (rv); -} - -uint64_t -fnvlist_lookup_uint64(nvlist_t *nvl, const char *name) -{ - uint64_t rv; - VERIFY0(nvlist_lookup_uint64(nvl, name, &rv)); - return (rv); -} - -char * -fnvlist_lookup_string(nvlist_t *nvl, const char *name) -{ - char *rv; - VERIFY0(nvlist_lookup_string(nvl, name, &rv)); - return (rv); -} - -nvlist_t * -fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name) -{ - nvlist_t *rv; - VERIFY0(nvlist_lookup_nvlist(nvl, name, &rv)); - return (rv); -} - -boolean_t -fnvpair_value_boolean_value(nvpair_t *nvp) -{ - boolean_t rv; - VERIFY0(nvpair_value_boolean_value(nvp, &rv)); - return (rv); -} - -uchar_t -fnvpair_value_byte(nvpair_t *nvp) -{ - uchar_t rv; - VERIFY0(nvpair_value_byte(nvp, &rv)); - return (rv); -} - -int8_t -fnvpair_value_int8(nvpair_t *nvp) -{ - int8_t rv; - VERIFY0(nvpair_value_int8(nvp, &rv)); - return (rv); -} - -int16_t -fnvpair_value_int16(nvpair_t *nvp) -{ - int16_t rv; - VERIFY0(nvpair_value_int16(nvp, &rv)); - return (rv); -} - -int32_t -fnvpair_value_int32(nvpair_t *nvp) -{ - int32_t rv; - VERIFY0(nvpair_value_int32(nvp, &rv)); - return (rv); -} - -int64_t -fnvpair_value_int64(nvpair_t *nvp) -{ - int64_t rv; - VERIFY0(nvpair_value_int64(nvp, &rv)); - return (rv); -} - -uint8_t -fnvpair_value_uint8_t(nvpair_t *nvp) -{ - uint8_t rv; - VERIFY0(nvpair_value_uint8(nvp, &rv)); - return (rv); -} - -uint16_t -fnvpair_value_uint16(nvpair_t *nvp) -{ - uint16_t rv; - VERIFY0(nvpair_value_uint16(nvp, &rv)); - return (rv); -} - -uint32_t -fnvpair_value_uint32(nvpair_t *nvp) -{ - uint32_t rv; - VERIFY0(nvpair_value_uint32(nvp, &rv)); - return (rv); -} - -uint64_t -fnvpair_value_uint64(nvpair_t *nvp) -{ - uint64_t rv; - VERIFY0(nvpair_value_uint64(nvp, &rv)); - return (rv); -} - -char * -fnvpair_value_string(nvpair_t *nvp) -{ - char *rv; - VERIFY0(nvpair_value_string(nvp, &rv)); - return (rv); -} - -nvlist_t * -fnvpair_value_nvlist(nvpair_t *nvp) -{ - nvlist_t *rv; - VERIFY0(nvpair_value_nvlist(nvp, &rv)); - return (rv); -} diff --git a/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c b/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c deleted file mode 100644 index c322a5bd2179..000000000000 --- a/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c +++ /dev/null @@ -1,3600 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2015, 2017 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include - -#if defined(_KERNEL) && !defined(_BOOT) -#include -#include -#else -#include -#include -#include -#include -#endif - -#ifndef offsetof -#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) -#endif -#define skip_whitespace(p) while ((*(p) == ' ') || (*(p) == '\t')) p++ - -#if defined(__FreeBSD__) && !defined(_KERNEL) -/* - * libnvpair is the lowest commen denominator for ZFS related libraries, - * defining aok here makes it usable by all ZFS related libraries - */ -int aok; -#endif - -/* - * nvpair.c - Provides kernel & userland interfaces for manipulating - * name-value pairs. - * - * Overview Diagram - * - * +--------------+ - * | nvlist_t | - * |--------------| - * | nvl_version | - * | nvl_nvflag | - * | nvl_priv -+-+ - * | nvl_flag | | - * | nvl_pad | | - * +--------------+ | - * V - * +--------------+ last i_nvp in list - * | nvpriv_t | +---------------------> - * |--------------| | - * +--+- nvp_list | | +------------+ - * | | nvp_last -+--+ + nv_alloc_t | - * | | nvp_curr | |------------| - * | | nvp_nva -+----> | nva_ops | - * | | nvp_stat | | nva_arg | - * | +--------------+ +------------+ - * | - * +-------+ - * V - * +---------------------+ +-------------------+ - * | i_nvp_t | +-->| i_nvp_t | +--> - * |---------------------| | |-------------------| | - * | nvi_next -+--+ | nvi_next -+--+ - * | nvi_prev (NULL) | <----+ nvi_prev | - * | . . . . . . . . . . | | . . . . . . . . . | - * | nvp (nvpair_t) | | nvp (nvpair_t) | - * | - nvp_size | | - nvp_size | - * | - nvp_name_sz | | - nvp_name_sz | - * | - nvp_value_elem | | - nvp_value_elem | - * | - nvp_type | | - nvp_type | - * | - data ... | | - data ... | - * +---------------------+ +-------------------+ - * - * - * - * +---------------------+ +---------------------+ - * | i_nvp_t | +--> +-->| i_nvp_t (last) | - * |---------------------| | | |---------------------| - * | nvi_next -+--+ ... --+ | nvi_next (NULL) | - * <-+- nvi_prev |<-- ... <----+ nvi_prev | - * | . . . . . . . . . | | . . . . . . . . . | - * | nvp (nvpair_t) | | nvp (nvpair_t) | - * | - nvp_size | | - nvp_size | - * | - nvp_name_sz | | - nvp_name_sz | - * | - nvp_value_elem | | - nvp_value_elem | - * | - DATA_TYPE_NVLIST | | - nvp_type | - * | - data (embedded) | | - data ... | - * | nvlist name | +---------------------+ - * | +--------------+ | - * | | nvlist_t | | - * | |--------------| | - * | | nvl_version | | - * | | nvl_nvflag | | - * | | nvl_priv --+---+----> - * | | nvl_flag | | - * | | nvl_pad | | - * | +--------------+ | - * +---------------------+ - * - * - * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will - * allow value to be aligned on 8 byte boundary - * - * name_len is the length of the name string including the null terminator - * so it must be >= 1 - */ -#define NVP_SIZE_CALC(name_len, data_len) \ - (NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len)) - -static int i_get_value_size(data_type_t type, const void *data, uint_t nelem); -static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type, - uint_t nelem, const void *data); - -#define NV_STAT_EMBEDDED 0x1 -#define EMBEDDED_NVL(nvp) ((nvlist_t *)(void *)NVP_VALUE(nvp)) -#define EMBEDDED_NVL_ARRAY(nvp) ((nvlist_t **)(void *)NVP_VALUE(nvp)) - -#define NVP_VALOFF(nvp) (NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz)) -#define NVPAIR2I_NVP(nvp) \ - ((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp))) - -#ifdef _KERNEL -int nvpair_max_recursion = 20; -#else -int nvpair_max_recursion = 100; -#endif - -uint64_t nvlist_hashtable_init_size = (1 << 4); - -int -nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...) -{ - va_list valist; - int err = 0; - - nva->nva_ops = nvo; - nva->nva_arg = NULL; - - va_start(valist, nvo); - if (nva->nva_ops->nv_ao_init != NULL) - err = nva->nva_ops->nv_ao_init(nva, valist); - va_end(valist); - - return (err); -} - -void -nv_alloc_reset(nv_alloc_t *nva) -{ - if (nva->nva_ops->nv_ao_reset != NULL) - nva->nva_ops->nv_ao_reset(nva); -} - -void -nv_alloc_fini(nv_alloc_t *nva) -{ - if (nva->nva_ops->nv_ao_fini != NULL) - nva->nva_ops->nv_ao_fini(nva); -} - -nv_alloc_t * -nvlist_lookup_nv_alloc(nvlist_t *nvl) -{ - nvpriv_t *priv; - - if (nvl == NULL || - (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) - return (NULL); - - return (priv->nvp_nva); -} - -static void * -nv_mem_zalloc(nvpriv_t *nvp, size_t size) -{ - nv_alloc_t *nva = nvp->nvp_nva; - void *buf; - - if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL) - bzero(buf, size); - - return (buf); -} - -static void -nv_mem_free(nvpriv_t *nvp, void *buf, size_t size) -{ - nv_alloc_t *nva = nvp->nvp_nva; - - nva->nva_ops->nv_ao_free(nva, buf, size); -} - -static void -nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat) -{ - bzero(priv, sizeof (nvpriv_t)); - - priv->nvp_nva = nva; - priv->nvp_stat = stat; -} - -static nvpriv_t * -nv_priv_alloc(nv_alloc_t *nva) -{ - nvpriv_t *priv; - - /* - * nv_mem_alloc() cannot called here because it needs the priv - * argument. - */ - if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL) - return (NULL); - - nv_priv_init(priv, nva, 0); - - return (priv); -} - -/* - * Embedded lists need their own nvpriv_t's. We create a new - * nvpriv_t using the parameters and allocator from the parent - * list's nvpriv_t. - */ -static nvpriv_t * -nv_priv_alloc_embedded(nvpriv_t *priv) -{ - nvpriv_t *emb_priv; - - if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL) - return (NULL); - - nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED); - - return (emb_priv); -} - -static int -nvt_tab_alloc(nvpriv_t *priv, uint64_t buckets) -{ - ASSERT3P(priv->nvp_hashtable, ==, NULL); - ASSERT0(priv->nvp_nbuckets); - ASSERT0(priv->nvp_nentries); - - i_nvp_t **tab = nv_mem_zalloc(priv, buckets * sizeof (i_nvp_t *)); - if (tab == NULL) - return (ENOMEM); - - priv->nvp_hashtable = tab; - priv->nvp_nbuckets = buckets; - return (0); -} - -static void -nvt_tab_free(nvpriv_t *priv) -{ - i_nvp_t **tab = priv->nvp_hashtable; - if (tab == NULL) { - ASSERT0(priv->nvp_nbuckets); - ASSERT0(priv->nvp_nentries); - return; - } - - nv_mem_free(priv, tab, priv->nvp_nbuckets * sizeof (i_nvp_t *)); - - priv->nvp_hashtable = NULL; - priv->nvp_nbuckets = 0; - priv->nvp_nentries = 0; -} - -static uint32_t -nvt_hash(const char *p) -{ - uint32_t g, hval = 0; - - while (*p) { - hval = (hval << 4) + *p++; - if ((g = (hval & 0xf0000000)) != 0) - hval ^= g >> 24; - hval &= ~g; - } - return (hval); -} - -static boolean_t -nvt_nvpair_match(nvpair_t *nvp1, nvpair_t *nvp2, uint32_t nvflag) -{ - boolean_t match = B_FALSE; - if (nvflag & NV_UNIQUE_NAME_TYPE) { - if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0 && - NVP_TYPE(nvp1) == NVP_TYPE(nvp2)) - match = B_TRUE; - } else { - ASSERT(nvflag == 0 || nvflag & NV_UNIQUE_NAME); - if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0) - match = B_TRUE; - } - return (match); -} - -static nvpair_t * -nvt_lookup_name_type(nvlist_t *nvl, const char *name, data_type_t type) -{ - nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; - ASSERT(priv != NULL); - - i_nvp_t **tab = priv->nvp_hashtable; - - if (tab == NULL) { - ASSERT3P(priv->nvp_list, ==, NULL); - ASSERT0(priv->nvp_nbuckets); - ASSERT0(priv->nvp_nentries); - return (NULL); - } else { - ASSERT(priv->nvp_nbuckets != 0); - } - - uint64_t hash = nvt_hash(name); - uint64_t index = hash & (priv->nvp_nbuckets - 1); - - ASSERT3U(index, <, priv->nvp_nbuckets); - i_nvp_t *entry = tab[index]; - - for (i_nvp_t *e = entry; e != NULL; e = e->nvi_hashtable_next) { - if (strcmp(NVP_NAME(&e->nvi_nvp), name) == 0 && - (type == DATA_TYPE_DONTCARE || - NVP_TYPE(&e->nvi_nvp) == type)) - return (&e->nvi_nvp); - } - return (NULL); -} - -static nvpair_t * -nvt_lookup_name(nvlist_t *nvl, const char *name) -{ - return (nvt_lookup_name_type(nvl, name, DATA_TYPE_DONTCARE)); -} - -static int -nvt_resize(nvpriv_t *priv, uint32_t new_size) -{ - i_nvp_t **tab = priv->nvp_hashtable; - - /* - * Migrate all the entries from the current table - * to a newly-allocated table with the new size by - * re-adjusting the pointers of their entries. - */ - uint32_t size = priv->nvp_nbuckets; - uint32_t new_mask = new_size - 1; - ASSERT(((new_size) & ((new_size) - 1)) == 0); - - i_nvp_t **new_tab = nv_mem_zalloc(priv, new_size * sizeof (i_nvp_t *)); - if (new_tab == NULL) - return (ENOMEM); - - uint32_t nentries = 0; - for (uint32_t i = 0; i < size; i++) { - i_nvp_t *next, *e = tab[i]; - - while (e != NULL) { - next = e->nvi_hashtable_next; - - uint32_t hash = nvt_hash(NVP_NAME(&e->nvi_nvp)); - uint32_t index = hash & new_mask; - - e->nvi_hashtable_next = new_tab[index]; - new_tab[index] = e; - nentries++; - - e = next; - } - tab[i] = NULL; - } - ASSERT3U(nentries, ==, priv->nvp_nentries); - - nvt_tab_free(priv); - - priv->nvp_hashtable = new_tab; - priv->nvp_nbuckets = new_size; - priv->nvp_nentries = nentries; - - return (0); -} - -static boolean_t -nvt_needs_togrow(nvpriv_t *priv) -{ - /* - * Grow only when we have more elements than buckets - * and the # of buckets doesn't overflow. - */ - return (priv->nvp_nentries > priv->nvp_nbuckets && - (UINT32_MAX >> 1) >= priv->nvp_nbuckets); -} - -/* - * Allocate a new table that's twice the size of the old one, - * and migrate all the entries from the old one to the new - * one by re-adjusting their pointers. - */ -static int -nvt_grow(nvpriv_t *priv) -{ - uint32_t current_size = priv->nvp_nbuckets; - /* ensure we won't overflow */ - ASSERT3U(UINT32_MAX >> 1, >=, current_size); - return (nvt_resize(priv, current_size << 1)); -} - -static boolean_t -nvt_needs_toshrink(nvpriv_t *priv) -{ - /* - * Shrink only when the # of elements is less than or - * equal to 1/4 the # of buckets. Never shrink less than - * nvlist_hashtable_init_size. - */ - ASSERT3U(priv->nvp_nbuckets, >=, nvlist_hashtable_init_size); - if (priv->nvp_nbuckets == nvlist_hashtable_init_size) - return (B_FALSE); - return (priv->nvp_nentries <= (priv->nvp_nbuckets >> 2)); -} - -/* - * Allocate a new table that's half the size of the old one, - * and migrate all the entries from the old one to the new - * one by re-adjusting their pointers. - */ -static int -nvt_shrink(nvpriv_t *priv) -{ - uint32_t current_size = priv->nvp_nbuckets; - /* ensure we won't overflow */ - ASSERT3U(current_size, >=, nvlist_hashtable_init_size); - return (nvt_resize(priv, current_size >> 1)); -} - -static int -nvt_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp) -{ - nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; - - if (nvt_needs_toshrink(priv)) { - int err = nvt_shrink(priv); - if (err != 0) - return (err); - } - i_nvp_t **tab = priv->nvp_hashtable; - - char *name = NVP_NAME(nvp); - uint64_t hash = nvt_hash(name); - uint64_t index = hash & (priv->nvp_nbuckets - 1); - - ASSERT3U(index, <, priv->nvp_nbuckets); - i_nvp_t *bucket = tab[index]; - - for (i_nvp_t *prev = NULL, *e = bucket; - e != NULL; prev = e, e = e->nvi_hashtable_next) { - if (nvt_nvpair_match(&e->nvi_nvp, nvp, nvl->nvl_flag)) { - if (prev != NULL) { - prev->nvi_hashtable_next = - e->nvi_hashtable_next; - } else { - ASSERT3P(e, ==, bucket); - tab[index] = e->nvi_hashtable_next; - } - e->nvi_hashtable_next = NULL; - priv->nvp_nentries--; - break; - } - } - - return (0); -} - -static int -nvt_add_nvpair(nvlist_t *nvl, nvpair_t *nvp) -{ - nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; - - /* initialize nvpair table now if it doesn't exist. */ - if (priv->nvp_hashtable == NULL) { - int err = nvt_tab_alloc(priv, nvlist_hashtable_init_size); - if (err != 0) - return (err); - } - - /* - * if we don't allow duplicate entries, make sure to - * unlink any existing entries from the table. - */ - if (nvl->nvl_nvflag != 0) { - int err = nvt_remove_nvpair(nvl, nvp); - if (err != 0) - return (err); - } - - if (nvt_needs_togrow(priv)) { - int err = nvt_grow(priv); - if (err != 0) - return (err); - } - i_nvp_t **tab = priv->nvp_hashtable; - - char *name = NVP_NAME(nvp); - uint64_t hash = nvt_hash(name); - uint64_t index = hash & (priv->nvp_nbuckets - 1); - - ASSERT3U(index, <, priv->nvp_nbuckets); - i_nvp_t *bucket = tab[index]; - - /* insert link at the beginning of the bucket */ - i_nvp_t *new_entry = NVPAIR2I_NVP(nvp); - ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL); - new_entry->nvi_hashtable_next = bucket; - tab[index] = new_entry; - - priv->nvp_nentries++; - return (0); -} - -static void -nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv) -{ - nvl->nvl_version = NV_VERSION; - nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE); - nvl->nvl_priv = (uint64_t)(uintptr_t)priv; - nvl->nvl_flag = 0; - nvl->nvl_pad = 0; -} - -uint_t -nvlist_nvflag(nvlist_t *nvl) -{ - return (nvl->nvl_nvflag); -} - -/* - * nvlist_alloc - Allocate nvlist. - */ -/*ARGSUSED1*/ -int -nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag) -{ -#if defined(_KERNEL) && !defined(_BOOT) - return (nvlist_xalloc(nvlp, nvflag, - (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); -#else - return (nvlist_xalloc(nvlp, nvflag, nv_alloc_nosleep)); -#endif -} - -int -nvlist_xalloc(nvlist_t **nvlp, uint_t nvflag, nv_alloc_t *nva) -{ - nvpriv_t *priv; - - if (nvlp == NULL || nva == NULL) - return (EINVAL); - - if ((priv = nv_priv_alloc(nva)) == NULL) - return (ENOMEM); - - if ((*nvlp = nv_mem_zalloc(priv, - NV_ALIGN(sizeof (nvlist_t)))) == NULL) { - nv_mem_free(priv, priv, sizeof (nvpriv_t)); - return (ENOMEM); - } - - nvlist_init(*nvlp, nvflag, priv); - - return (0); -} - -/* - * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair. - */ -static nvpair_t * -nvp_buf_alloc(nvlist_t *nvl, size_t len) -{ - nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; - i_nvp_t *buf; - nvpair_t *nvp; - size_t nvsize; - - /* - * Allocate the buffer - */ - nvsize = len + offsetof(i_nvp_t, nvi_nvp); - - if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL) - return (NULL); - - nvp = &buf->nvi_nvp; - nvp->nvp_size = len; - - return (nvp); -} - -/* - * nvp_buf_free - de-Allocate an i_nvp_t. - */ -static void -nvp_buf_free(nvlist_t *nvl, nvpair_t *nvp) -{ - nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; - size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp); - - nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize); -} - -/* - * nvp_buf_link - link a new nv pair into the nvlist. - */ -static void -nvp_buf_link(nvlist_t *nvl, nvpair_t *nvp) -{ - nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; - i_nvp_t *curr = NVPAIR2I_NVP(nvp); - - /* Put element at end of nvlist */ - if (priv->nvp_list == NULL) { - priv->nvp_list = priv->nvp_last = curr; - } else { - curr->nvi_prev = priv->nvp_last; - priv->nvp_last->nvi_next = curr; - priv->nvp_last = curr; - } -} - -/* - * nvp_buf_unlink - unlink an removed nvpair out of the nvlist. - */ -static void -nvp_buf_unlink(nvlist_t *nvl, nvpair_t *nvp) -{ - nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; - i_nvp_t *curr = NVPAIR2I_NVP(nvp); - - /* - * protect nvlist_next_nvpair() against walking on freed memory. - */ - if (priv->nvp_curr == curr) - priv->nvp_curr = curr->nvi_next; - - if (curr == priv->nvp_list) - priv->nvp_list = curr->nvi_next; - else - curr->nvi_prev->nvi_next = curr->nvi_next; - - if (curr == priv->nvp_last) - priv->nvp_last = curr->nvi_prev; - else - curr->nvi_next->nvi_prev = curr->nvi_prev; -} - -/* - * take a nvpair type and number of elements and make sure the are valid - */ -static int -i_validate_type_nelem(data_type_t type, uint_t nelem) -{ - switch (type) { - case DATA_TYPE_BOOLEAN: - if (nelem != 0) - return (EINVAL); - break; - case DATA_TYPE_BOOLEAN_VALUE: - case DATA_TYPE_BYTE: - case DATA_TYPE_INT8: - case DATA_TYPE_UINT8: - case DATA_TYPE_INT16: - case DATA_TYPE_UINT16: - case DATA_TYPE_INT32: - case DATA_TYPE_UINT32: - case DATA_TYPE_INT64: - case DATA_TYPE_UINT64: - case DATA_TYPE_STRING: - case DATA_TYPE_HRTIME: - case DATA_TYPE_NVLIST: -#if !defined(_KERNEL) - case DATA_TYPE_DOUBLE: -#endif - if (nelem != 1) - return (EINVAL); - break; - case DATA_TYPE_BOOLEAN_ARRAY: - case DATA_TYPE_BYTE_ARRAY: - case DATA_TYPE_INT8_ARRAY: - case DATA_TYPE_UINT8_ARRAY: - case DATA_TYPE_INT16_ARRAY: - case DATA_TYPE_UINT16_ARRAY: - case DATA_TYPE_INT32_ARRAY: - case DATA_TYPE_UINT32_ARRAY: - case DATA_TYPE_INT64_ARRAY: - case DATA_TYPE_UINT64_ARRAY: - case DATA_TYPE_STRING_ARRAY: - case DATA_TYPE_NVLIST_ARRAY: - /* we allow arrays with 0 elements */ - break; - default: - return (EINVAL); - } - return (0); -} - -/* - * Verify nvp_name_sz and check the name string length. - */ -static int -i_validate_nvpair_name(nvpair_t *nvp) -{ - if ((nvp->nvp_name_sz <= 0) || - (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0))) - return (EFAULT); - - /* verify the name string, make sure its terminated */ - if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0') - return (EFAULT); - - return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT); -} - -static int -i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data) -{ - switch (type) { - case DATA_TYPE_BOOLEAN_VALUE: - if (*(boolean_t *)data != B_TRUE && - *(boolean_t *)data != B_FALSE) - return (EINVAL); - break; - case DATA_TYPE_BOOLEAN_ARRAY: { - int i; - - for (i = 0; i < nelem; i++) - if (((boolean_t *)data)[i] != B_TRUE && - ((boolean_t *)data)[i] != B_FALSE) - return (EINVAL); - break; - } - default: - break; - } - - return (0); -} - -/* - * This function takes a pointer to what should be a nvpair and it's size - * and then verifies that all the nvpair fields make sense and can be - * trusted. This function is used when decoding packed nvpairs. - */ -static int -i_validate_nvpair(nvpair_t *nvp) -{ - data_type_t type = NVP_TYPE(nvp); - int size1, size2; - - /* verify nvp_name_sz, check the name string length */ - if (i_validate_nvpair_name(nvp) != 0) - return (EFAULT); - - if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0) - return (EFAULT); - - /* - * verify nvp_type, nvp_value_elem, and also possibly - * verify string values and get the value size. - */ - size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp)); - size1 = nvp->nvp_size - NVP_VALOFF(nvp); - if (size2 < 0 || size1 != NV_ALIGN(size2)) - return (EFAULT); - - return (0); -} - -static int -nvlist_copy_pairs(nvlist_t *snvl, nvlist_t *dnvl) -{ - nvpriv_t *priv; - i_nvp_t *curr; - - if ((priv = (nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL) - return (EINVAL); - - for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { - nvpair_t *nvp = &curr->nvi_nvp; - int err; - - if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp), - NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0) - return (err); - } - - return (0); -} - -/* - * Frees all memory allocated for an nvpair (like embedded lists) with - * the exception of the nvpair buffer itself. - */ -static void -nvpair_free(nvpair_t *nvp) -{ - switch (NVP_TYPE(nvp)) { - case DATA_TYPE_NVLIST: - nvlist_free(EMBEDDED_NVL(nvp)); - break; - case DATA_TYPE_NVLIST_ARRAY: { - nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); - int i; - - for (i = 0; i < NVP_NELEM(nvp); i++) - nvlist_free(nvlp[i]); - break; - } - default: - break; - } -} - -/* - * nvlist_free - free an unpacked nvlist - */ -void -nvlist_free(nvlist_t *nvl) -{ - nvpriv_t *priv; - i_nvp_t *curr; - - if (nvl == NULL || - (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) - return; - - /* - * Unpacked nvlist are linked through i_nvp_t - */ - curr = priv->nvp_list; - while (curr != NULL) { - nvpair_t *nvp = &curr->nvi_nvp; - curr = curr->nvi_next; - - nvpair_free(nvp); - nvp_buf_free(nvl, nvp); - } - - if (!(priv->nvp_stat & NV_STAT_EMBEDDED)) - nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t))); - else - nvl->nvl_priv = 0; - - nvt_tab_free(priv); - nv_mem_free(priv, priv, sizeof (nvpriv_t)); -} - -static int -nvlist_contains_nvp(nvlist_t *nvl, nvpair_t *nvp) -{ - nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; - i_nvp_t *curr; - - if (nvp == NULL) - return (0); - - for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) - if (&curr->nvi_nvp == nvp) - return (1); - - return (0); -} - -/* - * Make a copy of nvlist - */ -/*ARGSUSED1*/ -int -nvlist_dup(nvlist_t *nvl, nvlist_t **nvlp, int kmflag) -{ -#if defined(_KERNEL) && !defined(_BOOT) - return (nvlist_xdup(nvl, nvlp, - (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); -#else - return (nvlist_xdup(nvl, nvlp, nv_alloc_nosleep)); -#endif -} - -int -nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva) -{ - int err; - nvlist_t *ret; - - if (nvl == NULL || nvlp == NULL) - return (EINVAL); - - if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0) - return (err); - - if ((err = nvlist_copy_pairs(nvl, ret)) != 0) - nvlist_free(ret); - else - *nvlp = ret; - - return (err); -} - -/* - * Remove all with matching name - */ -int -nvlist_remove_all(nvlist_t *nvl, const char *name) -{ - int error = ENOENT; - - if (nvl == NULL || name == NULL || nvl->nvl_priv == 0) - return (EINVAL); - - nvpair_t *nvp; - while ((nvp = nvt_lookup_name(nvl, name)) != NULL) { - VERIFY0(nvlist_remove_nvpair(nvl, nvp)); - error = 0; - } - - return (error); -} - -/* - * Remove first one with matching name and type - */ -int -nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type) -{ - if (nvl == NULL || name == NULL || nvl->nvl_priv == 0) - return (EINVAL); - - nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type); - if (nvp == NULL) - return (ENOENT); - - return (nvlist_remove_nvpair(nvl, nvp)); -} - -int -nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp) -{ - if (nvl == NULL || nvp == NULL) - return (EINVAL); - - int err = nvt_remove_nvpair(nvl, nvp); - if (err != 0) - return (err); - - nvp_buf_unlink(nvl, nvp); - nvpair_free(nvp); - nvp_buf_free(nvl, nvp); - return (0); -} - -/* - * This function calculates the size of an nvpair value. - * - * The data argument controls the behavior in case of the data types - * DATA_TYPE_STRING and - * DATA_TYPE_STRING_ARRAY - * Is data == NULL then the size of the string(s) is excluded. - */ -static int -i_get_value_size(data_type_t type, const void *data, uint_t nelem) -{ - uint64_t value_sz; - - if (i_validate_type_nelem(type, nelem) != 0) - return (-1); - - /* Calculate required size for holding value */ - switch (type) { - case DATA_TYPE_BOOLEAN: - value_sz = 0; - break; - case DATA_TYPE_BOOLEAN_VALUE: - value_sz = sizeof (boolean_t); - break; - case DATA_TYPE_BYTE: - value_sz = sizeof (uchar_t); - break; - case DATA_TYPE_INT8: - value_sz = sizeof (int8_t); - break; - case DATA_TYPE_UINT8: - value_sz = sizeof (uint8_t); - break; - case DATA_TYPE_INT16: - value_sz = sizeof (int16_t); - break; - case DATA_TYPE_UINT16: - value_sz = sizeof (uint16_t); - break; - case DATA_TYPE_INT32: - value_sz = sizeof (int32_t); - break; - case DATA_TYPE_UINT32: - value_sz = sizeof (uint32_t); - break; - case DATA_TYPE_INT64: - value_sz = sizeof (int64_t); - break; - case DATA_TYPE_UINT64: - value_sz = sizeof (uint64_t); - break; -#if !defined(_KERNEL) - case DATA_TYPE_DOUBLE: - value_sz = sizeof (double); - break; -#endif - case DATA_TYPE_STRING: - if (data == NULL) - value_sz = 0; - else - value_sz = strlen(data) + 1; - break; - case DATA_TYPE_BOOLEAN_ARRAY: - value_sz = (uint64_t)nelem * sizeof (boolean_t); - break; - case DATA_TYPE_BYTE_ARRAY: - value_sz = (uint64_t)nelem * sizeof (uchar_t); - break; - case DATA_TYPE_INT8_ARRAY: - value_sz = (uint64_t)nelem * sizeof (int8_t); - break; - case DATA_TYPE_UINT8_ARRAY: - value_sz = (uint64_t)nelem * sizeof (uint8_t); - break; - case DATA_TYPE_INT16_ARRAY: - value_sz = (uint64_t)nelem * sizeof (int16_t); - break; - case DATA_TYPE_UINT16_ARRAY: - value_sz = (uint64_t)nelem * sizeof (uint16_t); - break; - case DATA_TYPE_INT32_ARRAY: - value_sz = (uint64_t)nelem * sizeof (int32_t); - break; - case DATA_TYPE_UINT32_ARRAY: - value_sz = (uint64_t)nelem * sizeof (uint32_t); - break; - case DATA_TYPE_INT64_ARRAY: - value_sz = (uint64_t)nelem * sizeof (int64_t); - break; - case DATA_TYPE_UINT64_ARRAY: - value_sz = (uint64_t)nelem * sizeof (uint64_t); - break; - case DATA_TYPE_STRING_ARRAY: - value_sz = (uint64_t)nelem * sizeof (uint64_t); - - if (data != NULL) { - char *const *strs = data; - uint_t i; - - /* no alignment requirement for strings */ - for (i = 0; i < nelem; i++) { - if (strs[i] == NULL) - return (-1); - value_sz += strlen(strs[i]) + 1; - } - } - break; - case DATA_TYPE_HRTIME: - value_sz = sizeof (hrtime_t); - break; - case DATA_TYPE_NVLIST: - value_sz = NV_ALIGN(sizeof (nvlist_t)); - break; - case DATA_TYPE_NVLIST_ARRAY: - value_sz = (uint64_t)nelem * sizeof (uint64_t) + - (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t)); - break; - default: - return (-1); - } - - return (value_sz > INT32_MAX ? -1 : (int)value_sz); -} - -static int -nvlist_copy_embedded(nvlist_t *nvl, nvlist_t *onvl, nvlist_t *emb_nvl) -{ - nvpriv_t *priv; - int err; - - if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t) - nvl->nvl_priv)) == NULL) - return (ENOMEM); - - nvlist_init(emb_nvl, onvl->nvl_nvflag, priv); - - if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) { - nvlist_free(emb_nvl); - emb_nvl->nvl_priv = 0; - } - - return (err); -} - -/* - * nvlist_add_common - Add new pair to nvlist - */ -static int -nvlist_add_common(nvlist_t *nvl, const char *name, - data_type_t type, uint_t nelem, const void *data) -{ - nvpair_t *nvp; - uint_t i; - - int nvp_sz, name_sz, value_sz; - int err = 0; - - if (name == NULL || nvl == NULL || nvl->nvl_priv == 0) - return (EINVAL); - - if (nelem != 0 && data == NULL) - return (EINVAL); - - /* - * Verify type and nelem and get the value size. - * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY - * is the size of the string(s) included. - */ - if ((value_sz = i_get_value_size(type, data, nelem)) < 0) - return (EINVAL); - - if (i_validate_nvpair_value(type, nelem, data) != 0) - return (EINVAL); - - /* - * If we're adding an nvlist or nvlist array, ensure that we are not - * adding the input nvlist to itself, which would cause recursion, - * and ensure that no NULL nvlist pointers are present. - */ - switch (type) { - case DATA_TYPE_NVLIST: - if (data == nvl || data == NULL) - return (EINVAL); - break; - case DATA_TYPE_NVLIST_ARRAY: { - nvlist_t **onvlp = (nvlist_t **)data; - for (i = 0; i < nelem; i++) { - if (onvlp[i] == nvl || onvlp[i] == NULL) - return (EINVAL); - } - break; - } - default: - break; - } - - /* calculate sizes of the nvpair elements and the nvpair itself */ - name_sz = strlen(name) + 1; - if (name_sz >= 1ULL << (sizeof (nvp->nvp_name_sz) * 8 - 1)) - return (EINVAL); - - nvp_sz = NVP_SIZE_CALC(name_sz, value_sz); - - if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL) - return (ENOMEM); - - ASSERT(nvp->nvp_size == nvp_sz); - nvp->nvp_name_sz = name_sz; - nvp->nvp_value_elem = nelem; - nvp->nvp_type = type; - bcopy(name, NVP_NAME(nvp), name_sz); - - switch (type) { - case DATA_TYPE_BOOLEAN: - break; - case DATA_TYPE_STRING_ARRAY: { - char *const *strs = data; - char *buf = NVP_VALUE(nvp); - char **cstrs = (void *)buf; - - /* skip pre-allocated space for pointer array */ - buf += nelem * sizeof (uint64_t); - for (i = 0; i < nelem; i++) { - int slen = strlen(strs[i]) + 1; - bcopy(strs[i], buf, slen); - cstrs[i] = buf; - buf += slen; - } - break; - } - case DATA_TYPE_NVLIST: { - nvlist_t *nnvl = EMBEDDED_NVL(nvp); - nvlist_t *onvl = (nvlist_t *)data; - - if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) { - nvp_buf_free(nvl, nvp); - return (err); - } - break; - } - case DATA_TYPE_NVLIST_ARRAY: { - nvlist_t **onvlp = (nvlist_t **)data; - nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); - nvlist_t *embedded = (nvlist_t *) - ((uintptr_t)nvlp + nelem * sizeof (uint64_t)); - - for (i = 0; i < nelem; i++) { - if ((err = nvlist_copy_embedded(nvl, - onvlp[i], embedded)) != 0) { - /* - * Free any successfully created lists - */ - nvpair_free(nvp); - nvp_buf_free(nvl, nvp); - return (err); - } - - nvlp[i] = embedded++; - } - break; - } - default: - bcopy(data, NVP_VALUE(nvp), value_sz); - } - - /* if unique name, remove before add */ - if (nvl->nvl_nvflag & NV_UNIQUE_NAME) - (void) nvlist_remove_all(nvl, name); - else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE) - (void) nvlist_remove(nvl, name, type); - - err = nvt_add_nvpair(nvl, nvp); - if (err != 0) { - nvpair_free(nvp); - nvp_buf_free(nvl, nvp); - return (err); - } - nvp_buf_link(nvl, nvp); - - return (0); -} - -int -nvlist_add_boolean(nvlist_t *nvl, const char *name) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL)); -} - -int -nvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val)); -} - -int -nvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val)); -} - -int -nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val)); -} - -int -nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val)); -} - -int -nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val)); -} - -int -nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val)); -} - -int -nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val)); -} - -int -nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val)); -} - -int -nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val)); -} - -int -nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val)); -} - -#if !defined(_KERNEL) -int -nvlist_add_double(nvlist_t *nvl, const char *name, double val) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val)); -} -#endif - -int -nvlist_add_string(nvlist_t *nvl, const char *name, const char *val) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val)); -} - -int -nvlist_add_boolean_array(nvlist_t *nvl, const char *name, - boolean_t *a, uint_t n) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a)); -} - -int -nvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *a, uint_t n) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a)); -} - -int -nvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *a, uint_t n) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a)); -} - -int -nvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *a, uint_t n) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a)); -} - -int -nvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *a, uint_t n) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a)); -} - -int -nvlist_add_uint16_array(nvlist_t *nvl, const char *name, uint16_t *a, uint_t n) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a)); -} - -int -nvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *a, uint_t n) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a)); -} - -int -nvlist_add_uint32_array(nvlist_t *nvl, const char *name, uint32_t *a, uint_t n) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a)); -} - -int -nvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *a, uint_t n) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a)); -} - -int -nvlist_add_uint64_array(nvlist_t *nvl, const char *name, uint64_t *a, uint_t n) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a)); -} - -int -nvlist_add_string_array(nvlist_t *nvl, const char *name, - char *const *a, uint_t n) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a)); -} - -int -nvlist_add_hrtime(nvlist_t *nvl, const char *name, hrtime_t val) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val)); -} - -int -nvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val)); -} - -int -nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **a, uint_t n) -{ - return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a)); -} - -/* reading name-value pairs */ -nvpair_t * -nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp) -{ - nvpriv_t *priv; - i_nvp_t *curr; - - if (nvl == NULL || - (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) - return (NULL); - - curr = NVPAIR2I_NVP(nvp); - - /* - * Ensure that nvp is a valid nvpair on this nvlist. - * NB: nvp_curr is used only as a hint so that we don't always - * have to walk the list to determine if nvp is still on the list. - */ - if (nvp == NULL) - curr = priv->nvp_list; - else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp)) - curr = curr->nvi_next; - else - curr = NULL; - - priv->nvp_curr = curr; - - return (curr != NULL ? &curr->nvi_nvp : NULL); -} - -nvpair_t * -nvlist_prev_nvpair(nvlist_t *nvl, nvpair_t *nvp) -{ - nvpriv_t *priv; - i_nvp_t *curr; - - if (nvl == NULL || - (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) - return (NULL); - - curr = NVPAIR2I_NVP(nvp); - - if (nvp == NULL) - curr = priv->nvp_last; - else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp)) - curr = curr->nvi_prev; - else - curr = NULL; - - priv->nvp_curr = curr; - - return (curr != NULL ? &curr->nvi_nvp : NULL); -} - -boolean_t -nvlist_empty(nvlist_t *nvl) -{ - nvpriv_t *priv; - - if (nvl == NULL || - (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) - return (B_TRUE); - - return (priv->nvp_list == NULL); -} - -char * -nvpair_name(nvpair_t *nvp) -{ - return (NVP_NAME(nvp)); -} - -data_type_t -nvpair_type(nvpair_t *nvp) -{ - return (NVP_TYPE(nvp)); -} - -int -nvpair_type_is_array(nvpair_t *nvp) -{ - data_type_t type = NVP_TYPE(nvp); - - if ((type == DATA_TYPE_BYTE_ARRAY) || - (type == DATA_TYPE_INT8_ARRAY) || - (type == DATA_TYPE_UINT8_ARRAY) || - (type == DATA_TYPE_INT16_ARRAY) || - (type == DATA_TYPE_UINT16_ARRAY) || - (type == DATA_TYPE_INT32_ARRAY) || - (type == DATA_TYPE_UINT32_ARRAY) || - (type == DATA_TYPE_INT64_ARRAY) || - (type == DATA_TYPE_UINT64_ARRAY) || - (type == DATA_TYPE_BOOLEAN_ARRAY) || - (type == DATA_TYPE_STRING_ARRAY) || - (type == DATA_TYPE_NVLIST_ARRAY)) - return (1); - return (0); - -} - -static int -nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data) -{ - if (nvp == NULL || nvpair_type(nvp) != type) - return (EINVAL); - - /* - * For non-array types, we copy the data. - * For array types (including string), we set a pointer. - */ - switch (type) { - case DATA_TYPE_BOOLEAN: - if (nelem != NULL) - *nelem = 0; - break; - - case DATA_TYPE_BOOLEAN_VALUE: - case DATA_TYPE_BYTE: - case DATA_TYPE_INT8: - case DATA_TYPE_UINT8: - case DATA_TYPE_INT16: - case DATA_TYPE_UINT16: - case DATA_TYPE_INT32: - case DATA_TYPE_UINT32: - case DATA_TYPE_INT64: - case DATA_TYPE_UINT64: - case DATA_TYPE_HRTIME: -#if !defined(_KERNEL) - case DATA_TYPE_DOUBLE: -#endif - if (data == NULL) - return (EINVAL); - bcopy(NVP_VALUE(nvp), data, - (size_t)i_get_value_size(type, NULL, 1)); - if (nelem != NULL) - *nelem = 1; - break; - - case DATA_TYPE_NVLIST: - case DATA_TYPE_STRING: - if (data == NULL) - return (EINVAL); - *(void **)data = (void *)NVP_VALUE(nvp); - if (nelem != NULL) - *nelem = 1; - break; - - case DATA_TYPE_BOOLEAN_ARRAY: - case DATA_TYPE_BYTE_ARRAY: - case DATA_TYPE_INT8_ARRAY: - case DATA_TYPE_UINT8_ARRAY: - case DATA_TYPE_INT16_ARRAY: - case DATA_TYPE_UINT16_ARRAY: - case DATA_TYPE_INT32_ARRAY: - case DATA_TYPE_UINT32_ARRAY: - case DATA_TYPE_INT64_ARRAY: - case DATA_TYPE_UINT64_ARRAY: - case DATA_TYPE_STRING_ARRAY: - case DATA_TYPE_NVLIST_ARRAY: - if (nelem == NULL || data == NULL) - return (EINVAL); - if ((*nelem = NVP_NELEM(nvp)) != 0) - *(void **)data = (void *)NVP_VALUE(nvp); - else - *(void **)data = NULL; - break; - - default: - return (ENOTSUP); - } - - return (0); -} - -static int -nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type, - uint_t *nelem, void *data) -{ - if (name == NULL || nvl == NULL || nvl->nvl_priv == 0) - return (EINVAL); - - if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE))) - return (ENOTSUP); - - nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type); - if (nvp == NULL) - return (ENOENT); - - return (nvpair_value_common(nvp, type, nelem, data)); -} - -int -nvlist_lookup_boolean(nvlist_t *nvl, const char *name) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL)); -} - -int -nvlist_lookup_boolean_value(nvlist_t *nvl, const char *name, boolean_t *val) -{ - return (nvlist_lookup_common(nvl, name, - DATA_TYPE_BOOLEAN_VALUE, NULL, val)); -} - -int -nvlist_lookup_byte(nvlist_t *nvl, const char *name, uchar_t *val) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val)); -} - -int -nvlist_lookup_int8(nvlist_t *nvl, const char *name, int8_t *val) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val)); -} - -int -nvlist_lookup_uint8(nvlist_t *nvl, const char *name, uint8_t *val) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val)); -} - -int -nvlist_lookup_int16(nvlist_t *nvl, const char *name, int16_t *val) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val)); -} - -int -nvlist_lookup_uint16(nvlist_t *nvl, const char *name, uint16_t *val) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val)); -} - -int -nvlist_lookup_int32(nvlist_t *nvl, const char *name, int32_t *val) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val)); -} - -int -nvlist_lookup_uint32(nvlist_t *nvl, const char *name, uint32_t *val) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val)); -} - -int -nvlist_lookup_int64(nvlist_t *nvl, const char *name, int64_t *val) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val)); -} - -int -nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val)); -} - -#if !defined(_KERNEL) -int -nvlist_lookup_double(nvlist_t *nvl, const char *name, double *val) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val)); -} -#endif - -int -nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val)); -} - -int -nvlist_lookup_nvlist(nvlist_t *nvl, const char *name, nvlist_t **val) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val)); -} - -int -nvlist_lookup_boolean_array(nvlist_t *nvl, const char *name, - boolean_t **a, uint_t *n) -{ - return (nvlist_lookup_common(nvl, name, - DATA_TYPE_BOOLEAN_ARRAY, n, a)); -} - -int -nvlist_lookup_byte_array(nvlist_t *nvl, const char *name, - uchar_t **a, uint_t *n) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a)); -} - -int -nvlist_lookup_int8_array(nvlist_t *nvl, const char *name, int8_t **a, uint_t *n) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a)); -} - -int -nvlist_lookup_uint8_array(nvlist_t *nvl, const char *name, - uint8_t **a, uint_t *n) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a)); -} - -int -nvlist_lookup_int16_array(nvlist_t *nvl, const char *name, - int16_t **a, uint_t *n) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a)); -} - -int -nvlist_lookup_uint16_array(nvlist_t *nvl, const char *name, - uint16_t **a, uint_t *n) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a)); -} - -int -nvlist_lookup_int32_array(nvlist_t *nvl, const char *name, - int32_t **a, uint_t *n) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a)); -} - -int -nvlist_lookup_uint32_array(nvlist_t *nvl, const char *name, - uint32_t **a, uint_t *n) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a)); -} - -int -nvlist_lookup_int64_array(nvlist_t *nvl, const char *name, - int64_t **a, uint_t *n) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a)); -} - -int -nvlist_lookup_uint64_array(nvlist_t *nvl, const char *name, - uint64_t **a, uint_t *n) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a)); -} - -int -nvlist_lookup_string_array(nvlist_t *nvl, const char *name, - char ***a, uint_t *n) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a)); -} - -int -nvlist_lookup_nvlist_array(nvlist_t *nvl, const char *name, - nvlist_t ***a, uint_t *n) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a)); -} - -int -nvlist_lookup_hrtime(nvlist_t *nvl, const char *name, hrtime_t *val) -{ - return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val)); -} - -int -nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...) -{ - va_list ap; - char *name; - int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0); - int ret = 0; - - va_start(ap, flag); - while (ret == 0 && (name = va_arg(ap, char *)) != NULL) { - data_type_t type; - void *val; - uint_t *nelem; - - switch (type = va_arg(ap, data_type_t)) { - case DATA_TYPE_BOOLEAN: - ret = nvlist_lookup_common(nvl, name, type, NULL, NULL); - break; - - case DATA_TYPE_BOOLEAN_VALUE: - case DATA_TYPE_BYTE: - case DATA_TYPE_INT8: - case DATA_TYPE_UINT8: - case DATA_TYPE_INT16: - case DATA_TYPE_UINT16: - case DATA_TYPE_INT32: - case DATA_TYPE_UINT32: - case DATA_TYPE_INT64: - case DATA_TYPE_UINT64: - case DATA_TYPE_HRTIME: - case DATA_TYPE_STRING: - case DATA_TYPE_NVLIST: -#if !defined(_KERNEL) - case DATA_TYPE_DOUBLE: -#endif - val = va_arg(ap, void *); - ret = nvlist_lookup_common(nvl, name, type, NULL, val); - break; - - case DATA_TYPE_BYTE_ARRAY: - case DATA_TYPE_BOOLEAN_ARRAY: - case DATA_TYPE_INT8_ARRAY: - case DATA_TYPE_UINT8_ARRAY: - case DATA_TYPE_INT16_ARRAY: - case DATA_TYPE_UINT16_ARRAY: - case DATA_TYPE_INT32_ARRAY: - case DATA_TYPE_UINT32_ARRAY: - case DATA_TYPE_INT64_ARRAY: - case DATA_TYPE_UINT64_ARRAY: - case DATA_TYPE_STRING_ARRAY: - case DATA_TYPE_NVLIST_ARRAY: - val = va_arg(ap, void *); - nelem = va_arg(ap, uint_t *); - ret = nvlist_lookup_common(nvl, name, type, nelem, val); - break; - - default: - ret = EINVAL; - } - - if (ret == ENOENT && noentok) - ret = 0; - } - va_end(ap); - - return (ret); -} - -/* - * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function - * returns zero and a pointer to the matching nvpair is returned in '*ret' - * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate - * multiple levels of embedded nvlists, with 'sep' as the separator. As an - * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or - * "a.d[3].e[1]". This matches the C syntax for array embed (for convience, - * code also supports "a.d[3]e[1]" syntax). - * - * If 'ip' is non-NULL and the last name component is an array, return the - * value of the "...[index]" array index in *ip. For an array reference that - * is not indexed, *ip will be returned as -1. If there is a syntax error in - * 'name', and 'ep' is non-NULL then *ep will be set to point to the location - * inside the 'name' string where the syntax error was detected. - */ -static int -nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep, - nvpair_t **ret, int *ip, char **ep) -{ - nvpair_t *nvp; - const char *np; - char *sepp; - char *idxp, *idxep; - nvlist_t **nva; - long idx; - int n; - - if (ip) - *ip = -1; /* not indexed */ - if (ep) - *ep = NULL; - - if ((nvl == NULL) || (name == NULL)) - return (EINVAL); - - sepp = NULL; - idx = 0; - /* step through components of name */ - for (np = name; np && *np; np = sepp) { - /* ensure unique names */ - if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME)) - return (ENOTSUP); - - /* skip white space */ - skip_whitespace(np); - if (*np == 0) - break; - - /* set 'sepp' to end of current component 'np' */ - if (sep) - sepp = strchr(np, sep); - else - sepp = NULL; - - /* find start of next "[ index ]..." */ - idxp = strchr(np, '['); - - /* if sepp comes first, set idxp to NULL */ - if (sepp && idxp && (sepp < idxp)) - idxp = NULL; - - /* - * At this point 'idxp' is set if there is an index - * expected for the current component. - */ - if (idxp) { - /* set 'n' to length of current 'np' name component */ - n = idxp++ - np; - - /* keep sepp up to date for *ep use as we advance */ - skip_whitespace(idxp); - sepp = idxp; - - /* determine the index value */ -#if defined(_KERNEL) && !defined(_BOOT) - if (ddi_strtol(idxp, &idxep, 0, &idx)) - goto fail; -#else - idx = strtol(idxp, &idxep, 0); -#endif - if (idxep == idxp) - goto fail; - - /* keep sepp up to date for *ep use as we advance */ - sepp = idxep; - - /* skip white space index value and check for ']' */ - skip_whitespace(sepp); - if (*sepp++ != ']') - goto fail; - - /* for embedded arrays, support C syntax: "a[1].b" */ - skip_whitespace(sepp); - if (sep && (*sepp == sep)) - sepp++; - } else if (sepp) { - n = sepp++ - np; - } else { - n = strlen(np); - } - - /* trim trailing whitespace by reducing length of 'np' */ - if (n == 0) - goto fail; - for (n--; (np[n] == ' ') || (np[n] == '\t'); n--) - ; - n++; - - /* skip whitespace, and set sepp to NULL if complete */ - if (sepp) { - skip_whitespace(sepp); - if (*sepp == 0) - sepp = NULL; - } - - /* - * At this point: - * o 'n' is the length of current 'np' component. - * o 'idxp' is set if there was an index, and value 'idx'. - * o 'sepp' is set to the beginning of the next component, - * and set to NULL if we have no more components. - * - * Search for nvpair with matching component name. - */ - for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; - nvp = nvlist_next_nvpair(nvl, nvp)) { - - /* continue if no match on name */ - if (strncmp(np, nvpair_name(nvp), n) || - (strlen(nvpair_name(nvp)) != n)) - continue; - - /* if indexed, verify type is array oriented */ - if (idxp && !nvpair_type_is_array(nvp)) - goto fail; - - /* - * Full match found, return nvp and idx if this - * was the last component. - */ - if (sepp == NULL) { - if (ret) - *ret = nvp; - if (ip && idxp) - *ip = (int)idx; /* return index */ - return (0); /* found */ - } - - /* - * More components: current match must be - * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY - * to support going deeper. - */ - if (nvpair_type(nvp) == DATA_TYPE_NVLIST) { - nvl = EMBEDDED_NVL(nvp); - break; - } else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) { - (void) nvpair_value_nvlist_array(nvp, - &nva, (uint_t *)&n); - if ((n < 0) || (idx >= n)) - goto fail; - nvl = nva[idx]; - break; - } - - /* type does not support more levels */ - goto fail; - } - if (nvp == NULL) - goto fail; /* 'name' not found */ - - /* search for match of next component in embedded 'nvl' list */ - } - -fail: if (ep && sepp) - *ep = sepp; - return (EINVAL); -} - -/* - * Return pointer to nvpair with specified 'name'. - */ -int -nvlist_lookup_nvpair(nvlist_t *nvl, const char *name, nvpair_t **ret) -{ - return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL)); -} - -/* - * Determine if named nvpair exists in nvlist (use embedded separator of '.' - * and return array index). See nvlist_lookup_nvpair_ei_sep for more detailed - * description. - */ -int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl, - const char *name, nvpair_t **ret, int *ip, char **ep) -{ - return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep)); -} - -boolean_t -nvlist_exists(nvlist_t *nvl, const char *name) -{ - nvpriv_t *priv; - nvpair_t *nvp; - i_nvp_t *curr; - - if (name == NULL || nvl == NULL || - (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) - return (B_FALSE); - - for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { - nvp = &curr->nvi_nvp; - - if (strcmp(name, NVP_NAME(nvp)) == 0) - return (B_TRUE); - } - - return (B_FALSE); -} - -int -nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val) -{ - return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val)); -} - -int -nvpair_value_byte(nvpair_t *nvp, uchar_t *val) -{ - return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val)); -} - -int -nvpair_value_int8(nvpair_t *nvp, int8_t *val) -{ - return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val)); -} - -int -nvpair_value_uint8(nvpair_t *nvp, uint8_t *val) -{ - return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val)); -} - -int -nvpair_value_int16(nvpair_t *nvp, int16_t *val) -{ - return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val)); -} - -int -nvpair_value_uint16(nvpair_t *nvp, uint16_t *val) -{ - return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val)); -} - -int -nvpair_value_int32(nvpair_t *nvp, int32_t *val) -{ - return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val)); -} - -int -nvpair_value_uint32(nvpair_t *nvp, uint32_t *val) -{ - return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val)); -} - -int -nvpair_value_int64(nvpair_t *nvp, int64_t *val) -{ - return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val)); -} - -int -nvpair_value_uint64(nvpair_t *nvp, uint64_t *val) -{ - return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val)); -} - -#if !defined(_KERNEL) -int -nvpair_value_double(nvpair_t *nvp, double *val) -{ - return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val)); -} -#endif - -int -nvpair_value_string(nvpair_t *nvp, char **val) -{ - return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val)); -} - -int -nvpair_value_nvlist(nvpair_t *nvp, nvlist_t **val) -{ - return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val)); -} - -int -nvpair_value_boolean_array(nvpair_t *nvp, boolean_t **val, uint_t *nelem) -{ - return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val)); -} - -int -nvpair_value_byte_array(nvpair_t *nvp, uchar_t **val, uint_t *nelem) -{ - return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val)); -} - -int -nvpair_value_int8_array(nvpair_t *nvp, int8_t **val, uint_t *nelem) -{ - return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val)); -} - -int -nvpair_value_uint8_array(nvpair_t *nvp, uint8_t **val, uint_t *nelem) -{ - return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val)); -} - -int -nvpair_value_int16_array(nvpair_t *nvp, int16_t **val, uint_t *nelem) -{ - return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val)); -} - -int -nvpair_value_uint16_array(nvpair_t *nvp, uint16_t **val, uint_t *nelem) -{ - return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val)); -} - -int -nvpair_value_int32_array(nvpair_t *nvp, int32_t **val, uint_t *nelem) -{ - return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val)); -} - -int -nvpair_value_uint32_array(nvpair_t *nvp, uint32_t **val, uint_t *nelem) -{ - return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val)); -} - -int -nvpair_value_int64_array(nvpair_t *nvp, int64_t **val, uint_t *nelem) -{ - return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val)); -} - -int -nvpair_value_uint64_array(nvpair_t *nvp, uint64_t **val, uint_t *nelem) -{ - return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val)); -} - -int -nvpair_value_string_array(nvpair_t *nvp, char ***val, uint_t *nelem) -{ - return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val)); -} - -int -nvpair_value_nvlist_array(nvpair_t *nvp, nvlist_t ***val, uint_t *nelem) -{ - return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val)); -} - -int -nvpair_value_hrtime(nvpair_t *nvp, hrtime_t *val) -{ - return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val)); -} - -/* - * Add specified pair to the list. - */ -int -nvlist_add_nvpair(nvlist_t *nvl, nvpair_t *nvp) -{ - if (nvl == NULL || nvp == NULL) - return (EINVAL); - - return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp), - NVP_NELEM(nvp), NVP_VALUE(nvp))); -} - -/* - * Merge the supplied nvlists and put the result in dst. - * The merged list will contain all names specified in both lists, - * the values are taken from nvl in the case of duplicates. - * Return 0 on success. - */ -/*ARGSUSED*/ -int -nvlist_merge(nvlist_t *dst, nvlist_t *nvl, int flag) -{ - if (nvl == NULL || dst == NULL) - return (EINVAL); - - if (dst != nvl) - return (nvlist_copy_pairs(nvl, dst)); - - return (0); -} - -/* - * Encoding related routines - */ -#define NVS_OP_ENCODE 0 -#define NVS_OP_DECODE 1 -#define NVS_OP_GETSIZE 2 - -typedef struct nvs_ops nvs_ops_t; - -typedef struct { - int nvs_op; - const nvs_ops_t *nvs_ops; - void *nvs_private; - nvpriv_t *nvs_priv; - int nvs_recursion; -} nvstream_t; - -/* - * nvs operations are: - * - nvs_nvlist - * encoding / decoding of a nvlist header (nvlist_t) - * calculates the size used for header and end detection - * - * - nvs_nvpair - * responsible for the first part of encoding / decoding of an nvpair - * calculates the decoded size of an nvpair - * - * - nvs_nvp_op - * second part of encoding / decoding of an nvpair - * - * - nvs_nvp_size - * calculates the encoding size of an nvpair - * - * - nvs_nvl_fini - * encodes the end detection mark (zeros). - */ -struct nvs_ops { - int (*nvs_nvlist)(nvstream_t *, nvlist_t *, size_t *); - int (*nvs_nvpair)(nvstream_t *, nvpair_t *, size_t *); - int (*nvs_nvp_op)(nvstream_t *, nvpair_t *); - int (*nvs_nvp_size)(nvstream_t *, nvpair_t *, size_t *); - int (*nvs_nvl_fini)(nvstream_t *); -}; - -typedef struct { - char nvh_encoding; /* nvs encoding method */ - char nvh_endian; /* nvs endian */ - char nvh_reserved1; /* reserved for future use */ - char nvh_reserved2; /* reserved for future use */ -} nvs_header_t; - -static int -nvs_encode_pairs(nvstream_t *nvs, nvlist_t *nvl) -{ - nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; - i_nvp_t *curr; - - /* - * Walk nvpair in list and encode each nvpair - */ - for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) - if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0) - return (EFAULT); - - return (nvs->nvs_ops->nvs_nvl_fini(nvs)); -} - -static int -nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl) -{ - nvpair_t *nvp; - size_t nvsize; - int err; - - /* - * Get decoded size of next pair in stream, alloc - * memory for nvpair_t, then decode the nvpair - */ - while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) { - if (nvsize == 0) /* end of list */ - break; - - /* make sure len makes sense */ - if (nvsize < NVP_SIZE_CALC(1, 0)) - return (EFAULT); - - if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL) - return (ENOMEM); - - if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) { - nvp_buf_free(nvl, nvp); - return (err); - } - - if (i_validate_nvpair(nvp) != 0) { - nvpair_free(nvp); - nvp_buf_free(nvl, nvp); - return (EFAULT); - } - - err = nvt_add_nvpair(nvl, nvp); - if (err != 0) { - nvpair_free(nvp); - nvp_buf_free(nvl, nvp); - return (err); - } - nvp_buf_link(nvl, nvp); - } - return (err); -} - -static int -nvs_getsize_pairs(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen) -{ - nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; - i_nvp_t *curr; - uint64_t nvsize = *buflen; - size_t size; - - /* - * Get encoded size of nvpairs in nvlist - */ - for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { - if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0) - return (EINVAL); - - if ((nvsize += size) > INT32_MAX) - return (EINVAL); - } - - *buflen = nvsize; - return (0); -} - -static int -nvs_operation(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen) -{ - int err; - - if (nvl->nvl_priv == 0) - return (EFAULT); - - /* - * Perform the operation, starting with header, then each nvpair - */ - if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0) - return (err); - - switch (nvs->nvs_op) { - case NVS_OP_ENCODE: - err = nvs_encode_pairs(nvs, nvl); - break; - - case NVS_OP_DECODE: - err = nvs_decode_pairs(nvs, nvl); - break; - - case NVS_OP_GETSIZE: - err = nvs_getsize_pairs(nvs, nvl, buflen); - break; - - default: - err = EINVAL; - } - - return (err); -} - -static int -nvs_embedded(nvstream_t *nvs, nvlist_t *embedded) -{ - switch (nvs->nvs_op) { - case NVS_OP_ENCODE: { - int err; - - if (nvs->nvs_recursion >= nvpair_max_recursion) - return (EINVAL); - nvs->nvs_recursion++; - err = nvs_operation(nvs, embedded, NULL); - nvs->nvs_recursion--; - return (err); - } - case NVS_OP_DECODE: { - nvpriv_t *priv; - int err; - - if (embedded->nvl_version != NV_VERSION) - return (ENOTSUP); - - if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL) - return (ENOMEM); - - nvlist_init(embedded, embedded->nvl_nvflag, priv); - - if (nvs->nvs_recursion >= nvpair_max_recursion) { - nvlist_free(embedded); - return (EINVAL); - } - nvs->nvs_recursion++; - if ((err = nvs_operation(nvs, embedded, NULL)) != 0) - nvlist_free(embedded); - nvs->nvs_recursion--; - return (err); - } - default: - break; - } - - return (EINVAL); -} - -static int -nvs_embedded_nvl_array(nvstream_t *nvs, nvpair_t *nvp, size_t *size) -{ - size_t nelem = NVP_NELEM(nvp); - nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); - int i; - - switch (nvs->nvs_op) { - case NVS_OP_ENCODE: - for (i = 0; i < nelem; i++) - if (nvs_embedded(nvs, nvlp[i]) != 0) - return (EFAULT); - break; - - case NVS_OP_DECODE: { - size_t len = nelem * sizeof (uint64_t); - nvlist_t *embedded = (nvlist_t *)((uintptr_t)nvlp + len); - - bzero(nvlp, len); /* don't trust packed data */ - for (i = 0; i < nelem; i++) { - if (nvs_embedded(nvs, embedded) != 0) { - nvpair_free(nvp); - return (EFAULT); - } - - nvlp[i] = embedded++; - } - break; - } - case NVS_OP_GETSIZE: { - uint64_t nvsize = 0; - - for (i = 0; i < nelem; i++) { - size_t nvp_sz = 0; - - if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0) - return (EINVAL); - - if ((nvsize += nvp_sz) > INT32_MAX) - return (EINVAL); - } - - *size = nvsize; - break; - } - default: - return (EINVAL); - } - - return (0); -} - -static int nvs_native(nvstream_t *, nvlist_t *, char *, size_t *); -static int nvs_xdr(nvstream_t *, nvlist_t *, char *, size_t *); - -/* - * Common routine for nvlist operations: - * encode, decode, getsize (encoded size). - */ -static int -nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding, - int nvs_op) -{ - int err = 0; - nvstream_t nvs; - int nvl_endian; -#if BYTE_ORDER == _LITTLE_ENDIAN - int host_endian = 1; -#else - int host_endian = 0; -#endif /* _LITTLE_ENDIAN */ - nvs_header_t *nvh = (void *)buf; - - if (buflen == NULL || nvl == NULL || - (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) - return (EINVAL); - - nvs.nvs_op = nvs_op; - nvs.nvs_recursion = 0; - - /* - * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and - * a buffer is allocated. The first 4 bytes in the buffer are - * used for encoding method and host endian. - */ - switch (nvs_op) { - case NVS_OP_ENCODE: - if (buf == NULL || *buflen < sizeof (nvs_header_t)) - return (EINVAL); - - nvh->nvh_encoding = encoding; - nvh->nvh_endian = nvl_endian = host_endian; - nvh->nvh_reserved1 = 0; - nvh->nvh_reserved2 = 0; - break; - - case NVS_OP_DECODE: - if (buf == NULL || *buflen < sizeof (nvs_header_t)) - return (EINVAL); - - /* get method of encoding from first byte */ - encoding = nvh->nvh_encoding; - nvl_endian = nvh->nvh_endian; - break; - - case NVS_OP_GETSIZE: - nvl_endian = host_endian; - - /* - * add the size for encoding - */ - *buflen = sizeof (nvs_header_t); - break; - - default: - return (ENOTSUP); - } - - /* - * Create an nvstream with proper encoding method - */ - switch (encoding) { - case NV_ENCODE_NATIVE: - /* - * check endianness, in case we are unpacking - * from a file - */ - if (nvl_endian != host_endian) - return (ENOTSUP); - err = nvs_native(&nvs, nvl, buf, buflen); - break; - case NV_ENCODE_XDR: - err = nvs_xdr(&nvs, nvl, buf, buflen); - break; - default: - err = ENOTSUP; - break; - } - - return (err); -} - -int -nvlist_size(nvlist_t *nvl, size_t *size, int encoding) -{ - return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE)); -} - -/* - * Pack nvlist into contiguous memory - */ -/*ARGSUSED1*/ -int -nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, - int kmflag) -{ -#if defined(_KERNEL) && !defined(_BOOT) - return (nvlist_xpack(nvl, bufp, buflen, encoding, - (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); -#else - return (nvlist_xpack(nvl, bufp, buflen, encoding, nv_alloc_nosleep)); -#endif -} - -int -nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, - nv_alloc_t *nva) -{ - nvpriv_t nvpriv; - size_t alloc_size; - char *buf; - int err; - - if (nva == NULL || nvl == NULL || bufp == NULL || buflen == NULL) - return (EINVAL); - - if (*bufp != NULL) - return (nvlist_common(nvl, *bufp, buflen, encoding, - NVS_OP_ENCODE)); - - /* - * Here is a difficult situation: - * 1. The nvlist has fixed allocator properties. - * All other nvlist routines (like nvlist_add_*, ...) use - * these properties. - * 2. When using nvlist_pack() the user can specify their own - * allocator properties (e.g. by using KM_NOSLEEP). - * - * We use the user specified properties (2). A clearer solution - * will be to remove the kmflag from nvlist_pack(), but we will - * not change the interface. - */ - nv_priv_init(&nvpriv, nva, 0); - - if ((err = nvlist_size(nvl, &alloc_size, encoding))) - return (err); - - if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL) - return (ENOMEM); - - if ((err = nvlist_common(nvl, buf, &alloc_size, encoding, - NVS_OP_ENCODE)) != 0) { - nv_mem_free(&nvpriv, buf, alloc_size); - } else { - *buflen = alloc_size; - *bufp = buf; - } - - return (err); -} - -/* - * Unpack buf into an nvlist_t - */ -/*ARGSUSED1*/ -int -nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag) -{ -#if defined(_KERNEL) && !defined(_BOOT) - return (nvlist_xunpack(buf, buflen, nvlp, - (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); -#else - return (nvlist_xunpack(buf, buflen, nvlp, nv_alloc_nosleep)); -#endif -} - -int -nvlist_xunpack(char *buf, size_t buflen, nvlist_t **nvlp, nv_alloc_t *nva) -{ - nvlist_t *nvl; - int err; - - if (nvlp == NULL) - return (EINVAL); - - if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0) - return (err); - - if ((err = nvlist_common(nvl, buf, &buflen, 0, NVS_OP_DECODE)) != 0) - nvlist_free(nvl); - else - *nvlp = nvl; - - return (err); -} - -/* - * Native encoding functions - */ -typedef struct { - /* - * This structure is used when decoding a packed nvpair in - * the native format. n_base points to a buffer containing the - * packed nvpair. n_end is a pointer to the end of the buffer. - * (n_end actually points to the first byte past the end of the - * buffer.) n_curr is a pointer that lies between n_base and n_end. - * It points to the current data that we are decoding. - * The amount of data left in the buffer is equal to n_end - n_curr. - * n_flag is used to recognize a packed embedded list. - */ - caddr_t n_base; - caddr_t n_end; - caddr_t n_curr; - uint_t n_flag; -} nvs_native_t; - -static int -nvs_native_create(nvstream_t *nvs, nvs_native_t *native, char *buf, - size_t buflen) -{ - switch (nvs->nvs_op) { - case NVS_OP_ENCODE: - case NVS_OP_DECODE: - nvs->nvs_private = native; - native->n_curr = native->n_base = buf; - native->n_end = buf + buflen; - native->n_flag = 0; - return (0); - - case NVS_OP_GETSIZE: - nvs->nvs_private = native; - native->n_curr = native->n_base = native->n_end = NULL; - native->n_flag = 0; - return (0); - default: - return (EINVAL); - } -} - -/*ARGSUSED*/ -static void -nvs_native_destroy(nvstream_t *nvs) -{ -} - -static int -native_cp(nvstream_t *nvs, void *buf, size_t size) -{ - nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; - - if (native->n_curr + size > native->n_end) - return (EFAULT); - - /* - * The bcopy() below eliminates alignment requirement - * on the buffer (stream) and is preferred over direct access. - */ - switch (nvs->nvs_op) { - case NVS_OP_ENCODE: - bcopy(buf, native->n_curr, size); - break; - case NVS_OP_DECODE: - bcopy(native->n_curr, buf, size); - break; - default: - return (EINVAL); - } - - native->n_curr += size; - return (0); -} - -/* - * operate on nvlist_t header - */ -static int -nvs_native_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size) -{ - nvs_native_t *native = nvs->nvs_private; - - switch (nvs->nvs_op) { - case NVS_OP_ENCODE: - case NVS_OP_DECODE: - if (native->n_flag) - return (0); /* packed embedded list */ - - native->n_flag = 1; - - /* copy version and nvflag of the nvlist_t */ - if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 || - native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0) - return (EFAULT); - - return (0); - - case NVS_OP_GETSIZE: - /* - * if calculate for packed embedded list - * 4 for end of the embedded list - * else - * 2 * sizeof (int32_t) for nvl_version and nvl_nvflag - * and 4 for end of the entire list - */ - if (native->n_flag) { - *size += 4; - } else { - native->n_flag = 1; - *size += 2 * sizeof (int32_t) + 4; - } - - return (0); - - default: - return (EINVAL); - } -} - -static int -nvs_native_nvl_fini(nvstream_t *nvs) -{ - if (nvs->nvs_op == NVS_OP_ENCODE) { - nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; - /* - * Add 4 zero bytes at end of nvlist. They are used - * for end detection by the decode routine. - */ - if (native->n_curr + sizeof (int) > native->n_end) - return (EFAULT); - - bzero(native->n_curr, sizeof (int)); - native->n_curr += sizeof (int); - } - - return (0); -} - -static int -nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp) -{ - if (nvs->nvs_op == NVS_OP_ENCODE) { - nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; - char *packed = (void *) - (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp)); - /* - * Null out the pointer that is meaningless in the packed - * structure. The address may not be aligned, so we have - * to use bzero. - */ - bzero(packed + offsetof(nvlist_t, nvl_priv), - sizeof(((nvlist_t *)NULL)->nvl_priv)); - } - - return (nvs_embedded(nvs, EMBEDDED_NVL(nvp))); -} - -static int -nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp) -{ - if (nvs->nvs_op == NVS_OP_ENCODE) { - nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; - char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp); - size_t len = NVP_NELEM(nvp) * sizeof (uint64_t); - int i; - /* - * Null out pointers that are meaningless in the packed - * structure. The addresses may not be aligned, so we have - * to use bzero. - */ - bzero(value, len); - - value += len; - for (i = 0; i < NVP_NELEM(nvp); i++) { - /* - * Null out the pointer that is meaningless in the - * packed structure. The address may not be aligned, - * so we have to use bzero. - */ - bzero(value + offsetof(nvlist_t, nvl_priv), - sizeof(((nvlist_t *)NULL)->nvl_priv)); - value += sizeof(nvlist_t); - } - } - - return (nvs_embedded_nvl_array(nvs, nvp, NULL)); -} - -static void -nvpair_native_string_array(nvstream_t *nvs, nvpair_t *nvp) -{ - switch (nvs->nvs_op) { - case NVS_OP_ENCODE: { - nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; - uint64_t *strp = (void *) - (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp)); - /* - * Null out pointers that are meaningless in the packed - * structure. The addresses may not be aligned, so we have - * to use bzero. - */ - bzero(strp, NVP_NELEM(nvp) * sizeof (uint64_t)); - break; - } - case NVS_OP_DECODE: { - char **strp = (void *)NVP_VALUE(nvp); - char *buf = ((char *)strp + NVP_NELEM(nvp) * sizeof (uint64_t)); - int i; - - for (i = 0; i < NVP_NELEM(nvp); i++) { - strp[i] = buf; - buf += strlen(buf) + 1; - } - break; - } - } -} - -static int -nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp) -{ - data_type_t type; - int value_sz; - int ret = 0; - - /* - * We do the initial bcopy of the data before we look at - * the nvpair type, because when we're decoding, we won't - * have the correct values for the pair until we do the bcopy. - */ - switch (nvs->nvs_op) { - case NVS_OP_ENCODE: - case NVS_OP_DECODE: - if (native_cp(nvs, nvp, nvp->nvp_size) != 0) - return (EFAULT); - break; - default: - return (EINVAL); - } - - /* verify nvp_name_sz, check the name string length */ - if (i_validate_nvpair_name(nvp) != 0) - return (EFAULT); - - type = NVP_TYPE(nvp); - - /* - * Verify type and nelem and get the value size. - * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY - * is the size of the string(s) excluded. - */ - if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0) - return (EFAULT); - - if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size) - return (EFAULT); - - switch (type) { - case DATA_TYPE_NVLIST: - ret = nvpair_native_embedded(nvs, nvp); - break; - case DATA_TYPE_NVLIST_ARRAY: - ret = nvpair_native_embedded_array(nvs, nvp); - break; - case DATA_TYPE_STRING_ARRAY: - nvpair_native_string_array(nvs, nvp); - break; - default: - break; - } - - return (ret); -} - -static int -nvs_native_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) -{ - uint64_t nvp_sz = nvp->nvp_size; - - switch (NVP_TYPE(nvp)) { - case DATA_TYPE_NVLIST: { - size_t nvsize = 0; - - if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0) - return (EINVAL); - - nvp_sz += nvsize; - break; - } - case DATA_TYPE_NVLIST_ARRAY: { - size_t nvsize; - - if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0) - return (EINVAL); - - nvp_sz += nvsize; - break; - } - default: - break; - } - - if (nvp_sz > INT32_MAX) - return (EINVAL); - - *size = nvp_sz; - - return (0); -} - -static int -nvs_native_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size) -{ - switch (nvs->nvs_op) { - case NVS_OP_ENCODE: - return (nvs_native_nvp_op(nvs, nvp)); - - case NVS_OP_DECODE: { - nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; - int32_t decode_len; - - /* try to read the size value from the stream */ - if (native->n_curr + sizeof (int32_t) > native->n_end) - return (EFAULT); - bcopy(native->n_curr, &decode_len, sizeof (int32_t)); - - /* sanity check the size value */ - if (decode_len < 0 || - decode_len > native->n_end - native->n_curr) - return (EFAULT); - - *size = decode_len; - - /* - * If at the end of the stream then move the cursor - * forward, otherwise nvpair_native_op() will read - * the entire nvpair at the same cursor position. - */ - if (*size == 0) - native->n_curr += sizeof (int32_t); - break; - } - - default: - return (EINVAL); - } - - return (0); -} - -static const nvs_ops_t nvs_native_ops = { - nvs_native_nvlist, - nvs_native_nvpair, - nvs_native_nvp_op, - nvs_native_nvp_size, - nvs_native_nvl_fini -}; - -static int -nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) -{ - nvs_native_t native; - int err; - - nvs->nvs_ops = &nvs_native_ops; - - if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t), - *buflen - sizeof (nvs_header_t))) != 0) - return (err); - - err = nvs_operation(nvs, nvl, buflen); - - nvs_native_destroy(nvs); - - return (err); -} - -/* - * XDR encoding functions - * - * An xdr packed nvlist is encoded as: - * - * - encoding methode and host endian (4 bytes) - * - nvl_version (4 bytes) - * - nvl_nvflag (4 bytes) - * - * - encoded nvpairs, the format of one xdr encoded nvpair is: - * - encoded size of the nvpair (4 bytes) - * - decoded size of the nvpair (4 bytes) - * - name string, (4 + sizeof(NV_ALIGN4(string)) - * a string is coded as size (4 bytes) and data - * - data type (4 bytes) - * - number of elements in the nvpair (4 bytes) - * - data - * - * - 2 zero's for end of the entire list (8 bytes) - */ -static int -nvs_xdr_create(nvstream_t *nvs, XDR *xdr, char *buf, size_t buflen) -{ - /* xdr data must be 4 byte aligned */ - if ((ulong_t)buf % 4 != 0) - return (EFAULT); - - switch (nvs->nvs_op) { - case NVS_OP_ENCODE: - xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE); - nvs->nvs_private = xdr; - return (0); - case NVS_OP_DECODE: - xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE); - nvs->nvs_private = xdr; - return (0); - case NVS_OP_GETSIZE: - nvs->nvs_private = NULL; - return (0); - default: - return (EINVAL); - } -} - -static void -nvs_xdr_destroy(nvstream_t *nvs) -{ - switch (nvs->nvs_op) { - case NVS_OP_ENCODE: - case NVS_OP_DECODE: - xdr_destroy((XDR *)nvs->nvs_private); - break; - default: - break; - } -} - -static int -nvs_xdr_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size) -{ - switch (nvs->nvs_op) { - case NVS_OP_ENCODE: - case NVS_OP_DECODE: { - XDR *xdr = nvs->nvs_private; - - if (!xdr_int(xdr, &nvl->nvl_version) || - !xdr_u_int(xdr, &nvl->nvl_nvflag)) - return (EFAULT); - break; - } - case NVS_OP_GETSIZE: { - /* - * 2 * 4 for nvl_version + nvl_nvflag - * and 8 for end of the entire list - */ - *size += 2 * 4 + 8; - break; - } - default: - return (EINVAL); - } - return (0); -} - -static int -nvs_xdr_nvl_fini(nvstream_t *nvs) -{ - if (nvs->nvs_op == NVS_OP_ENCODE) { - XDR *xdr = nvs->nvs_private; - int zero = 0; - - if (!xdr_int(xdr, &zero) || !xdr_int(xdr, &zero)) - return (EFAULT); - } - - return (0); -} - -/* - * The format of xdr encoded nvpair is: - * encode_size, decode_size, name string, data type, nelem, data - */ -static int -nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp) -{ - data_type_t type; - char *buf; - char *buf_end = (char *)nvp + nvp->nvp_size; - int value_sz; - uint_t nelem, buflen; - bool_t ret = FALSE; - XDR *xdr = nvs->nvs_private; - - ASSERT(xdr != NULL && nvp != NULL); - - /* name string */ - if ((buf = NVP_NAME(nvp)) >= buf_end) - return (EFAULT); - buflen = buf_end - buf; - - if (!xdr_string(xdr, &buf, buflen - 1)) - return (EFAULT); - nvp->nvp_name_sz = strlen(buf) + 1; - - /* type and nelem */ - if (!xdr_int(xdr, (int *)&nvp->nvp_type) || - !xdr_int(xdr, &nvp->nvp_value_elem)) - return (EFAULT); - - type = NVP_TYPE(nvp); - nelem = nvp->nvp_value_elem; - - /* - * Verify type and nelem and get the value size. - * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY - * is the size of the string(s) excluded. - */ - if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0) - return (EFAULT); - - /* if there is no data to extract then return */ - if (nelem == 0) - return (0); - - /* value */ - if ((buf = NVP_VALUE(nvp)) >= buf_end) - return (EFAULT); - buflen = buf_end - buf; - - if (buflen < value_sz) - return (EFAULT); - - switch (type) { - case DATA_TYPE_NVLIST: - if (nvs_embedded(nvs, (void *)buf) == 0) - return (0); - break; - - case DATA_TYPE_NVLIST_ARRAY: - if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0) - return (0); - break; - - case DATA_TYPE_BOOLEAN: - ret = TRUE; - break; - - case DATA_TYPE_BYTE: - case DATA_TYPE_INT8: - case DATA_TYPE_UINT8: - ret = xdr_char(xdr, buf); - break; - - case DATA_TYPE_INT16: - ret = xdr_short(xdr, (void *)buf); - break; - - case DATA_TYPE_UINT16: - ret = xdr_u_short(xdr, (void *)buf); - break; - - case DATA_TYPE_BOOLEAN_VALUE: - case DATA_TYPE_INT32: - ret = xdr_int(xdr, (void *)buf); - break; - - case DATA_TYPE_UINT32: - ret = xdr_u_int(xdr, (void *)buf); - break; - - case DATA_TYPE_INT64: - ret = xdr_longlong_t(xdr, (void *)buf); - break; - - case DATA_TYPE_UINT64: - ret = xdr_u_longlong_t(xdr, (void *)buf); - break; - - case DATA_TYPE_HRTIME: - /* - * NOTE: must expose the definition of hrtime_t here - */ - ret = xdr_longlong_t(xdr, (void *)buf); - break; -#if !defined(_KERNEL) - case DATA_TYPE_DOUBLE: - ret = xdr_double(xdr, (void *)buf); - break; -#endif - case DATA_TYPE_STRING: - ret = xdr_string(xdr, &buf, buflen - 1); - break; - - case DATA_TYPE_BYTE_ARRAY: - ret = xdr_opaque(xdr, buf, nelem); - break; - - case DATA_TYPE_INT8_ARRAY: - case DATA_TYPE_UINT8_ARRAY: - ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t), - (xdrproc_t)xdr_char); - break; - - case DATA_TYPE_INT16_ARRAY: - ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t), - sizeof (int16_t), (xdrproc_t)xdr_short); - break; - - case DATA_TYPE_UINT16_ARRAY: - ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t), - sizeof (uint16_t), (xdrproc_t)xdr_u_short); - break; - - case DATA_TYPE_BOOLEAN_ARRAY: - case DATA_TYPE_INT32_ARRAY: - ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t), - sizeof (int32_t), (xdrproc_t)xdr_int); - break; - - case DATA_TYPE_UINT32_ARRAY: - ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t), - sizeof (uint32_t), (xdrproc_t)xdr_u_int); - break; - - case DATA_TYPE_INT64_ARRAY: - ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t), - sizeof (int64_t), (xdrproc_t)xdr_longlong_t); - break; - - case DATA_TYPE_UINT64_ARRAY: - ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t), - sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t); - break; - - case DATA_TYPE_STRING_ARRAY: { - size_t len = nelem * sizeof (uint64_t); - char **strp = (void *)buf; - int i; - - if (nvs->nvs_op == NVS_OP_DECODE) - bzero(buf, len); /* don't trust packed data */ - - for (i = 0; i < nelem; i++) { - if (buflen <= len) - return (EFAULT); - - buf += len; - buflen -= len; - - if (xdr_string(xdr, &buf, buflen - 1) != TRUE) - return (EFAULT); - - if (nvs->nvs_op == NVS_OP_DECODE) - strp[i] = buf; - len = strlen(buf) + 1; - } - ret = TRUE; - break; - } - default: - break; - } - - return (ret == TRUE ? 0 : EFAULT); -} - -static int -nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) -{ - data_type_t type = NVP_TYPE(nvp); - /* - * encode_size + decode_size + name string size + data type + nelem - * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) - */ - uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4; - - switch (type) { - case DATA_TYPE_BOOLEAN: - break; - - case DATA_TYPE_BOOLEAN_VALUE: - case DATA_TYPE_BYTE: - case DATA_TYPE_INT8: - case DATA_TYPE_UINT8: - case DATA_TYPE_INT16: - case DATA_TYPE_UINT16: - case DATA_TYPE_INT32: - case DATA_TYPE_UINT32: - nvp_sz += 4; /* 4 is the minimum xdr unit */ - break; - - case DATA_TYPE_INT64: - case DATA_TYPE_UINT64: - case DATA_TYPE_HRTIME: -#if !defined(_KERNEL) - case DATA_TYPE_DOUBLE: -#endif - nvp_sz += 8; - break; - - case DATA_TYPE_STRING: - nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp))); - break; - - case DATA_TYPE_BYTE_ARRAY: - nvp_sz += NV_ALIGN4(NVP_NELEM(nvp)); - break; - - case DATA_TYPE_BOOLEAN_ARRAY: - case DATA_TYPE_INT8_ARRAY: - case DATA_TYPE_UINT8_ARRAY: - case DATA_TYPE_INT16_ARRAY: - case DATA_TYPE_UINT16_ARRAY: - case DATA_TYPE_INT32_ARRAY: - case DATA_TYPE_UINT32_ARRAY: - nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp); - break; - - case DATA_TYPE_INT64_ARRAY: - case DATA_TYPE_UINT64_ARRAY: - nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp); - break; - - case DATA_TYPE_STRING_ARRAY: { - int i; - char **strs = (void *)NVP_VALUE(nvp); - - for (i = 0; i < NVP_NELEM(nvp); i++) - nvp_sz += 4 + NV_ALIGN4(strlen(strs[i])); - - break; - } - - case DATA_TYPE_NVLIST: - case DATA_TYPE_NVLIST_ARRAY: { - size_t nvsize = 0; - int old_nvs_op = nvs->nvs_op; - int err; - - nvs->nvs_op = NVS_OP_GETSIZE; - if (type == DATA_TYPE_NVLIST) - err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize); - else - err = nvs_embedded_nvl_array(nvs, nvp, &nvsize); - nvs->nvs_op = old_nvs_op; - - if (err != 0) - return (EINVAL); - - nvp_sz += nvsize; - break; - } - - default: - return (EINVAL); - } - - if (nvp_sz > INT32_MAX) - return (EINVAL); - - *size = nvp_sz; - - return (0); -} - - -/* - * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates - * the largest nvpair that could be encoded in the buffer. - * - * See comments above nvpair_xdr_op() for the format of xdr encoding. - * The size of a xdr packed nvpair without any data is 5 words. - * - * Using the size of the data directly as an estimate would be ok - * in all cases except one. If the data type is of DATA_TYPE_STRING_ARRAY - * then the actual nvpair has space for an array of pointers to index - * the strings. These pointers are not encoded into the packed xdr buffer. - * - * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are - * of length 0, then each string is endcoded in xdr format as a single word. - * Therefore when expanded to an nvpair there will be 2.25 word used for - * each string. (a int64_t allocated for pointer usage, and a single char - * for the null termination.) - * - * This is the calculation performed by the NVS_XDR_MAX_LEN macro. - */ -#define NVS_XDR_HDR_LEN ((size_t)(5 * 4)) -#define NVS_XDR_DATA_LEN(y) (((size_t)(y) <= NVS_XDR_HDR_LEN) ? \ - 0 : ((size_t)(y) - NVS_XDR_HDR_LEN)) -#define NVS_XDR_MAX_LEN(x) (NVP_SIZE_CALC(1, 0) + \ - (NVS_XDR_DATA_LEN(x) * 2) + \ - NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4))) - -static int -nvs_xdr_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size) -{ - XDR *xdr = nvs->nvs_private; - int32_t encode_len, decode_len; - - switch (nvs->nvs_op) { - case NVS_OP_ENCODE: { - size_t nvsize; - - if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0) - return (EFAULT); - - decode_len = nvp->nvp_size; - encode_len = nvsize; - if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len)) - return (EFAULT); - - return (nvs_xdr_nvp_op(nvs, nvp)); - } - case NVS_OP_DECODE: { - struct xdr_bytesrec bytesrec; - - /* get the encode and decode size */ - if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len)) - return (EFAULT); - *size = decode_len; - - /* are we at the end of the stream? */ - if (*size == 0) - return (0); - - /* sanity check the size parameter */ - if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec)) - return (EFAULT); - - if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail)) - return (EFAULT); - break; - } - - default: - return (EINVAL); - } - return (0); -} - -static const struct nvs_ops nvs_xdr_ops = { - nvs_xdr_nvlist, - nvs_xdr_nvpair, - nvs_xdr_nvp_op, - nvs_xdr_nvp_size, - nvs_xdr_nvl_fini -}; - -static int -nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) -{ - XDR xdr; - int err; - - nvs->nvs_ops = &nvs_xdr_ops; - - if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t), - *buflen - sizeof (nvs_header_t))) != 0) - return (err); - - err = nvs_operation(nvs, nvl, buflen); - - nvs_xdr_destroy(nvs); - - return (err); -} diff --git a/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c b/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c deleted file mode 100644 index 620171e4ca4e..000000000000 --- a/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include -#include -#if defined(_KERNEL) && !defined(_BOOT) -#include -#else -#include -#include -#endif - -/* - * This allocator is very simple. - * - it uses a pre-allocated buffer for memory allocations. - * - it does _not_ free memory in the pre-allocated buffer. - * - * The reason for the selected implemention is simplicity. - * This allocator is designed for the usage in interrupt context when - * the caller may not wait for free memory. - */ - -/* pre-allocated buffer for memory allocations */ -typedef struct nvbuf { - uintptr_t nvb_buf; /* address of pre-allocated buffer */ - uintptr_t nvb_lim; /* limit address in the buffer */ - uintptr_t nvb_cur; /* current address in the buffer */ -} nvbuf_t; - -/* - * Initialize the pre-allocated buffer allocator. The caller needs to supply - * - * buf address of pre-allocated buffer - * bufsz size of pre-allocated buffer - * - * nv_fixed_init() calculates the remaining members of nvbuf_t. - */ -static int -nv_fixed_init(nv_alloc_t *nva, va_list valist) -{ - uintptr_t base = va_arg(valist, uintptr_t); - uintptr_t lim = base + va_arg(valist, size_t); - nvbuf_t *nvb = (nvbuf_t *)P2ROUNDUP(base, sizeof (uintptr_t)); - - if (base == 0 || (uintptr_t)&nvb[1] > lim) - return (EINVAL); - - nvb->nvb_buf = (uintptr_t)&nvb[0]; - nvb->nvb_cur = (uintptr_t)&nvb[1]; - nvb->nvb_lim = lim; - nva->nva_arg = nvb; - - return (0); -} - -static void * -nv_fixed_alloc(nv_alloc_t *nva, size_t size) -{ - nvbuf_t *nvb = nva->nva_arg; - uintptr_t new = nvb->nvb_cur; - - if (size == 0 || new + size > nvb->nvb_lim) - return (NULL); - - nvb->nvb_cur = P2ROUNDUP(new + size, sizeof (uintptr_t)); - - return ((void *)new); -} - -/*ARGSUSED*/ -static void -nv_fixed_free(nv_alloc_t *nva, void *buf, size_t size) -{ - /* don't free memory in the pre-allocated buffer */ -} - -static void -nv_fixed_reset(nv_alloc_t *nva) -{ - nvbuf_t *nvb = nva->nva_arg; - - nvb->nvb_cur = (uintptr_t)&nvb[1]; -} - -const nv_alloc_ops_t nv_fixed_ops_def = { - nv_fixed_init, /* nv_ao_init() */ - NULL, /* nv_ao_fini() */ - nv_fixed_alloc, /* nv_ao_alloc() */ - nv_fixed_free, /* nv_ao_free() */ - nv_fixed_reset /* nv_ao_reset() */ -}; - -const nv_alloc_ops_t *nv_fixed_ops = &nv_fixed_ops_def; diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c deleted file mode 100644 index ba79eeaaefea..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c +++ /dev/null @@ -1,310 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. - * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright (c) 2017, Intel Corporation. - */ - -#ifdef _KERNEL -#include -#else -#include -#include -#endif -#include -#include -#include -#include "zfeature_common.h" - -/* - * Set to disable all feature checks while opening pools, allowing pools with - * unsupported features to be opened. Set for testing only. - */ -boolean_t zfeature_checks_disable = B_FALSE; - -zfeature_info_t spa_feature_table[SPA_FEATURES]; - -/* - * Valid characters for feature guids. This list is mainly for aesthetic - * purposes and could be expanded in the future. There are different allowed - * characters in the guids reverse dns portion (before the colon) and its - * short name (after the colon). - */ -static int -valid_char(char c, boolean_t after_colon) -{ - return ((c >= 'a' && c <= 'z') || - (c >= '0' && c <= '9') || - (after_colon && c == '_') || - (!after_colon && (c == '.' || c == '-'))); -} - -/* - * Every feature guid must contain exactly one colon which separates a reverse - * dns organization name from the feature's "short" name (e.g. - * "com.company:feature_name"). - */ -boolean_t -zfeature_is_valid_guid(const char *name) -{ - int i; - boolean_t has_colon = B_FALSE; - - i = 0; - while (name[i] != '\0') { - char c = name[i++]; - if (c == ':') { - if (has_colon) - return (B_FALSE); - has_colon = B_TRUE; - continue; - } - if (!valid_char(c, has_colon)) - return (B_FALSE); - } - - return (has_colon); -} - -boolean_t -zfeature_is_supported(const char *guid) -{ - if (zfeature_checks_disable) - return (B_TRUE); - - for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { - zfeature_info_t *feature = &spa_feature_table[i]; - if (strcmp(guid, feature->fi_guid) == 0) - return (B_TRUE); - } - return (B_FALSE); -} - -int -zfeature_lookup_name(const char *name, spa_feature_t *res) -{ - for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { - zfeature_info_t *feature = &spa_feature_table[i]; - if (strcmp(name, feature->fi_uname) == 0) { - if (res != NULL) - *res = i; - return (0); - } - } - - return (ENOENT); -} - -boolean_t -zfeature_depends_on(spa_feature_t fid, spa_feature_t check) -{ - zfeature_info_t *feature = &spa_feature_table[fid]; - - for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) { - if (feature->fi_depends[i] == check) - return (B_TRUE); - } - return (B_FALSE); -} - -static void -zfeature_register(spa_feature_t fid, const char *guid, const char *name, - const char *desc, zfeature_flags_t flags, const spa_feature_t *deps) -{ - zfeature_info_t *feature = &spa_feature_table[fid]; - static spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; - - ASSERT(name != NULL); - ASSERT(desc != NULL); - ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 || - (flags & ZFEATURE_FLAG_MOS) == 0); - ASSERT3U(fid, <, SPA_FEATURES); - ASSERT(zfeature_is_valid_guid(guid)); - - if (deps == NULL) - deps = nodeps; - - feature->fi_feature = fid; - feature->fi_guid = guid; - feature->fi_uname = name; - feature->fi_desc = desc; - feature->fi_flags = flags; - feature->fi_depends = deps; -} - -void -zpool_feature_init(void) -{ - zfeature_register(SPA_FEATURE_ASYNC_DESTROY, - "com.delphix:async_destroy", "async_destroy", - "Destroy filesystems asynchronously.", - ZFEATURE_FLAG_READONLY_COMPAT, NULL); - - zfeature_register(SPA_FEATURE_EMPTY_BPOBJ, - "com.delphix:empty_bpobj", "empty_bpobj", - "Snapshots use less space.", - ZFEATURE_FLAG_READONLY_COMPAT, NULL); - - zfeature_register(SPA_FEATURE_LZ4_COMPRESS, - "org.illumos:lz4_compress", "lz4_compress", - "LZ4 compression algorithm support.", - ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, NULL); - - zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, - "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump", - "Crash dumps to multiple vdev pools.", - 0, NULL); - - zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM, - "com.delphix:spacemap_histogram", "spacemap_histogram", - "Spacemaps maintain space histograms.", - ZFEATURE_FLAG_READONLY_COMPAT, NULL); - - zfeature_register(SPA_FEATURE_ENABLED_TXG, - "com.delphix:enabled_txg", "enabled_txg", - "Record txg at which a feature is enabled", - ZFEATURE_FLAG_READONLY_COMPAT, NULL); - - static spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG, - SPA_FEATURE_NONE }; - zfeature_register(SPA_FEATURE_HOLE_BIRTH, - "com.delphix:hole_birth", "hole_birth", - "Retain hole birth txg for more precise zfs send", - ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, - hole_birth_deps); - - zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET, - "com.delphix:extensible_dataset", "extensible_dataset", - "Enhanced dataset functionality, used by other features.", - 0, NULL); - - static const spa_feature_t bookmarks_deps[] = { - SPA_FEATURE_EXTENSIBLE_DATASET, - SPA_FEATURE_NONE - }; - zfeature_register(SPA_FEATURE_BOOKMARKS, - "com.delphix:bookmarks", "bookmarks", - "\"zfs bookmark\" command", - ZFEATURE_FLAG_READONLY_COMPAT, bookmarks_deps); - - static const spa_feature_t filesystem_limits_deps[] = { - SPA_FEATURE_EXTENSIBLE_DATASET, - SPA_FEATURE_NONE - }; - zfeature_register(SPA_FEATURE_FS_SS_LIMIT, - "com.joyent:filesystem_limits", "filesystem_limits", - "Filesystem and snapshot limits.", - ZFEATURE_FLAG_READONLY_COMPAT, filesystem_limits_deps); - - zfeature_register(SPA_FEATURE_EMBEDDED_DATA, - "com.delphix:embedded_data", "embedded_data", - "Blocks which compress very well use even less space.", - ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, - NULL); - - zfeature_register(SPA_FEATURE_POOL_CHECKPOINT, - "com.delphix:zpool_checkpoint", "zpool_checkpoint", - "Pool state can be checkpointed, allowing rewind later.", - ZFEATURE_FLAG_READONLY_COMPAT, NULL); - - zfeature_register(SPA_FEATURE_SPACEMAP_V2, - "com.delphix:spacemap_v2", "spacemap_v2", - "Space maps representing large segments are more efficient.", - ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, - NULL); - - static const spa_feature_t large_blocks_deps[] = { - SPA_FEATURE_EXTENSIBLE_DATASET, - SPA_FEATURE_NONE - }; - zfeature_register(SPA_FEATURE_LARGE_BLOCKS, - "org.open-zfs:large_blocks", "large_blocks", - "Support for blocks larger than 128KB.", - ZFEATURE_FLAG_PER_DATASET, large_blocks_deps); - - { - static const spa_feature_t large_dnode_deps[] = { - SPA_FEATURE_EXTENSIBLE_DATASET, - SPA_FEATURE_NONE - }; - zfeature_register(SPA_FEATURE_LARGE_DNODE, - "org.zfsonlinux:large_dnode", "large_dnode", - "Variable on-disk size of dnodes.", - ZFEATURE_FLAG_PER_DATASET, large_dnode_deps); - } - - static const spa_feature_t sha512_deps[] = { - SPA_FEATURE_EXTENSIBLE_DATASET, - SPA_FEATURE_NONE - }; - zfeature_register(SPA_FEATURE_SHA512, - "org.illumos:sha512", "sha512", - "SHA-512/256 hash algorithm.", - ZFEATURE_FLAG_PER_DATASET, sha512_deps); - - static const spa_feature_t skein_deps[] = { - SPA_FEATURE_EXTENSIBLE_DATASET, - SPA_FEATURE_NONE - }; - zfeature_register(SPA_FEATURE_SKEIN, - "org.illumos:skein", "skein", - "Skein hash algorithm.", - ZFEATURE_FLAG_PER_DATASET, skein_deps); - -#ifdef illumos - static const spa_feature_t edonr_deps[] = { - SPA_FEATURE_EXTENSIBLE_DATASET, - SPA_FEATURE_NONE - }; - zfeature_register(SPA_FEATURE_EDONR, - "org.illumos:edonr", "edonr", - "Edon-R hash algorithm.", - ZFEATURE_FLAG_PER_DATASET, edonr_deps); -#endif - - zfeature_register(SPA_FEATURE_DEVICE_REMOVAL, - "com.delphix:device_removal", "device_removal", - "Top-level vdevs can be removed, reducing logical pool size.", - ZFEATURE_FLAG_MOS, NULL); - - static const spa_feature_t obsolete_counts_deps[] = { - SPA_FEATURE_EXTENSIBLE_DATASET, - SPA_FEATURE_DEVICE_REMOVAL, - SPA_FEATURE_NONE - }; - zfeature_register(SPA_FEATURE_OBSOLETE_COUNTS, - "com.delphix:obsolete_counts", "obsolete_counts", - "Reduce memory used by removed devices when their blocks are " - "freed or remapped.", - ZFEATURE_FLAG_READONLY_COMPAT, obsolete_counts_deps); - - { - zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES, - "org.zfsonlinux:allocation_classes", "allocation_classes", - "Support for separate allocation classes.", - ZFEATURE_FLAG_READONLY_COMPAT, NULL); - } -} diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h deleted file mode 100644 index d23a4e226e2d..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright (c) 2017, Intel Corporation. - */ - -#ifndef _ZFEATURE_COMMON_H -#define _ZFEATURE_COMMON_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct zfeature_info; - -typedef enum spa_feature { - SPA_FEATURE_NONE = -1, - SPA_FEATURE_ASYNC_DESTROY, - SPA_FEATURE_EMPTY_BPOBJ, - SPA_FEATURE_LZ4_COMPRESS, - SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, - SPA_FEATURE_SPACEMAP_HISTOGRAM, - SPA_FEATURE_ENABLED_TXG, - SPA_FEATURE_HOLE_BIRTH, - SPA_FEATURE_EXTENSIBLE_DATASET, - SPA_FEATURE_EMBEDDED_DATA, - SPA_FEATURE_BOOKMARKS, - SPA_FEATURE_FS_SS_LIMIT, - SPA_FEATURE_LARGE_BLOCKS, - SPA_FEATURE_LARGE_DNODE, - SPA_FEATURE_SHA512, - SPA_FEATURE_SKEIN, -#ifdef illumos - SPA_FEATURE_EDONR, -#endif - SPA_FEATURE_DEVICE_REMOVAL, - SPA_FEATURE_OBSOLETE_COUNTS, - SPA_FEATURE_POOL_CHECKPOINT, - SPA_FEATURE_SPACEMAP_V2, - SPA_FEATURE_ALLOCATION_CLASSES, - SPA_FEATURES -} spa_feature_t; - -#define SPA_FEATURE_DISABLED (-1ULL) - -typedef enum zfeature_flags { - /* Can open pool readonly even if this feature is not supported. */ - ZFEATURE_FLAG_READONLY_COMPAT = (1 << 0), - /* Is this feature necessary to read the MOS? */ - ZFEATURE_FLAG_MOS = (1 << 1), - /* Activate this feature at the same time it is enabled. */ - ZFEATURE_FLAG_ACTIVATE_ON_ENABLE = (1 << 2), - /* Each dataset has a field set if it has ever used this feature. */ - ZFEATURE_FLAG_PER_DATASET = (1 << 3) -} zfeature_flags_t; - -typedef struct zfeature_info { - spa_feature_t fi_feature; - const char *fi_uname; /* User-facing feature name */ - const char *fi_guid; /* On-disk feature identifier */ - const char *fi_desc; /* Feature description */ - zfeature_flags_t fi_flags; - /* array of dependencies, terminated by SPA_FEATURE_NONE */ - const spa_feature_t *fi_depends; -} zfeature_info_t; - -typedef int (zfeature_func_t)(zfeature_info_t *, void *); - -#define ZFS_FEATURE_DEBUG - -extern zfeature_info_t spa_feature_table[SPA_FEATURES]; - -extern boolean_t zfeature_is_valid_guid(const char *); - -extern boolean_t zfeature_is_supported(const char *); -extern int zfeature_lookup_name(const char *, spa_feature_t *); -extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t); - -extern void zpool_feature_init(void); - -#ifdef __cplusplus -} -#endif - -#endif /* _ZFEATURE_COMMON_H */ diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c deleted file mode 100644 index f18d82b507b2..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c +++ /dev/null @@ -1,206 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - */ - -/* - * This file is intended for functions that ought to be common between user - * land (libzfs) and the kernel. When many common routines need to be shared - * then a separate file should to be created. - */ - -#if defined(_KERNEL) -#include -#else -#include -#endif - -#include -#include -#include -#include "zfs_comutil.h" - -/* - * Are there allocatable vdevs? - */ -boolean_t -zfs_allocatable_devs(nvlist_t *nv) -{ - uint64_t is_log; - uint_t c; - nvlist_t **child; - uint_t children; - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) { - return (B_FALSE); - } - for (c = 0; c < children; c++) { - is_log = 0; - (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - &is_log); - if (!is_log) - return (B_TRUE); - } - return (B_FALSE); -} - -void -zpool_get_load_policy(nvlist_t *nvl, zpool_load_policy_t *zlpp) -{ - nvlist_t *policy; - nvpair_t *elem; - char *nm; - - /* Defaults */ - zlpp->zlp_rewind = ZPOOL_NO_REWIND; - zlpp->zlp_maxmeta = 0; - zlpp->zlp_maxdata = UINT64_MAX; - zlpp->zlp_txg = UINT64_MAX; - - if (nvl == NULL) - return; - - elem = NULL; - while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { - nm = nvpair_name(elem); - if (strcmp(nm, ZPOOL_LOAD_POLICY) == 0) { - if (nvpair_value_nvlist(elem, &policy) == 0) - zpool_get_load_policy(policy, zlpp); - return; - } else if (strcmp(nm, ZPOOL_LOAD_REWIND_POLICY) == 0) { - if (nvpair_value_uint32(elem, &zlpp->zlp_rewind) == 0) - if (zlpp->zlp_rewind & ~ZPOOL_REWIND_POLICIES) - zlpp->zlp_rewind = ZPOOL_NO_REWIND; - } else if (strcmp(nm, ZPOOL_LOAD_REQUEST_TXG) == 0) { - (void) nvpair_value_uint64(elem, &zlpp->zlp_txg); - } else if (strcmp(nm, ZPOOL_LOAD_META_THRESH) == 0) { - (void) nvpair_value_uint64(elem, &zlpp->zlp_maxmeta); - } else if (strcmp(nm, ZPOOL_LOAD_DATA_THRESH) == 0) { - (void) nvpair_value_uint64(elem, &zlpp->zlp_maxdata); - } - } - if (zlpp->zlp_rewind == 0) - zlpp->zlp_rewind = ZPOOL_NO_REWIND; -} - -typedef struct zfs_version_spa_map { - int version_zpl; - int version_spa; -} zfs_version_spa_map_t; - -/* - * Keep this table in monotonically increasing version number order. - */ -static zfs_version_spa_map_t zfs_version_table[] = { - {ZPL_VERSION_INITIAL, SPA_VERSION_INITIAL}, - {ZPL_VERSION_DIRENT_TYPE, SPA_VERSION_INITIAL}, - {ZPL_VERSION_FUID, SPA_VERSION_FUID}, - {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE}, - {ZPL_VERSION_SA, SPA_VERSION_SA}, - {0, 0} -}; - -/* - * Return the max zpl version for a corresponding spa version - * -1 is returned if no mapping exists. - */ -int -zfs_zpl_version_map(int spa_version) -{ - int i; - int version = -1; - - for (i = 0; zfs_version_table[i].version_spa; i++) { - if (spa_version >= zfs_version_table[i].version_spa) - version = zfs_version_table[i].version_zpl; - } - - return (version); -} - -/* - * Return the min spa version for a corresponding spa version - * -1 is returned if no mapping exists. - */ -int -zfs_spa_version_map(int zpl_version) -{ - int i; - int version = -1; - - for (i = 0; zfs_version_table[i].version_zpl; i++) { - if (zfs_version_table[i].version_zpl >= zpl_version) - return (zfs_version_table[i].version_spa); - } - - return (version); -} - -/* - * This is the table of legacy internal event names; it should not be modified. - * The internal events are now stored in the history log as strings. - */ -const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = { - "invalid event", - "pool create", - "vdev add", - "pool remove", - "pool destroy", - "pool export", - "pool import", - "vdev attach", - "vdev replace", - "vdev detach", - "vdev online", - "vdev offline", - "vdev upgrade", - "pool clear", - "pool scrub", - "pool property set", - "create", - "clone", - "destroy", - "destroy_begin_sync", - "inherit", - "property set", - "quota set", - "permission update", - "permission remove", - "permission who remove", - "promote", - "receive", - "rename", - "reservation set", - "replay_inc_sync", - "replay_full_sync", - "rollback", - "snapshot", - "filesystem version upgrade", - "refquota set", - "refreservation set", - "pool scrub done", - "user hold", - "user release", - "pool split", -}; diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h deleted file mode 100644 index 1c828e41e29f..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright 2019 Joyent, Inc. - */ - -#ifndef _ZFS_COMUTIL_H -#define _ZFS_COMUTIL_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* Needed for ZoL errno usage in MMP kernel and user code */ -#define EREMOTEIO EREMOTE - -extern boolean_t zfs_allocatable_devs(nvlist_t *); -extern void zpool_get_load_policy(nvlist_t *, zpool_load_policy_t *); - -extern int zfs_zpl_version_map(int spa_version); -extern int zfs_spa_version_map(int zpl_version); -#define ZFS_NUM_LEGACY_HISTORY_EVENTS 41 -extern const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS]; - -#ifdef __cplusplus -} -#endif - -#endif /* _ZFS_COMUTIL_H */ diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c deleted file mode 100644 index a3383f4ccf2d..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c +++ /dev/null @@ -1,235 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2010 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. - * Copyright 2016 Igor Kozhukhov - */ - -#include - -#if defined(_KERNEL) -#include -#include -#include -#else -#include -#include -#include -#include -#include -#endif -#include -#include "zfs_prop.h" -#include "zfs_deleg.h" -#include "zfs_namecheck.h" - -zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = { - {ZFS_DELEG_PERM_ALLOW}, - {ZFS_DELEG_PERM_BOOKMARK}, - {ZFS_DELEG_PERM_CLONE}, - {ZFS_DELEG_PERM_CREATE}, - {ZFS_DELEG_PERM_DESTROY}, - {ZFS_DELEG_PERM_DIFF}, - {ZFS_DELEG_PERM_MOUNT}, - {ZFS_DELEG_PERM_PROMOTE}, - {ZFS_DELEG_PERM_RECEIVE}, - {ZFS_DELEG_PERM_REMAP}, - {ZFS_DELEG_PERM_RENAME}, - {ZFS_DELEG_PERM_ROLLBACK}, - {ZFS_DELEG_PERM_SNAPSHOT}, - {ZFS_DELEG_PERM_SHARE}, - {ZFS_DELEG_PERM_SEND}, - {ZFS_DELEG_PERM_USERPROP}, - {ZFS_DELEG_PERM_USERQUOTA}, - {ZFS_DELEG_PERM_GROUPQUOTA}, - {ZFS_DELEG_PERM_USERUSED}, - {ZFS_DELEG_PERM_GROUPUSED}, - {ZFS_DELEG_PERM_HOLD}, - {ZFS_DELEG_PERM_RELEASE}, - {NULL} -}; - -static int -zfs_valid_permission_name(const char *perm) -{ - if (zfs_deleg_canonicalize_perm(perm)) - return (0); - - return (permset_namecheck(perm, NULL, NULL)); -} - -const char * -zfs_deleg_canonicalize_perm(const char *perm) -{ - int i; - zfs_prop_t prop; - - for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) { - if (strcmp(perm, zfs_deleg_perm_tab[i].z_perm) == 0) - return (perm); - } - - prop = zfs_name_to_prop(perm); - if (prop != ZPROP_INVAL && zfs_prop_delegatable(prop)) - return (zfs_prop_to_name(prop)); - return (NULL); - -} - -static int -zfs_validate_who(char *who) -{ - char *p; - - if (who[2] != ZFS_DELEG_FIELD_SEP_CHR) - return (-1); - - switch (who[0]) { - case ZFS_DELEG_USER: - case ZFS_DELEG_GROUP: - case ZFS_DELEG_USER_SETS: - case ZFS_DELEG_GROUP_SETS: - if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT) - return (-1); - for (p = &who[3]; *p; p++) - if (!isdigit(*p)) - return (-1); - break; - - case ZFS_DELEG_NAMED_SET: - case ZFS_DELEG_NAMED_SET_SETS: - if (who[1] != ZFS_DELEG_NA) - return (-1); - return (permset_namecheck(&who[3], NULL, NULL)); - - case ZFS_DELEG_CREATE: - case ZFS_DELEG_CREATE_SETS: - if (who[1] != ZFS_DELEG_NA) - return (-1); - if (who[3] != '\0') - return (-1); - break; - - case ZFS_DELEG_EVERYONE: - case ZFS_DELEG_EVERYONE_SETS: - if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT) - return (-1); - if (who[3] != '\0') - return (-1); - break; - - default: - return (-1); - } - - return (0); -} - -int -zfs_deleg_verify_nvlist(nvlist_t *nvp) -{ - nvpair_t *who, *perm_name; - nvlist_t *perms; - int error; - - if (nvp == NULL) - return (-1); - - who = nvlist_next_nvpair(nvp, NULL); - if (who == NULL) - return (-1); - - do { - if (zfs_validate_who(nvpair_name(who))) - return (-1); - - error = nvlist_lookup_nvlist(nvp, nvpair_name(who), &perms); - - if (error && error != ENOENT) - return (-1); - if (error == ENOENT) - continue; - - perm_name = nvlist_next_nvpair(perms, NULL); - if (perm_name == NULL) { - return (-1); - } - do { - error = zfs_valid_permission_name( - nvpair_name(perm_name)); - if (error) - return (-1); - } while ((perm_name = nvlist_next_nvpair(perms, perm_name)) - != NULL); - } while ((who = nvlist_next_nvpair(nvp, who)) != NULL); - return (0); -} - -/* - * Construct the base attribute name. The base attribute names - * are the "key" to locate the jump objects which contain the actual - * permissions. The base attribute names are encoded based on - * type of entry and whether it is a local or descendent permission. - * - * Arguments: - * attr - attribute name return string, attribute is assumed to be - * ZFS_MAX_DELEG_NAME long. - * type - type of entry to construct - * inheritchr - inheritance type (local,descendent, or NA for create and - * permission set definitions - * data - is either a permission set name or a 64 bit uid/gid. - */ -void -zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type, - char inheritchr, void *data) -{ - int len = ZFS_MAX_DELEG_NAME; - uint64_t *id = data; - - switch (type) { - case ZFS_DELEG_USER: - case ZFS_DELEG_GROUP: - case ZFS_DELEG_USER_SETS: - case ZFS_DELEG_GROUP_SETS: - (void) snprintf(attr, len, "%c%c%c%lld", type, inheritchr, - ZFS_DELEG_FIELD_SEP_CHR, (longlong_t)*id); - break; - case ZFS_DELEG_NAMED_SET_SETS: - case ZFS_DELEG_NAMED_SET: - (void) snprintf(attr, len, "%c-%c%s", type, - ZFS_DELEG_FIELD_SEP_CHR, (char *)data); - break; - case ZFS_DELEG_CREATE: - case ZFS_DELEG_CREATE_SETS: - (void) snprintf(attr, len, "%c-%c", type, - ZFS_DELEG_FIELD_SEP_CHR); - break; - case ZFS_DELEG_EVERYONE: - case ZFS_DELEG_EVERYONE_SETS: - (void) snprintf(attr, len, "%c%c%c", type, inheritchr, - ZFS_DELEG_FIELD_SEP_CHR); - break; - default: - ASSERT(!"bad zfs_deleg_who_type_t"); - } -} diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h deleted file mode 100644 index 06d2df9bb80d..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2010 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. - */ - -#ifndef _ZFS_DELEG_H -#define _ZFS_DELEG_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define ZFS_DELEG_SET_NAME_CHR '@' /* set name lead char */ -#define ZFS_DELEG_FIELD_SEP_CHR '$' /* field separator */ - -/* - * Max name length for a delegation attribute - */ -#define ZFS_MAX_DELEG_NAME 128 - -#define ZFS_DELEG_LOCAL 'l' -#define ZFS_DELEG_DESCENDENT 'd' -#define ZFS_DELEG_NA '-' - -typedef enum { - ZFS_DELEG_NOTE_CREATE, - ZFS_DELEG_NOTE_DESTROY, - ZFS_DELEG_NOTE_SNAPSHOT, - ZFS_DELEG_NOTE_ROLLBACK, - ZFS_DELEG_NOTE_CLONE, - ZFS_DELEG_NOTE_PROMOTE, - ZFS_DELEG_NOTE_RENAME, - ZFS_DELEG_NOTE_SEND, - ZFS_DELEG_NOTE_RECEIVE, - ZFS_DELEG_NOTE_ALLOW, - ZFS_DELEG_NOTE_USERPROP, - ZFS_DELEG_NOTE_MOUNT, - ZFS_DELEG_NOTE_SHARE, - ZFS_DELEG_NOTE_USERQUOTA, - ZFS_DELEG_NOTE_GROUPQUOTA, - ZFS_DELEG_NOTE_USERUSED, - ZFS_DELEG_NOTE_GROUPUSED, - ZFS_DELEG_NOTE_HOLD, - ZFS_DELEG_NOTE_RELEASE, - ZFS_DELEG_NOTE_DIFF, - ZFS_DELEG_NOTE_BOOKMARK, - ZFS_DELEG_NOTE_REMAP, - ZFS_DELEG_NOTE_NONE -} zfs_deleg_note_t; - -typedef struct zfs_deleg_perm_tab { - char *z_perm; - zfs_deleg_note_t z_note; -} zfs_deleg_perm_tab_t; - -extern zfs_deleg_perm_tab_t zfs_deleg_perm_tab[]; - -int zfs_deleg_verify_nvlist(nvlist_t *nvlist); -void zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type, - char checkflag, void *data); -const char *zfs_deleg_canonicalize_perm(const char *perm); - -#ifdef __cplusplus -} -#endif - -#endif /* _ZFS_DELEG_H */ diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c deleted file mode 100644 index c889169b426b..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c +++ /dev/null @@ -1,279 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright 2013 Saso Kiselkov. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. - */ - -/* - * Fletcher Checksums - * ------------------ - * - * ZFS's 2nd and 4th order Fletcher checksums are defined by the following - * recurrence relations: - * - * a = a + f - * i i-1 i-1 - * - * b = b + a - * i i-1 i - * - * c = c + b (fletcher-4 only) - * i i-1 i - * - * d = d + c (fletcher-4 only) - * i i-1 i - * - * Where - * a_0 = b_0 = c_0 = d_0 = 0 - * and - * f_0 .. f_(n-1) are the input data. - * - * Using standard techniques, these translate into the following series: - * - * __n_ __n_ - * \ | \ | - * a = > f b = > i * f - * n /___| n - i n /___| n - i - * i = 1 i = 1 - * - * - * __n_ __n_ - * \ | i*(i+1) \ | i*(i+1)*(i+2) - * c = > ------- f d = > ------------- f - * n /___| 2 n - i n /___| 6 n - i - * i = 1 i = 1 - * - * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators. - * Since the additions are done mod (2^64), errors in the high bits may not - * be noticed. For this reason, fletcher-2 is deprecated. - * - * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators. - * A conservative estimate of how big the buffer can get before we overflow - * can be estimated using f_i = 0xffffffff for all i: - * - * % bc - * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4 - * 2264 - * quit - * % - * - * So blocks of up to 2k will not overflow. Our largest block size is - * 128k, which has 32k 4-byte words, so we can compute the largest possible - * accumulators, then divide by 2^64 to figure the max amount of overflow: - * - * % bc - * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c } - * a/2^64;b/2^64;c/2^64;d/2^64 - * 0 - * 0 - * 1365 - * 11186858 - * quit - * % - * - * So a and b cannot overflow. To make sure each bit of input has some - * effect on the contents of c and d, we can look at what the factors of - * the coefficients in the equations for c_n and d_n are. The number of 2s - * in the factors determines the lowest set bit in the multiplier. Running - * through the cases for n*(n+1)/2 reveals that the highest power of 2 is - * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow - * the 64-bit accumulators, every bit of every f_i effects every accumulator, - * even for 128k blocks. - * - * If we wanted to make a stronger version of fletcher4 (fletcher4c?), - * we could do our calculations mod (2^32 - 1) by adding in the carries - * periodically, and store the number of carries in the top 32-bits. - * - * -------------------- - * Checksum Performance - * -------------------- - * - * There are two interesting components to checksum performance: cached and - * uncached performance. With cached data, fletcher-2 is about four times - * faster than fletcher-4. With uncached data, the performance difference is - * negligible, since the cost of a cache fill dominates the processing time. - * Even though fletcher-4 is slower than fletcher-2, it is still a pretty - * efficient pass over the data. - * - * In normal operation, the data which is being checksummed is in a buffer - * which has been filled either by: - * - * 1. a compression step, which will be mostly cached, or - * 2. a bcopy() or copyin(), which will be uncached (because the - * copy is cache-bypassing). - * - * For both cached and uncached data, both fletcher checksums are much faster - * than sha-256, and slower than 'off', which doesn't touch the data at all. - */ - -#include -#include -#include -#include -#include -#include - -void -fletcher_init(zio_cksum_t *zcp) -{ - ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); -} - -int -fletcher_2_incremental_native(void *buf, size_t size, void *data) -{ - zio_cksum_t *zcp = data; - - const uint64_t *ip = buf; - const uint64_t *ipend = ip + (size / sizeof (uint64_t)); - uint64_t a0, b0, a1, b1; - - a0 = zcp->zc_word[0]; - a1 = zcp->zc_word[1]; - b0 = zcp->zc_word[2]; - b1 = zcp->zc_word[3]; - - for (; ip < ipend; ip += 2) { - a0 += ip[0]; - a1 += ip[1]; - b0 += a0; - b1 += a1; - } - - ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); - return (0); -} - -/*ARGSUSED*/ -void -fletcher_2_native(const void *buf, size_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - fletcher_init(zcp); - (void) fletcher_2_incremental_native((void *) buf, size, zcp); -} - -int -fletcher_2_incremental_byteswap(void *buf, size_t size, void *data) -{ - zio_cksum_t *zcp = data; - - const uint64_t *ip = buf; - const uint64_t *ipend = ip + (size / sizeof (uint64_t)); - uint64_t a0, b0, a1, b1; - - a0 = zcp->zc_word[0]; - a1 = zcp->zc_word[1]; - b0 = zcp->zc_word[2]; - b1 = zcp->zc_word[3]; - - for (; ip < ipend; ip += 2) { - a0 += BSWAP_64(ip[0]); - a1 += BSWAP_64(ip[1]); - b0 += a0; - b1 += a1; - } - - ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); - return (0); -} - -/*ARGSUSED*/ -void -fletcher_2_byteswap(const void *buf, size_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - fletcher_init(zcp); - (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp); -} - -int -fletcher_4_incremental_native(void *buf, size_t size, void *data) -{ - zio_cksum_t *zcp = data; - - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - a = zcp->zc_word[0]; - b = zcp->zc_word[1]; - c = zcp->zc_word[2]; - d = zcp->zc_word[3]; - - for (; ip < ipend; ip++) { - a += ip[0]; - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); - return (0); -} - -/*ARGSUSED*/ -void -fletcher_4_native(const void *buf, size_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - fletcher_init(zcp); - (void) fletcher_4_incremental_native((void *) buf, size, zcp); -} - -int -fletcher_4_incremental_byteswap(void *buf, size_t size, void *data) -{ - zio_cksum_t *zcp = data; - - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - a = zcp->zc_word[0]; - b = zcp->zc_word[1]; - c = zcp->zc_word[2]; - d = zcp->zc_word[3]; - - for (; ip < ipend; ip++) { - a += BSWAP_32(ip[0]); - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); - return (0); -} - -/*ARGSUSED*/ -void -fletcher_4_byteswap(const void *buf, size_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - fletcher_init(zcp); - (void) fletcher_4_incremental_byteswap((void *) buf, size, zcp); -} diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h deleted file mode 100644 index 33c6c728cf61..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright 2013 Saso Kiselkov. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. - */ - -#ifndef _ZFS_FLETCHER_H -#define _ZFS_FLETCHER_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * fletcher checksum functions - */ - -void fletcher_init(zio_cksum_t *); -void fletcher_2_native(const void *, size_t, const void *, zio_cksum_t *); -void fletcher_2_byteswap(const void *, size_t, const void *, zio_cksum_t *); -int fletcher_2_incremental_native(void *, size_t, void *); -int fletcher_2_incremental_byteswap(void *, size_t, void *); -void fletcher_4_native(const void *, size_t, const void *, zio_cksum_t *); -void fletcher_4_byteswap(const void *, size_t, const void *, zio_cksum_t *); -int fletcher_4_incremental_native(void *, size_t, void *); -int fletcher_4_incremental_byteswap(void *, size_t, void *); - -#ifdef __cplusplus -} -#endif - -#endif /* _ZFS_FLETCHER_H */ diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c deleted file mode 100644 index e5ac73f96b98..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c +++ /dev/null @@ -1,1380 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2013 Xin Li . All rights reserved. - * Copyright 2013 Martin Matuska . All rights reserved. - * Portions Copyright 2005, 2010, Oracle and/or its affiliates. - * All rights reserved. - * Use is subject to license terms. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "zfs_namecheck.h" -#include "zfs_ioctl_compat.h" - -static int zfs_version_ioctl = ZFS_IOCVER_CURRENT; -SYSCTL_DECL(_vfs_zfs_version); -SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl, - 0, "ZFS_IOCTL_VERSION"); - -/* - * FreeBSD zfs_cmd compatibility with older binaries - * appropriately remap/extend the zfs_cmd_t structure - */ -void -zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) -{ - zfs_cmd_v15_t *zc_c; - zfs_cmd_v28_t *zc28_c; - zfs_cmd_deadman_t *zcdm_c; - zfs_cmd_zcmd_t *zcmd_c; - zfs_cmd_edbp_t *edbp_c; - zfs_cmd_resume_t *resume_c; - zfs_cmd_inlanes_t *inlanes_c; - - switch (cflag) { - case ZFS_CMD_COMPAT_INLANES: - inlanes_c = (void *)addr; - /* zc */ - strlcpy(zc->zc_name, inlanes_c->zc_name, MAXPATHLEN); - strlcpy(zc->zc_value, inlanes_c->zc_value, MAXPATHLEN * 2); - strlcpy(zc->zc_string, inlanes_c->zc_string, MAXPATHLEN); - -#define FIELD_COPY(field) zc->field = inlanes_c->field - FIELD_COPY(zc_nvlist_src); - FIELD_COPY(zc_nvlist_src_size); - FIELD_COPY(zc_nvlist_dst); - FIELD_COPY(zc_nvlist_dst_size); - FIELD_COPY(zc_nvlist_dst_filled); - FIELD_COPY(zc_pad2); - FIELD_COPY(zc_history); - FIELD_COPY(zc_guid); - FIELD_COPY(zc_nvlist_conf); - FIELD_COPY(zc_nvlist_conf_size); - FIELD_COPY(zc_cookie); - FIELD_COPY(zc_objset_type); - FIELD_COPY(zc_perm_action); - FIELD_COPY(zc_history_len); - FIELD_COPY(zc_history_offset); - FIELD_COPY(zc_obj); - FIELD_COPY(zc_iflags); - FIELD_COPY(zc_share); - FIELD_COPY(zc_jailid); - FIELD_COPY(zc_objset_stats); - FIELD_COPY(zc_begin_record); - FIELD_COPY(zc_inject_record); - FIELD_COPY(zc_defer_destroy); - FIELD_COPY(zc_flags); - FIELD_COPY(zc_action_handle); - FIELD_COPY(zc_cleanup_fd); - FIELD_COPY(zc_simple); - FIELD_COPY(zc_resumable); - FIELD_COPY(zc_sendobj); - FIELD_COPY(zc_fromobj); - FIELD_COPY(zc_createtxg); - FIELD_COPY(zc_stat); -#undef FIELD_COPY - break; - - case ZFS_CMD_COMPAT_RESUME: - resume_c = (void *)addr; - /* zc */ - strlcpy(zc->zc_name, resume_c->zc_name, MAXPATHLEN); - strlcpy(zc->zc_value, resume_c->zc_value, MAXPATHLEN * 2); - strlcpy(zc->zc_string, resume_c->zc_string, MAXPATHLEN); - -#define FIELD_COPY(field) zc->field = resume_c->field - FIELD_COPY(zc_nvlist_src); - FIELD_COPY(zc_nvlist_src_size); - FIELD_COPY(zc_nvlist_dst); - FIELD_COPY(zc_nvlist_dst_size); - FIELD_COPY(zc_nvlist_dst_filled); - FIELD_COPY(zc_pad2); - FIELD_COPY(zc_history); - FIELD_COPY(zc_guid); - FIELD_COPY(zc_nvlist_conf); - FIELD_COPY(zc_nvlist_conf_size); - FIELD_COPY(zc_cookie); - FIELD_COPY(zc_objset_type); - FIELD_COPY(zc_perm_action); - FIELD_COPY(zc_history_len); - FIELD_COPY(zc_history_offset); - FIELD_COPY(zc_obj); - FIELD_COPY(zc_iflags); - FIELD_COPY(zc_share); - FIELD_COPY(zc_jailid); - FIELD_COPY(zc_objset_stats); - FIELD_COPY(zc_begin_record); - FIELD_COPY(zc_inject_record.zi_objset); - FIELD_COPY(zc_inject_record.zi_object); - FIELD_COPY(zc_inject_record.zi_start); - FIELD_COPY(zc_inject_record.zi_end); - FIELD_COPY(zc_inject_record.zi_guid); - FIELD_COPY(zc_inject_record.zi_level); - FIELD_COPY(zc_inject_record.zi_error); - FIELD_COPY(zc_inject_record.zi_type); - FIELD_COPY(zc_inject_record.zi_freq); - FIELD_COPY(zc_inject_record.zi_failfast); - strlcpy(zc->zc_inject_record.zi_func, - resume_c->zc_inject_record.zi_func, MAXNAMELEN); - FIELD_COPY(zc_inject_record.zi_iotype); - FIELD_COPY(zc_inject_record.zi_duration); - FIELD_COPY(zc_inject_record.zi_timer); - zc->zc_inject_record.zi_nlanes = 1; - FIELD_COPY(zc_inject_record.zi_cmd); - FIELD_COPY(zc_inject_record.zi_pad); - FIELD_COPY(zc_defer_destroy); - FIELD_COPY(zc_flags); - FIELD_COPY(zc_action_handle); - FIELD_COPY(zc_cleanup_fd); - FIELD_COPY(zc_simple); - FIELD_COPY(zc_resumable); - FIELD_COPY(zc_sendobj); - FIELD_COPY(zc_fromobj); - FIELD_COPY(zc_createtxg); - FIELD_COPY(zc_stat); -#undef FIELD_COPY - break; - - case ZFS_CMD_COMPAT_EDBP: - edbp_c = (void *)addr; - /* zc */ - strlcpy(zc->zc_name, edbp_c->zc_name, MAXPATHLEN); - strlcpy(zc->zc_value, edbp_c->zc_value, MAXPATHLEN * 2); - strlcpy(zc->zc_string, edbp_c->zc_string, MAXPATHLEN); - -#define FIELD_COPY(field) zc->field = edbp_c->field - FIELD_COPY(zc_nvlist_src); - FIELD_COPY(zc_nvlist_src_size); - FIELD_COPY(zc_nvlist_dst); - FIELD_COPY(zc_nvlist_dst_size); - FIELD_COPY(zc_nvlist_dst_filled); - FIELD_COPY(zc_pad2); - FIELD_COPY(zc_history); - FIELD_COPY(zc_guid); - FIELD_COPY(zc_nvlist_conf); - FIELD_COPY(zc_nvlist_conf_size); - FIELD_COPY(zc_cookie); - FIELD_COPY(zc_objset_type); - FIELD_COPY(zc_perm_action); - FIELD_COPY(zc_history_len); - FIELD_COPY(zc_history_offset); - FIELD_COPY(zc_obj); - FIELD_COPY(zc_iflags); - FIELD_COPY(zc_share); - FIELD_COPY(zc_jailid); - FIELD_COPY(zc_objset_stats); - zc->zc_begin_record.drr_u.drr_begin = edbp_c->zc_begin_record; - FIELD_COPY(zc_inject_record.zi_objset); - FIELD_COPY(zc_inject_record.zi_object); - FIELD_COPY(zc_inject_record.zi_start); - FIELD_COPY(zc_inject_record.zi_end); - FIELD_COPY(zc_inject_record.zi_guid); - FIELD_COPY(zc_inject_record.zi_level); - FIELD_COPY(zc_inject_record.zi_error); - FIELD_COPY(zc_inject_record.zi_type); - FIELD_COPY(zc_inject_record.zi_freq); - FIELD_COPY(zc_inject_record.zi_failfast); - strlcpy(zc->zc_inject_record.zi_func, - edbp_c->zc_inject_record.zi_func, MAXNAMELEN); - FIELD_COPY(zc_inject_record.zi_iotype); - FIELD_COPY(zc_inject_record.zi_duration); - FIELD_COPY(zc_inject_record.zi_timer); - zc->zc_inject_record.zi_nlanes = 1; - FIELD_COPY(zc_inject_record.zi_cmd); - FIELD_COPY(zc_inject_record.zi_pad); - FIELD_COPY(zc_defer_destroy); - FIELD_COPY(zc_flags); - FIELD_COPY(zc_action_handle); - FIELD_COPY(zc_cleanup_fd); - FIELD_COPY(zc_simple); - zc->zc_resumable = B_FALSE; - FIELD_COPY(zc_sendobj); - FIELD_COPY(zc_fromobj); - FIELD_COPY(zc_createtxg); - FIELD_COPY(zc_stat); -#undef FIELD_COPY - break; - - case ZFS_CMD_COMPAT_ZCMD: - zcmd_c = (void *)addr; - /* zc */ - strlcpy(zc->zc_name, zcmd_c->zc_name, MAXPATHLEN); - strlcpy(zc->zc_value, zcmd_c->zc_value, MAXPATHLEN * 2); - strlcpy(zc->zc_string, zcmd_c->zc_string, MAXPATHLEN); - -#define FIELD_COPY(field) zc->field = zcmd_c->field - FIELD_COPY(zc_nvlist_src); - FIELD_COPY(zc_nvlist_src_size); - FIELD_COPY(zc_nvlist_dst); - FIELD_COPY(zc_nvlist_dst_size); - FIELD_COPY(zc_nvlist_dst_filled); - FIELD_COPY(zc_pad2); - FIELD_COPY(zc_history); - FIELD_COPY(zc_guid); - FIELD_COPY(zc_nvlist_conf); - FIELD_COPY(zc_nvlist_conf_size); - FIELD_COPY(zc_cookie); - FIELD_COPY(zc_objset_type); - FIELD_COPY(zc_perm_action); - FIELD_COPY(zc_history_len); - FIELD_COPY(zc_history_offset); - FIELD_COPY(zc_obj); - FIELD_COPY(zc_iflags); - FIELD_COPY(zc_share); - FIELD_COPY(zc_jailid); - FIELD_COPY(zc_objset_stats); - zc->zc_begin_record.drr_u.drr_begin = zcmd_c->zc_begin_record; - FIELD_COPY(zc_inject_record.zi_objset); - FIELD_COPY(zc_inject_record.zi_object); - FIELD_COPY(zc_inject_record.zi_start); - FIELD_COPY(zc_inject_record.zi_end); - FIELD_COPY(zc_inject_record.zi_guid); - FIELD_COPY(zc_inject_record.zi_level); - FIELD_COPY(zc_inject_record.zi_error); - FIELD_COPY(zc_inject_record.zi_type); - FIELD_COPY(zc_inject_record.zi_freq); - FIELD_COPY(zc_inject_record.zi_failfast); - strlcpy(zc->zc_inject_record.zi_func, - zcmd_c->zc_inject_record.zi_func, MAXNAMELEN); - FIELD_COPY(zc_inject_record.zi_iotype); - FIELD_COPY(zc_inject_record.zi_duration); - FIELD_COPY(zc_inject_record.zi_timer); - zc->zc_inject_record.zi_nlanes = 1; - FIELD_COPY(zc_inject_record.zi_cmd); - FIELD_COPY(zc_inject_record.zi_pad); - - /* boolean_t -> uint32_t */ - zc->zc_defer_destroy = (uint32_t)(zcmd_c->zc_defer_destroy); - zc->zc_flags = 0; - - FIELD_COPY(zc_action_handle); - FIELD_COPY(zc_cleanup_fd); - FIELD_COPY(zc_simple); - zc->zc_resumable = B_FALSE; - FIELD_COPY(zc_sendobj); - FIELD_COPY(zc_fromobj); - FIELD_COPY(zc_createtxg); - FIELD_COPY(zc_stat); -#undef FIELD_COPY - - break; - - case ZFS_CMD_COMPAT_DEADMAN: - zcdm_c = (void *)addr; - /* zc */ - strlcpy(zc->zc_name, zcdm_c->zc_name, MAXPATHLEN); - strlcpy(zc->zc_value, zcdm_c->zc_value, MAXPATHLEN * 2); - strlcpy(zc->zc_string, zcdm_c->zc_string, MAXPATHLEN); - -#define FIELD_COPY(field) zc->field = zcdm_c->field - zc->zc_guid = zcdm_c->zc_guid; - zc->zc_nvlist_conf = zcdm_c->zc_nvlist_conf; - zc->zc_nvlist_conf_size = zcdm_c->zc_nvlist_conf_size; - zc->zc_nvlist_src = zcdm_c->zc_nvlist_src; - zc->zc_nvlist_src_size = zcdm_c->zc_nvlist_src_size; - zc->zc_nvlist_dst = zcdm_c->zc_nvlist_dst; - zc->zc_nvlist_dst_size = zcdm_c->zc_nvlist_dst_size; - zc->zc_cookie = zcdm_c->zc_cookie; - zc->zc_objset_type = zcdm_c->zc_objset_type; - zc->zc_perm_action = zcdm_c->zc_perm_action; - zc->zc_history = zcdm_c->zc_history; - zc->zc_history_len = zcdm_c->zc_history_len; - zc->zc_history_offset = zcdm_c->zc_history_offset; - zc->zc_obj = zcdm_c->zc_obj; - zc->zc_iflags = zcdm_c->zc_iflags; - zc->zc_share = zcdm_c->zc_share; - zc->zc_jailid = zcdm_c->zc_jailid; - zc->zc_objset_stats = zcdm_c->zc_objset_stats; - zc->zc_begin_record.drr_u.drr_begin = zcdm_c->zc_begin_record; - zc->zc_defer_destroy = zcdm_c->zc_defer_destroy; - (void)zcdm_c->zc_temphold; - zc->zc_action_handle = zcdm_c->zc_action_handle; - zc->zc_cleanup_fd = zcdm_c->zc_cleanup_fd; - zc->zc_simple = zcdm_c->zc_simple; - zc->zc_resumable = B_FALSE; - zc->zc_sendobj = zcdm_c->zc_sendobj; - zc->zc_fromobj = zcdm_c->zc_fromobj; - zc->zc_createtxg = zcdm_c->zc_createtxg; - zc->zc_stat = zcdm_c->zc_stat; - FIELD_COPY(zc_inject_record.zi_objset); - FIELD_COPY(zc_inject_record.zi_object); - FIELD_COPY(zc_inject_record.zi_start); - FIELD_COPY(zc_inject_record.zi_end); - FIELD_COPY(zc_inject_record.zi_guid); - FIELD_COPY(zc_inject_record.zi_level); - FIELD_COPY(zc_inject_record.zi_error); - FIELD_COPY(zc_inject_record.zi_type); - FIELD_COPY(zc_inject_record.zi_freq); - FIELD_COPY(zc_inject_record.zi_failfast); - strlcpy(zc->zc_inject_record.zi_func, - resume_c->zc_inject_record.zi_func, MAXNAMELEN); - FIELD_COPY(zc_inject_record.zi_iotype); - FIELD_COPY(zc_inject_record.zi_duration); - FIELD_COPY(zc_inject_record.zi_timer); - zc->zc_inject_record.zi_nlanes = 1; - FIELD_COPY(zc_inject_record.zi_cmd); - FIELD_COPY(zc_inject_record.zi_pad); - - /* we always assume zc_nvlist_dst_filled is true */ - zc->zc_nvlist_dst_filled = B_TRUE; -#undef FIELD_COPY - break; - - case ZFS_CMD_COMPAT_V28: - zc28_c = (void *)addr; - - /* zc */ - strlcpy(zc->zc_name, zc28_c->zc_name, MAXPATHLEN); - strlcpy(zc->zc_value, zc28_c->zc_value, MAXPATHLEN * 2); - strlcpy(zc->zc_string, zc28_c->zc_string, MAXPATHLEN); - zc->zc_guid = zc28_c->zc_guid; - zc->zc_nvlist_conf = zc28_c->zc_nvlist_conf; - zc->zc_nvlist_conf_size = zc28_c->zc_nvlist_conf_size; - zc->zc_nvlist_src = zc28_c->zc_nvlist_src; - zc->zc_nvlist_src_size = zc28_c->zc_nvlist_src_size; - zc->zc_nvlist_dst = zc28_c->zc_nvlist_dst; - zc->zc_nvlist_dst_size = zc28_c->zc_nvlist_dst_size; - zc->zc_cookie = zc28_c->zc_cookie; - zc->zc_objset_type = zc28_c->zc_objset_type; - zc->zc_perm_action = zc28_c->zc_perm_action; - zc->zc_history = zc28_c->zc_history; - zc->zc_history_len = zc28_c->zc_history_len; - zc->zc_history_offset = zc28_c->zc_history_offset; - zc->zc_obj = zc28_c->zc_obj; - zc->zc_iflags = zc28_c->zc_iflags; - zc->zc_share = zc28_c->zc_share; - zc->zc_jailid = zc28_c->zc_jailid; - zc->zc_objset_stats = zc28_c->zc_objset_stats; - zc->zc_begin_record.drr_u.drr_begin = zc28_c->zc_begin_record; - zc->zc_defer_destroy = zc28_c->zc_defer_destroy; - (void)zc28_c->zc_temphold; - zc->zc_action_handle = zc28_c->zc_action_handle; - zc->zc_cleanup_fd = zc28_c->zc_cleanup_fd; - zc->zc_simple = zc28_c->zc_simple; - zc->zc_resumable = B_FALSE; - zc->zc_sendobj = zc28_c->zc_sendobj; - zc->zc_fromobj = zc28_c->zc_fromobj; - zc->zc_createtxg = zc28_c->zc_createtxg; - zc->zc_stat = zc28_c->zc_stat; - - /* zc->zc_inject_record */ - zc->zc_inject_record.zi_objset = - zc28_c->zc_inject_record.zi_objset; - zc->zc_inject_record.zi_object = - zc28_c->zc_inject_record.zi_object; - zc->zc_inject_record.zi_start = - zc28_c->zc_inject_record.zi_start; - zc->zc_inject_record.zi_end = - zc28_c->zc_inject_record.zi_end; - zc->zc_inject_record.zi_guid = - zc28_c->zc_inject_record.zi_guid; - zc->zc_inject_record.zi_level = - zc28_c->zc_inject_record.zi_level; - zc->zc_inject_record.zi_error = - zc28_c->zc_inject_record.zi_error; - zc->zc_inject_record.zi_type = - zc28_c->zc_inject_record.zi_type; - zc->zc_inject_record.zi_freq = - zc28_c->zc_inject_record.zi_freq; - zc->zc_inject_record.zi_failfast = - zc28_c->zc_inject_record.zi_failfast; - strlcpy(zc->zc_inject_record.zi_func, - zc28_c->zc_inject_record.zi_func, MAXNAMELEN); - zc->zc_inject_record.zi_iotype = - zc28_c->zc_inject_record.zi_iotype; - zc->zc_inject_record.zi_duration = - zc28_c->zc_inject_record.zi_duration; - zc->zc_inject_record.zi_timer = - zc28_c->zc_inject_record.zi_timer; - zc->zc_inject_record.zi_nlanes = 1; - zc->zc_inject_record.zi_cmd = ZINJECT_UNINITIALIZED; - zc->zc_inject_record.zi_pad = 0; - break; - - case ZFS_CMD_COMPAT_V15: - zc_c = (void *)addr; - - /* zc */ - strlcpy(zc->zc_name, zc_c->zc_name, MAXPATHLEN); - strlcpy(zc->zc_value, zc_c->zc_value, MAXPATHLEN); - strlcpy(zc->zc_string, zc_c->zc_string, MAXPATHLEN); - zc->zc_guid = zc_c->zc_guid; - zc->zc_nvlist_conf = zc_c->zc_nvlist_conf; - zc->zc_nvlist_conf_size = zc_c->zc_nvlist_conf_size; - zc->zc_nvlist_src = zc_c->zc_nvlist_src; - zc->zc_nvlist_src_size = zc_c->zc_nvlist_src_size; - zc->zc_nvlist_dst = zc_c->zc_nvlist_dst; - zc->zc_nvlist_dst_size = zc_c->zc_nvlist_dst_size; - zc->zc_cookie = zc_c->zc_cookie; - zc->zc_objset_type = zc_c->zc_objset_type; - zc->zc_perm_action = zc_c->zc_perm_action; - zc->zc_history = zc_c->zc_history; - zc->zc_history_len = zc_c->zc_history_len; - zc->zc_history_offset = zc_c->zc_history_offset; - zc->zc_obj = zc_c->zc_obj; - zc->zc_share = zc_c->zc_share; - zc->zc_jailid = zc_c->zc_jailid; - zc->zc_objset_stats = zc_c->zc_objset_stats; - zc->zc_begin_record.drr_u.drr_begin = zc_c->zc_begin_record; - - /* zc->zc_inject_record */ - zc->zc_inject_record.zi_objset = - zc_c->zc_inject_record.zi_objset; - zc->zc_inject_record.zi_object = - zc_c->zc_inject_record.zi_object; - zc->zc_inject_record.zi_start = - zc_c->zc_inject_record.zi_start; - zc->zc_inject_record.zi_end = - zc_c->zc_inject_record.zi_end; - zc->zc_inject_record.zi_guid = - zc_c->zc_inject_record.zi_guid; - zc->zc_inject_record.zi_level = - zc_c->zc_inject_record.zi_level; - zc->zc_inject_record.zi_error = - zc_c->zc_inject_record.zi_error; - zc->zc_inject_record.zi_type = - zc_c->zc_inject_record.zi_type; - zc->zc_inject_record.zi_freq = - zc_c->zc_inject_record.zi_freq; - zc->zc_inject_record.zi_failfast = - zc_c->zc_inject_record.zi_failfast; - break; - } -} - -void -zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request, - const int cflag) -{ - zfs_cmd_v15_t *zc_c; - zfs_cmd_v28_t *zc28_c; - zfs_cmd_deadman_t *zcdm_c; - zfs_cmd_zcmd_t *zcmd_c; - zfs_cmd_edbp_t *edbp_c; - zfs_cmd_resume_t *resume_c; - zfs_cmd_inlanes_t *inlanes_c; - - switch (cflag) { - case ZFS_CMD_COMPAT_INLANES: - inlanes_c = (void *)addr; - strlcpy(inlanes_c->zc_name, zc->zc_name, MAXPATHLEN); - strlcpy(inlanes_c->zc_value, zc->zc_value, MAXPATHLEN * 2); - strlcpy(inlanes_c->zc_string, zc->zc_string, MAXPATHLEN); - -#define FIELD_COPY(field) inlanes_c->field = zc->field - FIELD_COPY(zc_nvlist_src); - FIELD_COPY(zc_nvlist_src_size); - FIELD_COPY(zc_nvlist_dst); - FIELD_COPY(zc_nvlist_dst_size); - FIELD_COPY(zc_nvlist_dst_filled); - FIELD_COPY(zc_pad2); - FIELD_COPY(zc_history); - FIELD_COPY(zc_guid); - FIELD_COPY(zc_nvlist_conf); - FIELD_COPY(zc_nvlist_conf_size); - FIELD_COPY(zc_cookie); - FIELD_COPY(zc_objset_type); - FIELD_COPY(zc_perm_action); - FIELD_COPY(zc_history_len); - FIELD_COPY(zc_history_offset); - FIELD_COPY(zc_obj); - FIELD_COPY(zc_iflags); - FIELD_COPY(zc_share); - FIELD_COPY(zc_jailid); - FIELD_COPY(zc_objset_stats); - FIELD_COPY(zc_begin_record); - FIELD_COPY(zc_inject_record); - FIELD_COPY(zc_defer_destroy); - FIELD_COPY(zc_flags); - FIELD_COPY(zc_action_handle); - FIELD_COPY(zc_cleanup_fd); - FIELD_COPY(zc_simple); - FIELD_COPY(zc_sendobj); - FIELD_COPY(zc_fromobj); - FIELD_COPY(zc_createtxg); - FIELD_COPY(zc_stat); -#undef FIELD_COPY - break; - - case ZFS_CMD_COMPAT_RESUME: - resume_c = (void *)addr; - strlcpy(resume_c->zc_name, zc->zc_name, MAXPATHLEN); - strlcpy(resume_c->zc_value, zc->zc_value, MAXPATHLEN * 2); - strlcpy(resume_c->zc_string, zc->zc_string, MAXPATHLEN); - -#define FIELD_COPY(field) resume_c->field = zc->field - FIELD_COPY(zc_nvlist_src); - FIELD_COPY(zc_nvlist_src_size); - FIELD_COPY(zc_nvlist_dst); - FIELD_COPY(zc_nvlist_dst_size); - FIELD_COPY(zc_nvlist_dst_filled); - FIELD_COPY(zc_pad2); - FIELD_COPY(zc_history); - FIELD_COPY(zc_guid); - FIELD_COPY(zc_nvlist_conf); - FIELD_COPY(zc_nvlist_conf_size); - FIELD_COPY(zc_cookie); - FIELD_COPY(zc_objset_type); - FIELD_COPY(zc_perm_action); - FIELD_COPY(zc_history_len); - FIELD_COPY(zc_history_offset); - FIELD_COPY(zc_obj); - FIELD_COPY(zc_iflags); - FIELD_COPY(zc_share); - FIELD_COPY(zc_jailid); - FIELD_COPY(zc_objset_stats); - FIELD_COPY(zc_begin_record); - FIELD_COPY(zc_inject_record.zi_objset); - FIELD_COPY(zc_inject_record.zi_object); - FIELD_COPY(zc_inject_record.zi_start); - FIELD_COPY(zc_inject_record.zi_end); - FIELD_COPY(zc_inject_record.zi_guid); - FIELD_COPY(zc_inject_record.zi_level); - FIELD_COPY(zc_inject_record.zi_error); - FIELD_COPY(zc_inject_record.zi_type); - FIELD_COPY(zc_inject_record.zi_freq); - FIELD_COPY(zc_inject_record.zi_failfast); - strlcpy(resume_c->zc_inject_record.zi_func, - zc->zc_inject_record.zi_func, MAXNAMELEN); - FIELD_COPY(zc_inject_record.zi_iotype); - FIELD_COPY(zc_inject_record.zi_duration); - FIELD_COPY(zc_inject_record.zi_timer); - FIELD_COPY(zc_inject_record.zi_cmd); - FIELD_COPY(zc_inject_record.zi_pad); - FIELD_COPY(zc_defer_destroy); - FIELD_COPY(zc_flags); - FIELD_COPY(zc_action_handle); - FIELD_COPY(zc_cleanup_fd); - FIELD_COPY(zc_simple); - FIELD_COPY(zc_sendobj); - FIELD_COPY(zc_fromobj); - FIELD_COPY(zc_createtxg); - FIELD_COPY(zc_stat); -#undef FIELD_COPY - break; - - case ZFS_CMD_COMPAT_EDBP: - edbp_c = (void *)addr; - strlcpy(edbp_c->zc_name, zc->zc_name, MAXPATHLEN); - strlcpy(edbp_c->zc_value, zc->zc_value, MAXPATHLEN * 2); - strlcpy(edbp_c->zc_string, zc->zc_string, MAXPATHLEN); - -#define FIELD_COPY(field) edbp_c->field = zc->field - FIELD_COPY(zc_nvlist_src); - FIELD_COPY(zc_nvlist_src_size); - FIELD_COPY(zc_nvlist_dst); - FIELD_COPY(zc_nvlist_dst_size); - FIELD_COPY(zc_nvlist_dst_filled); - FIELD_COPY(zc_pad2); - FIELD_COPY(zc_history); - FIELD_COPY(zc_guid); - FIELD_COPY(zc_nvlist_conf); - FIELD_COPY(zc_nvlist_conf_size); - FIELD_COPY(zc_cookie); - FIELD_COPY(zc_objset_type); - FIELD_COPY(zc_perm_action); - FIELD_COPY(zc_history_len); - FIELD_COPY(zc_history_offset); - FIELD_COPY(zc_obj); - FIELD_COPY(zc_iflags); - FIELD_COPY(zc_share); - FIELD_COPY(zc_jailid); - FIELD_COPY(zc_objset_stats); - edbp_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; - FIELD_COPY(zc_inject_record.zi_objset); - FIELD_COPY(zc_inject_record.zi_object); - FIELD_COPY(zc_inject_record.zi_start); - FIELD_COPY(zc_inject_record.zi_end); - FIELD_COPY(zc_inject_record.zi_guid); - FIELD_COPY(zc_inject_record.zi_level); - FIELD_COPY(zc_inject_record.zi_error); - FIELD_COPY(zc_inject_record.zi_type); - FIELD_COPY(zc_inject_record.zi_freq); - FIELD_COPY(zc_inject_record.zi_failfast); - strlcpy(resume_c->zc_inject_record.zi_func, - zc->zc_inject_record.zi_func, MAXNAMELEN); - FIELD_COPY(zc_inject_record.zi_iotype); - FIELD_COPY(zc_inject_record.zi_duration); - FIELD_COPY(zc_inject_record.zi_timer); - FIELD_COPY(zc_inject_record.zi_cmd); - FIELD_COPY(zc_inject_record.zi_pad); - FIELD_COPY(zc_defer_destroy); - FIELD_COPY(zc_flags); - FIELD_COPY(zc_action_handle); - FIELD_COPY(zc_cleanup_fd); - FIELD_COPY(zc_simple); - FIELD_COPY(zc_sendobj); - FIELD_COPY(zc_fromobj); - FIELD_COPY(zc_createtxg); - FIELD_COPY(zc_stat); -#undef FIELD_COPY - break; - - case ZFS_CMD_COMPAT_ZCMD: - zcmd_c = (void *)addr; - /* zc */ - strlcpy(zcmd_c->zc_name, zc->zc_name, MAXPATHLEN); - strlcpy(zcmd_c->zc_value, zc->zc_value, MAXPATHLEN * 2); - strlcpy(zcmd_c->zc_string, zc->zc_string, MAXPATHLEN); - -#define FIELD_COPY(field) zcmd_c->field = zc->field - FIELD_COPY(zc_nvlist_src); - FIELD_COPY(zc_nvlist_src_size); - FIELD_COPY(zc_nvlist_dst); - FIELD_COPY(zc_nvlist_dst_size); - FIELD_COPY(zc_nvlist_dst_filled); - FIELD_COPY(zc_pad2); - FIELD_COPY(zc_history); - FIELD_COPY(zc_guid); - FIELD_COPY(zc_nvlist_conf); - FIELD_COPY(zc_nvlist_conf_size); - FIELD_COPY(zc_cookie); - FIELD_COPY(zc_objset_type); - FIELD_COPY(zc_perm_action); - FIELD_COPY(zc_history_len); - FIELD_COPY(zc_history_offset); - FIELD_COPY(zc_obj); - FIELD_COPY(zc_iflags); - FIELD_COPY(zc_share); - FIELD_COPY(zc_jailid); - FIELD_COPY(zc_objset_stats); - zcmd_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; - FIELD_COPY(zc_inject_record.zi_objset); - FIELD_COPY(zc_inject_record.zi_object); - FIELD_COPY(zc_inject_record.zi_start); - FIELD_COPY(zc_inject_record.zi_end); - FIELD_COPY(zc_inject_record.zi_guid); - FIELD_COPY(zc_inject_record.zi_level); - FIELD_COPY(zc_inject_record.zi_error); - FIELD_COPY(zc_inject_record.zi_type); - FIELD_COPY(zc_inject_record.zi_freq); - FIELD_COPY(zc_inject_record.zi_failfast); - strlcpy(resume_c->zc_inject_record.zi_func, - zc->zc_inject_record.zi_func, MAXNAMELEN); - FIELD_COPY(zc_inject_record.zi_iotype); - FIELD_COPY(zc_inject_record.zi_duration); - FIELD_COPY(zc_inject_record.zi_timer); - FIELD_COPY(zc_inject_record.zi_cmd); - FIELD_COPY(zc_inject_record.zi_pad); - - /* boolean_t -> uint32_t */ - zcmd_c->zc_defer_destroy = (uint32_t)(zc->zc_defer_destroy); - zcmd_c->zc_temphold = 0; - - FIELD_COPY(zc_action_handle); - FIELD_COPY(zc_cleanup_fd); - FIELD_COPY(zc_simple); - FIELD_COPY(zc_sendobj); - FIELD_COPY(zc_fromobj); - FIELD_COPY(zc_createtxg); - FIELD_COPY(zc_stat); -#undef FIELD_COPY - - break; - - case ZFS_CMD_COMPAT_DEADMAN: - zcdm_c = (void *)addr; - - strlcpy(zcdm_c->zc_name, zc->zc_name, MAXPATHLEN); - strlcpy(zcdm_c->zc_value, zc->zc_value, MAXPATHLEN * 2); - strlcpy(zcdm_c->zc_string, zc->zc_string, MAXPATHLEN); - -#define FIELD_COPY(field) zcdm_c->field = zc->field - zcdm_c->zc_guid = zc->zc_guid; - zcdm_c->zc_nvlist_conf = zc->zc_nvlist_conf; - zcdm_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size; - zcdm_c->zc_nvlist_src = zc->zc_nvlist_src; - zcdm_c->zc_nvlist_src_size = zc->zc_nvlist_src_size; - zcdm_c->zc_nvlist_dst = zc->zc_nvlist_dst; - zcdm_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size; - zcdm_c->zc_cookie = zc->zc_cookie; - zcdm_c->zc_objset_type = zc->zc_objset_type; - zcdm_c->zc_perm_action = zc->zc_perm_action; - zcdm_c->zc_history = zc->zc_history; - zcdm_c->zc_history_len = zc->zc_history_len; - zcdm_c->zc_history_offset = zc->zc_history_offset; - zcdm_c->zc_obj = zc->zc_obj; - zcdm_c->zc_iflags = zc->zc_iflags; - zcdm_c->zc_share = zc->zc_share; - zcdm_c->zc_jailid = zc->zc_jailid; - zcdm_c->zc_objset_stats = zc->zc_objset_stats; - zcdm_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; - zcdm_c->zc_defer_destroy = zc->zc_defer_destroy; - zcdm_c->zc_temphold = 0; - zcdm_c->zc_action_handle = zc->zc_action_handle; - zcdm_c->zc_cleanup_fd = zc->zc_cleanup_fd; - zcdm_c->zc_simple = zc->zc_simple; - zcdm_c->zc_sendobj = zc->zc_sendobj; - zcdm_c->zc_fromobj = zc->zc_fromobj; - zcdm_c->zc_createtxg = zc->zc_createtxg; - zcdm_c->zc_stat = zc->zc_stat; - FIELD_COPY(zc_inject_record.zi_objset); - FIELD_COPY(zc_inject_record.zi_object); - FIELD_COPY(zc_inject_record.zi_start); - FIELD_COPY(zc_inject_record.zi_end); - FIELD_COPY(zc_inject_record.zi_guid); - FIELD_COPY(zc_inject_record.zi_level); - FIELD_COPY(zc_inject_record.zi_error); - FIELD_COPY(zc_inject_record.zi_type); - FIELD_COPY(zc_inject_record.zi_freq); - FIELD_COPY(zc_inject_record.zi_failfast); - strlcpy(resume_c->zc_inject_record.zi_func, - zc->zc_inject_record.zi_func, MAXNAMELEN); - FIELD_COPY(zc_inject_record.zi_iotype); - FIELD_COPY(zc_inject_record.zi_duration); - FIELD_COPY(zc_inject_record.zi_timer); - FIELD_COPY(zc_inject_record.zi_cmd); - FIELD_COPY(zc_inject_record.zi_pad); -#undef FIELD_COPY -#ifndef _KERNEL - if (request == ZFS_IOC_RECV) - strlcpy(zcdm_c->zc_top_ds, - zc->zc_value + strlen(zc->zc_value) + 1, - (MAXPATHLEN * 2) - strlen(zc->zc_value) - 1); -#endif - break; - - case ZFS_CMD_COMPAT_V28: - zc28_c = (void *)addr; - - strlcpy(zc28_c->zc_name, zc->zc_name, MAXPATHLEN); - strlcpy(zc28_c->zc_value, zc->zc_value, MAXPATHLEN * 2); - strlcpy(zc28_c->zc_string, zc->zc_string, MAXPATHLEN); - zc28_c->zc_guid = zc->zc_guid; - zc28_c->zc_nvlist_conf = zc->zc_nvlist_conf; - zc28_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size; - zc28_c->zc_nvlist_src = zc->zc_nvlist_src; - zc28_c->zc_nvlist_src_size = zc->zc_nvlist_src_size; - zc28_c->zc_nvlist_dst = zc->zc_nvlist_dst; - zc28_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size; - zc28_c->zc_cookie = zc->zc_cookie; - zc28_c->zc_objset_type = zc->zc_objset_type; - zc28_c->zc_perm_action = zc->zc_perm_action; - zc28_c->zc_history = zc->zc_history; - zc28_c->zc_history_len = zc->zc_history_len; - zc28_c->zc_history_offset = zc->zc_history_offset; - zc28_c->zc_obj = zc->zc_obj; - zc28_c->zc_iflags = zc->zc_iflags; - zc28_c->zc_share = zc->zc_share; - zc28_c->zc_jailid = zc->zc_jailid; - zc28_c->zc_objset_stats = zc->zc_objset_stats; - zc28_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; - zc28_c->zc_defer_destroy = zc->zc_defer_destroy; - zc28_c->zc_temphold = 0; - zc28_c->zc_action_handle = zc->zc_action_handle; - zc28_c->zc_cleanup_fd = zc->zc_cleanup_fd; - zc28_c->zc_simple = zc->zc_simple; - zc28_c->zc_sendobj = zc->zc_sendobj; - zc28_c->zc_fromobj = zc->zc_fromobj; - zc28_c->zc_createtxg = zc->zc_createtxg; - zc28_c->zc_stat = zc->zc_stat; -#ifndef _KERNEL - if (request == ZFS_IOC_RECV) - strlcpy(zc28_c->zc_top_ds, - zc->zc_value + strlen(zc->zc_value) + 1, - MAXPATHLEN * 2 - strlen(zc->zc_value) - 1); -#endif - /* zc_inject_record */ - zc28_c->zc_inject_record.zi_objset = - zc->zc_inject_record.zi_objset; - zc28_c->zc_inject_record.zi_object = - zc->zc_inject_record.zi_object; - zc28_c->zc_inject_record.zi_start = - zc->zc_inject_record.zi_start; - zc28_c->zc_inject_record.zi_end = - zc->zc_inject_record.zi_end; - zc28_c->zc_inject_record.zi_guid = - zc->zc_inject_record.zi_guid; - zc28_c->zc_inject_record.zi_level = - zc->zc_inject_record.zi_level; - zc28_c->zc_inject_record.zi_error = - zc->zc_inject_record.zi_error; - zc28_c->zc_inject_record.zi_type = - zc->zc_inject_record.zi_type; - zc28_c->zc_inject_record.zi_freq = - zc->zc_inject_record.zi_freq; - zc28_c->zc_inject_record.zi_failfast = - zc->zc_inject_record.zi_failfast; - strlcpy(zc28_c->zc_inject_record.zi_func, - zc->zc_inject_record.zi_func, MAXNAMELEN); - zc28_c->zc_inject_record.zi_iotype = - zc->zc_inject_record.zi_iotype; - zc28_c->zc_inject_record.zi_duration = - zc->zc_inject_record.zi_duration; - zc28_c->zc_inject_record.zi_timer = - zc->zc_inject_record.zi_timer; - break; - - case ZFS_CMD_COMPAT_V15: - zc_c = (void *)addr; - - /* zc */ - strlcpy(zc_c->zc_name, zc->zc_name, MAXPATHLEN); - strlcpy(zc_c->zc_value, zc->zc_value, MAXPATHLEN); - strlcpy(zc_c->zc_string, zc->zc_string, MAXPATHLEN); - zc_c->zc_guid = zc->zc_guid; - zc_c->zc_nvlist_conf = zc->zc_nvlist_conf; - zc_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size; - zc_c->zc_nvlist_src = zc->zc_nvlist_src; - zc_c->zc_nvlist_src_size = zc->zc_nvlist_src_size; - zc_c->zc_nvlist_dst = zc->zc_nvlist_dst; - zc_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size; - zc_c->zc_cookie = zc->zc_cookie; - zc_c->zc_objset_type = zc->zc_objset_type; - zc_c->zc_perm_action = zc->zc_perm_action; - zc_c->zc_history = zc->zc_history; - zc_c->zc_history_len = zc->zc_history_len; - zc_c->zc_history_offset = zc->zc_history_offset; - zc_c->zc_obj = zc->zc_obj; - zc_c->zc_share = zc->zc_share; - zc_c->zc_jailid = zc->zc_jailid; - zc_c->zc_objset_stats = zc->zc_objset_stats; - zc_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; - - /* zc_inject_record */ - zc_c->zc_inject_record.zi_objset = - zc->zc_inject_record.zi_objset; - zc_c->zc_inject_record.zi_object = - zc->zc_inject_record.zi_object; - zc_c->zc_inject_record.zi_start = - zc->zc_inject_record.zi_start; - zc_c->zc_inject_record.zi_end = - zc->zc_inject_record.zi_end; - zc_c->zc_inject_record.zi_guid = - zc->zc_inject_record.zi_guid; - zc_c->zc_inject_record.zi_level = - zc->zc_inject_record.zi_level; - zc_c->zc_inject_record.zi_error = - zc->zc_inject_record.zi_error; - zc_c->zc_inject_record.zi_type = - zc->zc_inject_record.zi_type; - zc_c->zc_inject_record.zi_freq = - zc->zc_inject_record.zi_freq; - zc_c->zc_inject_record.zi_failfast = - zc->zc_inject_record.zi_failfast; - - break; - } -} - -static int -zfs_ioctl_compat_get_nvlist(uint64_t nvl, size_t size, int iflag, - nvlist_t **nvp) -{ - char *packed; - int error; - nvlist_t *list = NULL; - - /* - * Read in and unpack the user-supplied nvlist. - */ - if (size == 0) - return (EINVAL); - -#ifdef _KERNEL - packed = kmem_alloc(size, KM_SLEEP); - if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, - iflag)) != 0) { - kmem_free(packed, size); - return (error); - } -#else - packed = (void *)(uintptr_t)nvl; -#endif - - error = nvlist_unpack(packed, size, &list, 0); - -#ifdef _KERNEL - kmem_free(packed, size); -#endif - - if (error != 0) - return (error); - - *nvp = list; - return (0); -} - -static int -zfs_ioctl_compat_put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) -{ - char *packed = NULL; - int error = 0; - size_t size; - - VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); - -#ifdef _KERNEL - packed = kmem_alloc(size, KM_SLEEP); - VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, - KM_SLEEP) == 0); - - if (ddi_copyout(packed, - (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) - error = EFAULT; - kmem_free(packed, size); -#else - packed = (void *)(uintptr_t)zc->zc_nvlist_dst; - VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, - 0) == 0); -#endif - - zc->zc_nvlist_dst_size = size; - return (error); -} - -static void -zfs_ioctl_compat_fix_stats_nvlist(nvlist_t *nvl) -{ - nvlist_t **child; - nvlist_t *nvroot = NULL; - vdev_stat_t *vs; - uint_t c, children, nelem; - - if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (c = 0; c < children; c++) { - zfs_ioctl_compat_fix_stats_nvlist(child[c]); - } - } - - if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0) - zfs_ioctl_compat_fix_stats_nvlist(nvroot); -#ifdef _KERNEL - if ((nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS, -#else - if ((nvlist_lookup_uint64_array(nvl, "stats", -#endif - - (uint64_t **)&vs, &nelem) == 0)) { - nvlist_add_uint64_array(nvl, -#ifdef _KERNEL - "stats", -#else - ZPOOL_CONFIG_VDEV_STATS, -#endif - (uint64_t *)vs, nelem); -#ifdef _KERNEL - nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS, -#else - nvlist_remove(nvl, "stats", -#endif - DATA_TYPE_UINT64_ARRAY); - } -} - -static int -zfs_ioctl_compat_fix_stats(zfs_cmd_t *zc, const int nc) -{ - nvlist_t *nv, *nvp = NULL; - nvpair_t *elem; - int error; - - if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst, - zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0) - return (error); - - if (nc == 5) { /* ZFS_IOC_POOL_STATS */ - elem = NULL; - while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) { - if (nvpair_value_nvlist(elem, &nvp) == 0) - zfs_ioctl_compat_fix_stats_nvlist(nvp); - } - elem = NULL; - } else - zfs_ioctl_compat_fix_stats_nvlist(nv); - - error = zfs_ioctl_compat_put_nvlist(zc, nv); - - nvlist_free(nv); - - return (error); -} - -static int -zfs_ioctl_compat_pool_get_props(zfs_cmd_t *zc) -{ - nvlist_t *nv, *nva = NULL; - int error; - - if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst, - zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0) - return (error); - -#ifdef _KERNEL - if (nvlist_lookup_nvlist(nv, "allocated", &nva) == 0) { - nvlist_add_nvlist(nv, "used", nva); - nvlist_remove(nv, "allocated", DATA_TYPE_NVLIST); - } - - if (nvlist_lookup_nvlist(nv, "free", &nva) == 0) { - nvlist_add_nvlist(nv, "available", nva); - nvlist_remove(nv, "free", DATA_TYPE_NVLIST); - } -#else - if (nvlist_lookup_nvlist(nv, "used", &nva) == 0) { - nvlist_add_nvlist(nv, "allocated", nva); - nvlist_remove(nv, "used", DATA_TYPE_NVLIST); - } - - if (nvlist_lookup_nvlist(nv, "available", &nva) == 0) { - nvlist_add_nvlist(nv, "free", nva); - nvlist_remove(nv, "available", DATA_TYPE_NVLIST); - } -#endif - - error = zfs_ioctl_compat_put_nvlist(zc, nv); - - nvlist_free(nv); - - return (error); -} - -#ifndef _KERNEL -int -zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag) -{ - int nc, ret; - void *zc_c; - unsigned long ncmd; - zfs_iocparm_t zp; - - switch (cflag) { - case ZFS_CMD_COMPAT_NONE: - ncmd = _IOWR('Z', request, struct zfs_iocparm); - zp.zfs_cmd = (uint64_t)zc; - zp.zfs_cmd_size = sizeof(zfs_cmd_t); - zp.zfs_ioctl_version = ZFS_IOCVER_CURRENT; - return (ioctl(fd, ncmd, &zp)); - case ZFS_CMD_COMPAT_INLANES: - ncmd = _IOWR('Z', request, struct zfs_iocparm); - zp.zfs_cmd = (uint64_t)zc; - zp.zfs_cmd_size = sizeof(zfs_cmd_inlanes_t); - zp.zfs_ioctl_version = ZFS_IOCVER_INLANES; - return (ioctl(fd, ncmd, &zp)); - case ZFS_CMD_COMPAT_RESUME: - ncmd = _IOWR('Z', request, struct zfs_iocparm); - zp.zfs_cmd = (uint64_t)zc; - zp.zfs_cmd_size = sizeof(zfs_cmd_resume_t); - zp.zfs_ioctl_version = ZFS_IOCVER_RESUME; - return (ioctl(fd, ncmd, &zp)); - case ZFS_CMD_COMPAT_EDBP: - ncmd = _IOWR('Z', request, struct zfs_iocparm); - zp.zfs_cmd = (uint64_t)zc; - zp.zfs_cmd_size = sizeof(zfs_cmd_edbp_t); - zp.zfs_ioctl_version = ZFS_IOCVER_EDBP; - return (ioctl(fd, ncmd, &zp)); - case ZFS_CMD_COMPAT_ZCMD: - ncmd = _IOWR('Z', request, struct zfs_iocparm); - zp.zfs_cmd = (uint64_t)zc; - zp.zfs_cmd_size = sizeof(zfs_cmd_zcmd_t); - zp.zfs_ioctl_version = ZFS_IOCVER_ZCMD; - return (ioctl(fd, ncmd, &zp)); - case ZFS_CMD_COMPAT_LZC: - ncmd = _IOWR('Z', request, struct zfs_cmd); - return (ioctl(fd, ncmd, zc)); - case ZFS_CMD_COMPAT_DEADMAN: - zc_c = malloc(sizeof(zfs_cmd_deadman_t)); - ncmd = _IOWR('Z', request, struct zfs_cmd_deadman); - break; - case ZFS_CMD_COMPAT_V28: - zc_c = malloc(sizeof(zfs_cmd_v28_t)); - ncmd = _IOWR('Z', request, struct zfs_cmd_v28); - break; - case ZFS_CMD_COMPAT_V15: - nc = zfs_ioctl_v28_to_v15[request]; - zc_c = malloc(sizeof(zfs_cmd_v15_t)); - ncmd = _IOWR('Z', nc, struct zfs_cmd_v15); - break; - default: - return (EINVAL); - } - - if (ZFS_IOCREQ(ncmd) == ZFS_IOC_COMPAT_FAIL) - return (ENOTSUP); - - zfs_cmd_compat_put(zc, (caddr_t)zc_c, request, cflag); - - ret = ioctl(fd, ncmd, zc_c); - if (cflag == ZFS_CMD_COMPAT_V15 && - nc == ZFS_IOC_POOL_IMPORT) - ret = ioctl(fd, _IOWR('Z', ZFS_IOC_POOL_CONFIGS, - struct zfs_cmd_v15), zc_c); - zfs_cmd_compat_get(zc, (caddr_t)zc_c, cflag); - free(zc_c); - - if (cflag == ZFS_CMD_COMPAT_V15) { - switch (nc) { - case ZFS_IOC_POOL_IMPORT: - case ZFS_IOC_POOL_CONFIGS: - case ZFS_IOC_POOL_STATS: - case ZFS_IOC_POOL_TRYIMPORT: - zfs_ioctl_compat_fix_stats(zc, nc); - break; - case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */ - zfs_ioctl_compat_pool_get_props(zc); - break; - } - } - - return (ret); -} -#else /* _KERNEL */ -int -zfs_ioctl_compat_pre(zfs_cmd_t *zc, int *vec, const int cflag) -{ - int error = 0; - - /* are we creating a clone? */ - if (*vec == ZFS_IOC_CREATE && zc->zc_value[0] != '\0') - *vec = ZFS_IOC_CLONE; - - if (cflag == ZFS_CMD_COMPAT_V15) { - switch (*vec) { - - case 7: /* ZFS_IOC_POOL_SCRUB (v15) */ - zc->zc_cookie = POOL_SCAN_SCRUB; - break; - } - } - - return (error); -} - -void -zfs_ioctl_compat_post(zfs_cmd_t *zc, int vec, const int cflag) -{ - if (cflag == ZFS_CMD_COMPAT_V15) { - switch (vec) { - case ZFS_IOC_POOL_CONFIGS: - case ZFS_IOC_POOL_STATS: - case ZFS_IOC_POOL_TRYIMPORT: - zfs_ioctl_compat_fix_stats(zc, vec); - break; - case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */ - zfs_ioctl_compat_pool_get_props(zc); - break; - } - } -} - -nvlist_t * -zfs_ioctl_compat_innvl(zfs_cmd_t *zc, nvlist_t * innvl, const int vec, - const int cflag) -{ - nvlist_t *nvl, *tmpnvl, *hnvl; - nvpair_t *elem; - char *poolname, *snapname; - int err; - - if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC || - cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP || - cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES) - goto out; - - switch (vec) { - case ZFS_IOC_CREATE: - nvl = fnvlist_alloc(); - fnvlist_add_int32(nvl, "type", zc->zc_objset_type); - if (innvl != NULL) { - fnvlist_add_nvlist(nvl, "props", innvl); - nvlist_free(innvl); - } - return (nvl); - break; - case ZFS_IOC_CLONE: - nvl = fnvlist_alloc(); - fnvlist_add_string(nvl, "origin", zc->zc_value); - if (innvl != NULL) { - fnvlist_add_nvlist(nvl, "props", innvl); - nvlist_free(innvl); - } - return (nvl); - break; - case ZFS_IOC_SNAPSHOT: - if (innvl == NULL) - goto out; - nvl = fnvlist_alloc(); - fnvlist_add_nvlist(nvl, "props", innvl); - tmpnvl = fnvlist_alloc(); - snapname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value); - fnvlist_add_boolean(tmpnvl, snapname); - kmem_free(snapname, strlen(snapname + 1)); - /* check if we are doing a recursive snapshot */ - if (zc->zc_cookie) - dmu_get_recursive_snaps_nvl(zc->zc_name, zc->zc_value, - tmpnvl); - fnvlist_add_nvlist(nvl, "snaps", tmpnvl); - fnvlist_free(tmpnvl); - nvlist_free(innvl); - /* strip dataset part from zc->zc_name */ - zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; - return (nvl); - break; - case ZFS_IOC_SPACE_SNAPS: - nvl = fnvlist_alloc(); - fnvlist_add_string(nvl, "firstsnap", zc->zc_value); - if (innvl != NULL) - nvlist_free(innvl); - return (nvl); - break; - case ZFS_IOC_DESTROY_SNAPS: - if (innvl == NULL && cflag == ZFS_CMD_COMPAT_DEADMAN) - goto out; - nvl = fnvlist_alloc(); - if (innvl != NULL) { - fnvlist_add_nvlist(nvl, "snaps", innvl); - } else { - /* - * We are probably called by even older binaries, - * allocate and populate nvlist with recursive - * snapshots - */ - if (zfs_component_namecheck(zc->zc_value, NULL, - NULL) == 0) { - tmpnvl = fnvlist_alloc(); - if (dmu_get_recursive_snaps_nvl(zc->zc_name, - zc->zc_value, tmpnvl) == 0) - fnvlist_add_nvlist(nvl, "snaps", - tmpnvl); - nvlist_free(tmpnvl); - } - } - if (innvl != NULL) - nvlist_free(innvl); - /* strip dataset part from zc->zc_name */ - zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; - return (nvl); - break; - case ZFS_IOC_HOLD: - nvl = fnvlist_alloc(); - tmpnvl = fnvlist_alloc(); - if (zc->zc_cleanup_fd != -1) - fnvlist_add_int32(nvl, "cleanup_fd", - (int32_t)zc->zc_cleanup_fd); - if (zc->zc_cookie) { - hnvl = fnvlist_alloc(); - if (dmu_get_recursive_snaps_nvl(zc->zc_name, - zc->zc_value, hnvl) == 0) { - elem = NULL; - while ((elem = nvlist_next_nvpair(hnvl, - elem)) != NULL) { - nvlist_add_string(tmpnvl, - nvpair_name(elem), zc->zc_string); - } - } - nvlist_free(hnvl); - } else { - snapname = kmem_asprintf("%s@%s", zc->zc_name, - zc->zc_value); - nvlist_add_string(tmpnvl, snapname, zc->zc_string); - kmem_free(snapname, strlen(snapname + 1)); - } - fnvlist_add_nvlist(nvl, "holds", tmpnvl); - nvlist_free(tmpnvl); - if (innvl != NULL) - nvlist_free(innvl); - /* strip dataset part from zc->zc_name */ - zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; - return (nvl); - break; - case ZFS_IOC_RELEASE: - nvl = fnvlist_alloc(); - tmpnvl = fnvlist_alloc(); - if (zc->zc_cookie) { - hnvl = fnvlist_alloc(); - if (dmu_get_recursive_snaps_nvl(zc->zc_name, - zc->zc_value, hnvl) == 0) { - elem = NULL; - while ((elem = nvlist_next_nvpair(hnvl, - elem)) != NULL) { - fnvlist_add_boolean(tmpnvl, - zc->zc_string); - fnvlist_add_nvlist(nvl, - nvpair_name(elem), tmpnvl); - } - } - nvlist_free(hnvl); - } else { - snapname = kmem_asprintf("%s@%s", zc->zc_name, - zc->zc_value); - fnvlist_add_boolean(tmpnvl, zc->zc_string); - fnvlist_add_nvlist(nvl, snapname, tmpnvl); - kmem_free(snapname, strlen(snapname + 1)); - } - nvlist_free(tmpnvl); - if (innvl != NULL) - nvlist_free(innvl); - /* strip dataset part from zc->zc_name */ - zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; - return (nvl); - break; - } -out: - return (innvl); -} - -nvlist_t * -zfs_ioctl_compat_outnvl(zfs_cmd_t *zc, nvlist_t * outnvl, const int vec, - const int cflag) -{ - nvlist_t *tmpnvl; - - if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC || - cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP || - cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES) - return (outnvl); - - switch (vec) { - case ZFS_IOC_SPACE_SNAPS: - (void) nvlist_lookup_uint64(outnvl, "used", &zc->zc_cookie); - (void) nvlist_lookup_uint64(outnvl, "compressed", - &zc->zc_objset_type); - (void) nvlist_lookup_uint64(outnvl, "uncompressed", - &zc->zc_perm_action); - nvlist_free(outnvl); - /* return empty outnvl */ - tmpnvl = fnvlist_alloc(); - return (tmpnvl); - break; - case ZFS_IOC_CREATE: - case ZFS_IOC_CLONE: - case ZFS_IOC_HOLD: - case ZFS_IOC_RELEASE: - nvlist_free(outnvl); - /* return empty outnvl */ - tmpnvl = fnvlist_alloc(); - return (tmpnvl); - break; - } - - return (outnvl); -} -#endif /* KERNEL */ diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h deleted file mode 100644 index 61f1514e3ebd..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h +++ /dev/null @@ -1,543 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2014 Xin Li . All rights reserved. - * Copyright 2013 Martin Matuska . All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ZFS_IOCTL_COMPAT_H -#define _SYS_ZFS_IOCTL_COMPAT_H - -#include -#include -#include -#include -#include - -#ifdef _KERNEL -#include -#endif /* _KERNEL */ - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Backwards ioctl compatibility - */ - -/* ioctl versions for vfs.zfs.version.ioctl */ -#define ZFS_IOCVER_UNDEF -1 -#define ZFS_IOCVER_NONE 0 -#define ZFS_IOCVER_DEADMAN 1 -#define ZFS_IOCVER_LZC 2 -#define ZFS_IOCVER_ZCMD 3 -#define ZFS_IOCVER_EDBP 4 -#define ZFS_IOCVER_RESUME 5 -#define ZFS_IOCVER_INLANES 6 -#define ZFS_IOCVER_PAD 7 -#define ZFS_IOCVER_CURRENT ZFS_IOCVER_PAD - -/* compatibility conversion flag */ -#define ZFS_CMD_COMPAT_NONE 0 -#define ZFS_CMD_COMPAT_V15 1 -#define ZFS_CMD_COMPAT_V28 2 -#define ZFS_CMD_COMPAT_DEADMAN 3 -#define ZFS_CMD_COMPAT_LZC 4 -#define ZFS_CMD_COMPAT_ZCMD 5 -#define ZFS_CMD_COMPAT_EDBP 6 -#define ZFS_CMD_COMPAT_RESUME 7 -#define ZFS_CMD_COMPAT_INLANES 8 - -#define ZFS_IOC_COMPAT_PASS 254 -#define ZFS_IOC_COMPAT_FAIL 255 - -#define ZFS_IOCREQ(ioreq) ((ioreq) & 0xff) - -typedef struct zfs_iocparm { - uint32_t zfs_ioctl_version; - uint64_t zfs_cmd; - uint64_t zfs_cmd_size; -} zfs_iocparm_t; - -typedef struct zinject_record_v15 { - uint64_t zi_objset; - uint64_t zi_object; - uint64_t zi_start; - uint64_t zi_end; - uint64_t zi_guid; - uint32_t zi_level; - uint32_t zi_error; - uint64_t zi_type; - uint32_t zi_freq; - uint32_t zi_failfast; -} zinject_record_v15_t; - -typedef struct zfs_cmd_v15 { - char zc_name[MAXPATHLEN]; - char zc_value[MAXPATHLEN]; - char zc_string[MAXNAMELEN]; - uint64_t zc_guid; - uint64_t zc_nvlist_conf; /* really (char *) */ - uint64_t zc_nvlist_conf_size; - uint64_t zc_nvlist_src; /* really (char *) */ - uint64_t zc_nvlist_src_size; - uint64_t zc_nvlist_dst; /* really (char *) */ - uint64_t zc_nvlist_dst_size; - uint64_t zc_cookie; - uint64_t zc_objset_type; - uint64_t zc_perm_action; - uint64_t zc_history; /* really (char *) */ - uint64_t zc_history_len; - uint64_t zc_history_offset; - uint64_t zc_obj; - zfs_share_t zc_share; - uint64_t zc_jailid; - dmu_objset_stats_t zc_objset_stats; - struct drr_begin zc_begin_record; - zinject_record_v15_t zc_inject_record; -} zfs_cmd_v15_t; - -typedef struct zinject_record_v28 { - uint64_t zi_objset; - uint64_t zi_object; - uint64_t zi_start; - uint64_t zi_end; - uint64_t zi_guid; - uint32_t zi_level; - uint32_t zi_error; - uint64_t zi_type; - uint32_t zi_freq; - uint32_t zi_failfast; - char zi_func[MAXNAMELEN]; - uint32_t zi_iotype; - int32_t zi_duration; - uint64_t zi_timer; -} zinject_record_v28_t; - -typedef struct zfs_cmd_v28 { - char zc_name[MAXPATHLEN]; - char zc_value[MAXPATHLEN * 2]; - char zc_string[MAXNAMELEN]; - char zc_top_ds[MAXPATHLEN]; - uint64_t zc_guid; - uint64_t zc_nvlist_conf; /* really (char *) */ - uint64_t zc_nvlist_conf_size; - uint64_t zc_nvlist_src; /* really (char *) */ - uint64_t zc_nvlist_src_size; - uint64_t zc_nvlist_dst; /* really (char *) */ - uint64_t zc_nvlist_dst_size; - uint64_t zc_cookie; - uint64_t zc_objset_type; - uint64_t zc_perm_action; - uint64_t zc_history; /* really (char *) */ - uint64_t zc_history_len; - uint64_t zc_history_offset; - uint64_t zc_obj; - uint64_t zc_iflags; /* internal to zfs(7fs) */ - zfs_share_t zc_share; - uint64_t zc_jailid; - dmu_objset_stats_t zc_objset_stats; - struct drr_begin zc_begin_record; - zinject_record_v28_t zc_inject_record; - boolean_t zc_defer_destroy; - boolean_t zc_temphold; - uint64_t zc_action_handle; - int zc_cleanup_fd; - uint8_t zc_simple; - uint8_t zc_pad[3]; /* alignment */ - uint64_t zc_sendobj; - uint64_t zc_fromobj; - uint64_t zc_createtxg; - zfs_stat_t zc_stat; -} zfs_cmd_v28_t; - -typedef struct zinject_record_deadman { - uint64_t zi_objset; - uint64_t zi_object; - uint64_t zi_start; - uint64_t zi_end; - uint64_t zi_guid; - uint32_t zi_level; - uint32_t zi_error; - uint64_t zi_type; - uint32_t zi_freq; - uint32_t zi_failfast; - char zi_func[MAXNAMELEN]; - uint32_t zi_iotype; - int32_t zi_duration; - uint64_t zi_timer; - uint32_t zi_cmd; - uint32_t zi_pad; -} zinject_record_deadman_t; - -typedef struct zfs_cmd_deadman { - char zc_name[MAXPATHLEN]; - char zc_value[MAXPATHLEN * 2]; - char zc_string[MAXNAMELEN]; - char zc_top_ds[MAXPATHLEN]; - uint64_t zc_guid; - uint64_t zc_nvlist_conf; /* really (char *) */ - uint64_t zc_nvlist_conf_size; - uint64_t zc_nvlist_src; /* really (char *) */ - uint64_t zc_nvlist_src_size; - uint64_t zc_nvlist_dst; /* really (char *) */ - uint64_t zc_nvlist_dst_size; - uint64_t zc_cookie; - uint64_t zc_objset_type; - uint64_t zc_perm_action; - uint64_t zc_history; /* really (char *) */ - uint64_t zc_history_len; - uint64_t zc_history_offset; - uint64_t zc_obj; - uint64_t zc_iflags; /* internal to zfs(7fs) */ - zfs_share_t zc_share; - uint64_t zc_jailid; - dmu_objset_stats_t zc_objset_stats; - struct drr_begin zc_begin_record; - /* zc_inject_record doesn't change in libzfs_core */ - zinject_record_deadman_t zc_inject_record; - boolean_t zc_defer_destroy; - boolean_t zc_temphold; - uint64_t zc_action_handle; - int zc_cleanup_fd; - uint8_t zc_simple; - uint8_t zc_pad[3]; /* alignment */ - uint64_t zc_sendobj; - uint64_t zc_fromobj; - uint64_t zc_createtxg; - zfs_stat_t zc_stat; -} zfs_cmd_deadman_t; - -typedef struct zfs_cmd_zcmd { - char zc_name[MAXPATHLEN]; /* name of pool or dataset */ - uint64_t zc_nvlist_src; /* really (char *) */ - uint64_t zc_nvlist_src_size; - uint64_t zc_nvlist_dst; /* really (char *) */ - uint64_t zc_nvlist_dst_size; - boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ - int zc_pad2; - - /* - * The following members are for legacy ioctls which haven't been - * converted to the new method. - */ - uint64_t zc_history; /* really (char *) */ - char zc_value[MAXPATHLEN * 2]; - char zc_string[MAXNAMELEN]; - uint64_t zc_guid; - uint64_t zc_nvlist_conf; /* really (char *) */ - uint64_t zc_nvlist_conf_size; - uint64_t zc_cookie; - uint64_t zc_objset_type; - uint64_t zc_perm_action; - uint64_t zc_history_len; - uint64_t zc_history_offset; - uint64_t zc_obj; - uint64_t zc_iflags; /* internal to zfs(7fs) */ - zfs_share_t zc_share; - uint64_t zc_jailid; - dmu_objset_stats_t zc_objset_stats; - struct drr_begin zc_begin_record; - zinject_record_deadman_t zc_inject_record; - boolean_t zc_defer_destroy; - boolean_t zc_temphold; - uint64_t zc_action_handle; - int zc_cleanup_fd; - uint8_t zc_simple; - uint8_t zc_pad[3]; /* alignment */ - uint64_t zc_sendobj; - uint64_t zc_fromobj; - uint64_t zc_createtxg; - zfs_stat_t zc_stat; -} zfs_cmd_zcmd_t; - -typedef struct zfs_cmd_edbp { - char zc_name[MAXPATHLEN]; /* name of pool or dataset */ - uint64_t zc_nvlist_src; /* really (char *) */ - uint64_t zc_nvlist_src_size; - uint64_t zc_nvlist_dst; /* really (char *) */ - uint64_t zc_nvlist_dst_size; - boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ - int zc_pad2; - - /* - * The following members are for legacy ioctls which haven't been - * converted to the new method. - */ - uint64_t zc_history; /* really (char *) */ - char zc_value[MAXPATHLEN * 2]; - char zc_string[MAXNAMELEN]; - uint64_t zc_guid; - uint64_t zc_nvlist_conf; /* really (char *) */ - uint64_t zc_nvlist_conf_size; - uint64_t zc_cookie; - uint64_t zc_objset_type; - uint64_t zc_perm_action; - uint64_t zc_history_len; - uint64_t zc_history_offset; - uint64_t zc_obj; - uint64_t zc_iflags; /* internal to zfs(7fs) */ - zfs_share_t zc_share; - uint64_t zc_jailid; - dmu_objset_stats_t zc_objset_stats; - struct drr_begin zc_begin_record; - zinject_record_deadman_t zc_inject_record; - uint32_t zc_defer_destroy; - uint32_t zc_flags; - uint64_t zc_action_handle; - int zc_cleanup_fd; - uint8_t zc_simple; - uint8_t zc_pad[3]; /* alignment */ - uint64_t zc_sendobj; - uint64_t zc_fromobj; - uint64_t zc_createtxg; - zfs_stat_t zc_stat; -} zfs_cmd_edbp_t; - -typedef struct zfs_cmd_resume { - char zc_name[MAXPATHLEN]; /* name of pool or dataset */ - uint64_t zc_nvlist_src; /* really (char *) */ - uint64_t zc_nvlist_src_size; - uint64_t zc_nvlist_dst; /* really (char *) */ - uint64_t zc_nvlist_dst_size; - boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ - int zc_pad2; - - /* - * The following members are for legacy ioctls which haven't been - * converted to the new method. - */ - uint64_t zc_history; /* really (char *) */ - char zc_value[MAXPATHLEN * 2]; - char zc_string[MAXNAMELEN]; - uint64_t zc_guid; - uint64_t zc_nvlist_conf; /* really (char *) */ - uint64_t zc_nvlist_conf_size; - uint64_t zc_cookie; - uint64_t zc_objset_type; - uint64_t zc_perm_action; - uint64_t zc_history_len; - uint64_t zc_history_offset; - uint64_t zc_obj; - uint64_t zc_iflags; /* internal to zfs(7fs) */ - zfs_share_t zc_share; - uint64_t zc_jailid; - dmu_objset_stats_t zc_objset_stats; - dmu_replay_record_t zc_begin_record; - zinject_record_deadman_t zc_inject_record; - uint32_t zc_defer_destroy; - uint32_t zc_flags; - uint64_t zc_action_handle; - int zc_cleanup_fd; - uint8_t zc_simple; - boolean_t zc_resumable; - uint64_t zc_sendobj; - uint64_t zc_fromobj; - uint64_t zc_createtxg; - zfs_stat_t zc_stat; -} zfs_cmd_resume_t; - -typedef struct zfs_cmd_inlanes { - char zc_name[MAXPATHLEN]; /* name of pool or dataset */ - uint64_t zc_nvlist_src; /* really (char *) */ - uint64_t zc_nvlist_src_size; - uint64_t zc_nvlist_dst; /* really (char *) */ - uint64_t zc_nvlist_dst_size; - boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ - int zc_pad2; - - /* - * The following members are for legacy ioctls which haven't been - * converted to the new method. - */ - uint64_t zc_history; /* really (char *) */ - char zc_value[MAXPATHLEN * 2]; - char zc_string[MAXNAMELEN]; - uint64_t zc_guid; - uint64_t zc_nvlist_conf; /* really (char *) */ - uint64_t zc_nvlist_conf_size; - uint64_t zc_cookie; - uint64_t zc_objset_type; - uint64_t zc_perm_action; - uint64_t zc_history_len; - uint64_t zc_history_offset; - uint64_t zc_obj; - uint64_t zc_iflags; /* internal to zfs(7fs) */ - zfs_share_t zc_share; - uint64_t zc_jailid; - dmu_objset_stats_t zc_objset_stats; - dmu_replay_record_t zc_begin_record; - zinject_record_t zc_inject_record; - uint32_t zc_defer_destroy; - uint32_t zc_flags; - uint64_t zc_action_handle; - int zc_cleanup_fd; - uint8_t zc_simple; - boolean_t zc_resumable; - uint64_t zc_sendobj; - uint64_t zc_fromobj; - uint64_t zc_createtxg; - zfs_stat_t zc_stat; -} zfs_cmd_inlanes_t; - -#ifdef _KERNEL -unsigned static long zfs_ioctl_v15_to_v28[] = { - 0, /* 0 ZFS_IOC_POOL_CREATE */ - 1, /* 1 ZFS_IOC_POOL_DESTROY */ - 2, /* 2 ZFS_IOC_POOL_IMPORT */ - 3, /* 3 ZFS_IOC_POOL_EXPORT */ - 4, /* 4 ZFS_IOC_POOL_CONFIGS */ - 5, /* 5 ZFS_IOC_POOL_STATS */ - 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */ - 7, /* 7 ZFS_IOC_POOL_SCRUB */ - 8, /* 8 ZFS_IOC_POOL_FREEZE */ - 9, /* 9 ZFS_IOC_POOL_UPGRADE */ - 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */ - 11, /* 11 ZFS_IOC_VDEV_ADD */ - 12, /* 12 ZFS_IOC_VDEV_REMOVE */ - 13, /* 13 ZFS_IOC_VDEV_SET_STATE */ - 14, /* 14 ZFS_IOC_VDEV_ATTACH */ - 15, /* 15 ZFS_IOC_VDEV_DETACH */ - 16, /* 16 ZFS_IOC_VDEV_SETPATH */ - 18, /* 17 ZFS_IOC_OBJSET_STATS */ - 19, /* 18 ZFS_IOC_OBJSET_ZPLPROPS */ - 20, /* 19 ZFS_IOC_DATASET_LIST_NEXT */ - 21, /* 20 ZFS_IOC_SNAPSHOT_LIST_NEXT */ - 22, /* 21 ZFS_IOC_SET_PROP */ - ZFS_IOC_COMPAT_PASS, /* 22 ZFS_IOC_CREATE_MINOR */ - ZFS_IOC_COMPAT_PASS, /* 23 ZFS_IOC_REMOVE_MINOR */ - 23, /* 24 ZFS_IOC_CREATE */ - 24, /* 25 ZFS_IOC_DESTROY */ - 25, /* 26 ZFS_IOC_ROLLBACK */ - 26, /* 27 ZFS_IOC_RENAME */ - 27, /* 28 ZFS_IOC_RECV */ - 28, /* 29 ZFS_IOC_SEND */ - 29, /* 30 ZFS_IOC_INJECT_FAULT */ - 30, /* 31 ZFS_IOC_CLEAR_FAULT */ - 31, /* 32 ZFS_IOC_INJECT_LIST_NEXT */ - 32, /* 33 ZFS_IOC_ERROR_LOG */ - 33, /* 34 ZFS_IOC_CLEAR */ - 34, /* 35 ZFS_IOC_PROMOTE */ - 35, /* 36 ZFS_IOC_DESTROY_SNAPS */ - 36, /* 37 ZFS_IOC_SNAPSHOT */ - 37, /* 38 ZFS_IOC_DSOBJ_TO_DSNAME */ - 38, /* 39 ZFS_IOC_OBJ_TO_PATH */ - 39, /* 40 ZFS_IOC_POOL_SET_PROPS */ - 40, /* 41 ZFS_IOC_POOL_GET_PROPS */ - 41, /* 42 ZFS_IOC_SET_FSACL */ - 42, /* 43 ZFS_IOC_GET_FSACL */ - ZFS_IOC_COMPAT_PASS, /* 44 ZFS_IOC_ISCSI_PERM_CHECK */ - 43, /* 45 ZFS_IOC_SHARE */ - 44, /* 46 ZFS_IOC_IHNERIT_PROP */ - 58, /* 47 ZFS_IOC_JAIL */ - 59, /* 48 ZFS_IOC_UNJAIL */ - 45, /* 49 ZFS_IOC_SMB_ACL */ - 46, /* 50 ZFS_IOC_USERSPACE_ONE */ - 47, /* 51 ZFS_IOC_USERSPACE_MANY */ - 48, /* 52 ZFS_IOC_USERSPACE_UPGRADE */ - 17, /* 53 ZFS_IOC_SETFRU */ -}; - -#else /* KERNEL */ -unsigned static long zfs_ioctl_v28_to_v15[] = { - 0, /* 0 ZFS_IOC_POOL_CREATE */ - 1, /* 1 ZFS_IOC_POOL_DESTROY */ - 2, /* 2 ZFS_IOC_POOL_IMPORT */ - 3, /* 3 ZFS_IOC_POOL_EXPORT */ - 4, /* 4 ZFS_IOC_POOL_CONFIGS */ - 5, /* 5 ZFS_IOC_POOL_STATS */ - 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */ - 7, /* 7 ZFS_IOC_POOL_SCAN */ - 8, /* 8 ZFS_IOC_POOL_FREEZE */ - 9, /* 9 ZFS_IOC_POOL_UPGRADE */ - 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */ - 11, /* 11 ZFS_IOC_VDEV_ADD */ - 12, /* 12 ZFS_IOC_VDEV_REMOVE */ - 13, /* 13 ZFS_IOC_VDEV_SET_STATE */ - 14, /* 14 ZFS_IOC_VDEV_ATTACH */ - 15, /* 15 ZFS_IOC_VDEV_DETACH */ - 16, /* 16 ZFS_IOC_VDEV_SETPATH */ - 53, /* 17 ZFS_IOC_VDEV_SETFRU */ - 17, /* 18 ZFS_IOC_OBJSET_STATS */ - 18, /* 19 ZFS_IOC_OBJSET_ZPLPROPS */ - 19, /* 20 ZFS_IOC_DATASET_LIST_NEXT */ - 20, /* 21 ZFS_IOC_SNAPSHOT_LIST_NEXT */ - 21, /* 22 ZFS_IOC_SET_PROP */ - 24, /* 23 ZFS_IOC_CREATE */ - 25, /* 24 ZFS_IOC_DESTROY */ - 26, /* 25 ZFS_IOC_ROLLBACK */ - 27, /* 26 ZFS_IOC_RENAME */ - 28, /* 27 ZFS_IOC_RECV */ - 29, /* 28 ZFS_IOC_SEND */ - 30, /* 39 ZFS_IOC_INJECT_FAULT */ - 31, /* 30 ZFS_IOC_CLEAR_FAULT */ - 32, /* 31 ZFS_IOC_INJECT_LIST_NEXT */ - 33, /* 32 ZFS_IOC_ERROR_LOG */ - 34, /* 33 ZFS_IOC_CLEAR */ - 35, /* 34 ZFS_IOC_PROMOTE */ - 36, /* 35 ZFS_IOC_DESTROY_SNAPS */ - 37, /* 36 ZFS_IOC_SNAPSHOT */ - 38, /* 37 ZFS_IOC_DSOBJ_TO_DSNAME */ - 39, /* 38 ZFS_IOC_OBJ_TO_PATH */ - 40, /* 39 ZFS_IOC_POOL_SET_PROPS */ - 41, /* 40 ZFS_IOC_POOL_GET_PROPS */ - 42, /* 41 ZFS_IOC_SET_FSACL */ - 43, /* 42 ZFS_IOC_GET_FSACL */ - 45, /* 43 ZFS_IOC_SHARE */ - 46, /* 44 ZFS_IOC_IHNERIT_PROP */ - 49, /* 45 ZFS_IOC_SMB_ACL */ - 50, /* 46 ZFS_IOC_USERSPACE_ONE */ - 51, /* 47 ZFS_IOC_USERSPACE_MANY */ - 52, /* 48 ZFS_IOC_USERSPACE_UPGRADE */ - ZFS_IOC_COMPAT_FAIL, /* 49 ZFS_IOC_HOLD */ - ZFS_IOC_COMPAT_FAIL, /* 50 ZFS_IOC_RELEASE */ - ZFS_IOC_COMPAT_FAIL, /* 51 ZFS_IOC_GET_HOLDS */ - ZFS_IOC_COMPAT_FAIL, /* 52 ZFS_IOC_OBJSET_RECVD_PROPS */ - ZFS_IOC_COMPAT_FAIL, /* 53 ZFS_IOC_VDEV_SPLIT */ - ZFS_IOC_COMPAT_FAIL, /* 54 ZFS_IOC_NEXT_OBJ */ - ZFS_IOC_COMPAT_FAIL, /* 55 ZFS_IOC_DIFF */ - ZFS_IOC_COMPAT_FAIL, /* 56 ZFS_IOC_TMP_SNAPSHOT */ - ZFS_IOC_COMPAT_FAIL, /* 57 ZFS_IOC_OBJ_TO_STATS */ - 47, /* 58 ZFS_IOC_JAIL */ - 48, /* 59 ZFS_IOC_UNJAIL */ -}; -#endif /* ! _KERNEL */ - -#ifdef _KERNEL -int zfs_ioctl_compat_pre(zfs_cmd_t *, int *, const int); -void zfs_ioctl_compat_post(zfs_cmd_t *, const int, const int); -nvlist_t *zfs_ioctl_compat_innvl(zfs_cmd_t *, nvlist_t *, const int, - const int); -nvlist_t *zfs_ioctl_compat_outnvl(zfs_cmd_t *, nvlist_t *, const int, - const int); -#else -int zcmd_ioctl_compat(int, int, zfs_cmd_t *, const int); -#endif /* _KERNEL */ -void zfs_cmd_compat_get(zfs_cmd_t *, caddr_t, const int); -void zfs_cmd_compat_put(zfs_cmd_t *, caddr_t, const int, const int); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZFS_IOCTL_COMPAT_H */ diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c deleted file mode 100644 index bad8f20e6917..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c +++ /dev/null @@ -1,399 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. - */ - -/* - * Common name validation routines for ZFS. These routines are shared by the - * userland code as well as the ioctl() layer to ensure that we don't - * inadvertently expose a hole through direct ioctl()s that never gets tested. - * In userland, however, we want significantly more information about _why_ the - * name is invalid. In the kernel, we only care whether it's valid or not. - * Each routine therefore takes a 'namecheck_err_t' which describes exactly why - * the name failed to validate. - */ - -#if defined(_KERNEL) -#include -#else -#include -#endif - -#include -#include -#include -#include "zfs_namecheck.h" -#include "zfs_deleg.h" - -/* - * Deeply nested datasets can overflow the stack, so we put a limit - * in the amount of nesting a path can have. zfs_max_dataset_nesting - * can be tuned temporarily to fix existing datasets that exceed our - * predefined limit. - */ -int zfs_max_dataset_nesting = 50; - -static int -valid_char(char c) -{ - return ((c >= 'a' && c <= 'z') || - (c >= 'A' && c <= 'Z') || - (c >= '0' && c <= '9') || - c == '-' || c == '_' || c == '.' || c == ':' || c == ' '); -} - -/* - * Looks at a path and returns its level of nesting (depth). - */ -int -get_dataset_depth(const char *path) -{ - const char *loc = path; - int nesting = 0; - - /* - * Keep track of nesting until you hit the end of the - * path or found the snapshot/bookmark seperator. - */ - for (int i = 0; loc[i] != '\0' && - loc[i] != '@' && - loc[i] != '#'; i++) { - if (loc[i] == '/') - nesting++; - } - - return (nesting); -} - -/* - * Snapshot names must be made up of alphanumeric characters plus the following - * characters: - * - * [-_.: ] - * - * Returns 0 on success, -1 on error. - */ -int -zfs_component_namecheck(const char *path, namecheck_err_t *why, char *what) -{ - const char *loc; - - if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) { - if (why) - *why = NAME_ERR_TOOLONG; - return (-1); - } - - if (path[0] == '\0') { - if (why) - *why = NAME_ERR_EMPTY_COMPONENT; - return (-1); - } - - for (loc = path; *loc; loc++) { - if (!valid_char(*loc)) { - if (why) { - *why = NAME_ERR_INVALCHAR; - *what = *loc; - } - return (-1); - } - } - return (0); -} - - -/* - * Permissions set name must start with the letter '@' followed by the - * same character restrictions as snapshot names, except that the name - * cannot exceed 64 characters. - * - * Returns 0 on success, -1 on error. - */ -int -permset_namecheck(const char *path, namecheck_err_t *why, char *what) -{ - if (strlen(path) >= ZFS_PERMSET_MAXLEN) { - if (why) - *why = NAME_ERR_TOOLONG; - return (-1); - } - - if (path[0] != '@') { - if (why) { - *why = NAME_ERR_NO_AT; - *what = path[0]; - } - return (-1); - } - - return (zfs_component_namecheck(&path[1], why, what)); -} - -/* - * Dataset paths should not be deeper than zfs_max_dataset_nesting - * in terms of nesting. - * - * Returns 0 on success, -1 on error. - */ -int -dataset_nestcheck(const char *path) -{ - return ((get_dataset_depth(path) < zfs_max_dataset_nesting) ? 0 : -1); -} - -/* - * Entity names must be of the following form: - * - * [component/]*[component][(@|#)component]? - * - * Where each component is made up of alphanumeric characters plus the following - * characters: - * - * [-_.:%] - * - * We allow '%' here as we use that character internally to create unique - * names for temporary clones (for online recv). - * - * Returns 0 on success, -1 on error. - */ -int -entity_namecheck(const char *path, namecheck_err_t *why, char *what) -{ - const char *end; - - /* - * Make sure the name is not too long. - */ - if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) { - if (why) - *why = NAME_ERR_TOOLONG; - return (-1); - } - - /* Explicitly check for a leading slash. */ - if (path[0] == '/') { - if (why) - *why = NAME_ERR_LEADING_SLASH; - return (-1); - } - - if (path[0] == '\0') { - if (why) - *why = NAME_ERR_EMPTY_COMPONENT; - return (-1); - } - - const char *start = path; - boolean_t found_delim = B_FALSE; - for (;;) { - /* Find the end of this component */ - end = start; - while (*end != '/' && *end != '@' && *end != '#' && - *end != '\0') - end++; - - if (*end == '\0' && end[-1] == '/') { - /* trailing slashes are not allowed */ - if (why) - *why = NAME_ERR_TRAILING_SLASH; - return (-1); - } - - /* Validate the contents of this component */ - for (const char *loc = start; loc != end; loc++) { - if (!valid_char(*loc) && *loc != '%') { - if (why) { - *why = NAME_ERR_INVALCHAR; - *what = *loc; - } - return (-1); - } - } - - /* Snapshot or bookmark delimiter found */ - if (*end == '@' || *end == '#') { - /* Multiple delimiters are not allowed */ - if (found_delim != 0) { - if (why) - *why = NAME_ERR_MULTIPLE_DELIMITERS; - return (-1); - } - - found_delim = B_TRUE; - } - - /* Zero-length components are not allowed */ - if (start == end) { - if (why) - *why = NAME_ERR_EMPTY_COMPONENT; - return (-1); - } - - /* If we've reached the end of the string, we're OK */ - if (*end == '\0') - return (0); - - /* - * If there is a '/' in a snapshot or bookmark name - * then report an error - */ - if (*end == '/' && found_delim != 0) { - if (why) - *why = NAME_ERR_TRAILING_SLASH; - return (-1); - } - - /* Update to the next component */ - start = end + 1; - } -} - -/* - * Dataset is any entity, except bookmark - */ -int -dataset_namecheck(const char *path, namecheck_err_t *why, char *what) -{ - int ret = entity_namecheck(path, why, what); - - if (ret == 0 && strchr(path, '#') != NULL) { - if (why != NULL) { - *why = NAME_ERR_INVALCHAR; - *what = '#'; - } - return (-1); - } - - return (ret); -} - -/* - * mountpoint names must be of the following form: - * - * /[component][/]*[component][/] - * - * Returns 0 on success, -1 on error. - */ -int -mountpoint_namecheck(const char *path, namecheck_err_t *why) -{ - const char *start, *end; - - /* - * Make sure none of the mountpoint component names are too long. - * If a component name is too long then the mkdir of the mountpoint - * will fail but then the mountpoint property will be set to a value - * that can never be mounted. Better to fail before setting the prop. - * Extra slashes are OK, they will be tossed by the mountpoint mkdir. - */ - - if (path == NULL || *path != '/') { - if (why) - *why = NAME_ERR_LEADING_SLASH; - return (-1); - } - - /* Skip leading slash */ - start = &path[1]; - do { - end = start; - while (*end != '/' && *end != '\0') - end++; - - if (end - start >= ZFS_MAX_DATASET_NAME_LEN) { - if (why) - *why = NAME_ERR_TOOLONG; - return (-1); - } - start = end + 1; - - } while (*end != '\0'); - - return (0); -} - -/* - * For pool names, we have the same set of valid characters as described in - * dataset names, with the additional restriction that the pool name must begin - * with a letter. The pool names 'raidz' and 'mirror' are also reserved names - * that cannot be used. - * - * Returns 0 on success, -1 on error. - */ -int -pool_namecheck(const char *pool, namecheck_err_t *why, char *what) -{ - const char *c; - - /* - * Make sure the name is not too long. - * If we're creating a pool with version >= SPA_VERSION_DSL_SCRUB (v11) - * we need to account for additional space needed by the origin ds which - * will also be snapshotted: "poolname"+"/"+"$ORIGIN"+"@"+"$ORIGIN". - * Play it safe and enforce this limit even if the pool version is < 11 - * so it can be upgraded without issues. - */ - if (strlen(pool) >= (ZFS_MAX_DATASET_NAME_LEN - 2 - - strlen(ORIGIN_DIR_NAME) * 2)) { - if (why) - *why = NAME_ERR_TOOLONG; - return (-1); - } - - c = pool; - while (*c != '\0') { - if (!valid_char(*c)) { - if (why) { - *why = NAME_ERR_INVALCHAR; - *what = *c; - } - return (-1); - } - c++; - } - - if (!(*pool >= 'a' && *pool <= 'z') && - !(*pool >= 'A' && *pool <= 'Z')) { - if (why) - *why = NAME_ERR_NOLETTER; - return (-1); - } - - if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) { - if (why) - *why = NAME_ERR_RESERVED; - return (-1); - } - - if (pool[0] == 'c' && (pool[1] >= '0' && pool[1] <= '9')) { - if (why) - *why = NAME_ERR_DISKLIKE; - return (-1); - } - - return (0); -} diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h deleted file mode 100644 index 527db92b0cfa..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. - */ - -#ifndef _ZFS_NAMECHECK_H -#define _ZFS_NAMECHECK_H - -#ifdef __cplusplus -extern "C" { -#endif - -typedef enum { - NAME_ERR_LEADING_SLASH, /* name begins with leading slash */ - NAME_ERR_EMPTY_COMPONENT, /* name contains an empty component */ - NAME_ERR_TRAILING_SLASH, /* name ends with a slash */ - NAME_ERR_INVALCHAR, /* invalid character found */ - NAME_ERR_MULTIPLE_DELIMITERS, /* multiple '@'/'#' delimiters found */ - NAME_ERR_NOLETTER, /* pool doesn't begin with a letter */ - NAME_ERR_RESERVED, /* entire name is reserved */ - NAME_ERR_DISKLIKE, /* reserved disk name (c[0-9].*) */ - NAME_ERR_TOOLONG, /* name is too long */ - NAME_ERR_NO_AT, /* permission set is missing '@' */ -} namecheck_err_t; - -#define ZFS_PERMSET_MAXLEN 64 - -extern int zfs_max_dataset_nesting; - -int get_dataset_depth(const char *); -int pool_namecheck(const char *, namecheck_err_t *, char *); -int entity_namecheck(const char *, namecheck_err_t *, char *); -int dataset_namecheck(const char *, namecheck_err_t *, char *); -int dataset_nestcheck(const char *); -int mountpoint_namecheck(const char *, namecheck_err_t *); -int zfs_component_namecheck(const char *, namecheck_err_t *, char *); -int permset_namecheck(const char *, namecheck_err_t *, char *); - -#ifdef __cplusplus -} -#endif - -#endif /* _ZFS_NAMECHECK_H */ diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c deleted file mode 100644 index ac8da491a9ec..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c +++ /dev/null @@ -1,718 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -/* Portions Copyright 2010 Robert Milkowski */ - -#include -#include -#include -#include -#include -#include - -#include "zfs_prop.h" -#include "zfs_deleg.h" - -#if defined(_KERNEL) -#include -#else -#include -#include -#include -#endif - -static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS]; - -/* Note this is indexed by zfs_userquota_prop_t, keep the order the same */ -const char *zfs_userquota_prop_prefixes[] = { - "userused@", - "userquota@", - "groupused@", - "groupquota@" -}; - -zprop_desc_t * -zfs_prop_get_table(void) -{ - return (zfs_prop_table); -} - -void -zfs_prop_init(void) -{ - static zprop_index_t checksum_table[] = { - { "on", ZIO_CHECKSUM_ON }, - { "off", ZIO_CHECKSUM_OFF }, - { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 }, - { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 }, - { "sha256", ZIO_CHECKSUM_SHA256 }, - { "noparity", ZIO_CHECKSUM_NOPARITY }, - { "sha512", ZIO_CHECKSUM_SHA512 }, - { "skein", ZIO_CHECKSUM_SKEIN }, -#ifdef illumos - { "edonr", ZIO_CHECKSUM_EDONR }, -#endif - { NULL } - }; - - static zprop_index_t dedup_table[] = { - { "on", ZIO_CHECKSUM_ON }, - { "off", ZIO_CHECKSUM_OFF }, - { "verify", ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY }, - { "sha256", ZIO_CHECKSUM_SHA256 }, - { "sha256,verify", - ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY }, - { "sha512", ZIO_CHECKSUM_SHA512 }, - { "sha512,verify", - ZIO_CHECKSUM_SHA512 | ZIO_CHECKSUM_VERIFY }, - { "skein", ZIO_CHECKSUM_SKEIN }, - { "skein,verify", - ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY }, -#ifdef illumos - { "edonr,verify", - ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY }, -#endif - { NULL } - }; - - static zprop_index_t compress_table[] = { - { "on", ZIO_COMPRESS_ON }, - { "off", ZIO_COMPRESS_OFF }, - { "lzjb", ZIO_COMPRESS_LZJB }, - { "gzip", ZIO_COMPRESS_GZIP_6 }, /* gzip default */ - { "gzip-1", ZIO_COMPRESS_GZIP_1 }, - { "gzip-2", ZIO_COMPRESS_GZIP_2 }, - { "gzip-3", ZIO_COMPRESS_GZIP_3 }, - { "gzip-4", ZIO_COMPRESS_GZIP_4 }, - { "gzip-5", ZIO_COMPRESS_GZIP_5 }, - { "gzip-6", ZIO_COMPRESS_GZIP_6 }, - { "gzip-7", ZIO_COMPRESS_GZIP_7 }, - { "gzip-8", ZIO_COMPRESS_GZIP_8 }, - { "gzip-9", ZIO_COMPRESS_GZIP_9 }, - { "zle", ZIO_COMPRESS_ZLE }, - { "lz4", ZIO_COMPRESS_LZ4 }, - { NULL } - }; - - static zprop_index_t snapdir_table[] = { - { "hidden", ZFS_SNAPDIR_HIDDEN }, - { "visible", ZFS_SNAPDIR_VISIBLE }, - { NULL } - }; - - static zprop_index_t acl_mode_table[] = { - { "discard", ZFS_ACL_DISCARD }, - { "groupmask", ZFS_ACL_GROUPMASK }, - { "passthrough", ZFS_ACL_PASSTHROUGH }, - { "restricted", ZFS_ACL_RESTRICTED }, - { NULL } - }; - - static zprop_index_t acl_inherit_table[] = { - { "discard", ZFS_ACL_DISCARD }, - { "noallow", ZFS_ACL_NOALLOW }, - { "restricted", ZFS_ACL_RESTRICTED }, - { "passthrough", ZFS_ACL_PASSTHROUGH }, - { "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatability */ - { "passthrough-x", ZFS_ACL_PASSTHROUGH_X }, - { NULL } - }; - - static zprop_index_t case_table[] = { - { "sensitive", ZFS_CASE_SENSITIVE }, - { "insensitive", ZFS_CASE_INSENSITIVE }, - { "mixed", ZFS_CASE_MIXED }, - { NULL } - }; - - static zprop_index_t copies_table[] = { - { "1", 1 }, - { "2", 2 }, - { "3", 3 }, - { NULL } - }; - - /* - * Use the unique flags we have to send to u8_strcmp() and/or - * u8_textprep() to represent the various normalization property - * values. - */ - static zprop_index_t normalize_table[] = { - { "none", 0 }, - { "formD", U8_TEXTPREP_NFD }, - { "formKC", U8_TEXTPREP_NFKC }, - { "formC", U8_TEXTPREP_NFC }, - { "formKD", U8_TEXTPREP_NFKD }, - { NULL } - }; - - static zprop_index_t version_table[] = { - { "1", 1 }, - { "2", 2 }, - { "3", 3 }, - { "4", 4 }, - { "5", 5 }, - { "current", ZPL_VERSION }, - { NULL } - }; - - static zprop_index_t boolean_table[] = { - { "off", 0 }, - { "on", 1 }, - { NULL } - }; - - static zprop_index_t logbias_table[] = { - { "latency", ZFS_LOGBIAS_LATENCY }, - { "throughput", ZFS_LOGBIAS_THROUGHPUT }, - { NULL } - }; - - static zprop_index_t canmount_table[] = { - { "off", ZFS_CANMOUNT_OFF }, - { "on", ZFS_CANMOUNT_ON }, - { "noauto", ZFS_CANMOUNT_NOAUTO }, - { NULL } - }; - - static zprop_index_t cache_table[] = { - { "none", ZFS_CACHE_NONE }, - { "metadata", ZFS_CACHE_METADATA }, - { "all", ZFS_CACHE_ALL }, - { NULL } - }; - - static zprop_index_t sync_table[] = { - { "standard", ZFS_SYNC_STANDARD }, - { "always", ZFS_SYNC_ALWAYS }, - { "disabled", ZFS_SYNC_DISABLED }, - { NULL } - }; - - static zprop_index_t volmode_table[] = { - { "default", ZFS_VOLMODE_DEFAULT }, - { "geom", ZFS_VOLMODE_GEOM }, - { "dev", ZFS_VOLMODE_DEV }, - { "none", ZFS_VOLMODE_NONE }, - { NULL } - }; - - static zprop_index_t dnsize_table[] = { - { "legacy", ZFS_DNSIZE_LEGACY }, - { "auto", ZFS_DNSIZE_AUTO }, - { "1k", ZFS_DNSIZE_1K }, - { "2k", ZFS_DNSIZE_2K }, - { "4k", ZFS_DNSIZE_4K }, - { "8k", ZFS_DNSIZE_8K }, - { "16k", ZFS_DNSIZE_16K }, - { NULL } - }; - - static zprop_index_t redundant_metadata_table[] = { - { "all", ZFS_REDUNDANT_METADATA_ALL }, - { "most", ZFS_REDUNDANT_METADATA_MOST }, - { NULL } - }; - - /* inherit index properties */ - zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata", - ZFS_REDUNDANT_METADATA_ALL, - PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "all | most", "REDUND_MD", - redundant_metadata_table); - zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD, - PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "standard | always | disabled", "SYNC", - sync_table); - zprop_register_index(ZFS_PROP_CHECKSUM, "checksum", - ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | - ZFS_TYPE_VOLUME, - "on | off | fletcher2 | fletcher4 | sha256 | sha512 | " - "skein", "CHECKSUM", checksum_table); - zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF, - PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "on | off | verify | sha256[,verify], sha512[,verify], " - "skein[,verify]", "DEDUP", dedup_table); - zprop_register_index(ZFS_PROP_COMPRESSION, "compression", - ZIO_COMPRESS_DEFAULT, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4", - "COMPRESS", compress_table); - zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN, - PROP_INHERIT, ZFS_TYPE_FILESYSTEM, - "hidden | visible", "SNAPDIR", snapdir_table); - zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD, - PROP_INHERIT, ZFS_TYPE_FILESYSTEM, - "discard | groupmask | passthrough | restricted", "ACLMODE", - acl_mode_table); - zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit", - ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, - "discard | noallow | restricted | passthrough | passthrough-x", - "ACLINHERIT", acl_inherit_table); - zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "1 | 2 | 3", "COPIES", copies_table); - zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache", - ZFS_CACHE_ALL, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, - "all | none | metadata", "PRIMARYCACHE", cache_table); - zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache", - ZFS_CACHE_ALL, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, - "all | none | metadata", "SECONDARYCACHE", cache_table); - zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY, - PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "latency | throughput", "LOGBIAS", logbias_table); - zprop_register_index(ZFS_PROP_VOLMODE, "volmode", - ZFS_VOLMODE_DEFAULT, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, - "default | geom | dev | none", "VOLMODE", volmode_table); - - zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize", - ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, - "legacy | auto | 1k | 2k | 4k | 8k | 16k", "DNSIZE", dnsize_table); - - /* inherit index (boolean) properties */ - zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table); - zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES", - boolean_table); - zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC", - boolean_table); - zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID", - boolean_table); - zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY", - boolean_table); - zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table); - zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR", - boolean_table); - zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", - boolean_table); - zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND", - boolean_table); - - /* default index properties */ - zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, - "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table); - zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON, - PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto", - "CANMOUNT", canmount_table); - - /* readonly index (boolean) properties */ - zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY, - ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table); - zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0, - PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY", - boolean_table); - - /* set once index properties */ - zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0, - PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, - "none | formC | formD | formKC | formKD", "NORMALIZATION", - normalize_table); - zprop_register_index(ZFS_PROP_CASE, "casesensitivity", - ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | - ZFS_TYPE_SNAPSHOT, - "sensitive | insensitive | mixed", "CASE", case_table); - - /* set once index (boolean) properties */ - zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, - "on | off", "UTF8ONLY", boolean_table); - - /* string properties */ - zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "ORIGIN"); - zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY, - ZFS_TYPE_SNAPSHOT, "[,...]", "CLONES"); - zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", - PROP_INHERIT, ZFS_TYPE_FILESYSTEM, " | legacy | none", - "MOUNTPOINT"); - zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", - PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", - "SHARENFS"); - zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY, - ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, - "filesystem | volume | snapshot | bookmark", "TYPE"); - zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", - PROP_INHERIT, ZFS_TYPE_FILESYSTEM, - "on | off | sharemgr(1M) options", "SHARESMB"); - zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel", - ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET, - "", "MLSLABEL"); - zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN, - "receive_resume_token", - NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "", "RESUMETOK"); - - /* readonly number properties */ - zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, - ZFS_TYPE_DATASET, "", "USED"); - zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "AVAIL"); - zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0, - PROP_READONLY, ZFS_TYPE_DATASET, "", "REFER"); - zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0, - PROP_READONLY, ZFS_TYPE_DATASET, - "<1.00x or higher if compressed>", "RATIO"); - zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0, - PROP_READONLY, ZFS_TYPE_DATASET, - "<1.00x or higher if compressed>", "REFRATIO"); - zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", - ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME, - ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK"); - zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, - PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", - "USEDSNAP"); - zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, - PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", - "USEDDS"); - zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, - PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", - "USEDCHILD"); - zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0, - PROP_READONLY, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDREFRESERV"); - zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY, - ZFS_TYPE_SNAPSHOT, "", "USERREFS"); - zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY, - ZFS_TYPE_DATASET, "", "WRITTEN"); - zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0, - PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", - "LUSED"); - zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced", - 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "LREFER"); - - /* default number properties */ - zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, - ZFS_TYPE_FILESYSTEM, " | none", "QUOTA"); - zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0, - PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - " | none", "RESERV"); - zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT, - ZFS_TYPE_VOLUME, "", "VOLSIZE"); - zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT, - ZFS_TYPE_FILESYSTEM, " | none", "REFQUOTA"); - zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0, - PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - " | none", "REFRESERV"); - zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit", - UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, - " | none", "FSLIMIT"); - zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit", - UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - " | none", "SSLIMIT"); - zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count", - UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, - "", "FSCOUNT"); - zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count", - UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "", "SSCOUNT"); - zprop_register_number(ZFS_PROP_GUID, "guid", 0, PROP_READONLY, - ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "GUID"); - zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY, - ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "CREATETXG"); - - /* inherit number properties */ - zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize", - SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE"); - zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS, - "special_small_blocks", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, - "zero or 512 to 128K, power of 2", "SPECIAL_SMALL_BLOCKS"); - - /* hidden properties */ - zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER, - PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG"); - zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER, - PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES"); - zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING, - PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "NAME"); - zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", - PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS"); - zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu", - PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, - "STMF_SBD_LU"); - zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting", - PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, - "USERACCOUNTING"); - zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER, - PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE"); - zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER, - PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID"); - zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent", - PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT"); - zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING, - PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP"); - - /* oddball properties */ - zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, - NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, - "", "CREATION", B_FALSE, B_TRUE, NULL); -} - -boolean_t -zfs_prop_delegatable(zfs_prop_t prop) -{ - zprop_desc_t *pd = &zfs_prop_table[prop]; - - /* The mlslabel property is never delegatable. */ - if (prop == ZFS_PROP_MLSLABEL) - return (B_FALSE); - - return (pd->pd_attr != PROP_READONLY); -} - -/* - * Given a zfs dataset property name, returns the corresponding property ID. - */ -zfs_prop_t -zfs_name_to_prop(const char *propname) -{ - return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET)); -} - -/* - * For user property names, we allow all lowercase alphanumeric characters, plus - * a few useful punctuation characters. - */ -static int -valid_char(char c) -{ - return ((c >= 'a' && c <= 'z') || - (c >= '0' && c <= '9') || - c == '-' || c == '_' || c == '.' || c == ':'); -} - -/* - * Returns true if this is a valid user-defined property (one with a ':'). - */ -boolean_t -zfs_prop_user(const char *name) -{ - int i; - char c; - boolean_t foundsep = B_FALSE; - - for (i = 0; i < strlen(name); i++) { - c = name[i]; - if (!valid_char(c)) - return (B_FALSE); - if (c == ':') - foundsep = B_TRUE; - } - - if (!foundsep) - return (B_FALSE); - - return (B_TRUE); -} - -/* - * Returns true if this is a valid userspace-type property (one with a '@'). - * Note that after the @, any character is valid (eg, another @, for SID - * user@domain). - */ -boolean_t -zfs_prop_userquota(const char *name) -{ - zfs_userquota_prop_t prop; - - for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) { - if (strncmp(name, zfs_userquota_prop_prefixes[prop], - strlen(zfs_userquota_prop_prefixes[prop])) == 0) { - return (B_TRUE); - } - } - - return (B_FALSE); -} - -/* - * Returns true if this is a valid written@ property. - * Note that after the @, any character is valid (eg, another @, for - * written@pool/fs@origin). - */ -boolean_t -zfs_prop_written(const char *name) -{ - static const char *prefix = "written@"; - return (strncmp(name, prefix, strlen(prefix)) == 0); -} - -/* - * Tables of index types, plus functions to convert between the user view - * (strings) and internal representation (uint64_t). - */ -int -zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index) -{ - return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET)); -} - -int -zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string) -{ - return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET)); -} - -uint64_t -zfs_prop_random_value(zfs_prop_t prop, uint64_t seed) -{ - return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET)); -} - -/* - * Returns TRUE if the property applies to any of the given dataset types. - */ -boolean_t -zfs_prop_valid_for_type(int prop, zfs_type_t types) -{ - return (zprop_valid_for_type(prop, types)); -} - -zprop_type_t -zfs_prop_get_type(zfs_prop_t prop) -{ - return (zfs_prop_table[prop].pd_proptype); -} - -/* - * Returns TRUE if the property is readonly. - */ -boolean_t -zfs_prop_readonly(zfs_prop_t prop) -{ - return (zfs_prop_table[prop].pd_attr == PROP_READONLY || - zfs_prop_table[prop].pd_attr == PROP_ONETIME); -} - -/* - * Returns TRUE if the property is visible (not hidden). - */ -boolean_t -zfs_prop_visible(zfs_prop_t prop) -{ - return (zfs_prop_table[prop].pd_visible); -} - -/* - * Returns TRUE if the property is only allowed to be set once. - */ -boolean_t -zfs_prop_setonce(zfs_prop_t prop) -{ - return (zfs_prop_table[prop].pd_attr == PROP_ONETIME); -} - -const char * -zfs_prop_default_string(zfs_prop_t prop) -{ - return (zfs_prop_table[prop].pd_strdefault); -} - -uint64_t -zfs_prop_default_numeric(zfs_prop_t prop) -{ - return (zfs_prop_table[prop].pd_numdefault); -} - -/* - * Given a dataset property ID, returns the corresponding name. - * Assuming the zfs dataset property ID is valid. - */ -const char * -zfs_prop_to_name(zfs_prop_t prop) -{ - return (zfs_prop_table[prop].pd_name); -} - -/* - * Returns TRUE if the property is inheritable. - */ -boolean_t -zfs_prop_inheritable(zfs_prop_t prop) -{ - return (zfs_prop_table[prop].pd_attr == PROP_INHERIT || - zfs_prop_table[prop].pd_attr == PROP_ONETIME); -} - -#ifndef _KERNEL - -/* - * Returns a string describing the set of acceptable values for the given - * zfs property, or NULL if it cannot be set. - */ -const char * -zfs_prop_values(zfs_prop_t prop) -{ - return (zfs_prop_table[prop].pd_values); -} - -/* - * Returns TRUE if this property is a string type. Note that index types - * (compression, checksum) are treated as strings in userland, even though they - * are stored numerically on disk. - */ -int -zfs_prop_is_string(zfs_prop_t prop) -{ - return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING || - zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX); -} - -/* - * Returns the column header for the given property. Used only in - * 'zfs list -o', but centralized here with the other property information. - */ -const char * -zfs_prop_column_name(zfs_prop_t prop) -{ - return (zfs_prop_table[prop].pd_colname); -} - -/* - * Returns whether the given property should be displayed right-justified for - * 'zfs list'. - */ -boolean_t -zfs_prop_align_right(zfs_prop_t prop) -{ - return (zfs_prop_table[prop].pd_rightalign); -} - -#endif diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h deleted file mode 100644 index e604abda131d..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h +++ /dev/null @@ -1,131 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _ZFS_PROP_H -#define _ZFS_PROP_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * For index types (e.g. compression and checksum), we want the numeric value - * in the kernel, but the string value in userland. - */ -typedef enum { - PROP_TYPE_NUMBER, /* numeric value */ - PROP_TYPE_STRING, /* string value */ - PROP_TYPE_INDEX /* numeric value indexed by string */ -} zprop_type_t; - -typedef enum { - PROP_DEFAULT, - PROP_READONLY, - PROP_INHERIT, - /* - * ONETIME properties are a sort of conglomeration of READONLY - * and INHERIT. They can be set only during object creation, - * after that they are READONLY. If not explicitly set during - * creation, they can be inherited. - */ - PROP_ONETIME -} zprop_attr_t; - -typedef struct zfs_index { - const char *pi_name; - uint64_t pi_value; -} zprop_index_t; - -typedef struct { - const char *pd_name; /* human-readable property name */ - int pd_propnum; /* property number */ - zprop_type_t pd_proptype; /* string, boolean, index, number */ - const char *pd_strdefault; /* default for strings */ - uint64_t pd_numdefault; /* for boolean / index / number */ - zprop_attr_t pd_attr; /* default, readonly, inherit */ - int pd_types; /* bitfield of valid dataset types */ - /* fs | vol | snap; or pool */ - const char *pd_values; /* string telling acceptable values */ - const char *pd_colname; /* column header for "zfs list" */ - boolean_t pd_rightalign; /* column alignment for "zfs list" */ - boolean_t pd_visible; /* do we list this property with the */ - /* "zfs get" help message */ - const zprop_index_t *pd_table; /* for index properties, a table */ - /* defining the possible values */ - size_t pd_table_size; /* number of entries in pd_table[] */ -} zprop_desc_t; - -/* - * zfs dataset property functions - */ -void zfs_prop_init(void); -zprop_type_t zfs_prop_get_type(zfs_prop_t); -boolean_t zfs_prop_delegatable(zfs_prop_t prop); -zprop_desc_t *zfs_prop_get_table(void); - -/* - * zpool property functions - */ -void zpool_prop_init(void); -zprop_type_t zpool_prop_get_type(zpool_prop_t); -zprop_desc_t *zpool_prop_get_table(void); - -/* - * Common routines to initialize property tables - */ -void zprop_register_impl(int, const char *, zprop_type_t, uint64_t, - const char *, zprop_attr_t, int, const char *, const char *, - boolean_t, boolean_t, const zprop_index_t *); -void zprop_register_string(int, const char *, const char *, - zprop_attr_t attr, int, const char *, const char *); -void zprop_register_number(int, const char *, uint64_t, zprop_attr_t, int, - const char *, const char *); -void zprop_register_index(int, const char *, uint64_t, zprop_attr_t, int, - const char *, const char *, const zprop_index_t *); -void zprop_register_hidden(int, const char *, zprop_type_t, zprop_attr_t, - int, const char *); - -/* - * Common routines for zfs and zpool property management - */ -int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, zfs_type_t); -int zprop_name_to_prop(const char *, zfs_type_t); -int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t); -int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t); -uint64_t zprop_random_value(int, uint64_t, zfs_type_t); -const char *zprop_values(int, zfs_type_t); -size_t zprop_width(int, boolean_t *, zfs_type_t); -boolean_t zprop_valid_for_type(int, zfs_type_t); -boolean_t zfs_prop_written(const char *name); - - -#ifdef __cplusplus -} -#endif - -#endif /* _ZFS_PROP_H */ diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c deleted file mode 100644 index d17c7fd98043..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c +++ /dev/null @@ -1,250 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -#include -#include -#include -#include -#include - -#include "zfs_prop.h" - -#if defined(_KERNEL) -#include -#else -#include -#include -#include -#endif - -static zprop_desc_t zpool_prop_table[ZPOOL_NUM_PROPS]; - -zprop_desc_t * -zpool_prop_get_table(void) -{ - return (zpool_prop_table); -} - -void -zpool_prop_init(void) -{ - static zprop_index_t boolean_table[] = { - { "off", 0}, - { "on", 1}, - { NULL } - }; - - static zprop_index_t failuremode_table[] = { - { "wait", ZIO_FAILURE_MODE_WAIT }, - { "continue", ZIO_FAILURE_MODE_CONTINUE }, - { "panic", ZIO_FAILURE_MODE_PANIC }, - { NULL } - }; - - /* string properties */ - zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT, - ZFS_TYPE_POOL, "", "ALTROOT"); - zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT, - ZFS_TYPE_POOL, "", "BOOTFS"); - zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, - PROP_DEFAULT, ZFS_TYPE_POOL, " | none", "CACHEFILE"); - zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL, - PROP_DEFAULT, ZFS_TYPE_POOL, "", "COMMENT"); - - /* readonly number properties */ - zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY, - ZFS_TYPE_POOL, "", "SIZE"); - zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY, - ZFS_TYPE_POOL, "", "FREE"); - zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY, - ZFS_TYPE_POOL, "", "FREEING"); - zprop_register_number(ZPOOL_PROP_CHECKPOINT, "checkpoint", 0, - PROP_READONLY, ZFS_TYPE_POOL, "", "CKPOINT"); - zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY, - ZFS_TYPE_POOL, "", "LEAKED"); - zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, - PROP_READONLY, ZFS_TYPE_POOL, "", "ALLOC"); - zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, - PROP_READONLY, ZFS_TYPE_POOL, "", "EXPANDSZ"); - zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0, - PROP_READONLY, ZFS_TYPE_POOL, "", "FRAG"); - zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, - ZFS_TYPE_POOL, "", "CAP"); - zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, - ZFS_TYPE_POOL, "", "GUID"); - zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY, - ZFS_TYPE_POOL, "", "HEALTH"); - zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0, - PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>", - "DEDUP"); - - /* system partition size */ - zprop_register_number(ZPOOL_PROP_BOOTSIZE, "bootsize", 0, PROP_ONETIME, - ZFS_TYPE_POOL, "", "BOOTSIZE"); - - /* default number properties */ - zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, - PROP_DEFAULT, ZFS_TYPE_POOL, "", "VERSION"); - zprop_register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0, - PROP_DEFAULT, ZFS_TYPE_POOL, "", "DEDUPDITTO"); - - /* default index (boolean) properties */ - zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, - PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION", - boolean_table); - zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0, - PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table); - zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, - PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS", - boolean_table); - zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0, - PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table); - zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0, - PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "RDONLY", boolean_table); - zprop_register_index(ZPOOL_PROP_MULTIHOST, "multihost", 0, - PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "MULTIHOST", - boolean_table); - - /* default index properties */ - zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode", - ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL, - "wait | continue | panic", "FAILMODE", failuremode_table); - - /* hidden properties */ - zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING, - PROP_READONLY, ZFS_TYPE_POOL, "NAME"); - zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize", - PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE"); - zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING, - PROP_ONETIME, ZFS_TYPE_POOL, "TNAME"); - zprop_register_hidden(ZPOOL_PROP_MAXDNODESIZE, "maxdnodesize", - PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXDNODESIZE"); -} - -/* - * Given a property name and its type, returns the corresponding property ID. - */ -zpool_prop_t -zpool_name_to_prop(const char *propname) -{ - return (zprop_name_to_prop(propname, ZFS_TYPE_POOL)); -} - -/* - * Given a pool property ID, returns the corresponding name. - * Assuming the pool propety ID is valid. - */ -const char * -zpool_prop_to_name(zpool_prop_t prop) -{ - return (zpool_prop_table[prop].pd_name); -} - -zprop_type_t -zpool_prop_get_type(zpool_prop_t prop) -{ - return (zpool_prop_table[prop].pd_proptype); -} - -boolean_t -zpool_prop_readonly(zpool_prop_t prop) -{ - return (zpool_prop_table[prop].pd_attr == PROP_READONLY); -} - -const char * -zpool_prop_default_string(zpool_prop_t prop) -{ - return (zpool_prop_table[prop].pd_strdefault); -} - -uint64_t -zpool_prop_default_numeric(zpool_prop_t prop) -{ - return (zpool_prop_table[prop].pd_numdefault); -} - -/* - * Returns true if this is a valid feature@ property. - */ -boolean_t -zpool_prop_feature(const char *name) -{ - static const char *prefix = "feature@"; - return (strncmp(name, prefix, strlen(prefix)) == 0); -} - -/* - * Returns true if this is a valid unsupported@ property. - */ -boolean_t -zpool_prop_unsupported(const char *name) -{ - static const char *prefix = "unsupported@"; - return (strncmp(name, prefix, strlen(prefix)) == 0); -} - -int -zpool_prop_string_to_index(zpool_prop_t prop, const char *string, - uint64_t *index) -{ - return (zprop_string_to_index(prop, string, index, ZFS_TYPE_POOL)); -} - -int -zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index, - const char **string) -{ - return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL)); -} - -uint64_t -zpool_prop_random_value(zpool_prop_t prop, uint64_t seed) -{ - return (zprop_random_value(prop, seed, ZFS_TYPE_POOL)); -} - -#ifndef _KERNEL - -const char * -zpool_prop_values(zpool_prop_t prop) -{ - return (zpool_prop_table[prop].pd_values); -} - -const char * -zpool_prop_column_name(zpool_prop_t prop) -{ - return (zpool_prop_table[prop].pd_colname); -} - -boolean_t -zpool_prop_align_right(zpool_prop_t prop) -{ - return (zpool_prop_table[prop].pd_rightalign); -} -#endif diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c deleted file mode 100644 index ca2e72c5daa4..000000000000 --- a/sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c +++ /dev/null @@ -1,430 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2012 by Delphix. All rights reserved. - */ - -/* - * Common routines used by zfs and zpool property management. - */ - -#include -#include -#include -#include -#include -#include - -#include "zfs_prop.h" -#include "zfs_deleg.h" - -#if defined(_KERNEL) -#include -#include -#else -#include -#include -#include -#endif - -static zprop_desc_t * -zprop_get_proptable(zfs_type_t type) -{ - if (type == ZFS_TYPE_POOL) - return (zpool_prop_get_table()); - else - return (zfs_prop_get_table()); -} - -static int -zprop_get_numprops(zfs_type_t type) -{ - if (type == ZFS_TYPE_POOL) - return (ZPOOL_NUM_PROPS); - else - return (ZFS_NUM_PROPS); -} - -void -zprop_register_impl(int prop, const char *name, zprop_type_t type, - uint64_t numdefault, const char *strdefault, zprop_attr_t attr, - int objset_types, const char *values, const char *colname, - boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl) -{ - zprop_desc_t *prop_tbl = zprop_get_proptable(objset_types); - zprop_desc_t *pd; - - pd = &prop_tbl[prop]; - - ASSERT(pd->pd_name == NULL || pd->pd_name == name); - ASSERT(name != NULL); - ASSERT(colname != NULL); - - pd->pd_name = name; - pd->pd_propnum = prop; - pd->pd_proptype = type; - pd->pd_numdefault = numdefault; - pd->pd_strdefault = strdefault; - pd->pd_attr = attr; - pd->pd_types = objset_types; - pd->pd_values = values; - pd->pd_colname = colname; - pd->pd_rightalign = rightalign; - pd->pd_visible = visible; - pd->pd_table = idx_tbl; - pd->pd_table_size = 0; - while (idx_tbl && (idx_tbl++)->pi_name != NULL) - pd->pd_table_size++; -} - -void -zprop_register_string(int prop, const char *name, const char *def, - zprop_attr_t attr, int objset_types, const char *values, - const char *colname) -{ - zprop_register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr, - objset_types, values, colname, B_FALSE, B_TRUE, NULL); - -} - -void -zprop_register_number(int prop, const char *name, uint64_t def, - zprop_attr_t attr, int objset_types, const char *values, - const char *colname) -{ - zprop_register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr, - objset_types, values, colname, B_TRUE, B_TRUE, NULL); -} - -void -zprop_register_index(int prop, const char *name, uint64_t def, - zprop_attr_t attr, int objset_types, const char *values, - const char *colname, const zprop_index_t *idx_tbl) -{ - zprop_register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr, - objset_types, values, colname, B_TRUE, B_TRUE, idx_tbl); -} - -void -zprop_register_hidden(int prop, const char *name, zprop_type_t type, - zprop_attr_t attr, int objset_types, const char *colname) -{ - zprop_register_impl(prop, name, type, 0, NULL, attr, - objset_types, NULL, colname, - type == PROP_TYPE_NUMBER, B_FALSE, NULL); -} - - -/* - * A comparison function we can use to order indexes into property tables. - */ -static int -zprop_compare(const void *arg1, const void *arg2) -{ - const zprop_desc_t *p1 = *((zprop_desc_t **)arg1); - const zprop_desc_t *p2 = *((zprop_desc_t **)arg2); - boolean_t p1ro, p2ro; - - p1ro = (p1->pd_attr == PROP_READONLY); - p2ro = (p2->pd_attr == PROP_READONLY); - - if (p1ro == p2ro) - return (strcmp(p1->pd_name, p2->pd_name)); - - return (p1ro ? -1 : 1); -} - -/* - * Iterate over all properties in the given property table, calling back - * into the specified function for each property. We will continue to - * iterate until we either reach the end or the callback function returns - * something other than ZPROP_CONT. - */ -int -zprop_iter_common(zprop_func func, void *cb, boolean_t show_all, - boolean_t ordered, zfs_type_t type) -{ - int i, j, num_props, size, prop; - zprop_desc_t *prop_tbl; - zprop_desc_t **order; - - prop_tbl = zprop_get_proptable(type); - num_props = zprop_get_numprops(type); - size = num_props * sizeof (zprop_desc_t *); - -#if defined(_KERNEL) - order = kmem_alloc(size, KM_SLEEP); -#else - if ((order = malloc(size)) == NULL) - return (ZPROP_CONT); -#endif - - for (j = 0; j < num_props; j++) - order[j] = &prop_tbl[j]; - - if (ordered) { - qsort((void *)order, num_props, sizeof (zprop_desc_t *), - zprop_compare); - } - - prop = ZPROP_CONT; - for (i = 0; i < num_props; i++) { - if ((order[i]->pd_visible || show_all) && - (func(order[i]->pd_propnum, cb) != ZPROP_CONT)) { - prop = order[i]->pd_propnum; - break; - } - } - -#if defined(_KERNEL) - kmem_free(order, size); -#else - free(order); -#endif - return (prop); -} - -static boolean_t -propname_match(const char *p, size_t len, zprop_desc_t *prop_entry) -{ - const char *propname = prop_entry->pd_name; -#ifndef _KERNEL - const char *colname = prop_entry->pd_colname; - int c; -#endif - - if (len == strlen(propname) && - strncmp(p, propname, len) == 0) - return (B_TRUE); - -#ifndef _KERNEL - if (colname == NULL || len != strlen(colname)) - return (B_FALSE); - - for (c = 0; c < len; c++) - if (p[c] != tolower(colname[c])) - break; - - return (colname[c] == '\0'); -#else - return (B_FALSE); -#endif -} - -typedef struct name_to_prop_cb { - const char *propname; - zprop_desc_t *prop_tbl; -} name_to_prop_cb_t; - -static int -zprop_name_to_prop_cb(int prop, void *cb_data) -{ - name_to_prop_cb_t *data = cb_data; - - if (propname_match(data->propname, strlen(data->propname), - &data->prop_tbl[prop])) - return (prop); - - return (ZPROP_CONT); -} - -int -zprop_name_to_prop(const char *propname, zfs_type_t type) -{ - int prop; - name_to_prop_cb_t cb_data; - - cb_data.propname = propname; - cb_data.prop_tbl = zprop_get_proptable(type); - - prop = zprop_iter_common(zprop_name_to_prop_cb, &cb_data, - B_TRUE, B_FALSE, type); - - return (prop == ZPROP_CONT ? ZPROP_INVAL : prop); -} - -int -zprop_string_to_index(int prop, const char *string, uint64_t *index, - zfs_type_t type) -{ - zprop_desc_t *prop_tbl; - const zprop_index_t *idx_tbl; - int i; - - if (prop == ZPROP_INVAL || prop == ZPROP_CONT) - return (-1); - - ASSERT(prop < zprop_get_numprops(type)); - prop_tbl = zprop_get_proptable(type); - if ((idx_tbl = prop_tbl[prop].pd_table) == NULL) - return (-1); - - for (i = 0; idx_tbl[i].pi_name != NULL; i++) { - if (strcmp(string, idx_tbl[i].pi_name) == 0) { - *index = idx_tbl[i].pi_value; - return (0); - } - } - - return (-1); -} - -int -zprop_index_to_string(int prop, uint64_t index, const char **string, - zfs_type_t type) -{ - zprop_desc_t *prop_tbl; - const zprop_index_t *idx_tbl; - int i; - - if (prop == ZPROP_INVAL || prop == ZPROP_CONT) - return (-1); - - ASSERT(prop < zprop_get_numprops(type)); - prop_tbl = zprop_get_proptable(type); - if ((idx_tbl = prop_tbl[prop].pd_table) == NULL) - return (-1); - - for (i = 0; idx_tbl[i].pi_name != NULL; i++) { - if (idx_tbl[i].pi_value == index) { - *string = idx_tbl[i].pi_name; - return (0); - } - } - - return (-1); -} - -/* - * Return a random valid property value. Used by ztest. - */ -uint64_t -zprop_random_value(int prop, uint64_t seed, zfs_type_t type) -{ - zprop_desc_t *prop_tbl; - const zprop_index_t *idx_tbl; - - ASSERT((uint_t)prop < zprop_get_numprops(type)); - prop_tbl = zprop_get_proptable(type); - idx_tbl = prop_tbl[prop].pd_table; - - if (idx_tbl == NULL) - return (seed); - - return (idx_tbl[seed % prop_tbl[prop].pd_table_size].pi_value); -} - -const char * -zprop_values(int prop, zfs_type_t type) -{ - zprop_desc_t *prop_tbl; - - ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT); - ASSERT(prop < zprop_get_numprops(type)); - - prop_tbl = zprop_get_proptable(type); - - return (prop_tbl[prop].pd_values); -} - -/* - * Returns TRUE if the property applies to any of the given dataset types. - */ -boolean_t -zprop_valid_for_type(int prop, zfs_type_t type) -{ - zprop_desc_t *prop_tbl; - - if (prop == ZPROP_INVAL || prop == ZPROP_CONT) - return (B_FALSE); - - ASSERT(prop < zprop_get_numprops(type)); - prop_tbl = zprop_get_proptable(type); - return ((prop_tbl[prop].pd_types & type) != 0); -} - -#ifndef _KERNEL - -/* - * Determines the minimum width for the column, and indicates whether it's fixed - * or not. Only string columns are non-fixed. - */ -size_t -zprop_width(int prop, boolean_t *fixed, zfs_type_t type) -{ - zprop_desc_t *prop_tbl, *pd; - const zprop_index_t *idx; - size_t ret; - int i; - - ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT); - ASSERT(prop < zprop_get_numprops(type)); - - prop_tbl = zprop_get_proptable(type); - pd = &prop_tbl[prop]; - - *fixed = B_TRUE; - - /* - * Start with the width of the column name. - */ - ret = strlen(pd->pd_colname); - - /* - * For fixed-width values, make sure the width is large enough to hold - * any possible value. - */ - switch (pd->pd_proptype) { - case PROP_TYPE_NUMBER: - /* - * The maximum length of a human-readable number is 5 characters - * ("20.4M", for example). - */ - if (ret < 5) - ret = 5; - /* - * 'creation' is handled specially because it's a number - * internally, but displayed as a date string. - */ - if (prop == ZFS_PROP_CREATION) - *fixed = B_FALSE; - break; - case PROP_TYPE_INDEX: - idx = prop_tbl[prop].pd_table; - for (i = 0; idx[i].pi_name != NULL; i++) { - if (strlen(idx[i].pi_name) > ret) - ret = strlen(idx[i].pi_name); - } - break; - - case PROP_TYPE_STRING: - *fixed = B_FALSE; - break; - } - - return (ret); -} - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c index 8399be770bb0..3d68a68ba819 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c +++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c @@ -67,13 +67,15 @@ * on capital-f functions. */ #include +#include +#include #ifndef illumos #include #endif #include -#include #include #include +#include #ifdef illumos #include #include @@ -96,7 +98,6 @@ #include #include #endif -#include #ifdef illumos #include #include @@ -119,6 +120,7 @@ #include #include #include +#include #include #include #include @@ -129,6 +131,13 @@ #include #include + +#include +#undef AT_UID +#undef AT_GID +#include +#include + #include #include @@ -299,8 +308,10 @@ static kmutex_t dtrace_meta_lock; /* meta-provider state lock */ #define ipaddr_t in_addr_t #define mod_modname pathname #define vuprintf vprintf +#ifndef crgetzoneid +#define crgetzoneid(_a) 0 +#endif #define ttoproc(_a) ((_a)->td_proc) -#define crgetzoneid(_a) 0 #define SNOCD 0 #define CPU_ON_INTR(_a) 0 @@ -491,7 +502,7 @@ do { \ if ((remp) != NULL) { \ *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \ } \ -_NOTE(CONSTCOND) } while (0) +} while (0) /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c b/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c index d5be43f0c3d1..4771a67a9f09 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c +++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,8 @@ #include #include #include +#undef AT_UID +#undef AT_GID #include #ifdef illumos #include diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c deleted file mode 100644 index 6d82470d220a..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ -/* All Rights Reserved */ - -/* - * University Copyright- Copyright (c) 1982, 1986, 1988 - * The Regents of the University of California - * All Rights Reserved - * - * University Acknowledgment- Portions of this document are derived from - * software developed by the University of California, Berkeley, and its - * contributors. - */ - -#include -#include -#include -#include -#include - -/* Extensible attribute (xva) routines. */ - -/* - * Zero out the structure, set the size of the requested/returned bitmaps, - * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer - * to the returned attributes array. - */ -void -xva_init(xvattr_t *xvap) -{ - bzero(xvap, sizeof (xvattr_t)); - xvap->xva_mapsize = XVA_MAPSIZE; - xvap->xva_magic = XVA_MAGIC; - xvap->xva_vattr.va_mask = AT_XVATTR; - xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0]; -} - -/* - * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t - * structure. Otherwise, returns NULL. - */ -xoptattr_t * -xva_getxoptattr(xvattr_t *xvap) -{ - xoptattr_t *xoap = NULL; - if (xvap->xva_vattr.va_mask & AT_XVATTR) - xoap = &xvap->xva_xoptattrs; - return (xoap); -} - -/* - * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it - * asynchronously using a taskq. This can avoid deadlocks caused by re-entering - * the file system as a result of releasing the vnode. Note, file systems - * already have to handle the race where the vnode is incremented before the - * inactive routine is called and does its locking. - * - * Warning: Excessive use of this routine can lead to performance problems. - * This is because taskqs throttle back allocation if too many are created. - */ -void -vn_rele_async(vnode_t *vp, taskq_t *taskq) -{ - VERIFY(vp->v_count > 0); - if (refcount_release_if_not_last(&vp->v_usecount)) { - return; - } - VERIFY(taskq_dispatch((taskq_t *)taskq, - (task_func_t *)vrele, vp, TQ_SLEEP) != 0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash deleted file mode 100644 index e558b2a50358..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash +++ /dev/null @@ -1,19 +0,0 @@ -Copyright (c) 2011 Google, Inc. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip deleted file mode 100644 index f98cb76dfc91..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip +++ /dev/null @@ -1 +0,0 @@ -CITYHASH CHECKSUM FUNCTIONALITY IN ZFS diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4 b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4 deleted file mode 100644 index 722cc75f01e9..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4 +++ /dev/null @@ -1,30 +0,0 @@ -LZ4 - Fast LZ compression algorithm -Copyright (C) 2011-2013, Yann Collet. -BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER -OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -You can contact the author at : -- LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html -- LZ4 source repository : http://code.google.com/p/lz4/ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip deleted file mode 100644 index 211f679b5749..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip +++ /dev/null @@ -1 +0,0 @@ -LZ4 COMPRESSION FUNCTIONALITY IN ZFS diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c deleted file mode 100644 index 1843c8161038..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c +++ /dev/null @@ -1,960 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright (c) 2014 by Chunwei Chen. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. - */ - -/* - * ARC buffer data (ABD). - * - * ABDs are an abstract data structure for the ARC which can use two - * different ways of storing the underlying data: - * - * (a) Linear buffer. In this case, all the data in the ABD is stored in one - * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). - * - * +-------------------+ - * | ABD (linear) | - * | abd_flags = ... | - * | abd_size = ... | +--------------------------------+ - * | abd_buf ------------->| raw buffer of size abd_size | - * +-------------------+ +--------------------------------+ - * no abd_chunks - * - * (b) Scattered buffer. In this case, the data in the ABD is split into - * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers - * to the chunks recorded in an array at the end of the ABD structure. - * - * +-------------------+ - * | ABD (scattered) | - * | abd_flags = ... | - * | abd_size = ... | - * | abd_offset = 0 | +-----------+ - * | abd_chunks[0] ----------------------------->| chunk 0 | - * | abd_chunks[1] ---------------------+ +-----------+ - * | ... | | +-----------+ - * | abd_chunks[N-1] ---------+ +------->| chunk 1 | - * +-------------------+ | +-----------+ - * | ... - * | +-----------+ - * +----------------->| chunk N-1 | - * +-----------+ - * - * Using a large proportion of scattered ABDs decreases ARC fragmentation since - * when we are at the limit of allocatable space, using equal-size chunks will - * allow us to quickly reclaim enough space for a new large allocation (assuming - * it is also scattered). - * - * In addition to directly allocating a linear or scattered ABD, it is also - * possible to create an ABD by requesting the "sub-ABD" starting at an offset - * within an existing ABD. In linear buffers this is simple (set abd_buf of - * the new ABD to the starting point within the original raw buffer), but - * scattered ABDs are a little more complex. The new ABD makes a copy of the - * relevant abd_chunks pointers (but not the underlying data). However, to - * provide arbitrary rather than only chunk-aligned starting offsets, it also - * tracks an abd_offset field which represents the starting point of the data - * within the first chunk in abd_chunks. For both linear and scattered ABDs, - * creating an offset ABD marks the original ABD as the offset's parent, and the - * original ABD's abd_children refcount is incremented. This data allows us to - * ensure the root ABD isn't deleted before its children. - * - * Most consumers should never need to know what type of ABD they're using -- - * the ABD public API ensures that it's possible to transparently switch from - * using a linear ABD to a scattered one when doing so would be beneficial. - * - * If you need to use the data within an ABD directly, if you know it's linear - * (because you allocated it) you can use abd_to_buf() to access the underlying - * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions - * which will allocate a raw buffer if necessary. Use the abd_return_buf* - * functions to return any raw buffers that are no longer necessary when you're - * done using them. - * - * There are a variety of ABD APIs that implement basic buffer operations: - * compare, copy, read, write, and fill with zeroes. If you need a custom - * function which progressively accesses the whole ABD, use the abd_iterate_* - * functions. - */ - -#include -#include -#include -#include -#include - -typedef struct abd_stats { - kstat_named_t abdstat_struct_size; - kstat_named_t abdstat_scatter_cnt; - kstat_named_t abdstat_scatter_data_size; - kstat_named_t abdstat_scatter_chunk_waste; - kstat_named_t abdstat_linear_cnt; - kstat_named_t abdstat_linear_data_size; -} abd_stats_t; - -static abd_stats_t abd_stats = { - /* Amount of memory occupied by all of the abd_t struct allocations */ - { "struct_size", KSTAT_DATA_UINT64 }, - /* - * The number of scatter ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset()). - */ - { "scatter_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ - { "scatter_data_size", KSTAT_DATA_UINT64 }, - /* - * The amount of space wasted at the end of the last chunk across all - * scatter ABDs tracked by scatter_cnt. - */ - { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, - /* - * The number of linear ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset() and abd_get_from_buf()). If an - * ABD takes ownership of its buf then it will become tracked. - */ - { "linear_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all linear ABDs tracked by linear_cnt */ - { "linear_data_size", KSTAT_DATA_UINT64 }, -}; - -#define ABDSTAT(stat) (abd_stats.stat.value.ui64) -#define ABDSTAT_INCR(stat, val) \ - atomic_add_64(&abd_stats.stat.value.ui64, (val)) -#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) -#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) - -/* - * It is possible to make all future ABDs be linear by setting this to B_FALSE. - * Otherwise, ABDs are allocated scattered by default unless the caller uses - * abd_alloc_linear(). - */ -boolean_t zfs_abd_scatter_enabled = B_TRUE; - -/* - * The size of the chunks ABD allocates. Because the sizes allocated from the - * kmem_cache can't change, this tunable can only be modified at boot. Changing - * it at runtime would cause ABD iteration to work incorrectly for ABDs which - * were allocated with the old size, so a safeguard has been put in place which - * will cause the machine to panic if you change it and try to access the data - * within a scattered ABD. - */ -size_t zfs_abd_chunk_size = 4096; - -#if defined(__FreeBSD__) && defined(_KERNEL) -SYSCTL_DECL(_vfs_zfs); - -SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, - &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); -SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN, - &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates"); -#endif - -#ifdef _KERNEL -extern vmem_t *zio_alloc_arena; -#endif - -kmem_cache_t *abd_chunk_cache; -static kstat_t *abd_ksp; - -extern inline boolean_t abd_is_linear(abd_t *abd); -extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size); -extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size); -extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size); -extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size); -extern inline void abd_zero(abd_t *abd, size_t size); - -static void * -abd_alloc_chunk() -{ - void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); - ASSERT3P(c, !=, NULL); - return (c); -} - -static void -abd_free_chunk(void *c) -{ - kmem_cache_free(abd_chunk_cache, c); -} - -void -abd_init(void) -{ -#ifdef illumos - vmem_t *data_alloc_arena = NULL; - -#ifdef _KERNEL - data_alloc_arena = zio_alloc_arena; -#endif - - /* - * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH - * so that no allocator metadata is stored with the buffers. - */ - abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, - NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH); -#else - abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, - NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG); -#endif - abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, - sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); - if (abd_ksp != NULL) { - abd_ksp->ks_data = &abd_stats; - kstat_install(abd_ksp); - } -} - -void -abd_fini(void) -{ - if (abd_ksp != NULL) { - kstat_delete(abd_ksp); - abd_ksp = NULL; - } - - kmem_cache_destroy(abd_chunk_cache); - abd_chunk_cache = NULL; -} - -static inline size_t -abd_chunkcnt_for_bytes(size_t size) -{ - return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); -} - -static inline size_t -abd_scatter_chunkcnt(abd_t *abd) -{ - ASSERT(!abd_is_linear(abd)); - return (abd_chunkcnt_for_bytes( - abd->abd_u.abd_scatter.abd_offset + abd->abd_size)); -} - -static inline void -abd_verify(abd_t *abd) -{ - ASSERT3U(abd->abd_size, >, 0); - ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); - ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | - ABD_FLAG_OWNER | ABD_FLAG_META)); - IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); - IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); - if (abd_is_linear(abd)) { - ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); - } else { - ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, - zfs_abd_chunk_size); - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { - ASSERT3P( - abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL); - } - } -} - -static inline abd_t * -abd_alloc_struct(size_t chunkcnt) -{ - size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); - abd_t *abd = kmem_alloc(size, KM_PUSHPAGE); - ASSERT3P(abd, !=, NULL); - ABDSTAT_INCR(abdstat_struct_size, size); - - return (abd); -} - -static inline void -abd_free_struct(abd_t *abd) -{ - size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); - int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); - kmem_free(abd, size); - ABDSTAT_INCR(abdstat_struct_size, -size); -} - -/* - * Allocate an ABD, along with its own underlying data buffers. Use this if you - * don't care whether the ABD is linear or not. - */ -abd_t * -abd_alloc(size_t size, boolean_t is_metadata) -{ - if (!zfs_abd_scatter_enabled || size <= zfs_abd_chunk_size) - return (abd_alloc_linear(size, is_metadata)); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - size_t n = abd_chunkcnt_for_bytes(size); - abd_t *abd = abd_alloc_struct(n); - - abd->abd_flags = ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - abd->abd_u.abd_scatter.abd_offset = 0; - abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; - - for (int i = 0; i < n; i++) { - void *c = abd_alloc_chunk(); - ASSERT3P(c, !=, NULL); - abd->abd_u.abd_scatter.abd_chunks[i] = c; - } - - ABDSTAT_BUMP(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - n * zfs_abd_chunk_size - size); - - return (abd); -} - -static void -abd_free_scatter(abd_t *abd) -{ - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { - abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]); - } - - zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - abd->abd_size - n * zfs_abd_chunk_size); - - abd_free_struct(abd); -} - -/* - * Allocate an ABD that must be linear, along with its own underlying data - * buffer. Only use this when it would be very annoying to write your ABD - * consumer with a scattered ABD. - */ -abd_t * -abd_alloc_linear(size_t size, boolean_t is_metadata) -{ - abd_t *abd = abd_alloc_struct(0); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - if (is_metadata) { - abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); - } else { - abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); - } - - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, size); - - return (abd); -} - -static void -abd_free_linear(abd_t *abd) -{ - if (abd->abd_flags & ABD_FLAG_META) { - zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); - } else { - zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); - } - - zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); - - abd_free_struct(abd); -} - -/* - * Free an ABD. Only use this on ABDs allocated with abd_alloc() or - * abd_alloc_linear(). - */ -void -abd_free(abd_t *abd) -{ - abd_verify(abd); - ASSERT3P(abd->abd_parent, ==, NULL); - ASSERT(abd->abd_flags & ABD_FLAG_OWNER); - if (abd_is_linear(abd)) - abd_free_linear(abd); - else - abd_free_scatter(abd); -} - -/* - * Allocate an ABD of the same format (same metadata flag, same scatterize - * setting) as another ABD. - */ -abd_t * -abd_alloc_sametype(abd_t *sabd, size_t size) -{ - boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; - if (abd_is_linear(sabd)) { - return (abd_alloc_linear(size, is_metadata)); - } else { - return (abd_alloc(size, is_metadata)); - } -} - -/* - * If we're going to use this ABD for doing I/O using the block layer, the - * consumer of the ABD data doesn't care if it's scattered or not, and we don't - * plan to store this ABD in memory for a long period of time, we should - * allocate the ABD type that requires the least data copying to do the I/O. - * - * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os - * using a scatter/gather list we should switch to that and replace this call - * with vanilla abd_alloc(). - */ -abd_t * -abd_alloc_for_io(size_t size, boolean_t is_metadata) -{ - return (abd_alloc_linear(size, is_metadata)); -} - -/* - * Allocate a new ABD to point to offset off of sabd. It shares the underlying - * buffer data with sabd. Use abd_put() to free. sabd must not be freed while - * any derived ABDs exist. - */ -abd_t * -abd_get_offset(abd_t *sabd, size_t off) -{ - abd_t *abd; - - abd_verify(sabd); - ASSERT3U(off, <=, sabd->abd_size); - - if (abd_is_linear(sabd)) { - abd = abd_alloc_struct(0); - - /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = ABD_FLAG_LINEAR; - - abd->abd_u.abd_linear.abd_buf = - (char *)sabd->abd_u.abd_linear.abd_buf + off; - } else { - size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; - size_t chunkcnt = abd_scatter_chunkcnt(sabd) - - (new_offset / zfs_abd_chunk_size); - - abd = abd_alloc_struct(chunkcnt); - - /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = 0; - - abd->abd_u.abd_scatter.abd_offset = - new_offset % zfs_abd_chunk_size; - abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; - - /* Copy the scatterlist starting at the correct offset */ - (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks, - &sabd->abd_u.abd_scatter.abd_chunks[new_offset / - zfs_abd_chunk_size], - chunkcnt * sizeof (void *)); - } - - abd->abd_size = sabd->abd_size - off; - abd->abd_parent = sabd; - zfs_refcount_create(&abd->abd_children); - (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); - - return (abd); -} - -/* - * Allocate a linear ABD structure for buf. You must free this with abd_put() - * since the resulting ABD doesn't own its own buffer. - */ -abd_t * -abd_get_from_buf(void *buf, size_t size) -{ - abd_t *abd = abd_alloc_struct(0); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - /* - * Even if this buf is filesystem metadata, we only track that if we - * own the underlying data buffer, which is not true in this case. - * Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = ABD_FLAG_LINEAR; - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - abd->abd_u.abd_linear.abd_buf = buf; - - return (abd); -} - -/* - * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not - * free the underlying scatterlist or buffer. - */ -void -abd_put(abd_t *abd) -{ - abd_verify(abd); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - - if (abd->abd_parent != NULL) { - (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, - abd->abd_size, abd); - } - - zfs_refcount_destroy(&abd->abd_children); - abd_free_struct(abd); -} - -/* - * Get the raw buffer associated with a linear ABD. - */ -void * -abd_to_buf(abd_t *abd) -{ - ASSERT(abd_is_linear(abd)); - abd_verify(abd); - return (abd->abd_u.abd_linear.abd_buf); -} - -/* - * Borrow a raw buffer from an ABD without copying the contents of the ABD - * into the buffer. If the ABD is scattered, this will allocate a raw buffer - * whose contents are undefined. To copy over the existing data in the ABD, use - * abd_borrow_buf_copy() instead. - */ -void * -abd_borrow_buf(abd_t *abd, size_t n) -{ - void *buf; - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); - if (abd_is_linear(abd)) { - buf = abd_to_buf(abd); - } else { - buf = zio_buf_alloc(n); - } - (void) zfs_refcount_add_many(&abd->abd_children, n, buf); - - return (buf); -} - -void * -abd_borrow_buf_copy(abd_t *abd, size_t n) -{ - void *buf = abd_borrow_buf(abd, n); - if (!abd_is_linear(abd)) { - abd_copy_to_buf(buf, abd, n); - } - return (buf); -} - -/* - * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will - * not change the contents of the ABD and will ASSERT that you didn't modify - * the buffer since it was borrowed. If you want any changes you made to buf to - * be copied back to abd, use abd_return_buf_copy() instead. - */ -void -abd_return_buf(abd_t *abd, void *buf, size_t n) -{ - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); - if (abd_is_linear(abd)) { - ASSERT3P(buf, ==, abd_to_buf(abd)); - } else { - ASSERT0(abd_cmp_buf(abd, buf, n)); - zio_buf_free(buf, n); - } - (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); -} - -void -abd_return_buf_copy(abd_t *abd, void *buf, size_t n) -{ - if (!abd_is_linear(abd)) { - abd_copy_from_buf(abd, buf, n); - } - abd_return_buf(abd, buf, n); -} - -/* - * Give this ABD ownership of the buffer that it's storing. Can only be used on - * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated - * with abd_alloc_linear() which subsequently released ownership of their buf - * with abd_release_ownership_of_buf(). - */ -void -abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) -{ - ASSERT(abd_is_linear(abd)); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - abd_verify(abd); - - abd->abd_flags |= ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); -} - -void -abd_release_ownership_of_buf(abd_t *abd) -{ - ASSERT(abd_is_linear(abd)); - ASSERT(abd->abd_flags & ABD_FLAG_OWNER); - abd_verify(abd); - - abd->abd_flags &= ~ABD_FLAG_OWNER; - /* Disable this flag since we no longer own the data buffer */ - abd->abd_flags &= ~ABD_FLAG_META; - - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); -} - -struct abd_iter { - abd_t *iter_abd; /* ABD being iterated through */ - size_t iter_pos; /* position (relative to abd_offset) */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ -}; - -static inline size_t -abd_iter_scatter_chunk_offset(struct abd_iter *aiter) -{ - ASSERT(!abd_is_linear(aiter->iter_abd)); - return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + - aiter->iter_pos) % zfs_abd_chunk_size); -} - -static inline size_t -abd_iter_scatter_chunk_index(struct abd_iter *aiter) -{ - ASSERT(!abd_is_linear(aiter->iter_abd)); - return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + - aiter->iter_pos) / zfs_abd_chunk_size); -} - -/* - * Initialize the abd_iter. - */ -static void -abd_iter_init(struct abd_iter *aiter, abd_t *abd) -{ - abd_verify(abd); - aiter->iter_abd = abd; - aiter->iter_pos = 0; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; -} - -/* - * Advance the iterator by a certain amount. Cannot be called when a chunk is - * in use. This can be safely called when the aiter has already exhausted, in - * which case this does nothing. - */ -static void -abd_iter_advance(struct abd_iter *aiter, size_t amount) -{ - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* There's nothing left to advance to, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - aiter->iter_pos += amount; -} - -/* - * Map the current chunk into aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_map(struct abd_iter *aiter) -{ - void *paddr; - size_t offset = 0; - - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* Panic if someone has changed zfs_abd_chunk_size */ - IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == - aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size); - - /* There's nothing left to iterate over, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - if (abd_is_linear(aiter->iter_abd)) { - offset = aiter->iter_pos; - aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; - paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; - } else { - size_t index = abd_iter_scatter_chunk_index(aiter); - offset = abd_iter_scatter_chunk_offset(aiter); - aiter->iter_mapsize = zfs_abd_chunk_size - offset; - paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]; - } - aiter->iter_mapaddr = (char *)paddr + offset; -} - -/* - * Unmap the current chunk from aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_unmap(struct abd_iter *aiter) -{ - /* There's nothing left to unmap, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - ASSERT3P(aiter->iter_mapaddr, !=, NULL); - ASSERT3U(aiter->iter_mapsize, >, 0); - - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; -} - -int -abd_iterate_func(abd_t *abd, size_t off, size_t size, - abd_iter_func_t *func, void *private) -{ - int ret = 0; - struct abd_iter aiter; - - abd_verify(abd); - ASSERT3U(off + size, <=, abd->abd_size); - - abd_iter_init(&aiter, abd); - abd_iter_advance(&aiter, off); - - while (size > 0) { - abd_iter_map(&aiter); - - size_t len = MIN(aiter.iter_mapsize, size); - ASSERT3U(len, >, 0); - - ret = func(aiter.iter_mapaddr, len, private); - - abd_iter_unmap(&aiter); - - if (ret != 0) - break; - - size -= len; - abd_iter_advance(&aiter, len); - } - - return (ret); -} - -struct buf_arg { - void *arg_buf; -}; - -static int -abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) -{ - struct buf_arg *ba_ptr = private; - - (void) memcpy(ba_ptr->arg_buf, buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (0); -} - -/* - * Copy abd to buf. (off is the offset in abd.) - */ -void -abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { buf }; - - (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, - &ba_ptr); -} - -static int -abd_cmp_buf_off_cb(void *buf, size_t size, void *private) -{ - int ret; - struct buf_arg *ba_ptr = private; - - ret = memcmp(buf, ba_ptr->arg_buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (ret); -} - -/* - * Compare the contents of abd to buf. (off is the offset in abd.) - */ -int -abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { (void *) buf }; - - return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); -} - -static int -abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) -{ - struct buf_arg *ba_ptr = private; - - (void) memcpy(buf, ba_ptr->arg_buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (0); -} - -/* - * Copy from buf to abd. (off is the offset in abd.) - */ -void -abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { (void *) buf }; - - (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, - &ba_ptr); -} - -/*ARGSUSED*/ -static int -abd_zero_off_cb(void *buf, size_t size, void *private) -{ - (void) memset(buf, 0, size); - return (0); -} - -/* - * Zero out the abd from a particular offset to the end. - */ -void -abd_zero_off(abd_t *abd, size_t off, size_t size) -{ - (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); -} - -/* - * Iterate over two ABDs and call func incrementally on the two ABDs' data in - * equal-sized chunks (passed to func as raw buffers). func could be called many - * times during this iteration. - */ -int -abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, - size_t size, abd_iter_func2_t *func, void *private) -{ - int ret = 0; - struct abd_iter daiter, saiter; - - abd_verify(dabd); - abd_verify(sabd); - - ASSERT3U(doff + size, <=, dabd->abd_size); - ASSERT3U(soff + size, <=, sabd->abd_size); - - abd_iter_init(&daiter, dabd); - abd_iter_init(&saiter, sabd); - abd_iter_advance(&daiter, doff); - abd_iter_advance(&saiter, soff); - - while (size > 0) { - abd_iter_map(&daiter); - abd_iter_map(&saiter); - - size_t dlen = MIN(daiter.iter_mapsize, size); - size_t slen = MIN(saiter.iter_mapsize, size); - size_t len = MIN(dlen, slen); - ASSERT(dlen > 0 || slen > 0); - - ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, - private); - - abd_iter_unmap(&saiter); - abd_iter_unmap(&daiter); - - if (ret != 0) - break; - - size -= len; - abd_iter_advance(&daiter, len); - abd_iter_advance(&saiter, len); - } - - return (ret); -} - -/*ARGSUSED*/ -static int -abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) -{ - (void) memcpy(dbuf, sbuf, size); - return (0); -} - -/* - * Copy from sabd to dabd starting from soff and doff. - */ -void -abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) -{ - (void) abd_iterate_func2(dabd, sabd, doff, soff, size, - abd_copy_off_cb, NULL); -} - -/*ARGSUSED*/ -static int -abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) -{ - return (memcmp(bufa, bufb, size)); -} - -/* - * Compares the first size bytes of two ABDs. - */ -int -abd_cmp(abd_t *dabd, abd_t *sabd, size_t size) -{ - return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c deleted file mode 100644 index 713ff2b0116c..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c +++ /dev/null @@ -1,234 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2017, 2018 by Delphix. All rights reserved. - */ - -#include -#include - -/* - * Aggregate-sum counters are a form of fanned-out counter, used when atomic - * instructions on a single field cause enough CPU cache line contention to - * slow system performance. Due to their increased overhead and the expense - * involved with precisely reading from them, they should only be used in cases - * where the write rate (increment/decrement) is much higher than the read rate - * (get value). - * - * Aggregate sum counters are comprised of two basic parts, the core and the - * buckets. The core counter contains a lock for the entire counter, as well - * as the current upper and lower bounds on the value of the counter. The - * aggsum_bucket structure contains a per-bucket lock to protect the contents of - * the bucket, the current amount that this bucket has changed from the global - * counter (called the delta), and the amount of increment and decrement we have - * "borrowed" from the core counter. - * - * The basic operation of an aggsum is simple. Threads that wish to modify the - * counter will modify one bucket's counter (determined by their current CPU, to - * help minimize lock and cache contention). If the bucket already has - * sufficient capacity borrowed from the core structure to handle their request, - * they simply modify the delta and return. If the bucket does not, we clear - * the bucket's current state (to prevent the borrowed amounts from getting too - * large), and borrow more from the core counter. Borrowing is done by adding to - * the upper bound (or subtracting from the lower bound) of the core counter, - * and setting the borrow value for the bucket to the amount added (or - * subtracted). Clearing the bucket is the opposite; we add the current delta - * to both the lower and upper bounds of the core counter, subtract the borrowed - * incremental from the upper bound, and add the borrowed decrement from the - * lower bound. Note that only borrowing and clearing require access to the - * core counter; since all other operations access CPU-local resources, - * performance can be much higher than a traditional counter. - * - * Threads that wish to read from the counter have a slightly more challenging - * task. It is fast to determine the upper and lower bounds of the aggum; this - * does not require grabbing any locks. This suffices for cases where an - * approximation of the aggsum's value is acceptable. However, if one needs to - * know whether some specific value is above or below the current value in the - * aggsum, they invoke aggsum_compare(). This function operates by repeatedly - * comparing the target value to the upper and lower bounds of the aggsum, and - * then clearing a bucket. This proceeds until the target is outside of the - * upper and lower bounds and we return a response, or the last bucket has been - * cleared and we know that the target is equal to the aggsum's value. Finally, - * the most expensive operation is determining the precise value of the aggsum. - * To do this, we clear every bucket and then return the upper bound (which must - * be equal to the lower bound). What makes aggsum_compare() and aggsum_value() - * expensive is clearing buckets. This involves grabbing the global lock - * (serializing against themselves and borrow operations), grabbing a bucket's - * lock (preventing threads on those CPUs from modifying their delta), and - * zeroing out the borrowed value (forcing that thread to borrow on its next - * request, which will also be expensive). This is what makes aggsums well - * suited for write-many read-rarely operations. - */ - -/* - * We will borrow aggsum_borrow_multiplier times the current request, so we will - * have to get the as_lock approximately every aggsum_borrow_multiplier calls to - * aggsum_delta(). - */ -static uint_t aggsum_borrow_multiplier = 10; - -void -aggsum_init(aggsum_t *as, uint64_t value) -{ - bzero(as, sizeof (*as)); - as->as_lower_bound = as->as_upper_bound = value; - mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL); - as->as_numbuckets = boot_ncpus; - as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t), - KM_SLEEP); - for (int i = 0; i < as->as_numbuckets; i++) { - mutex_init(&as->as_buckets[i].asc_lock, - NULL, MUTEX_DEFAULT, NULL); - } -} - -void -aggsum_fini(aggsum_t *as) -{ - for (int i = 0; i < as->as_numbuckets; i++) - mutex_destroy(&as->as_buckets[i].asc_lock); - kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t)); - mutex_destroy(&as->as_lock); -} - -int64_t -aggsum_lower_bound(aggsum_t *as) -{ - return (as->as_lower_bound); -} - -int64_t -aggsum_upper_bound(aggsum_t *as) -{ - return (as->as_upper_bound); -} - -static void -aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb) -{ - ASSERT(MUTEX_HELD(&as->as_lock)); - ASSERT(MUTEX_HELD(&asb->asc_lock)); - - /* - * We use atomic instructions for this because we read the upper and - * lower bounds without the lock, so we need stores to be atomic. - */ - atomic_add_64((volatile uint64_t *)&as->as_lower_bound, - asb->asc_delta + asb->asc_borrowed); - atomic_add_64((volatile uint64_t *)&as->as_upper_bound, - asb->asc_delta - asb->asc_borrowed); - asb->asc_delta = 0; - asb->asc_borrowed = 0; -} - -uint64_t -aggsum_value(aggsum_t *as) -{ - int64_t rv; - - mutex_enter(&as->as_lock); - if (as->as_lower_bound == as->as_upper_bound) { - rv = as->as_lower_bound; - for (int i = 0; i < as->as_numbuckets; i++) { - ASSERT0(as->as_buckets[i].asc_delta); - ASSERT0(as->as_buckets[i].asc_borrowed); - } - mutex_exit(&as->as_lock); - return (rv); - } - for (int i = 0; i < as->as_numbuckets; i++) { - struct aggsum_bucket *asb = &as->as_buckets[i]; - mutex_enter(&asb->asc_lock); - aggsum_flush_bucket(as, asb); - mutex_exit(&asb->asc_lock); - } - VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); - rv = as->as_lower_bound; - mutex_exit(&as->as_lock); - - return (rv); -} - -void -aggsum_add(aggsum_t *as, int64_t delta) -{ - struct aggsum_bucket *asb = - &as->as_buckets[CPU_SEQID % as->as_numbuckets]; - int64_t borrow; - - /* Try fast path if we already borrowed enough before. */ - mutex_enter(&asb->asc_lock); - if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed && - asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) { - asb->asc_delta += delta; - mutex_exit(&asb->asc_lock); - return; - } - mutex_exit(&asb->asc_lock); - - /* - * We haven't borrowed enough. Take the global lock and borrow - * considering what is requested now and what we borrowed before. - */ - borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier; - mutex_enter(&as->as_lock); - mutex_enter(&asb->asc_lock); - delta += asb->asc_delta; - asb->asc_delta = 0; - if (borrow >= asb->asc_borrowed) - borrow -= asb->asc_borrowed; - else - borrow = (borrow - (int64_t)asb->asc_borrowed) / 4; - asb->asc_borrowed += borrow; - atomic_add_64((volatile uint64_t *)&as->as_lower_bound, - delta - borrow); - atomic_add_64((volatile uint64_t *)&as->as_upper_bound, - delta + borrow); - mutex_exit(&asb->asc_lock); - mutex_exit(&as->as_lock); -} - -/* - * Compare the aggsum value to target efficiently. Returns -1 if the value - * represented by the aggsum is less than target, 1 if it's greater, and 0 if - * they are equal. - */ -int -aggsum_compare(aggsum_t *as, uint64_t target) -{ - if (as->as_upper_bound < target) - return (-1); - if (as->as_lower_bound > target) - return (1); - mutex_enter(&as->as_lock); - for (int i = 0; i < as->as_numbuckets; i++) { - struct aggsum_bucket *asb = &as->as_buckets[i]; - mutex_enter(&asb->asc_lock); - aggsum_flush_bucket(as, asb); - mutex_exit(&asb->asc_lock); - if (as->as_upper_bound < target) { - mutex_exit(&as->as_lock); - return (-1); - } - if (as->as_lower_bound > target) { - mutex_exit(&as->as_lock); - return (1); - } - } - VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); - ASSERT3U(as->as_lower_bound, ==, target); - mutex_exit(&as->as_lock); - return (0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c deleted file mode 100644 index 592fb02cfac1..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ /dev/null @@ -1,8569 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018, Joyent, Inc. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. - */ - -/* - * DVA-based Adjustable Replacement Cache - * - * While much of the theory of operation used here is - * based on the self-tuning, low overhead replacement cache - * presented by Megiddo and Modha at FAST 2003, there are some - * significant differences: - * - * 1. The Megiddo and Modha model assumes any page is evictable. - * Pages in its cache cannot be "locked" into memory. This makes - * the eviction algorithm simple: evict the last page in the list. - * This also make the performance characteristics easy to reason - * about. Our cache is not so simple. At any given moment, some - * subset of the blocks in the cache are un-evictable because we - * have handed out a reference to them. Blocks are only evictable - * when there are no external references active. This makes - * eviction far more problematic: we choose to evict the evictable - * blocks that are the "lowest" in the list. - * - * There are times when it is not possible to evict the requested - * space. In these circumstances we are unable to adjust the cache - * size. To prevent the cache growing unbounded at these times we - * implement a "cache throttle" that slows the flow of new data - * into the cache until we can make space available. - * - * 2. The Megiddo and Modha model assumes a fixed cache size. - * Pages are evicted when the cache is full and there is a cache - * miss. Our model has a variable sized cache. It grows with - * high use, but also tries to react to memory pressure from the - * operating system: decreasing its size when system memory is - * tight. - * - * 3. The Megiddo and Modha model assumes a fixed page size. All - * elements of the cache are therefore exactly the same size. So - * when adjusting the cache size following a cache miss, its simply - * a matter of choosing a single page to evict. In our model, we - * have variable sized cache blocks (rangeing from 512 bytes to - * 128K bytes). We therefore choose a set of blocks to evict to make - * space for a cache miss that approximates as closely as possible - * the space used by the new block. - * - * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" - * by N. Megiddo & D. Modha, FAST 2003 - */ - -/* - * The locking model: - * - * A new reference to a cache buffer can be obtained in two - * ways: 1) via a hash table lookup using the DVA as a key, - * or 2) via one of the ARC lists. The arc_read() interface - * uses method 1, while the internal ARC algorithms for - * adjusting the cache use method 2. We therefore provide two - * types of locks: 1) the hash table lock array, and 2) the - * ARC list locks. - * - * Buffers do not have their own mutexes, rather they rely on the - * hash table mutexes for the bulk of their protection (i.e. most - * fields in the arc_buf_hdr_t are protected by these mutexes). - * - * buf_hash_find() returns the appropriate mutex (held) when it - * locates the requested buffer in the hash table. It returns - * NULL for the mutex if the buffer was not in the table. - * - * buf_hash_remove() expects the appropriate hash mutex to be - * already held before it is invoked. - * - * Each ARC state also has a mutex which is used to protect the - * buffer list associated with the state. When attempting to - * obtain a hash table lock while holding an ARC list lock you - * must use: mutex_tryenter() to avoid deadlock. Also note that - * the active state mutex must be held before the ghost state mutex. - * - * It as also possible to register a callback which is run when the - * arc_meta_limit is reached and no buffers can be safely evicted. In - * this case the arc user should drop a reference on some arc buffers so - * they can be reclaimed and the arc_meta_limit honored. For example, - * when using the ZPL each dentry holds a references on a znode. These - * dentries must be pruned before the arc buffer holding the znode can - * be safely evicted. - * - * Note that the majority of the performance stats are manipulated - * with atomic operations. - * - * The L2ARC uses the l2ad_mtx on each vdev for the following: - * - * - L2ARC buflist creation - * - L2ARC buflist eviction - * - L2ARC write completion, which walks L2ARC buflists - * - ARC header destruction, as it removes from L2ARC buflists - * - ARC header release, as it removes from L2ARC buflists - */ - -/* - * ARC operation: - * - * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. - * This structure can point either to a block that is still in the cache or to - * one that is only accessible in an L2 ARC device, or it can provide - * information about a block that was recently evicted. If a block is - * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough - * information to retrieve it from the L2ARC device. This information is - * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block - * that is in this state cannot access the data directly. - * - * Blocks that are actively being referenced or have not been evicted - * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within - * the arc_buf_hdr_t that will point to the data block in memory. A block can - * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC - * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and - * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). - * - * The L1ARC's data pointer may or may not be uncompressed. The ARC has the - * ability to store the physical data (b_pabd) associated with the DVA of the - * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, - * it will match its on-disk compression characteristics. This behavior can be - * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the - * compressed ARC functionality is disabled, the b_pabd will point to an - * uncompressed version of the on-disk data. - * - * Data in the L1ARC is not accessed by consumers of the ARC directly. Each - * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. - * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC - * consumer. The ARC will provide references to this data and will keep it - * cached until it is no longer in use. The ARC caches only the L1ARC's physical - * data block and will evict any arc_buf_t that is no longer referenced. The - * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the - * "overhead_size" kstat. - * - * Depending on the consumer, an arc_buf_t can be requested in uncompressed or - * compressed form. The typical case is that consumers will want uncompressed - * data, and when that happens a new data buffer is allocated where the data is - * decompressed for them to use. Currently the only consumer who wants - * compressed arc_buf_t's is "zfs send", when it streams data exactly as it - * exists on disk. When this happens, the arc_buf_t's data buffer is shared - * with the arc_buf_hdr_t. - * - * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The - * first one is owned by a compressed send consumer (and therefore references - * the same compressed data buffer as the arc_buf_hdr_t) and the second could be - * used by any other consumer (and has its own uncompressed copy of the data - * buffer). - * - * arc_buf_hdr_t - * +-----------+ - * | fields | - * | common to | - * | L1- and | - * | L2ARC | - * +-----------+ - * | l2arc_buf_hdr_t - * | | - * +-----------+ - * | l1arc_buf_hdr_t - * | | arc_buf_t - * | b_buf +------------>+-----------+ arc_buf_t - * | b_pabd +-+ |b_next +---->+-----------+ - * +-----------+ | |-----------| |b_next +-->NULL - * | |b_comp = T | +-----------+ - * | |b_data +-+ |b_comp = F | - * | +-----------+ | |b_data +-+ - * +->+------+ | +-----------+ | - * compressed | | | | - * data | |<--------------+ | uncompressed - * +------+ compressed, | data - * shared +-->+------+ - * data | | - * | | - * +------+ - * - * When a consumer reads a block, the ARC must first look to see if the - * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new - * arc_buf_t and either copies uncompressed data into a new data buffer from an - * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a - * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the - * hdr is compressed and the desired compression characteristics of the - * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the - * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be - * the last buffer in the hdr's b_buf list, however a shared compressed buf can - * be anywhere in the hdr's list. - * - * The diagram below shows an example of an uncompressed ARC hdr that is - * sharing its data with an arc_buf_t (note that the shared uncompressed buf is - * the last element in the buf list): - * - * arc_buf_hdr_t - * +-----------+ - * | | - * | | - * | | - * +-----------+ - * l2arc_buf_hdr_t| | - * | | - * +-----------+ - * l1arc_buf_hdr_t| | - * | | arc_buf_t (shared) - * | b_buf +------------>+---------+ arc_buf_t - * | | |b_next +---->+---------+ - * | b_pabd +-+ |---------| |b_next +-->NULL - * +-----------+ | | | +---------+ - * | |b_data +-+ | | - * | +---------+ | |b_data +-+ - * +->+------+ | +---------+ | - * | | | | - * uncompressed | | | | - * data +------+ | | - * ^ +->+------+ | - * | uncompressed | | | - * | data | | | - * | +------+ | - * +---------------------------------+ - * - * Writing to the ARC requires that the ARC first discard the hdr's b_pabd - * since the physical block is about to be rewritten. The new data contents - * will be contained in the arc_buf_t. As the I/O pipeline performs the write, - * it may compress the data before writing it to disk. The ARC will be called - * with the transformed data and will bcopy the transformed on-disk block into - * a newly allocated b_pabd. Writes are always done into buffers which have - * either been loaned (and hence are new and don't have other readers) or - * buffers which have been released (and hence have their own hdr, if there - * were originally other readers of the buf's original hdr). This ensures that - * the ARC only needs to update a single buf and its hdr after a write occurs. - * - * When the L2ARC is in use, it will also take advantage of the b_pabd. The - * L2ARC will always write the contents of b_pabd to the L2ARC. This means - * that when compressed ARC is enabled that the L2ARC blocks are identical - * to the on-disk block in the main data pool. This provides a significant - * advantage since the ARC can leverage the bp's checksum when reading from the - * L2ARC to determine if the contents are valid. However, if the compressed - * ARC is disabled, then the L2ARC's block must be transformed to look - * like the physical block in the main data pool before comparing the - * checksum and determining its validity. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef _KERNEL -#include -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#ifdef illumos -#ifndef _KERNEL -/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ -boolean_t arc_watch = B_FALSE; -int arc_procfd; -#endif -#endif /* illumos */ - -/* - * This thread's job is to keep enough free memory in the system, by - * calling arc_kmem_reap_now() plus arc_shrink(), which improves - * arc_available_memory(). - */ -static zthr_t *arc_reap_zthr; - -/* - * This thread's job is to keep arc_size under arc_c, by calling - * arc_adjust(), which improves arc_is_overflowing(). - */ -static zthr_t *arc_adjust_zthr; - -static kmutex_t arc_adjust_lock; -static kcondvar_t arc_adjust_waiters_cv; -static boolean_t arc_adjust_needed = B_FALSE; - -static kmutex_t arc_dnlc_evicts_lock; -static kcondvar_t arc_dnlc_evicts_cv; -static boolean_t arc_dnlc_evicts_thread_exit; - -uint_t arc_reduce_dnlc_percent = 3; - -/* - * The number of headers to evict in arc_evict_state_impl() before - * dropping the sublist lock and evicting from another sublist. A lower - * value means we're more likely to evict the "correct" header (i.e. the - * oldest header in the arc state), but comes with higher overhead - * (i.e. more invocations of arc_evict_state_impl()). - */ -int zfs_arc_evict_batch_limit = 10; - -/* number of seconds before growing cache again */ -int arc_grow_retry = 60; - -/* - * Minimum time between calls to arc_kmem_reap_soon(). Note that this will - * be converted to ticks, so with the default hz=100, a setting of 15 ms - * will actually wait 2 ticks, or 20ms. - */ -int arc_kmem_cache_reap_retry_ms = 1000; - -/* shift of arc_c for calculating overflow limit in arc_get_data_impl */ -int zfs_arc_overflow_shift = 8; - -/* shift of arc_c for calculating both min and max arc_p */ -int arc_p_min_shift = 4; - -/* log2(fraction of arc to reclaim) */ -int arc_shrink_shift = 7; - -/* - * log2(fraction of ARC which must be free to allow growing). - * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, - * when reading a new block into the ARC, we will evict an equal-sized block - * from the ARC. - * - * This must be less than arc_shrink_shift, so that when we shrink the ARC, - * we will still not allow it to grow. - */ -int arc_no_grow_shift = 5; - - -/* - * minimum lifespan of a prefetch block in clock ticks - * (initialized in arc_init()) - */ -static int zfs_arc_min_prefetch_ms = 1; -static int zfs_arc_min_prescient_prefetch_ms = 6; - -/* - * If this percent of memory is free, don't throttle. - */ -int arc_lotsfree_percent = 10; - -static boolean_t arc_initialized; -extern boolean_t zfs_prefetch_disable; - -/* - * The arc has filled available memory and has now warmed up. - */ -static boolean_t arc_warm; - -/* - * log2 fraction of the zio arena to keep free. - */ -int arc_zio_arena_free_shift = 2; - -/* - * These tunables are for performance analysis. - */ -uint64_t zfs_arc_max; -uint64_t zfs_arc_min; -uint64_t zfs_arc_meta_limit = 0; -uint64_t zfs_arc_meta_min = 0; -uint64_t zfs_arc_dnode_limit = 0; -uint64_t zfs_arc_dnode_reduce_percent = 10; -int zfs_arc_grow_retry = 0; -int zfs_arc_shrink_shift = 0; -int zfs_arc_no_grow_shift = 0; -int zfs_arc_p_min_shift = 0; -uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ -u_int zfs_arc_free_target = 0; - -/* Absolute min for arc min / max is 16MB. */ -static uint64_t arc_abs_min = 16 << 20; - -/* - * ARC dirty data constraints for arc_tempreserve_space() throttle - */ -uint_t zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */ -uint_t zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */ -uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ - -boolean_t zfs_compressed_arc_enabled = B_TRUE; - -static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); -static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); -static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); -static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); -static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS); - -#if defined(__FreeBSD__) && defined(_KERNEL) -static void -arc_free_target_init(void *unused __unused) -{ - - zfs_arc_free_target = vm_cnt.v_free_target; -} -SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, - arc_free_target_init, NULL); - -TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); -TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); -TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); -TUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry); -TUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift); -SYSCTL_DECL(_vfs_zfs); -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, - CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, - 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, - CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, - 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, - CTLTYPE_U32 | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, - 0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U", - "log2(fraction of ARC which must be free to allow growing)"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, - &zfs_arc_average_blocksize, 0, - "ARC average blocksize"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, - &arc_shrink_shift, 0, - "log2(fraction of arc to reclaim)"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW, - &arc_grow_retry, 0, - "Wait in seconds before considering growing ARC"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN, - &zfs_compressed_arc_enabled, 0, - "Enable compressed ARC"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_kmem_cache_reap_retry_ms, CTLFLAG_RWTUN, - &arc_kmem_cache_reap_retry_ms, 0, - "Interval between ARC kmem_cache reapings"); - -/* - * We don't have a tunable for arc_free_target due to the dependency on - * pagedaemon initialisation. - */ -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, - CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), - sysctl_vfs_zfs_arc_free_target, "IU", - "Desired number of free pages below which ARC triggers reclaim"); - -static int -sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) -{ - u_int val; - int err; - - val = zfs_arc_free_target; - err = sysctl_handle_int(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (val < minfree) - return (EINVAL); - if (val > vm_cnt.v_page_count) - return (EINVAL); - - zfs_arc_free_target = val; - - return (0); -} - -/* - * Must be declared here, before the definition of corresponding kstat - * macro which uses the same names will confuse the compiler. - */ -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, - CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), - sysctl_vfs_zfs_arc_meta_limit, "QU", - "ARC metadata limit"); -#endif - -/* - * Note that buffers can be in one of 6 states: - * ARC_anon - anonymous (discussed below) - * ARC_mru - recently used, currently cached - * ARC_mru_ghost - recentely used, no longer in cache - * ARC_mfu - frequently used, currently cached - * ARC_mfu_ghost - frequently used, no longer in cache - * ARC_l2c_only - exists in L2ARC but not other states - * When there are no active references to the buffer, they are - * are linked onto a list in one of these arc states. These are - * the only buffers that can be evicted or deleted. Within each - * state there are multiple lists, one for meta-data and one for - * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, - * etc.) is tracked separately so that it can be managed more - * explicitly: favored over data, limited explicitly. - * - * Anonymous buffers are buffers that are not associated with - * a DVA. These are buffers that hold dirty block copies - * before they are written to stable storage. By definition, - * they are "ref'd" and are considered part of arc_mru - * that cannot be freed. Generally, they will aquire a DVA - * as they are written and migrate onto the arc_mru list. - * - * The ARC_l2c_only state is for buffers that are in the second - * level ARC but no longer in any of the ARC_m* lists. The second - * level ARC itself may also contain buffers that are in any of - * the ARC_m* states - meaning that a buffer can exist in two - * places. The reason for the ARC_l2c_only state is to keep the - * buffer header in the hash table, so that reads that hit the - * second level ARC benefit from these fast lookups. - */ - -typedef struct arc_state { - /* - * list of evictable buffers - */ - multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; - /* - * total amount of evictable data in this state - */ - zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; - /* - * total amount of data in this state; this includes: evictable, - * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. - */ - zfs_refcount_t arcs_size; - /* - * supports the "dbufs" kstat - */ - arc_state_type_t arcs_state; -} arc_state_t; - -/* - * Percentage that can be consumed by dnodes of ARC meta buffers. - */ -int zfs_arc_meta_prune = 10000; -unsigned long zfs_arc_dnode_limit_percent = 10; -int zfs_arc_meta_strategy = ARC_STRATEGY_META_ONLY; -int zfs_arc_meta_adjust_restarts = 4096; - -SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_meta_strategy, CTLFLAG_RWTUN, - &zfs_arc_meta_strategy, 0, - "ARC metadata reclamation strategy " - "(0 = metadata only, 1 = balance data and metadata)"); - -/* The 6 states: */ -static arc_state_t ARC_anon; -static arc_state_t ARC_mru; -static arc_state_t ARC_mru_ghost; -static arc_state_t ARC_mfu; -static arc_state_t ARC_mfu_ghost; -static arc_state_t ARC_l2c_only; - -typedef struct arc_stats { - kstat_named_t arcstat_hits; - kstat_named_t arcstat_misses; - kstat_named_t arcstat_demand_data_hits; - kstat_named_t arcstat_demand_data_misses; - kstat_named_t arcstat_demand_metadata_hits; - kstat_named_t arcstat_demand_metadata_misses; - kstat_named_t arcstat_prefetch_data_hits; - kstat_named_t arcstat_prefetch_data_misses; - kstat_named_t arcstat_prefetch_metadata_hits; - kstat_named_t arcstat_prefetch_metadata_misses; - kstat_named_t arcstat_mru_hits; - kstat_named_t arcstat_mru_ghost_hits; - kstat_named_t arcstat_mfu_hits; - kstat_named_t arcstat_mfu_ghost_hits; - kstat_named_t arcstat_allocated; - kstat_named_t arcstat_deleted; - /* - * Number of buffers that could not be evicted because the hash lock - * was held by another thread. The lock may not necessarily be held - * by something using the same buffer, since hash locks are shared - * by multiple buffers. - */ - kstat_named_t arcstat_mutex_miss; - /* - * Number of buffers skipped when updating the access state due to the - * header having already been released after acquiring the hash lock. - */ - kstat_named_t arcstat_access_skip; - /* - * Number of buffers skipped because they have I/O in progress, are - * indirect prefetch buffers that have not lived long enough, or are - * not from the spa we're trying to evict from. - */ - kstat_named_t arcstat_evict_skip; - /* - * Number of times arc_evict_state() was unable to evict enough - * buffers to reach it's target amount. - */ - kstat_named_t arcstat_evict_not_enough; - kstat_named_t arcstat_evict_l2_cached; - kstat_named_t arcstat_evict_l2_eligible; - kstat_named_t arcstat_evict_l2_ineligible; - kstat_named_t arcstat_evict_l2_skip; - kstat_named_t arcstat_hash_elements; - kstat_named_t arcstat_hash_elements_max; - kstat_named_t arcstat_hash_collisions; - kstat_named_t arcstat_hash_chains; - kstat_named_t arcstat_hash_chain_max; - kstat_named_t arcstat_p; - kstat_named_t arcstat_c; - kstat_named_t arcstat_c_min; - kstat_named_t arcstat_c_max; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_size; - /* - * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. - * Note that the compressed bytes may match the uncompressed bytes - * if the block is either not compressed or compressed arc is disabled. - */ - kstat_named_t arcstat_compressed_size; - /* - * Uncompressed size of the data stored in b_pabd. If compressed - * arc is disabled then this value will be identical to the stat - * above. - */ - kstat_named_t arcstat_uncompressed_size; - /* - * Number of bytes stored in all the arc_buf_t's. This is classified - * as "overhead" since this data is typically short-lived and will - * be evicted from the arc when it becomes unreferenced unless the - * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level - * values have been set (see comment in dbuf.c for more information). - */ - kstat_named_t arcstat_overhead_size; - /* - * Number of bytes consumed by internal ARC structures necessary - * for tracking purposes; these structures are not actually - * backed by ARC buffers. This includes arc_buf_hdr_t structures - * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only - * caches), and arc_buf_t structures (allocated via arc_buf_t - * cache). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_hdr_size; - /* - * Number of bytes consumed by ARC buffers of type equal to - * ARC_BUFC_DATA. This is generally consumed by buffers backing - * on disk user data (e.g. plain file contents). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_data_size; - /* - * Number of bytes consumed by ARC buffers of type equal to - * ARC_BUFC_METADATA. This is generally consumed by buffers - * backing on disk data that is used for internal ZFS - * structures (e.g. ZAP, dnode, indirect blocks, etc). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_metadata_size; - /* - * Number of bytes consumed by dmu_buf_impl_t objects. - */ - kstat_named_t arcstat_dbuf_size; - /* - * Number of bytes consumed by dnode_t objects. - */ - kstat_named_t arcstat_dnode_size; - /* - * Number of bytes consumed by bonus buffers. - */ - kstat_named_t arcstat_bonus_size; -#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11) - /* - * Sum of the previous three counters, provided for compatibility. - */ - kstat_named_t arcstat_other_size; -#endif - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_anon state. This includes *all* buffers in the arc_anon - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_size; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_DATA, - * residing in the arc_anon state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_evictable_data; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_METADATA, - * residing in the arc_anon state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_evictable_metadata; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_mru state. This includes *all* buffers in the arc_mru - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_size; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_DATA, - * residing in the arc_mru state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_evictable_data; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_METADATA, - * residing in the arc_mru state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_evictable_metadata; - /* - * Total number of bytes that *would have been* consumed by ARC - * buffers in the arc_mru_ghost state. The key thing to note - * here, is the fact that this size doesn't actually indicate - * RAM consumption. The ghost lists only consist of headers and - * don't actually have ARC buffers linked off of these headers. - * Thus, *if* the headers had associated ARC buffers, these - * buffers *would have* consumed this number of bytes. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_size; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_evictable_data; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_evictable_metadata; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_mfu state. This includes *all* buffers in the arc_mfu - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_size; - /* - * Number of bytes consumed by ARC buffers that are eligible for - * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu - * state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_evictable_data; - /* - * Number of bytes consumed by ARC buffers that are eligible for - * eviction, of type ARC_BUFC_METADATA, and reside in the - * arc_mfu state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_evictable_metadata; - /* - * Total number of bytes that *would have been* consumed by ARC - * buffers in the arc_mfu_ghost state. See the comment above - * arcstat_mru_ghost_size for more details. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_size; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_evictable_data; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_evictable_metadata; - kstat_named_t arcstat_l2_hits; - kstat_named_t arcstat_l2_misses; - kstat_named_t arcstat_l2_feeds; - kstat_named_t arcstat_l2_rw_clash; - kstat_named_t arcstat_l2_read_bytes; - kstat_named_t arcstat_l2_write_bytes; - kstat_named_t arcstat_l2_writes_sent; - kstat_named_t arcstat_l2_writes_done; - kstat_named_t arcstat_l2_writes_error; - kstat_named_t arcstat_l2_writes_lock_retry; - kstat_named_t arcstat_l2_evict_lock_retry; - kstat_named_t arcstat_l2_evict_reading; - kstat_named_t arcstat_l2_evict_l1cached; - kstat_named_t arcstat_l2_free_on_write; - kstat_named_t arcstat_l2_abort_lowmem; - kstat_named_t arcstat_l2_cksum_bad; - kstat_named_t arcstat_l2_io_error; - kstat_named_t arcstat_l2_lsize; - kstat_named_t arcstat_l2_psize; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_l2_hdr_size; - kstat_named_t arcstat_l2_write_trylock_fail; - kstat_named_t arcstat_l2_write_passed_headroom; - kstat_named_t arcstat_l2_write_spa_mismatch; - kstat_named_t arcstat_l2_write_in_l2; - kstat_named_t arcstat_l2_write_hdr_io_in_progress; - kstat_named_t arcstat_l2_write_not_cacheable; - kstat_named_t arcstat_l2_write_full; - kstat_named_t arcstat_l2_write_buffer_iter; - kstat_named_t arcstat_l2_write_pios; - kstat_named_t arcstat_l2_write_buffer_bytes_scanned; - kstat_named_t arcstat_l2_write_buffer_list_iter; - kstat_named_t arcstat_l2_write_buffer_list_null_iter; - kstat_named_t arcstat_memory_throttle_count; - kstat_named_t arcstat_memory_direct_count; - kstat_named_t arcstat_memory_indirect_count; - kstat_named_t arcstat_memory_all_bytes; - kstat_named_t arcstat_memory_free_bytes; - kstat_named_t arcstat_memory_available_bytes; - kstat_named_t arcstat_no_grow; - kstat_named_t arcstat_tempreserve; - kstat_named_t arcstat_loaned_bytes; - kstat_named_t arcstat_prune; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_meta_used; - kstat_named_t arcstat_meta_limit; - kstat_named_t arcstat_dnode_limit; - kstat_named_t arcstat_meta_max; - kstat_named_t arcstat_meta_min; - kstat_named_t arcstat_async_upgrade_sync; - kstat_named_t arcstat_demand_hit_predictive_prefetch; - kstat_named_t arcstat_demand_hit_prescient_prefetch; -} arc_stats_t; - -static arc_stats_t arc_stats = { - { "hits", KSTAT_DATA_UINT64 }, - { "misses", KSTAT_DATA_UINT64 }, - { "demand_data_hits", KSTAT_DATA_UINT64 }, - { "demand_data_misses", KSTAT_DATA_UINT64 }, - { "demand_metadata_hits", KSTAT_DATA_UINT64 }, - { "demand_metadata_misses", KSTAT_DATA_UINT64 }, - { "prefetch_data_hits", KSTAT_DATA_UINT64 }, - { "prefetch_data_misses", KSTAT_DATA_UINT64 }, - { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, - { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, - { "mru_hits", KSTAT_DATA_UINT64 }, - { "mru_ghost_hits", KSTAT_DATA_UINT64 }, - { "mfu_hits", KSTAT_DATA_UINT64 }, - { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, - { "allocated", KSTAT_DATA_UINT64 }, - { "deleted", KSTAT_DATA_UINT64 }, - { "mutex_miss", KSTAT_DATA_UINT64 }, - { "access_skip", KSTAT_DATA_UINT64 }, - { "evict_skip", KSTAT_DATA_UINT64 }, - { "evict_not_enough", KSTAT_DATA_UINT64 }, - { "evict_l2_cached", KSTAT_DATA_UINT64 }, - { "evict_l2_eligible", KSTAT_DATA_UINT64 }, - { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, - { "evict_l2_skip", KSTAT_DATA_UINT64 }, - { "hash_elements", KSTAT_DATA_UINT64 }, - { "hash_elements_max", KSTAT_DATA_UINT64 }, - { "hash_collisions", KSTAT_DATA_UINT64 }, - { "hash_chains", KSTAT_DATA_UINT64 }, - { "hash_chain_max", KSTAT_DATA_UINT64 }, - { "p", KSTAT_DATA_UINT64 }, - { "c", KSTAT_DATA_UINT64 }, - { "c_min", KSTAT_DATA_UINT64 }, - { "c_max", KSTAT_DATA_UINT64 }, - { "size", KSTAT_DATA_UINT64 }, - { "compressed_size", KSTAT_DATA_UINT64 }, - { "uncompressed_size", KSTAT_DATA_UINT64 }, - { "overhead_size", KSTAT_DATA_UINT64 }, - { "hdr_size", KSTAT_DATA_UINT64 }, - { "data_size", KSTAT_DATA_UINT64 }, - { "metadata_size", KSTAT_DATA_UINT64 }, - { "dbuf_size", KSTAT_DATA_UINT64 }, - { "dnode_size", KSTAT_DATA_UINT64 }, - { "bonus_size", KSTAT_DATA_UINT64 }, -#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11) - { "other_size", KSTAT_DATA_UINT64 }, -#endif - { "anon_size", KSTAT_DATA_UINT64 }, - { "anon_evictable_data", KSTAT_DATA_UINT64 }, - { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, - { "mru_size", KSTAT_DATA_UINT64 }, - { "mru_evictable_data", KSTAT_DATA_UINT64 }, - { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, - { "mru_ghost_size", KSTAT_DATA_UINT64 }, - { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, - { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, - { "mfu_size", KSTAT_DATA_UINT64 }, - { "mfu_evictable_data", KSTAT_DATA_UINT64 }, - { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, - { "mfu_ghost_size", KSTAT_DATA_UINT64 }, - { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, - { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, - { "l2_hits", KSTAT_DATA_UINT64 }, - { "l2_misses", KSTAT_DATA_UINT64 }, - { "l2_feeds", KSTAT_DATA_UINT64 }, - { "l2_rw_clash", KSTAT_DATA_UINT64 }, - { "l2_read_bytes", KSTAT_DATA_UINT64 }, - { "l2_write_bytes", KSTAT_DATA_UINT64 }, - { "l2_writes_sent", KSTAT_DATA_UINT64 }, - { "l2_writes_done", KSTAT_DATA_UINT64 }, - { "l2_writes_error", KSTAT_DATA_UINT64 }, - { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, - { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, - { "l2_evict_reading", KSTAT_DATA_UINT64 }, - { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, - { "l2_free_on_write", KSTAT_DATA_UINT64 }, - { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, - { "l2_cksum_bad", KSTAT_DATA_UINT64 }, - { "l2_io_error", KSTAT_DATA_UINT64 }, - { "l2_size", KSTAT_DATA_UINT64 }, - { "l2_asize", KSTAT_DATA_UINT64 }, - { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, - { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, - { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, - { "l2_write_in_l2", KSTAT_DATA_UINT64 }, - { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, - { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, - { "l2_write_full", KSTAT_DATA_UINT64 }, - { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, - { "l2_write_pios", KSTAT_DATA_UINT64 }, - { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, - { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, - { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, - { "memory_throttle_count", KSTAT_DATA_UINT64 }, - { "memory_direct_count", KSTAT_DATA_UINT64 }, - { "memory_indirect_count", KSTAT_DATA_UINT64 }, - { "memory_all_bytes", KSTAT_DATA_UINT64 }, - { "memory_free_bytes", KSTAT_DATA_UINT64 }, - { "memory_available_bytes", KSTAT_DATA_UINT64 }, - { "arc_no_grow", KSTAT_DATA_UINT64 }, - { "arc_tempreserve", KSTAT_DATA_UINT64 }, - { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, - { "arc_prune", KSTAT_DATA_UINT64 }, - { "arc_meta_used", KSTAT_DATA_UINT64 }, - { "arc_meta_limit", KSTAT_DATA_UINT64 }, - { "arc_dnode_limit", KSTAT_DATA_UINT64 }, - { "arc_meta_max", KSTAT_DATA_UINT64 }, - { "arc_meta_min", KSTAT_DATA_UINT64 }, - { "async_upgrade_sync", KSTAT_DATA_UINT64 }, - { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, - { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, -}; - -#define ARCSTAT(stat) (arc_stats.stat.value.ui64) - -#define ARCSTAT_INCR(stat, val) \ - atomic_add_64(&arc_stats.stat.value.ui64, (val)) - -#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) -#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) - -#define ARCSTAT_MAX(stat, val) { \ - uint64_t m; \ - while ((val) > (m = arc_stats.stat.value.ui64) && \ - (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ - continue; \ -} - -#define ARCSTAT_MAXSTAT(stat) \ - ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) - -/* - * We define a macro to allow ARC hits/misses to be easily broken down by - * two separate conditions, giving a total of four different subtypes for - * each of hits and misses (so eight statistics total). - */ -#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ - if (cond1) { \ - if (cond2) { \ - ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ - } else { \ - ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ - } \ - } else { \ - if (cond2) { \ - ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ - } else { \ - ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ - } \ - } - -kstat_t *arc_ksp; -static arc_state_t *arc_anon; -static arc_state_t *arc_mru; -static arc_state_t *arc_mru_ghost; -static arc_state_t *arc_mfu; -static arc_state_t *arc_mfu_ghost; -static arc_state_t *arc_l2c_only; - -/* - * There are several ARC variables that are critical to export as kstats -- - * but we don't want to have to grovel around in the kstat whenever we wish to - * manipulate them. For these variables, we therefore define them to be in - * terms of the statistic variable. This assures that we are not introducing - * the possibility of inconsistency by having shadow copies of the variables, - * while still allowing the code to be readable. - */ -#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ -#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ -#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ -#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ -#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ -#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ -#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ -#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ -#define arc_dbuf_size ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */ -#define arc_dnode_size ARCSTAT(arcstat_dnode_size) /* dnode metadata */ -#define arc_bonus_size ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */ - -/* compressed size of entire arc */ -#define arc_compressed_size ARCSTAT(arcstat_compressed_size) -/* uncompressed size of entire arc */ -#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) -/* number of bytes in the arc from arc_buf_t's */ -#define arc_overhead_size ARCSTAT(arcstat_overhead_size) - -/* - * There are also some ARC variables that we want to export, but that are - * updated so often that having the canonical representation be the statistic - * variable causes a performance bottleneck. We want to use aggsum_t's for these - * instead, but still be able to export the kstat in the same way as before. - * The solution is to always use the aggsum version, except in the kstat update - * callback. - */ -aggsum_t arc_size; -aggsum_t arc_meta_used; -aggsum_t astat_data_size; -aggsum_t astat_metadata_size; -aggsum_t astat_hdr_size; -aggsum_t astat_bonus_size; -aggsum_t astat_dnode_size; -aggsum_t astat_dbuf_size; -aggsum_t astat_l2_hdr_size; - -static list_t arc_prune_list; -static kmutex_t arc_prune_mtx; -static taskq_t *arc_prune_taskq; - -static int arc_no_grow; /* Don't try to grow cache size */ -static hrtime_t arc_growtime; -static uint64_t arc_tempreserve; -static uint64_t arc_loaned_bytes; - -typedef struct arc_callback arc_callback_t; - -struct arc_callback { - void *acb_private; - arc_read_done_func_t *acb_done; - arc_buf_t *acb_buf; - boolean_t acb_compressed; - zio_t *acb_zio_dummy; - zio_t *acb_zio_head; - arc_callback_t *acb_next; -}; - -typedef struct arc_write_callback arc_write_callback_t; - -struct arc_write_callback { - void *awcb_private; - arc_write_done_func_t *awcb_ready; - arc_write_done_func_t *awcb_children_ready; - arc_write_done_func_t *awcb_physdone; - arc_write_done_func_t *awcb_done; - arc_buf_t *awcb_buf; -}; - -/* - * ARC buffers are separated into multiple structs as a memory saving measure: - * - Common fields struct, always defined, and embedded within it: - * - L2-only fields, always allocated but undefined when not in L2ARC - * - L1-only fields, only allocated when in L1ARC - * - * Buffer in L1 Buffer only in L2 - * +------------------------+ +------------------------+ - * | arc_buf_hdr_t | | arc_buf_hdr_t | - * | | | | - * | | | | - * | | | | - * +------------------------+ +------------------------+ - * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | - * | (undefined if L1-only) | | | - * +------------------------+ +------------------------+ - * | l1arc_buf_hdr_t | - * | | - * | | - * | | - * | | - * +------------------------+ - * - * Because it's possible for the L2ARC to become extremely large, we can wind - * up eating a lot of memory in L2ARC buffer headers, so the size of a header - * is minimized by only allocating the fields necessary for an L1-cached buffer - * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and - * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple - * words in pointers. arc_hdr_realloc() is used to switch a header between - * these two allocation states. - */ -typedef struct l1arc_buf_hdr { - kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; -#ifdef ZFS_DEBUG - /* - * Used for debugging with kmem_flags - by allocating and freeing - * b_thawed when the buffer is thawed, we get a record of the stack - * trace that thawed it. - */ - void *b_thawed; -#endif - - arc_buf_t *b_buf; - uint32_t b_bufcnt; - /* for waiting on writes to complete */ - kcondvar_t b_cv; - uint8_t b_byteswap; - - /* protected by arc state mutex */ - arc_state_t *b_state; - multilist_node_t b_arc_node; - - /* updated atomically */ - clock_t b_arc_access; - uint32_t b_mru_hits; - uint32_t b_mru_ghost_hits; - uint32_t b_mfu_hits; - uint32_t b_mfu_ghost_hits; - uint32_t b_l2_hits; - - /* self protecting */ - zfs_refcount_t b_refcnt; - - arc_callback_t *b_acb; - abd_t *b_pabd; -} l1arc_buf_hdr_t; - -typedef struct l2arc_dev l2arc_dev_t; - -typedef struct l2arc_buf_hdr { - /* protected by arc_buf_hdr mutex */ - l2arc_dev_t *b_dev; /* L2ARC device */ - uint64_t b_daddr; /* disk address, offset byte */ - uint32_t b_hits; - - list_node_t b_l2node; -} l2arc_buf_hdr_t; - -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - - arc_buf_contents_t b_type; - arc_buf_hdr_t *b_hash_next; - arc_flags_t b_flags; - - /* - * This field stores the size of the data buffer after - * compression, and is set in the arc's zio completion handlers. - * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). - * - * While the block pointers can store up to 32MB in their psize - * field, we can only store up to 32MB minus 512B. This is due - * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. - * a field of zeros represents 512B in the bp). We can't use a - * bias of 1 since we need to reserve a psize of zero, here, to - * represent holes and embedded blocks. - * - * This isn't a problem in practice, since the maximum size of a - * buffer is limited to 16MB, so we never need to store 32MB in - * this field. Even in the upstream illumos code base, the - * maximum size of a buffer is limited to 16MB. - */ - uint16_t b_psize; - - /* - * This field stores the size of the data buffer before - * compression, and cannot change once set. It is in units - * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) - */ - uint16_t b_lsize; /* immutable */ - uint64_t b_spa; /* immutable */ - - /* L2ARC fields. Undefined when not in L2ARC. */ - l2arc_buf_hdr_t b_l2hdr; - /* L1ARC fields. Undefined when in l2arc_only state */ - l1arc_buf_hdr_t b_l1hdr; -}; - -#if defined(__FreeBSD__) && defined(_KERNEL) -static int -sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) -{ - uint64_t val; - int err; - - val = arc_meta_limit; - err = sysctl_handle_64(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (val <= 0 || val > arc_c_max) - return (EINVAL); - - arc_meta_limit = val; - - mutex_enter(&arc_adjust_lock); - arc_adjust_needed = B_TRUE; - mutex_exit(&arc_adjust_lock); - zthr_wakeup(arc_adjust_zthr); - - return (0); -} - -static int -sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) -{ - uint32_t val; - int err; - - val = arc_no_grow_shift; - err = sysctl_handle_32(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (val >= arc_shrink_shift) - return (EINVAL); - - arc_no_grow_shift = val; - return (0); -} - -static int -sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) -{ - uint64_t val; - int err; - - val = zfs_arc_max; - err = sysctl_handle_64(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (zfs_arc_max == 0) { - /* Loader tunable so blindly set */ - zfs_arc_max = val; - return (0); - } - - if (val < arc_abs_min || val > kmem_size()) - return (EINVAL); - if (val < arc_c_min) - return (EINVAL); - if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) - return (EINVAL); - - arc_c_max = val; - - arc_c = arc_c_max; - arc_p = (arc_c >> 1); - - if (zfs_arc_meta_limit == 0) { - /* limit meta-data to 1/4 of the arc capacity */ - arc_meta_limit = arc_c_max / 4; - } - - /* if kmem_flags are set, lets try to use less memory */ - if (kmem_debugging()) - arc_c = arc_c / 2; - - zfs_arc_max = arc_c; - - mutex_enter(&arc_adjust_lock); - arc_adjust_needed = B_TRUE; - mutex_exit(&arc_adjust_lock); - zthr_wakeup(arc_adjust_zthr); - - return (0); -} - -static int -sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) -{ - uint64_t val; - int err; - - val = zfs_arc_min; - err = sysctl_handle_64(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (zfs_arc_min == 0) { - /* Loader tunable so blindly set */ - zfs_arc_min = val; - return (0); - } - - if (val < arc_abs_min || val > arc_c_max) - return (EINVAL); - - arc_c_min = val; - - if (zfs_arc_meta_min == 0) - arc_meta_min = arc_c_min / 2; - - if (arc_c < arc_c_min) - arc_c = arc_c_min; - - zfs_arc_min = arc_c_min; - - return (0); -} -#endif - -#define GHOST_STATE(state) \ - ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ - (state) == arc_l2c_only) - -#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) -#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) -#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) -#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) -#define HDR_PRESCIENT_PREFETCH(hdr) \ - ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) -#define HDR_COMPRESSION_ENABLED(hdr) \ - ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) - -#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) -#define HDR_L2_READING(hdr) \ - (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ - ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) -#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) -#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) -#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) -#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) - -#define HDR_ISTYPE_METADATA(hdr) \ - ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) -#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) - -#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) -#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) - -/* For storing compression mode in b_flags */ -#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) - -#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ - HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) -#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ - HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); - -#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) -#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) -#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) - -/* - * Other sizes - */ - -#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) -#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) - -/* - * Hash table routines - */ - -#define HT_LOCK_PAD CACHE_LINE_SIZE - -struct ht_lock { - kmutex_t ht_lock; -#ifdef _KERNEL - unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; -#endif -}; - -#define BUF_LOCKS 256 -typedef struct buf_hash_table { - uint64_t ht_mask; - arc_buf_hdr_t **ht_table; - struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); -} buf_hash_table_t; - -static buf_hash_table_t buf_hash_table; - -#define BUF_HASH_INDEX(spa, dva, birth) \ - (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) -#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) -#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) -#define HDR_LOCK(hdr) \ - (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) - -uint64_t zfs_crc64_table[256]; - -/* - * Level 2 ARC - */ - -#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ -#define L2ARC_HEADROOM 2 /* num of writes */ -/* - * If we discover during ARC scan any buffers to be compressed, we boost - * our headroom for the next scanning cycle by this percentage multiple. - */ -#define L2ARC_HEADROOM_BOOST 200 -#define L2ARC_FEED_SECS 1 /* caching interval secs */ -#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ - -#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) -#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) - -/* L2ARC Performance Tunables */ -uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ -uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ -uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ -uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; -uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ -uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ -boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ -boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ -boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RWTUN, - &l2arc_write_max, 0, "max write size"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RWTUN, - &l2arc_write_boost, 0, "extra write during warmup"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RWTUN, - &l2arc_headroom, 0, "number of dev writes"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RWTUN, - &l2arc_feed_secs, 0, "interval seconds"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RWTUN, - &l2arc_feed_min_ms, 0, "min interval milliseconds"); - -SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RWTUN, - &l2arc_noprefetch, 0, "don't cache prefetch bufs"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RWTUN, - &l2arc_feed_again, 0, "turbo warmup"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RWTUN, - &l2arc_norw, 0, "no reads during writes"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, - &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, - &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of anonymous state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, - &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of anonymous state"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, - &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, - &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mru state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, - &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mru state"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, - &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, - &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mru ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, - &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mru ghost state"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, - &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, - &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mfu state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, - &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mfu state"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mfu ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mfu ghost state"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, - &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); - -SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prefetch_ms, CTLFLAG_RW, - &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms"); -SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW, - &zfs_arc_min_prescient_prefetch_ms, 0, "Min life of prescient prefetched block in ms"); - -/* - * L2ARC Internals - */ -struct l2arc_dev { - vdev_t *l2ad_vdev; /* vdev */ - spa_t *l2ad_spa; /* spa */ - uint64_t l2ad_hand; /* next write location */ - uint64_t l2ad_start; /* first addr on device */ - uint64_t l2ad_end; /* last addr on device */ - boolean_t l2ad_first; /* first sweep through */ - boolean_t l2ad_writing; /* currently writing */ - kmutex_t l2ad_mtx; /* lock for buffer list */ - list_t l2ad_buflist; /* buffer list */ - list_node_t l2ad_node; /* device list node */ - zfs_refcount_t l2ad_alloc; /* allocated bytes */ -}; - -static list_t L2ARC_dev_list; /* device list */ -static list_t *l2arc_dev_list; /* device list pointer */ -static kmutex_t l2arc_dev_mtx; /* device list mutex */ -static l2arc_dev_t *l2arc_dev_last; /* last device used */ -static list_t L2ARC_free_on_write; /* free after write buf list */ -static list_t *l2arc_free_on_write; /* free after write list ptr */ -static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ -static uint64_t l2arc_ndev; /* number of devices */ - -typedef struct l2arc_read_callback { - arc_buf_hdr_t *l2rcb_hdr; /* read header */ - blkptr_t l2rcb_bp; /* original blkptr */ - zbookmark_phys_t l2rcb_zb; /* original bookmark */ - int l2rcb_flags; /* original flags */ - abd_t *l2rcb_abd; /* temporary buffer */ -} l2arc_read_callback_t; - -typedef struct l2arc_write_callback { - l2arc_dev_t *l2wcb_dev; /* device info */ - arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ -} l2arc_write_callback_t; - -typedef struct l2arc_data_free { - /* protected by l2arc_free_on_write_mtx */ - abd_t *l2df_abd; - size_t l2df_size; - arc_buf_contents_t l2df_type; - list_node_t l2df_list_node; -} l2arc_data_free_t; - -static kmutex_t l2arc_feed_thr_lock; -static kcondvar_t l2arc_feed_thr_cv; -static uint8_t l2arc_thread_exit; - -static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t); -static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); -static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t); -static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); -static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); -static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); -static void arc_hdr_free_pabd(arc_buf_hdr_t *); -static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, boolean_t); -static void arc_access(arc_buf_hdr_t *, kmutex_t *); -static boolean_t arc_is_overflowing(); -static void arc_buf_watch(arc_buf_t *); -static void arc_prune_async(int64_t); - -static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); -static uint32_t arc_bufc_to_flags(arc_buf_contents_t); -static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); -static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); - -static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); -static void l2arc_read_done(zio_t *); - -static void -l2arc_trim(const arc_buf_hdr_t *hdr) -{ - l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; - - ASSERT(HDR_HAS_L2HDR(hdr)); - ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); - - if (HDR_GET_PSIZE(hdr) != 0) { - trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, - HDR_GET_PSIZE(hdr), 0); - } -} - -/* - * We use Cityhash for this. It's fast, and has good hash properties without - * requiring any large static buffers. - */ -static uint64_t -buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) -{ - return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); -} - -#define HDR_EMPTY(hdr) \ - ((hdr)->b_dva.dva_word[0] == 0 && \ - (hdr)->b_dva.dva_word[1] == 0) - -#define HDR_EQUAL(spa, dva, birth, hdr) \ - ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ - ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ - ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) - -static void -buf_discard_identity(arc_buf_hdr_t *hdr) -{ - hdr->b_dva.dva_word[0] = 0; - hdr->b_dva.dva_word[1] = 0; - hdr->b_birth = 0; -} - -static arc_buf_hdr_t * -buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) -{ - const dva_t *dva = BP_IDENTITY(bp); - uint64_t birth = BP_PHYSICAL_BIRTH(bp); - uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); - kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *hdr; - - mutex_enter(hash_lock); - for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; - hdr = hdr->b_hash_next) { - if (HDR_EQUAL(spa, dva, birth, hdr)) { - *lockp = hash_lock; - return (hdr); - } - } - mutex_exit(hash_lock); - *lockp = NULL; - return (NULL); -} - -/* - * Insert an entry into the hash table. If there is already an element - * equal to elem in the hash table, then the already existing element - * will be returned and the new element will not be inserted. - * Otherwise returns NULL. - * If lockp == NULL, the caller is assumed to already hold the hash lock. - */ -static arc_buf_hdr_t * -buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) -{ - uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); - kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *fhdr; - uint32_t i; - - ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); - ASSERT(hdr->b_birth != 0); - ASSERT(!HDR_IN_HASH_TABLE(hdr)); - - if (lockp != NULL) { - *lockp = hash_lock; - mutex_enter(hash_lock); - } else { - ASSERT(MUTEX_HELD(hash_lock)); - } - - for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; - fhdr = fhdr->b_hash_next, i++) { - if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) - return (fhdr); - } - - hdr->b_hash_next = buf_hash_table.ht_table[idx]; - buf_hash_table.ht_table[idx] = hdr; - arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); - - /* collect some hash table performance data */ - if (i > 0) { - ARCSTAT_BUMP(arcstat_hash_collisions); - if (i == 1) - ARCSTAT_BUMP(arcstat_hash_chains); - - ARCSTAT_MAX(arcstat_hash_chain_max, i); - } - - ARCSTAT_BUMP(arcstat_hash_elements); - ARCSTAT_MAXSTAT(arcstat_hash_elements); - - return (NULL); -} - -static void -buf_hash_remove(arc_buf_hdr_t *hdr) -{ - arc_buf_hdr_t *fhdr, **hdrp; - uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); - - ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); - ASSERT(HDR_IN_HASH_TABLE(hdr)); - - hdrp = &buf_hash_table.ht_table[idx]; - while ((fhdr = *hdrp) != hdr) { - ASSERT3P(fhdr, !=, NULL); - hdrp = &fhdr->b_hash_next; - } - *hdrp = hdr->b_hash_next; - hdr->b_hash_next = NULL; - arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); - - /* collect some hash table performance data */ - ARCSTAT_BUMPDOWN(arcstat_hash_elements); - - if (buf_hash_table.ht_table[idx] && - buf_hash_table.ht_table[idx]->b_hash_next == NULL) - ARCSTAT_BUMPDOWN(arcstat_hash_chains); -} - -/* - * Global data structures and functions for the buf kmem cache. - */ -static kmem_cache_t *hdr_full_cache; -static kmem_cache_t *hdr_l2only_cache; -static kmem_cache_t *buf_cache; - -static void -buf_fini(void) -{ - int i; - - kmem_free(buf_hash_table.ht_table, - (buf_hash_table.ht_mask + 1) * sizeof (void *)); - for (i = 0; i < BUF_LOCKS; i++) - mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); - kmem_cache_destroy(hdr_full_cache); - kmem_cache_destroy(hdr_l2only_cache); - kmem_cache_destroy(buf_cache); -} - -/* - * Constructor callback - called when the cache is empty - * and a new buf is requested. - */ -/* ARGSUSED */ -static int -hdr_full_cons(void *vbuf, void *unused, int kmflag) -{ - arc_buf_hdr_t *hdr = vbuf; - - bzero(hdr, HDR_FULL_SIZE); - cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); - zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); - mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - multilist_link_init(&hdr->b_l1hdr.b_arc_node); - arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); - - return (0); -} - -/* ARGSUSED */ -static int -hdr_l2only_cons(void *vbuf, void *unused, int kmflag) -{ - arc_buf_hdr_t *hdr = vbuf; - - bzero(hdr, HDR_L2ONLY_SIZE); - arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); - - return (0); -} - -/* ARGSUSED */ -static int -buf_cons(void *vbuf, void *unused, int kmflag) -{ - arc_buf_t *buf = vbuf; - - bzero(buf, sizeof (arc_buf_t)); - mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); - arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); - - return (0); -} - -/* - * Destructor callback - called when a cached buf is - * no longer required. - */ -/* ARGSUSED */ -static void -hdr_full_dest(void *vbuf, void *unused) -{ - arc_buf_hdr_t *hdr = vbuf; - - ASSERT(HDR_EMPTY(hdr)); - cv_destroy(&hdr->b_l1hdr.b_cv); - zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt); - mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); -} - -/* ARGSUSED */ -static void -hdr_l2only_dest(void *vbuf, void *unused) -{ - arc_buf_hdr_t *hdr = vbuf; - - ASSERT(HDR_EMPTY(hdr)); - arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); -} - -/* ARGSUSED */ -static void -buf_dest(void *vbuf, void *unused) -{ - arc_buf_t *buf = vbuf; - - mutex_destroy(&buf->b_evict_lock); - arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); -} - -/* - * Reclaim callback -- invoked when memory is low. - */ -/* ARGSUSED */ -static void -hdr_recl(void *unused) -{ - dprintf("hdr_recl called\n"); - /* - * umem calls the reclaim func when we destroy the buf cache, - * which is after we do arc_fini(). - */ - if (arc_initialized) - zthr_wakeup(arc_reap_zthr); -} - -static void -buf_init(void) -{ - uint64_t *ct; - uint64_t hsize = 1ULL << 12; - int i, j; - - /* - * The hash table is big enough to fill all of physical memory - * with an average block size of zfs_arc_average_blocksize (default 8K). - * By default, the table will take up - * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). - */ - while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) - hsize <<= 1; -retry: - buf_hash_table.ht_mask = hsize - 1; - buf_hash_table.ht_table = - kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); - if (buf_hash_table.ht_table == NULL) { - ASSERT(hsize > (1ULL << 8)); - hsize >>= 1; - goto retry; - } - - hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, - 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); - hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", - HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, - NULL, NULL, 0); - buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), - 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); - - for (i = 0; i < 256; i++) - for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) - *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); - - for (i = 0; i < BUF_LOCKS; i++) { - mutex_init(&buf_hash_table.ht_locks[i].ht_lock, - NULL, MUTEX_DEFAULT, NULL); - } -} - -/* - * This is the size that the buf occupies in memory. If the buf is compressed, - * it will correspond to the compressed size. You should use this method of - * getting the buf size unless you explicitly need the logical size. - */ -int32_t -arc_buf_size(arc_buf_t *buf) -{ - return (ARC_BUF_COMPRESSED(buf) ? - HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); -} - -int32_t -arc_buf_lsize(arc_buf_t *buf) -{ - return (HDR_GET_LSIZE(buf->b_hdr)); -} - -enum zio_compress -arc_get_compression(arc_buf_t *buf) -{ - return (ARC_BUF_COMPRESSED(buf) ? - HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); -} - -#define ARC_MINTIME (hz>>4) /* 62 ms */ - -static inline boolean_t -arc_buf_is_shared(arc_buf_t *buf) -{ - boolean_t shared = (buf->b_data != NULL && - buf->b_hdr->b_l1hdr.b_pabd != NULL && - abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && - buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); - IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); - IMPLY(shared, ARC_BUF_SHARED(buf)); - IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); - - /* - * It would be nice to assert arc_can_share() too, but the "hdr isn't - * already being shared" requirement prevents us from doing that. - */ - - return (shared); -} - -/* - * Free the checksum associated with this header. If there is no checksum, this - * is a no-op. - */ -static inline void -arc_cksum_free(arc_buf_hdr_t *hdr) -{ - ASSERT(HDR_HAS_L1HDR(hdr)); - mutex_enter(&hdr->b_l1hdr.b_freeze_lock); - if (hdr->b_l1hdr.b_freeze_cksum != NULL) { - kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_l1hdr.b_freeze_cksum = NULL; - } - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); -} - -/* - * Return true iff at least one of the bufs on hdr is not compressed. - */ -static boolean_t -arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) -{ - for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { - if (!ARC_BUF_COMPRESSED(b)) { - return (B_TRUE); - } - } - return (B_FALSE); -} - -/* - * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data - * matches the checksum that is stored in the hdr. If there is no checksum, - * or if the buf is compressed, this is a no-op. - */ -static void -arc_cksum_verify(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - zio_cksum_t zc; - - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) - return; - - if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || - arc_hdr_has_uncompressed_buf(hdr)); - return; - } - - ASSERT(HDR_HAS_L1HDR(hdr)); - - mutex_enter(&hdr->b_l1hdr.b_freeze_lock); - if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); - return; - } - - fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); - if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) - panic("buffer modified while frozen!"); - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); -} - -static boolean_t -arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) -{ - enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); - boolean_t valid_cksum; - - ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); - VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); - - /* - * We rely on the blkptr's checksum to determine if the block - * is valid or not. When compressed arc is enabled, the l2arc - * writes the block to the l2arc just as it appears in the pool. - * This allows us to use the blkptr's checksum to validate the - * data that we just read off of the l2arc without having to store - * a separate checksum in the arc_buf_hdr_t. However, if compressed - * arc is disabled, then the data written to the l2arc is always - * uncompressed and won't match the block as it exists in the main - * pool. When this is the case, we must first compress it if it is - * compressed on the main pool before we can validate the checksum. - */ - if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); - uint64_t lsize = HDR_GET_LSIZE(hdr); - uint64_t csize; - - abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE); - csize = zio_compress_data(compress, zio->io_abd, - abd_to_buf(cdata), lsize); - - ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); - if (csize < HDR_GET_PSIZE(hdr)) { - /* - * Compressed blocks are always a multiple of the - * smallest ashift in the pool. Ideally, we would - * like to round up the csize to the next - * spa_min_ashift but that value may have changed - * since the block was last written. Instead, - * we rely on the fact that the hdr's psize - * was set to the psize of the block when it was - * last written. We set the csize to that value - * and zero out any part that should not contain - * data. - */ - abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize); - csize = HDR_GET_PSIZE(hdr); - } - zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL); - } - - /* - * Block pointers always store the checksum for the logical data. - * If the block pointer has the gang bit set, then the checksum - * it represents is for the reconstituted data and not for an - * individual gang member. The zio pipeline, however, must be able to - * determine the checksum of each of the gang constituents so it - * treats the checksum comparison differently than what we need - * for l2arc blocks. This prevents us from using the - * zio_checksum_error() interface directly. Instead we must call the - * zio_checksum_error_impl() so that we can ensure the checksum is - * generated using the correct checksum algorithm and accounts for the - * logical I/O size and not just a gang fragment. - */ - valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, - BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, - zio->io_offset, NULL) == 0); - zio_pop_transforms(zio); - return (valid_cksum); -} - -/* - * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a - * checksum and attaches it to the buf's hdr so that we can ensure that the buf - * isn't modified later on. If buf is compressed or there is already a checksum - * on the hdr, this is a no-op (we only checksum uncompressed bufs). - */ -static void -arc_cksum_compute(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) - return; - - ASSERT(HDR_HAS_L1HDR(hdr)); - - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (hdr->b_l1hdr.b_freeze_cksum != NULL) { - ASSERT(arc_hdr_has_uncompressed_buf(hdr)); - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); - return; - } else if (ARC_BUF_COMPRESSED(buf)) { - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); - return; - } - - ASSERT(!ARC_BUF_COMPRESSED(buf)); - hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), - KM_SLEEP); - fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, - hdr->b_l1hdr.b_freeze_cksum); - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); -#ifdef illumos - arc_buf_watch(buf); -#endif -} - -#ifdef illumos -#ifndef _KERNEL -typedef struct procctl { - long cmd; - prwatch_t prwatch; -} procctl_t; -#endif - -/* ARGSUSED */ -static void -arc_buf_unwatch(arc_buf_t *buf) -{ -#ifndef _KERNEL - if (arc_watch) { - int result; - procctl_t ctl; - ctl.cmd = PCWATCH; - ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; - ctl.prwatch.pr_size = 0; - ctl.prwatch.pr_wflags = 0; - result = write(arc_procfd, &ctl, sizeof (ctl)); - ASSERT3U(result, ==, sizeof (ctl)); - } -#endif -} - -/* ARGSUSED */ -static void -arc_buf_watch(arc_buf_t *buf) -{ -#ifndef _KERNEL - if (arc_watch) { - int result; - procctl_t ctl; - ctl.cmd = PCWATCH; - ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; - ctl.prwatch.pr_size = arc_buf_size(buf); - ctl.prwatch.pr_wflags = WA_WRITE; - result = write(arc_procfd, &ctl, sizeof (ctl)); - ASSERT3U(result, ==, sizeof (ctl)); - } -#endif -} -#endif /* illumos */ - -static arc_buf_contents_t -arc_buf_type(arc_buf_hdr_t *hdr) -{ - arc_buf_contents_t type; - if (HDR_ISTYPE_METADATA(hdr)) { - type = ARC_BUFC_METADATA; - } else { - type = ARC_BUFC_DATA; - } - VERIFY3U(hdr->b_type, ==, type); - return (type); -} - -boolean_t -arc_is_metadata(arc_buf_t *buf) -{ - return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); -} - -static uint32_t -arc_bufc_to_flags(arc_buf_contents_t type) -{ - switch (type) { - case ARC_BUFC_DATA: - /* metadata field is 0 if buffer contains normal data */ - return (0); - case ARC_BUFC_METADATA: - return (ARC_FLAG_BUFC_METADATA); - default: - break; - } - panic("undefined ARC buffer type!"); - return ((uint32_t)-1); -} - -void -arc_buf_thaw(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - - ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - - arc_cksum_verify(buf); - - /* - * Compressed buffers do not manipulate the b_freeze_cksum or - * allocate b_thawed. - */ - if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || - arc_hdr_has_uncompressed_buf(hdr)); - return; - } - - ASSERT(HDR_HAS_L1HDR(hdr)); - arc_cksum_free(hdr); - - mutex_enter(&hdr->b_l1hdr.b_freeze_lock); -#ifdef ZFS_DEBUG - if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (hdr->b_l1hdr.b_thawed != NULL) - kmem_free(hdr->b_l1hdr.b_thawed, 1); - hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); - } -#endif - - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); - -#ifdef illumos - arc_buf_unwatch(buf); -#endif -} - -void -arc_buf_freeze(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock; - - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) - return; - - if (ARC_BUF_COMPRESSED(buf)) { - ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || - arc_hdr_has_uncompressed_buf(hdr)); - return; - } - - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || - hdr->b_l1hdr.b_state == arc_anon); - arc_cksum_compute(buf); - mutex_exit(hash_lock); -} - -/* - * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, - * the following functions should be used to ensure that the flags are - * updated in a thread-safe way. When manipulating the flags either - * the hash_lock must be held or the hdr must be undiscoverable. This - * ensures that we're not racing with any other threads when updating - * the flags. - */ -static inline void -arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) -{ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - hdr->b_flags |= flags; -} - -static inline void -arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) -{ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - hdr->b_flags &= ~flags; -} - -/* - * Setting the compression bits in the arc_buf_hdr_t's b_flags is - * done in a special way since we have to clear and set bits - * at the same time. Consumers that wish to set the compression bits - * must use this function to ensure that the flags are updated in - * thread-safe manner. - */ -static void -arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) -{ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - - /* - * Holes and embedded blocks will always have a psize = 0 so - * we ignore the compression of the blkptr and set the - * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. - * Holes and embedded blocks remain anonymous so we don't - * want to uncompress them. Mark them as uncompressed. - */ - if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { - arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); - HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); - ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); - } else { - arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); - HDR_SET_COMPRESS(hdr, cmp); - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); - ASSERT(HDR_COMPRESSION_ENABLED(hdr)); - } -} - -/* - * Looks for another buf on the same hdr which has the data decompressed, copies - * from it, and returns true. If no such buf exists, returns false. - */ -static boolean_t -arc_buf_try_copy_decompressed_data(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - boolean_t copied = B_FALSE; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(buf->b_data, !=, NULL); - ASSERT(!ARC_BUF_COMPRESSED(buf)); - - for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; - from = from->b_next) { - /* can't use our own data buffer */ - if (from == buf) { - continue; - } - - if (!ARC_BUF_COMPRESSED(from)) { - bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); - copied = B_TRUE; - break; - } - } - - /* - * There were no decompressed bufs, so there should not be a - * checksum on the hdr either. - */ - EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); - - return (copied); -} - -/* - * Given a buf that has a data buffer attached to it, this function will - * efficiently fill the buf with data of the specified compression setting from - * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr - * are already sharing a data buf, no copy is performed. - * - * If the buf is marked as compressed but uncompressed data was requested, this - * will allocate a new data buffer for the buf, remove that flag, and fill the - * buf with uncompressed data. You can't request a compressed buf on a hdr with - * uncompressed data, and (since we haven't added support for it yet) if you - * want compressed data your buf must already be marked as compressed and have - * the correct-sized data buffer. - */ -static int -arc_buf_fill(arc_buf_t *buf, boolean_t compressed) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); - dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; - - ASSERT3P(buf->b_data, !=, NULL); - IMPLY(compressed, hdr_compressed); - IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); - - if (hdr_compressed == compressed) { - if (!arc_buf_is_shared(buf)) { - abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, - arc_buf_size(buf)); - } - } else { - ASSERT(hdr_compressed); - ASSERT(!compressed); - ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); - - /* - * If the buf is sharing its data with the hdr, unlink it and - * allocate a new data buffer for the buf. - */ - if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_COMPRESSED(buf)); - - /* We need to give the buf it's own b_data */ - buf->b_flags &= ~ARC_BUF_FLAG_SHARED; - buf->b_data = - arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); - arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); - - /* Previously overhead was 0; just add new overhead */ - ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); - } else if (ARC_BUF_COMPRESSED(buf)) { - /* We need to reallocate the buf's b_data */ - arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), - buf); - buf->b_data = - arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); - - /* We increased the size of b_data; update overhead */ - ARCSTAT_INCR(arcstat_overhead_size, - HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); - } - - /* - * Regardless of the buf's previous compression settings, it - * should not be compressed at the end of this function. - */ - buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; - - /* - * Try copying the data from another buf which already has a - * decompressed version. If that's not possible, it's time to - * bite the bullet and decompress the data from the hdr. - */ - if (arc_buf_try_copy_decompressed_data(buf)) { - /* Skip byteswapping and checksumming (already done) */ - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); - return (0); - } else { - int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), - hdr->b_l1hdr.b_pabd, buf->b_data, - HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); - - /* - * Absent hardware errors or software bugs, this should - * be impossible, but log it anyway so we can debug it. - */ - if (error != 0) { - zfs_dbgmsg( - "hdr %p, compress %d, psize %d, lsize %d", - hdr, HDR_GET_COMPRESS(hdr), - HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); - return (SET_ERROR(EIO)); - } - } - } - - /* Byteswap the buf's data if necessary */ - if (bswap != DMU_BSWAP_NUMFUNCS) { - ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); - dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); - } - - /* Compute the hdr's checksum if necessary */ - arc_cksum_compute(buf); - - return (0); -} - -int -arc_decompress(arc_buf_t *buf) -{ - return (arc_buf_fill(buf, B_FALSE)); -} - -/* - * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. - */ -static uint64_t -arc_hdr_size(arc_buf_hdr_t *hdr) -{ - uint64_t size; - - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && - HDR_GET_PSIZE(hdr) > 0) { - size = HDR_GET_PSIZE(hdr); - } else { - ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); - size = HDR_GET_LSIZE(hdr); - } - return (size); -} - -/* - * Increment the amount of evictable space in the arc_state_t's refcount. - * We account for the space used by the hdr and the arc buf individually - * so that we can add and remove them from the refcount individually. - */ -static void -arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) -{ - arc_buf_contents_t type = arc_buf_type(hdr); - - ASSERT(HDR_HAS_L1HDR(hdr)); - - if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_bufcnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - (void) zfs_refcount_add_many(&state->arcs_esize[type], - HDR_GET_LSIZE(hdr), hdr); - return; - } - - ASSERT(!GHOST_STATE(state)); - if (hdr->b_l1hdr.b_pabd != NULL) { - (void) zfs_refcount_add_many(&state->arcs_esize[type], - arc_hdr_size(hdr), hdr); - } - for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; - buf = buf->b_next) { - if (arc_buf_is_shared(buf)) - continue; - (void) zfs_refcount_add_many(&state->arcs_esize[type], - arc_buf_size(buf), buf); - } -} - -/* - * Decrement the amount of evictable space in the arc_state_t's refcount. - * We account for the space used by the hdr and the arc buf individually - * so that we can add and remove them from the refcount individually. - */ -static void -arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) -{ - arc_buf_contents_t type = arc_buf_type(hdr); - - ASSERT(HDR_HAS_L1HDR(hdr)); - - if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_bufcnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - (void) zfs_refcount_remove_many(&state->arcs_esize[type], - HDR_GET_LSIZE(hdr), hdr); - return; - } - - ASSERT(!GHOST_STATE(state)); - if (hdr->b_l1hdr.b_pabd != NULL) { - (void) zfs_refcount_remove_many(&state->arcs_esize[type], - arc_hdr_size(hdr), hdr); - } - for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; - buf = buf->b_next) { - if (arc_buf_is_shared(buf)) - continue; - (void) zfs_refcount_remove_many(&state->arcs_esize[type], - arc_buf_size(buf), buf); - } -} - -/* - * Add a reference to this hdr indicating that someone is actively - * referencing that memory. When the refcount transitions from 0 to 1, - * we remove it from the respective arc_state_t list to indicate that - * it is not evictable. - */ -static void -add_reference(arc_buf_hdr_t *hdr, void *tag) -{ - ASSERT(HDR_HAS_L1HDR(hdr)); - if (!MUTEX_HELD(HDR_LOCK(hdr))) { - ASSERT(hdr->b_l1hdr.b_state == arc_anon); - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - } - - arc_state_t *state = hdr->b_l1hdr.b_state; - - if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && - (state != arc_anon)) { - /* We don't use the L2-only state list. */ - if (state != arc_l2c_only) { - multilist_remove(state->arcs_list[arc_buf_type(hdr)], - hdr); - arc_evictable_space_decrement(hdr, state); - } - /* remove the prefetch flag if we get a reference */ - arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); - } -} - -/* - * Remove a reference from this hdr. When the reference transitions from - * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's - * list making it eligible for eviction. - */ -static int -remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) -{ - int cnt; - arc_state_t *state = hdr->b_l1hdr.b_state; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); - ASSERT(!GHOST_STATE(state)); - - /* - * arc_l2c_only counts as a ghost state so we don't need to explicitly - * check to prevent usage of the arc_l2c_only list. - */ - if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && - (state != arc_anon)) { - multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); - arc_evictable_space_increment(hdr, state); - } - return (cnt); -} - -/* - * Returns detailed information about a specific arc buffer. When the - * state_index argument is set the function will calculate the arc header - * list position for its arc state. Since this requires a linear traversal - * callers are strongly encourage not to do this. However, it can be helpful - * for targeted analysis so the functionality is provided. - */ -void -arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) -{ - arc_buf_hdr_t *hdr = ab->b_hdr; - l1arc_buf_hdr_t *l1hdr = NULL; - l2arc_buf_hdr_t *l2hdr = NULL; - arc_state_t *state = NULL; - - memset(abi, 0, sizeof (arc_buf_info_t)); - - if (hdr == NULL) - return; - - abi->abi_flags = hdr->b_flags; - - if (HDR_HAS_L1HDR(hdr)) { - l1hdr = &hdr->b_l1hdr; - state = l1hdr->b_state; - } - if (HDR_HAS_L2HDR(hdr)) - l2hdr = &hdr->b_l2hdr; - - if (l1hdr) { - abi->abi_bufcnt = l1hdr->b_bufcnt; - abi->abi_access = l1hdr->b_arc_access; - abi->abi_mru_hits = l1hdr->b_mru_hits; - abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; - abi->abi_mfu_hits = l1hdr->b_mfu_hits; - abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; - abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt); - } - - if (l2hdr) { - abi->abi_l2arc_dattr = l2hdr->b_daddr; - abi->abi_l2arc_hits = l2hdr->b_hits; - } - - abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; - abi->abi_state_contents = arc_buf_type(hdr); - abi->abi_size = arc_hdr_size(hdr); -} - -/* - * Move the supplied buffer to the indicated state. The hash lock - * for the buffer must be held by the caller. - */ -static void -arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, - kmutex_t *hash_lock) -{ - arc_state_t *old_state; - int64_t refcnt; - uint32_t bufcnt; - boolean_t update_old, update_new; - arc_buf_contents_t buftype = arc_buf_type(hdr); - - /* - * We almost always have an L1 hdr here, since we call arc_hdr_realloc() - * in arc_read() when bringing a buffer out of the L2ARC. However, the - * L1 hdr doesn't always exist when we change state to arc_anon before - * destroying a header, in which case reallocating to add the L1 hdr is - * pointless. - */ - if (HDR_HAS_L1HDR(hdr)) { - old_state = hdr->b_l1hdr.b_state; - refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt); - bufcnt = hdr->b_l1hdr.b_bufcnt; - update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); - } else { - old_state = arc_l2c_only; - refcnt = 0; - bufcnt = 0; - update_old = B_FALSE; - } - update_new = update_old; - - ASSERT(MUTEX_HELD(hash_lock)); - ASSERT3P(new_state, !=, old_state); - ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); - ASSERT(old_state != arc_anon || bufcnt <= 1); - - /* - * If this buffer is evictable, transfer it from the - * old state list to the new state list. - */ - if (refcnt == 0) { - if (old_state != arc_anon && old_state != arc_l2c_only) { - ASSERT(HDR_HAS_L1HDR(hdr)); - multilist_remove(old_state->arcs_list[buftype], hdr); - - if (GHOST_STATE(old_state)) { - ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - update_old = B_TRUE; - } - arc_evictable_space_decrement(hdr, old_state); - } - if (new_state != arc_anon && new_state != arc_l2c_only) { - - /* - * An L1 header always exists here, since if we're - * moving to some L1-cached state (i.e. not l2c_only or - * anonymous), we realloc the header to add an L1hdr - * beforehand. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - multilist_insert(new_state->arcs_list[buftype], hdr); - - if (GHOST_STATE(new_state)) { - ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - update_new = B_TRUE; - } - arc_evictable_space_increment(hdr, new_state); - } - } - - ASSERT(!HDR_EMPTY(hdr)); - if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) - buf_hash_remove(hdr); - - /* adjust state sizes (ignore arc_l2c_only) */ - - if (update_new && new_state != arc_l2c_only) { - ASSERT(HDR_HAS_L1HDR(hdr)); - if (GHOST_STATE(new_state)) { - ASSERT0(bufcnt); - - /* - * When moving a header to a ghost state, we first - * remove all arc buffers. Thus, we'll have a - * bufcnt of zero, and no arc buffer to use for - * the reference. As a result, we use the arc - * header pointer for the reference. - */ - (void) zfs_refcount_add_many(&new_state->arcs_size, - HDR_GET_LSIZE(hdr), hdr); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - } else { - uint32_t buffers = 0; - - /* - * Each individual buffer holds a unique reference, - * thus we must remove each of these references one - * at a time. - */ - for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; - buf = buf->b_next) { - ASSERT3U(bufcnt, !=, 0); - buffers++; - - /* - * When the arc_buf_t is sharing the data - * block with the hdr, the owner of the - * reference belongs to the hdr. Only - * add to the refcount if the arc_buf_t is - * not shared. - */ - if (arc_buf_is_shared(buf)) - continue; - - (void) zfs_refcount_add_many( - &new_state->arcs_size, - arc_buf_size(buf), buf); - } - ASSERT3U(bufcnt, ==, buffers); - - if (hdr->b_l1hdr.b_pabd != NULL) { - (void) zfs_refcount_add_many( - &new_state->arcs_size, - arc_hdr_size(hdr), hdr); - } else { - ASSERT(GHOST_STATE(old_state)); - } - } - } - - if (update_old && old_state != arc_l2c_only) { - ASSERT(HDR_HAS_L1HDR(hdr)); - if (GHOST_STATE(old_state)) { - ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - - /* - * When moving a header off of a ghost state, - * the header will not contain any arc buffers. - * We use the arc header pointer for the reference - * which is exactly what we did when we put the - * header on the ghost state. - */ - - (void) zfs_refcount_remove_many(&old_state->arcs_size, - HDR_GET_LSIZE(hdr), hdr); - } else { - uint32_t buffers = 0; - - /* - * Each individual buffer holds a unique reference, - * thus we must remove each of these references one - * at a time. - */ - for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; - buf = buf->b_next) { - ASSERT3U(bufcnt, !=, 0); - buffers++; - - /* - * When the arc_buf_t is sharing the data - * block with the hdr, the owner of the - * reference belongs to the hdr. Only - * add to the refcount if the arc_buf_t is - * not shared. - */ - if (arc_buf_is_shared(buf)) - continue; - - (void) zfs_refcount_remove_many( - &old_state->arcs_size, arc_buf_size(buf), - buf); - } - ASSERT3U(bufcnt, ==, buffers); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - (void) zfs_refcount_remove_many( - &old_state->arcs_size, arc_hdr_size(hdr), hdr); - } - } - - if (HDR_HAS_L1HDR(hdr)) - hdr->b_l1hdr.b_state = new_state; - - /* - * L2 headers should never be on the L2 state list since they don't - * have L1 headers allocated. - */ - ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && - multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); -} - -void -arc_space_consume(uint64_t space, arc_space_type_t type) -{ - ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); - - switch (type) { - case ARC_SPACE_DATA: - aggsum_add(&astat_data_size, space); - break; - case ARC_SPACE_META: - aggsum_add(&astat_metadata_size, space); - break; - case ARC_SPACE_BONUS: - aggsum_add(&astat_bonus_size, space); - break; - case ARC_SPACE_DNODE: - aggsum_add(&astat_dnode_size, space); - break; - case ARC_SPACE_DBUF: - aggsum_add(&astat_dbuf_size, space); - break; - case ARC_SPACE_HDRS: - aggsum_add(&astat_hdr_size, space); - break; - case ARC_SPACE_L2HDRS: - aggsum_add(&astat_l2_hdr_size, space); - break; - } - - if (type != ARC_SPACE_DATA) - aggsum_add(&arc_meta_used, space); - - aggsum_add(&arc_size, space); -} - -void -arc_space_return(uint64_t space, arc_space_type_t type) -{ - ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); - - switch (type) { - case ARC_SPACE_DATA: - aggsum_add(&astat_data_size, -space); - break; - case ARC_SPACE_META: - aggsum_add(&astat_metadata_size, -space); - break; - case ARC_SPACE_BONUS: - aggsum_add(&astat_bonus_size, -space); - break; - case ARC_SPACE_DNODE: - aggsum_add(&astat_dnode_size, -space); - break; - case ARC_SPACE_DBUF: - aggsum_add(&astat_dbuf_size, -space); - break; - case ARC_SPACE_HDRS: - aggsum_add(&astat_hdr_size, -space); - break; - case ARC_SPACE_L2HDRS: - aggsum_add(&astat_l2_hdr_size, -space); - break; - } - - if (type != ARC_SPACE_DATA) { - ASSERT(aggsum_compare(&arc_meta_used, space) >= 0); - /* - * We use the upper bound here rather than the precise value - * because the arc_meta_max value doesn't need to be - * precise. It's only consumed by humans via arcstats. - */ - if (arc_meta_max < aggsum_upper_bound(&arc_meta_used)) - arc_meta_max = aggsum_upper_bound(&arc_meta_used); - aggsum_add(&arc_meta_used, -space); - } - - ASSERT(aggsum_compare(&arc_size, space) >= 0); - aggsum_add(&arc_size, -space); -} - -/* - * Given a hdr and a buf, returns whether that buf can share its b_data buffer - * with the hdr's b_pabd. - */ -static boolean_t -arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) -{ - /* - * The criteria for sharing a hdr's data are: - * 1. the hdr's compression matches the buf's compression - * 2. the hdr doesn't need to be byteswapped - * 3. the hdr isn't already being shared - * 4. the buf is either compressed or it is the last buf in the hdr list - * - * Criterion #4 maintains the invariant that shared uncompressed - * bufs must be the final buf in the hdr's b_buf list. Reading this, you - * might ask, "if a compressed buf is allocated first, won't that be the - * last thing in the list?", but in that case it's impossible to create - * a shared uncompressed buf anyway (because the hdr must be compressed - * to have the compressed buf). You might also think that #3 is - * sufficient to make this guarantee, however it's possible - * (specifically in the rare L2ARC write race mentioned in - * arc_buf_alloc_impl()) there will be an existing uncompressed buf that - * is sharable, but wasn't at the time of its allocation. Rather than - * allow a new shared uncompressed buf to be created and then shuffle - * the list around to make it the last element, this simply disallows - * sharing if the new buf isn't the first to be added. - */ - ASSERT3P(buf->b_hdr, ==, hdr); - boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF; - boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; - return (buf_compressed == hdr_compressed && - hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && - !HDR_SHARED_DATA(hdr) && - (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); -} - -/* - * Allocate a buf for this hdr. If you care about the data that's in the hdr, - * or if you want a compressed buffer, pass those flags in. Returns 0 if the - * copy was made successfully, or an error code otherwise. - */ -static int -arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, - boolean_t fill, arc_buf_t **ret) -{ - arc_buf_t *buf; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); - VERIFY(hdr->b_type == ARC_BUFC_DATA || - hdr->b_type == ARC_BUFC_METADATA); - ASSERT3P(ret, !=, NULL); - ASSERT3P(*ret, ==, NULL); - - buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_next = hdr->b_l1hdr.b_buf; - buf->b_flags = 0; - - add_reference(hdr, tag); - - /* - * We're about to change the hdr's b_flags. We must either - * hold the hash_lock or be undiscoverable. - */ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - - /* - * Only honor requests for compressed bufs if the hdr is actually - * compressed. - */ - if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) - buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; - - /* - * If the hdr's data can be shared then we share the data buffer and - * set the appropriate bit in the hdr's b_flags to indicate the hdr is - * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new - * buffer to store the buf's data. - * - * There are two additional restrictions here because we're sharing - * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be - * actively involved in an L2ARC write, because if this buf is used by - * an arc_write() then the hdr's data buffer will be released when the - * write completes, even though the L2ARC write might still be using it. - * Second, the hdr's ABD must be linear so that the buf's user doesn't - * need to be ABD-aware. - */ - boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && - abd_is_linear(hdr->b_l1hdr.b_pabd); - - /* Set up b_data and sharing */ - if (can_share) { - buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); - buf->b_flags |= ARC_BUF_FLAG_SHARED; - arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); - } else { - buf->b_data = - arc_get_data_buf(hdr, arc_buf_size(buf), buf); - ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); - } - VERIFY3P(buf->b_data, !=, NULL); - - hdr->b_l1hdr.b_buf = buf; - hdr->b_l1hdr.b_bufcnt += 1; - - /* - * If the user wants the data from the hdr, we need to either copy or - * decompress the data. - */ - if (fill) { - return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0)); - } - - return (0); -} - -static char *arc_onloan_tag = "onloan"; - -static inline void -arc_loaned_bytes_update(int64_t delta) -{ - atomic_add_64(&arc_loaned_bytes, delta); - - /* assert that it did not wrap around */ - ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); -} - -/* - * Loan out an anonymous arc buffer. Loaned buffers are not counted as in - * flight data by arc_tempreserve_space() until they are "returned". Loaned - * buffers must be returned to the arc before they can be used by the DMU or - * freed. - */ -arc_buf_t * -arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) -{ - arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, - is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); - - arc_loaned_bytes_update(arc_buf_size(buf)); - - return (buf); -} - -arc_buf_t * -arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type) -{ - arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, - psize, lsize, compression_type); - - arc_loaned_bytes_update(arc_buf_size(buf)); - - return (buf); -} - - -/* - * Return a loaned arc buffer to the arc. - */ -void -arc_return_buf(arc_buf_t *buf, void *tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - - ASSERT3P(buf->b_data, !=, NULL); - ASSERT(HDR_HAS_L1HDR(hdr)); - (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag); - (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); - - arc_loaned_bytes_update(-arc_buf_size(buf)); -} - -/* Detach an arc_buf from a dbuf (tag) */ -void -arc_loan_inuse_buf(arc_buf_t *buf, void *tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - - ASSERT3P(buf->b_data, !=, NULL); - ASSERT(HDR_HAS_L1HDR(hdr)); - (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); - (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); - - arc_loaned_bytes_update(arc_buf_size(buf)); -} - -static void -l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) -{ - l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); - - df->l2df_abd = abd; - df->l2df_size = size; - df->l2df_type = type; - mutex_enter(&l2arc_free_on_write_mtx); - list_insert_head(l2arc_free_on_write, df); - mutex_exit(&l2arc_free_on_write_mtx); -} - -static void -arc_hdr_free_on_write(arc_buf_hdr_t *hdr) -{ - arc_state_t *state = hdr->b_l1hdr.b_state; - arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t size = arc_hdr_size(hdr); - - /* protected by hash lock, if in the hash table */ - if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT(state != arc_anon && state != arc_l2c_only); - - (void) zfs_refcount_remove_many(&state->arcs_esize[type], - size, hdr); - } - (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr); - if (type == ARC_BUFC_METADATA) { - arc_space_return(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); - arc_space_return(size, ARC_SPACE_DATA); - } - - l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); -} - -/* - * Share the arc_buf_t's data with the hdr. Whenever we are sharing the - * data buffer, we transfer the refcount ownership to the hdr and update - * the appropriate kstats. - */ -static void -arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) -{ - arc_state_t *state = hdr->b_l1hdr.b_state; - - ASSERT(arc_can_share(hdr, buf)); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - - /* - * Start sharing the data buffer. We transfer the - * refcount ownership to the hdr since it always owns - * the refcount whenever an arc_buf_t is shared. - */ - zfs_refcount_transfer_ownership(&state->arcs_size, buf, hdr); - hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); - abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, - HDR_ISTYPE_METADATA(hdr)); - arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); - buf->b_flags |= ARC_BUF_FLAG_SHARED; - - /* - * Since we've transferred ownership to the hdr we need - * to increment its compressed and uncompressed kstats and - * decrement the overhead size. - */ - ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); - ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); - ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); -} - -static void -arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) -{ - arc_state_t *state = hdr->b_l1hdr.b_state; - - ASSERT(arc_buf_is_shared(buf)); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - - /* - * We are no longer sharing this buffer so we need - * to transfer its ownership to the rightful owner. - */ - zfs_refcount_transfer_ownership(&state->arcs_size, hdr, buf); - arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); - abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); - abd_put(hdr->b_l1hdr.b_pabd); - hdr->b_l1hdr.b_pabd = NULL; - buf->b_flags &= ~ARC_BUF_FLAG_SHARED; - - /* - * Since the buffer is no longer shared between - * the arc buf and the hdr, count it as overhead. - */ - ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); - ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); - ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); -} - -/* - * Remove an arc_buf_t from the hdr's buf list and return the last - * arc_buf_t on the list. If no buffers remain on the list then return - * NULL. - */ -static arc_buf_t * -arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) -{ - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - - arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; - arc_buf_t *lastbuf = NULL; - - /* - * Remove the buf from the hdr list and locate the last - * remaining buffer on the list. - */ - while (*bufp != NULL) { - if (*bufp == buf) - *bufp = buf->b_next; - - /* - * If we've removed a buffer in the middle of - * the list then update the lastbuf and update - * bufp. - */ - if (*bufp != NULL) { - lastbuf = *bufp; - bufp = &(*bufp)->b_next; - } - } - buf->b_next = NULL; - ASSERT3P(lastbuf, !=, buf); - IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); - IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); - IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); - - return (lastbuf); -} - -/* - * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's - * list and free it. - */ -static void -arc_buf_destroy_impl(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - - /* - * Free up the data associated with the buf but only if we're not - * sharing this with the hdr. If we are sharing it with the hdr, the - * hdr is responsible for doing the free. - */ - if (buf->b_data != NULL) { - /* - * We're about to change the hdr's b_flags. We must either - * hold the hash_lock or be undiscoverable. - */ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - - arc_cksum_verify(buf); -#ifdef illumos - arc_buf_unwatch(buf); -#endif - - if (arc_buf_is_shared(buf)) { - arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); - } else { - uint64_t size = arc_buf_size(buf); - arc_free_data_buf(hdr, buf->b_data, size, buf); - ARCSTAT_INCR(arcstat_overhead_size, -size); - } - buf->b_data = NULL; - - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); - hdr->b_l1hdr.b_bufcnt -= 1; - } - - arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); - - if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { - /* - * If the current arc_buf_t is sharing its data buffer with the - * hdr, then reassign the hdr's b_pabd to share it with the new - * buffer at the end of the list. The shared buffer is always - * the last one on the hdr's buffer list. - * - * There is an equivalent case for compressed bufs, but since - * they aren't guaranteed to be the last buf in the list and - * that is an exceedingly rare case, we just allow that space be - * wasted temporarily. - */ - if (lastbuf != NULL) { - /* Only one buf can be shared at once */ - VERIFY(!arc_buf_is_shared(lastbuf)); - /* hdr is uncompressed so can't have compressed buf */ - VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); - - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - arc_hdr_free_pabd(hdr); - - /* - * We must setup a new shared block between the - * last buffer and the hdr. The data would have - * been allocated by the arc buf so we need to transfer - * ownership to the hdr since it's now being shared. - */ - arc_share_buf(hdr, lastbuf); - } - } else if (HDR_SHARED_DATA(hdr)) { - /* - * Uncompressed shared buffers are always at the end - * of the list. Compressed buffers don't have the - * same requirements. This makes it hard to - * simply assert that the lastbuf is shared so - * we rely on the hdr's compression flags to determine - * if we have a compressed, shared buffer. - */ - ASSERT3P(lastbuf, !=, NULL); - ASSERT(arc_buf_is_shared(lastbuf) || - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); - } - - /* - * Free the checksum if we're removing the last uncompressed buf from - * this hdr. - */ - if (!arc_hdr_has_uncompressed_buf(hdr)) { - arc_cksum_free(hdr); - } - - /* clean up the buf */ - buf->b_hdr = NULL; - kmem_cache_free(buf_cache, buf); -} - -static void -arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, boolean_t do_adapt) -{ - ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(!HDR_SHARED_DATA(hdr)); - - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, do_adapt); - hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - - ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); - ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); -} - -static void -arc_hdr_free_pabd(arc_buf_hdr_t *hdr) -{ - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - - /* - * If the hdr is currently being written to the l2arc then - * we defer freeing the data by adding it to the l2arc_free_on_write - * list. The l2arc will free the data once it's finished - * writing it to the l2arc device. - */ - if (HDR_L2_WRITING(hdr)) { - arc_hdr_free_on_write(hdr); - ARCSTAT_BUMP(arcstat_l2_free_on_write); - } else { - arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, - arc_hdr_size(hdr), hdr); - } - hdr->b_l1hdr.b_pabd = NULL; - hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - - ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); - ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); -} - -static arc_buf_hdr_t * -arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, - enum zio_compress compression_type, arc_buf_contents_t type) -{ - arc_buf_hdr_t *hdr; - - VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); - - hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - ASSERT(HDR_EMPTY(hdr)); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); - HDR_SET_PSIZE(hdr, psize); - HDR_SET_LSIZE(hdr, lsize); - hdr->b_spa = spa; - hdr->b_type = type; - hdr->b_flags = 0; - arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); - arc_hdr_set_compress(hdr, compression_type); - - hdr->b_l1hdr.b_state = arc_anon; - hdr->b_l1hdr.b_arc_access = 0; - hdr->b_l1hdr.b_bufcnt = 0; - hdr->b_l1hdr.b_buf = NULL; - - /* - * Allocate the hdr's buffer. This will contain either - * the compressed or uncompressed data depending on the block - * it references and compressed arc enablement. - */ - arc_hdr_alloc_pabd(hdr, B_TRUE); - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - - return (hdr); -} - -/* - * Transition between the two allocation states for the arc_buf_hdr struct. - * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without - * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller - * version is used when a cache buffer is only in the L2ARC in order to reduce - * memory usage. - */ -static arc_buf_hdr_t * -arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) -{ - ASSERT(HDR_HAS_L2HDR(hdr)); - - arc_buf_hdr_t *nhdr; - l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; - - ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || - (old == hdr_l2only_cache && new == hdr_full_cache)); - - nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); - - ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); - buf_hash_remove(hdr); - - bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); - - if (new == hdr_full_cache) { - arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); - /* - * arc_access and arc_change_state need to be aware that a - * header has just come out of L2ARC, so we set its state to - * l2c_only even though it's about to change. - */ - nhdr->b_l1hdr.b_state = arc_l2c_only; - - /* Verify previous threads set to NULL before freeing */ - ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); - } else { - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT0(hdr->b_l1hdr.b_bufcnt); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - - /* - * If we've reached here, We must have been called from - * arc_evict_hdr(), as such we should have already been - * removed from any ghost list we were previously on - * (which protects us from racing with arc_evict_state), - * thus no locking is needed during this check. - */ - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - - /* - * A buffer must not be moved into the arc_l2c_only - * state if it's not finished being written out to the - * l2arc device. Otherwise, the b_l1hdr.b_pabd field - * might try to be accessed, even though it was removed. - */ - VERIFY(!HDR_L2_WRITING(hdr)); - VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); - -#ifdef ZFS_DEBUG - if (hdr->b_l1hdr.b_thawed != NULL) { - kmem_free(hdr->b_l1hdr.b_thawed, 1); - hdr->b_l1hdr.b_thawed = NULL; - } -#endif - - arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); - } - /* - * The header has been reallocated so we need to re-insert it into any - * lists it was on. - */ - (void) buf_hash_insert(nhdr, NULL); - - ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); - - mutex_enter(&dev->l2ad_mtx); - - /* - * We must place the realloc'ed header back into the list at - * the same spot. Otherwise, if it's placed earlier in the list, - * l2arc_write_buffers() could find it during the function's - * write phase, and try to write it out to the l2arc. - */ - list_insert_after(&dev->l2ad_buflist, hdr, nhdr); - list_remove(&dev->l2ad_buflist, hdr); - - mutex_exit(&dev->l2ad_mtx); - - /* - * Since we're using the pointer address as the tag when - * incrementing and decrementing the l2ad_alloc refcount, we - * must remove the old pointer (that we're about to destroy) and - * add the new pointer to the refcount. Otherwise we'd remove - * the wrong pointer address when calling arc_hdr_destroy() later. - */ - - (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), - hdr); - (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), - nhdr); - - buf_discard_identity(hdr); - kmem_cache_free(old, hdr); - - return (nhdr); -} - -/* - * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. - * The buf is returned thawed since we expect the consumer to modify it. - */ -arc_buf_t * -arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) -{ - arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, - ZIO_COMPRESS_OFF, type); - ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); - - arc_buf_t *buf = NULL; - VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf)); - arc_buf_thaw(buf); - - return (buf); -} - -/* - * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this - * for bufs containing metadata. - */ -arc_buf_t * -arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type) -{ - ASSERT3U(lsize, >, 0); - ASSERT3U(lsize, >=, psize); - ASSERT(compression_type > ZIO_COMPRESS_OFF); - ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); - - arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - compression_type, ARC_BUFC_DATA); - ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); - - arc_buf_t *buf = NULL; - VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf)); - arc_buf_thaw(buf); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - - if (!arc_buf_is_shared(buf)) { - /* - * To ensure that the hdr has the correct data in it if we call - * arc_decompress() on this buf before it's been written to - * disk, it's easiest if we just set up sharing between the - * buf and the hdr. - */ - ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); - arc_hdr_free_pabd(hdr); - arc_share_buf(hdr, buf); - } - - return (buf); -} - -static void -arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) -{ - l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; - l2arc_dev_t *dev = l2hdr->b_dev; - uint64_t psize = arc_hdr_size(hdr); - - ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); - ASSERT(HDR_HAS_L2HDR(hdr)); - - list_remove(&dev->l2ad_buflist, hdr); - - ARCSTAT_INCR(arcstat_l2_psize, -psize); - ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); - - vdev_space_update(dev->l2ad_vdev, -psize, 0, 0); - - (void) zfs_refcount_remove_many(&dev->l2ad_alloc, psize, hdr); - arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); -} - -static void -arc_hdr_destroy(arc_buf_hdr_t *hdr) -{ - if (HDR_HAS_L1HDR(hdr)) { - ASSERT(hdr->b_l1hdr.b_buf == NULL || - hdr->b_l1hdr.b_bufcnt > 0); - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); - } - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(!HDR_IN_HASH_TABLE(hdr)); - - if (!HDR_EMPTY(hdr)) - buf_discard_identity(hdr); - - if (HDR_HAS_L2HDR(hdr)) { - l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; - boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); - - if (!buflist_held) - mutex_enter(&dev->l2ad_mtx); - - /* - * Even though we checked this conditional above, we - * need to check this again now that we have the - * l2ad_mtx. This is because we could be racing with - * another thread calling l2arc_evict() which might have - * destroyed this header's L2 portion as we were waiting - * to acquire the l2ad_mtx. If that happens, we don't - * want to re-destroy the header's L2 portion. - */ - if (HDR_HAS_L2HDR(hdr)) { - l2arc_trim(hdr); - arc_hdr_l2hdr_destroy(hdr); - } - - if (!buflist_held) - mutex_exit(&dev->l2ad_mtx); - } - - if (HDR_HAS_L1HDR(hdr)) { - arc_cksum_free(hdr); - - while (hdr->b_l1hdr.b_buf != NULL) - arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); - -#ifdef ZFS_DEBUG - if (hdr->b_l1hdr.b_thawed != NULL) { - kmem_free(hdr->b_l1hdr.b_thawed, 1); - hdr->b_l1hdr.b_thawed = NULL; - } -#endif - - if (hdr->b_l1hdr.b_pabd != NULL) { - arc_hdr_free_pabd(hdr); - } - } - - ASSERT3P(hdr->b_hash_next, ==, NULL); - if (HDR_HAS_L1HDR(hdr)) { - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); - kmem_cache_free(hdr_full_cache, hdr); - } else { - kmem_cache_free(hdr_l2only_cache, hdr); - } -} - -void -arc_buf_destroy(arc_buf_t *buf, void* tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock = HDR_LOCK(hdr); - - if (hdr->b_l1hdr.b_state == arc_anon) { - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - VERIFY0(remove_reference(hdr, NULL, tag)); - arc_hdr_destroy(hdr); - return; - } - - mutex_enter(hash_lock); - ASSERT3P(hdr, ==, buf->b_hdr); - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); - ASSERT3P(buf->b_data, !=, NULL); - - (void) remove_reference(hdr, hash_lock, tag); - arc_buf_destroy_impl(buf); - mutex_exit(hash_lock); -} - -/* - * Evict the arc_buf_hdr that is provided as a parameter. The resultant - * state of the header is dependent on its state prior to entering this - * function. The following transitions are possible: - * - * - arc_mru -> arc_mru_ghost - * - arc_mfu -> arc_mfu_ghost - * - arc_mru_ghost -> arc_l2c_only - * - arc_mru_ghost -> deleted - * - arc_mfu_ghost -> arc_l2c_only - * - arc_mfu_ghost -> deleted - */ -static int64_t -arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) -{ - arc_state_t *evicted_state, *state; - int64_t bytes_evicted = 0; - int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? - zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms; - - ASSERT(MUTEX_HELD(hash_lock)); - ASSERT(HDR_HAS_L1HDR(hdr)); - - state = hdr->b_l1hdr.b_state; - if (GHOST_STATE(state)) { - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - - /* - * l2arc_write_buffers() relies on a header's L1 portion - * (i.e. its b_pabd field) during it's write phase. - * Thus, we cannot push a header onto the arc_l2c_only - * state (removing it's L1 piece) until the header is - * done being written to the l2arc. - */ - if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { - ARCSTAT_BUMP(arcstat_evict_l2_skip); - return (bytes_evicted); - } - - ARCSTAT_BUMP(arcstat_deleted); - bytes_evicted += HDR_GET_LSIZE(hdr); - - DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); - - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - if (HDR_HAS_L2HDR(hdr)) { - /* - * This buffer is cached on the 2nd Level ARC; - * don't destroy the header. - */ - arc_change_state(arc_l2c_only, hdr, hash_lock); - /* - * dropping from L1+L2 cached to L2-only, - * realloc to remove the L1 header. - */ - hdr = arc_hdr_realloc(hdr, hdr_full_cache, - hdr_l2only_cache); - } else { - arc_change_state(arc_anon, hdr, hash_lock); - arc_hdr_destroy(hdr); - } - return (bytes_evicted); - } - - ASSERT(state == arc_mru || state == arc_mfu); - evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; - - /* prefetch buffers have a minimum lifespan */ - if (HDR_IO_IN_PROGRESS(hdr) || - ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && - ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) { - ARCSTAT_BUMP(arcstat_evict_skip); - return (bytes_evicted); - } - - ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); - while (hdr->b_l1hdr.b_buf) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - if (!mutex_tryenter(&buf->b_evict_lock)) { - ARCSTAT_BUMP(arcstat_mutex_miss); - break; - } - if (buf->b_data != NULL) - bytes_evicted += HDR_GET_LSIZE(hdr); - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy_impl(buf); - } - - if (HDR_HAS_L2HDR(hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); - } else { - if (l2arc_write_eligible(hdr->b_spa, hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_eligible, - HDR_GET_LSIZE(hdr)); - } else { - ARCSTAT_INCR(arcstat_evict_l2_ineligible, - HDR_GET_LSIZE(hdr)); - } - } - - if (hdr->b_l1hdr.b_bufcnt == 0) { - arc_cksum_free(hdr); - - bytes_evicted += arc_hdr_size(hdr); - - /* - * If this hdr is being evicted and has a compressed - * buffer then we discard it here before we change states. - * This ensures that the accounting is updated correctly - * in arc_free_data_impl(). - */ - arc_hdr_free_pabd(hdr); - - arc_change_state(evicted_state, hdr, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(hdr)); - arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); - DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); - } - - return (bytes_evicted); -} - -static uint64_t -arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, - uint64_t spa, int64_t bytes) -{ - multilist_sublist_t *mls; - uint64_t bytes_evicted = 0; - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - int evict_count = 0; - - ASSERT3P(marker, !=, NULL); - IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); - - mls = multilist_sublist_lock(ml, idx); - - for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; - hdr = multilist_sublist_prev(mls, marker)) { - if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || - (evict_count >= zfs_arc_evict_batch_limit)) - break; - - /* - * To keep our iteration location, move the marker - * forward. Since we're not holding hdr's hash lock, we - * must be very careful and not remove 'hdr' from the - * sublist. Otherwise, other consumers might mistake the - * 'hdr' as not being on a sublist when they call the - * multilist_link_active() function (they all rely on - * the hash lock protecting concurrent insertions and - * removals). multilist_sublist_move_forward() was - * specifically implemented to ensure this is the case - * (only 'marker' will be removed and re-inserted). - */ - multilist_sublist_move_forward(mls, marker); - - /* - * The only case where the b_spa field should ever be - * zero, is the marker headers inserted by - * arc_evict_state(). It's possible for multiple threads - * to be calling arc_evict_state() concurrently (e.g. - * dsl_pool_close() and zio_inject_fault()), so we must - * skip any markers we see from these other threads. - */ - if (hdr->b_spa == 0) - continue; - - /* we're only interested in evicting buffers of a certain spa */ - if (spa != 0 && hdr->b_spa != spa) { - ARCSTAT_BUMP(arcstat_evict_skip); - continue; - } - - hash_lock = HDR_LOCK(hdr); - - /* - * We aren't calling this function from any code path - * that would already be holding a hash lock, so we're - * asserting on this assumption to be defensive in case - * this ever changes. Without this check, it would be - * possible to incorrectly increment arcstat_mutex_miss - * below (e.g. if the code changed such that we called - * this function with a hash lock held). - */ - ASSERT(!MUTEX_HELD(hash_lock)); - - if (mutex_tryenter(hash_lock)) { - uint64_t evicted = arc_evict_hdr(hdr, hash_lock); - mutex_exit(hash_lock); - - bytes_evicted += evicted; - - /* - * If evicted is zero, arc_evict_hdr() must have - * decided to skip this header, don't increment - * evict_count in this case. - */ - if (evicted != 0) - evict_count++; - - /* - * If arc_size isn't overflowing, signal any - * threads that might happen to be waiting. - * - * For each header evicted, we wake up a single - * thread. If we used cv_broadcast, we could - * wake up "too many" threads causing arc_size - * to significantly overflow arc_c; since - * arc_get_data_impl() doesn't check for overflow - * when it's woken up (it doesn't because it's - * possible for the ARC to be overflowing while - * full of un-evictable buffers, and the - * function should proceed in this case). - * - * If threads are left sleeping, due to not - * using cv_broadcast here, they will be woken - * up via cv_broadcast in arc_adjust_cb() just - * before arc_adjust_zthr sleeps. - */ - mutex_enter(&arc_adjust_lock); - if (!arc_is_overflowing()) - cv_signal(&arc_adjust_waiters_cv); - mutex_exit(&arc_adjust_lock); - } else { - ARCSTAT_BUMP(arcstat_mutex_miss); - } - } - - multilist_sublist_unlock(mls); - - return (bytes_evicted); -} - -/* - * Evict buffers from the given arc state, until we've removed the - * specified number of bytes. Move the removed buffers to the - * appropriate evict state. - * - * This function makes a "best effort". It skips over any buffers - * it can't get a hash_lock on, and so, may not catch all candidates. - * It may also return without evicting as much space as requested. - * - * If bytes is specified using the special value ARC_EVICT_ALL, this - * will evict all available (i.e. unlocked and evictable) buffers from - * the given arc state; which is used by arc_flush(). - */ -static uint64_t -arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type) -{ - uint64_t total_evicted = 0; - multilist_t *ml = state->arcs_list[type]; - int num_sublists; - arc_buf_hdr_t **markers; - - IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); - - num_sublists = multilist_get_num_sublists(ml); - - /* - * If we've tried to evict from each sublist, made some - * progress, but still have not hit the target number of bytes - * to evict, we want to keep trying. The markers allow us to - * pick up where we left off for each individual sublist, rather - * than starting from the tail each time. - */ - markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); - for (int i = 0; i < num_sublists; i++) { - markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); - - /* - * A b_spa of 0 is used to indicate that this header is - * a marker. This fact is used in arc_adjust_type() and - * arc_evict_state_impl(). - */ - markers[i]->b_spa = 0; - - multilist_sublist_t *mls = multilist_sublist_lock(ml, i); - multilist_sublist_insert_tail(mls, markers[i]); - multilist_sublist_unlock(mls); - } - - /* - * While we haven't hit our target number of bytes to evict, or - * we're evicting all available buffers. - */ - while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { - int sublist_idx = multilist_get_random_index(ml); - uint64_t scan_evicted = 0; - - /* - * Try to reduce pinned dnodes with a floor of arc_dnode_limit. - * Request that 10% of the LRUs be scanned by the superblock - * shrinker. - */ - if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size, - arc_dnode_limit) > 0) { - arc_prune_async((aggsum_upper_bound(&astat_dnode_size) - - arc_dnode_limit) / sizeof (dnode_t) / - zfs_arc_dnode_reduce_percent); - } - - /* - * Start eviction using a randomly selected sublist, - * this is to try and evenly balance eviction across all - * sublists. Always starting at the same sublist - * (e.g. index 0) would cause evictions to favor certain - * sublists over others. - */ - for (int i = 0; i < num_sublists; i++) { - uint64_t bytes_remaining; - uint64_t bytes_evicted; - - if (bytes == ARC_EVICT_ALL) - bytes_remaining = ARC_EVICT_ALL; - else if (total_evicted < bytes) - bytes_remaining = bytes - total_evicted; - else - break; - - bytes_evicted = arc_evict_state_impl(ml, sublist_idx, - markers[sublist_idx], spa, bytes_remaining); - - scan_evicted += bytes_evicted; - total_evicted += bytes_evicted; - - /* we've reached the end, wrap to the beginning */ - if (++sublist_idx >= num_sublists) - sublist_idx = 0; - } - - /* - * If we didn't evict anything during this scan, we have - * no reason to believe we'll evict more during another - * scan, so break the loop. - */ - if (scan_evicted == 0) { - /* This isn't possible, let's make that obvious */ - ASSERT3S(bytes, !=, 0); - - /* - * When bytes is ARC_EVICT_ALL, the only way to - * break the loop is when scan_evicted is zero. - * In that case, we actually have evicted enough, - * so we don't want to increment the kstat. - */ - if (bytes != ARC_EVICT_ALL) { - ASSERT3S(total_evicted, <, bytes); - ARCSTAT_BUMP(arcstat_evict_not_enough); - } - - break; - } - } - - for (int i = 0; i < num_sublists; i++) { - multilist_sublist_t *mls = multilist_sublist_lock(ml, i); - multilist_sublist_remove(mls, markers[i]); - multilist_sublist_unlock(mls); - - kmem_cache_free(hdr_full_cache, markers[i]); - } - kmem_free(markers, sizeof (*markers) * num_sublists); - - return (total_evicted); -} - -/* - * Flush all "evictable" data of the given type from the arc state - * specified. This will not evict any "active" buffers (i.e. referenced). - * - * When 'retry' is set to B_FALSE, the function will make a single pass - * over the state and evict any buffers that it can. Since it doesn't - * continually retry the eviction, it might end up leaving some buffers - * in the ARC due to lock misses. - * - * When 'retry' is set to B_TRUE, the function will continually retry the - * eviction until *all* evictable buffers have been removed from the - * state. As a result, if concurrent insertions into the state are - * allowed (e.g. if the ARC isn't shutting down), this function might - * wind up in an infinite loop, continually trying to evict buffers. - */ -static uint64_t -arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, - boolean_t retry) -{ - uint64_t evicted = 0; - - while (zfs_refcount_count(&state->arcs_esize[type]) != 0) { - evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); - - if (!retry) - break; - } - - return (evicted); -} - -/* - * Helper function for arc_prune_async() it is responsible for safely - * handling the execution of a registered arc_prune_func_t. - */ -static void -arc_prune_task(void *ptr) -{ - arc_prune_t *ap = (arc_prune_t *)ptr; - arc_prune_func_t *func = ap->p_pfunc; - - if (func != NULL) - func(ap->p_adjust, ap->p_private); - - zfs_refcount_remove(&ap->p_refcnt, func); -} - -/* - * Notify registered consumers they must drop holds on a portion of the ARC - * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This - * is analogous to dnlc_reduce_cache() but more generic. - * - * This operation is performed asynchronously so it may be safely called - * in the context of the arc_reclaim_thread(). A reference is taken here - * for each registered arc_prune_t and the arc_prune_task() is responsible - * for releasing it once the registered arc_prune_func_t has completed. - */ -static void -arc_prune_async(int64_t adjust) -{ - arc_prune_t *ap; - - mutex_enter(&arc_prune_mtx); - for (ap = list_head(&arc_prune_list); ap != NULL; - ap = list_next(&arc_prune_list, ap)) { - - if (zfs_refcount_count(&ap->p_refcnt) >= 2) - continue; - - zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); - ap->p_adjust = adjust; - if (taskq_dispatch(arc_prune_taskq, arc_prune_task, - ap, TQ_SLEEP) == TASKQID_INVALID) { - zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); - continue; - } - ARCSTAT_BUMP(arcstat_prune); - } - mutex_exit(&arc_prune_mtx); -} - -/* - * Evict the specified number of bytes from the state specified, - * restricting eviction to the spa and type given. This function - * prevents us from trying to evict more from a state's list than - * is "evictable", and to skip evicting altogether when passed a - * negative value for "bytes". In contrast, arc_evict_state() will - * evict everything it can, when passed a negative value for "bytes". - */ -static uint64_t -arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type) -{ - int64_t delta; - - if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { - delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), - bytes); - return (arc_evict_state(state, spa, delta, type)); - } - - return (0); -} - -/* - * The goal of this function is to evict enough meta data buffers from the - * ARC in order to enforce the arc_meta_limit. Achieving this is slightly - * more complicated than it appears because it is common for data buffers - * to have holds on meta data buffers. In addition, dnode meta data buffers - * will be held by the dnodes in the block preventing them from being freed. - * This means we can't simply traverse the ARC and expect to always find - * enough unheld meta data buffer to release. - * - * Therefore, this function has been updated to make alternating passes - * over the ARC releasing data buffers and then newly unheld meta data - * buffers. This ensures forward progress is maintained and meta_used - * will decrease. Normally this is sufficient, but if required the ARC - * will call the registered prune callbacks causing dentry and inodes to - * be dropped from the VFS cache. This will make dnode meta data buffers - * available for reclaim. - */ -static uint64_t -arc_adjust_meta_balanced(uint64_t meta_used) -{ - int64_t delta, prune = 0, adjustmnt; - uint64_t total_evicted = 0; - arc_buf_contents_t type = ARC_BUFC_DATA; - int restarts = MAX(zfs_arc_meta_adjust_restarts, 0); - -restart: - /* - * This slightly differs than the way we evict from the mru in - * arc_adjust because we don't have a "target" value (i.e. no - * "meta" arc_p). As a result, I think we can completely - * cannibalize the metadata in the MRU before we evict the - * metadata from the MFU. I think we probably need to implement a - * "metadata arc_p" value to do this properly. - */ - adjustmnt = meta_used - arc_meta_limit; - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) { - delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]), - adjustmnt); - total_evicted += arc_adjust_impl(arc_mru, 0, delta, type); - adjustmnt -= delta; - } - - /* - * We can't afford to recalculate adjustmnt here. If we do, - * new metadata buffers can sneak into the MRU or ANON lists, - * thus penalize the MFU metadata. Although the fudge factor is - * small, it has been empirically shown to be significant for - * certain workloads (e.g. creating many empty directories). As - * such, we use the original calculation for adjustmnt, and - * simply decrement the amount of data evicted from the MRU. - */ - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) { - delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]), - adjustmnt); - total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type); - } - - adjustmnt = meta_used - arc_meta_limit; - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { - delta = MIN(adjustmnt, - zfs_refcount_count(&arc_mru_ghost->arcs_esize[type])); - total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type); - adjustmnt -= delta; - } - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { - delta = MIN(adjustmnt, - zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type])); - total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type); - } - - /* - * If after attempting to make the requested adjustment to the ARC - * the meta limit is still being exceeded then request that the - * higher layers drop some cached objects which have holds on ARC - * meta buffers. Requests to the upper layers will be made with - * increasingly large scan sizes until the ARC is below the limit. - */ - if (meta_used > arc_meta_limit) { - if (type == ARC_BUFC_DATA) { - type = ARC_BUFC_METADATA; - } else { - type = ARC_BUFC_DATA; - - if (zfs_arc_meta_prune) { - prune += zfs_arc_meta_prune; - arc_prune_async(prune); - } - } - - if (restarts > 0) { - restarts--; - goto restart; - } - } - return (total_evicted); -} - -/* - * Evict metadata buffers from the cache, such that arc_meta_used is - * capped by the arc_meta_limit tunable. - */ -static uint64_t -arc_adjust_meta_only(uint64_t meta_used) -{ - uint64_t total_evicted = 0; - int64_t target; - - /* - * If we're over the meta limit, we want to evict enough - * metadata to get back under the meta limit. We don't want to - * evict so much that we drop the MRU below arc_p, though. If - * we're over the meta limit more than we're over arc_p, we - * evict some from the MRU here, and some from the MFU below. - */ - target = MIN((int64_t)(meta_used - arc_meta_limit), - (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) - arc_p)); - - total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - - /* - * Similar to the above, we want to evict enough bytes to get us - * below the meta limit, but not so much as to drop us below the - * space allotted to the MFU (which is defined as arc_c - arc_p). - */ - target = MIN((int64_t)(meta_used - arc_meta_limit), - (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) - - (arc_c - arc_p))); - - total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - - return (total_evicted); -} - -static uint64_t -arc_adjust_meta(uint64_t meta_used) -{ - if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) - return (arc_adjust_meta_only(meta_used)); - else - return (arc_adjust_meta_balanced(meta_used)); -} - -/* - * Return the type of the oldest buffer in the given arc state - * - * This function will select a random sublist of type ARC_BUFC_DATA and - * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist - * is compared, and the type which contains the "older" buffer will be - * returned. - */ -static arc_buf_contents_t -arc_adjust_type(arc_state_t *state) -{ - multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; - multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; - int data_idx = multilist_get_random_index(data_ml); - int meta_idx = multilist_get_random_index(meta_ml); - multilist_sublist_t *data_mls; - multilist_sublist_t *meta_mls; - arc_buf_contents_t type; - arc_buf_hdr_t *data_hdr; - arc_buf_hdr_t *meta_hdr; - - /* - * We keep the sublist lock until we're finished, to prevent - * the headers from being destroyed via arc_evict_state(). - */ - data_mls = multilist_sublist_lock(data_ml, data_idx); - meta_mls = multilist_sublist_lock(meta_ml, meta_idx); - - /* - * These two loops are to ensure we skip any markers that - * might be at the tail of the lists due to arc_evict_state(). - */ - - for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; - data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { - if (data_hdr->b_spa != 0) - break; - } - - for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; - meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { - if (meta_hdr->b_spa != 0) - break; - } - - if (data_hdr == NULL && meta_hdr == NULL) { - type = ARC_BUFC_DATA; - } else if (data_hdr == NULL) { - ASSERT3P(meta_hdr, !=, NULL); - type = ARC_BUFC_METADATA; - } else if (meta_hdr == NULL) { - ASSERT3P(data_hdr, !=, NULL); - type = ARC_BUFC_DATA; - } else { - ASSERT3P(data_hdr, !=, NULL); - ASSERT3P(meta_hdr, !=, NULL); - - /* The headers can't be on the sublist without an L1 header */ - ASSERT(HDR_HAS_L1HDR(data_hdr)); - ASSERT(HDR_HAS_L1HDR(meta_hdr)); - - if (data_hdr->b_l1hdr.b_arc_access < - meta_hdr->b_l1hdr.b_arc_access) { - type = ARC_BUFC_DATA; - } else { - type = ARC_BUFC_METADATA; - } - } - - multilist_sublist_unlock(meta_mls); - multilist_sublist_unlock(data_mls); - - return (type); -} - -/* - * Evict buffers from the cache, such that arc_size is capped by arc_c. - */ -static uint64_t -arc_adjust(void) -{ - uint64_t total_evicted = 0; - uint64_t bytes; - int64_t target; - uint64_t asize = aggsum_value(&arc_size); - uint64_t ameta = aggsum_value(&arc_meta_used); - - /* - * If we're over arc_meta_limit, we want to correct that before - * potentially evicting data buffers below. - */ - total_evicted += arc_adjust_meta(ameta); - - /* - * Adjust MRU size - * - * If we're over the target cache size, we want to evict enough - * from the list to get back to our target size. We don't want - * to evict too much from the MRU, such that it drops below - * arc_p. So, if we're over our target cache size more than - * the MRU is over arc_p, we'll evict enough to get back to - * arc_p here, and then evict more from the MFU below. - */ - target = MIN((int64_t)(asize - arc_c), - (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); - - /* - * If we're below arc_meta_min, always prefer to evict data. - * Otherwise, try to satisfy the requested number of bytes to - * evict from the type which contains older buffers; in an - * effort to keep newer buffers in the cache regardless of their - * type. If we cannot satisfy the number of bytes from this - * type, spill over into the next type. - */ - if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && - ameta > arc_meta_min) { - bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * metadata, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); - } else { - bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * data, we try to get the rest from metadata. - */ - target -= bytes; - - total_evicted += - arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - } - - /* - * Re-sum ARC stats after the first round of evictions. - */ - asize = aggsum_value(&arc_size); - ameta = aggsum_value(&arc_meta_used); - - /* - * Adjust MFU size - * - * Now that we've tried to evict enough from the MRU to get its - * size back to arc_p, if we're still above the target cache - * size, we evict the rest from the MFU. - */ - target = asize - arc_c; - - if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && - ameta > arc_meta_min) { - bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * metadata, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); - } else { - bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * data, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - } - - /* - * Adjust ghost lists - * - * In addition to the above, the ARC also defines target values - * for the ghost lists. The sum of the mru list and mru ghost - * list should never exceed the target size of the cache, and - * the sum of the mru list, mfu list, mru ghost list, and mfu - * ghost list should never exceed twice the target size of the - * cache. The following logic enforces these limits on the ghost - * caches, and evicts from them as needed. - */ - target = zfs_refcount_count(&arc_mru->arcs_size) + - zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c; - - bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - target -= bytes; - - total_evicted += - arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); - - /* - * We assume the sum of the mru list and mfu list is less than - * or equal to arc_c (we enforced this above), which means we - * can use the simpler of the two equations below: - * - * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c - * mru ghost + mfu ghost <= arc_c - */ - target = zfs_refcount_count(&arc_mru_ghost->arcs_size) + - zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; - - bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - target -= bytes; - - total_evicted += - arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); - - return (total_evicted); -} - -void -arc_flush(spa_t *spa, boolean_t retry) -{ - uint64_t guid = 0; - - /* - * If retry is B_TRUE, a spa must not be specified since we have - * no good way to determine if all of a spa's buffers have been - * evicted from an arc state. - */ - ASSERT(!retry || spa == 0); - - if (spa != NULL) - guid = spa_load_guid(spa); - - (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); - (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); - - (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); - (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); - - (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); - (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); - - (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); - (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); -} - -static void -arc_reduce_target_size(int64_t to_free) -{ - uint64_t asize = aggsum_value(&arc_size); - if (arc_c > arc_c_min) { - DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, - arc_c_min, uint64_t, arc_p, uint64_t, to_free); - if (arc_c > arc_c_min + to_free) - atomic_add_64(&arc_c, -to_free); - else - arc_c = arc_c_min; - - atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); - if (asize < arc_c) - arc_c = MAX(asize, arc_c_min); - if (arc_p > arc_c) - arc_p = (arc_c >> 1); - - DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, - arc_p); - - ASSERT(arc_c >= arc_c_min); - ASSERT((int64_t)arc_p >= 0); - } - - if (asize > arc_c) { - DTRACE_PROBE2(arc__shrink_adjust, uint64_t, asize, - uint64_t, arc_c); - /* See comment in arc_adjust_cb_check() on why lock+flag */ - mutex_enter(&arc_adjust_lock); - arc_adjust_needed = B_TRUE; - mutex_exit(&arc_adjust_lock); - zthr_wakeup(arc_adjust_zthr); - } -} - -typedef enum free_memory_reason_t { - FMR_UNKNOWN, - FMR_NEEDFREE, - FMR_LOTSFREE, - FMR_SWAPFS_MINFREE, - FMR_PAGES_PP_MAXIMUM, - FMR_HEAP_ARENA, - FMR_ZIO_ARENA, -} free_memory_reason_t; - -int64_t last_free_memory; -free_memory_reason_t last_free_reason; - -/* - * Additional reserve of pages for pp_reserve. - */ -int64_t arc_pages_pp_reserve = 64; - -/* - * Additional reserve of pages for swapfs. - */ -int64_t arc_swapfs_reserve = 64; - -/* - * Return the amount of memory that can be consumed before reclaim will be - * needed. Positive if there is sufficient free memory, negative indicates - * the amount of memory that needs to be freed up. - */ -static int64_t -arc_available_memory(void) -{ - int64_t lowest = INT64_MAX; - int64_t n; - free_memory_reason_t r = FMR_UNKNOWN; - -#ifdef _KERNEL -#ifdef __FreeBSD__ - /* - * Cooperate with pagedaemon when it's time for it to scan - * and reclaim some pages. - */ - n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); - if (n < lowest) { - lowest = n; - r = FMR_LOTSFREE; - } - -#else - if (needfree > 0) { - n = PAGESIZE * (-needfree); - if (n < lowest) { - lowest = n; - r = FMR_NEEDFREE; - } - } - - /* - * check that we're out of range of the pageout scanner. It starts to - * schedule paging if freemem is less than lotsfree and needfree. - * lotsfree is the high-water mark for pageout, and needfree is the - * number of needed free pages. We add extra pages here to make sure - * the scanner doesn't start up while we're freeing memory. - */ - n = PAGESIZE * (freemem - lotsfree - needfree - desfree); - if (n < lowest) { - lowest = n; - r = FMR_LOTSFREE; - } - - /* - * check to make sure that swapfs has enough space so that anon - * reservations can still succeed. anon_resvmem() checks that the - * availrmem is greater than swapfs_minfree, and the number of reserved - * swap pages. We also add a bit of extra here just to prevent - * circumstances from getting really dire. - */ - n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - - desfree - arc_swapfs_reserve); - if (n < lowest) { - lowest = n; - r = FMR_SWAPFS_MINFREE; - } - - - /* - * Check that we have enough availrmem that memory locking (e.g., via - * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum - * stores the number of pages that cannot be locked; when availrmem - * drops below pages_pp_maximum, page locking mechanisms such as - * page_pp_lock() will fail.) - */ - n = PAGESIZE * (availrmem - pages_pp_maximum - - arc_pages_pp_reserve); - if (n < lowest) { - lowest = n; - r = FMR_PAGES_PP_MAXIMUM; - } - -#endif /* __FreeBSD__ */ -#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) - /* - * If we're on an i386 platform, it's possible that we'll exhaust the - * kernel heap space before we ever run out of available physical - * memory. Most checks of the size of the heap_area compare against - * tune.t_minarmem, which is the minimum available real memory that we - * can have in the system. However, this is generally fixed at 25 pages - * which is so low that it's useless. In this comparison, we seek to - * calculate the total heap-size, and reclaim if more than 3/4ths of the - * heap is allocated. (Or, in the calculation, if less than 1/4th is - * free) - */ - n = uma_avail() - (long)(uma_limit() / 4); - if (n < lowest) { - lowest = n; - r = FMR_HEAP_ARENA; - } -#endif - - /* - * If zio data pages are being allocated out of a separate heap segment, - * then enforce that the size of available vmem for this arena remains - * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. - * - * Note that reducing the arc_zio_arena_free_shift keeps more virtual - * memory (in the zio_arena) free, which can avoid memory - * fragmentation issues. - */ - if (zio_arena != NULL) { - n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - - (vmem_size(zio_arena, VMEM_ALLOC) >> - arc_zio_arena_free_shift); - if (n < lowest) { - lowest = n; - r = FMR_ZIO_ARENA; - } - } - -#else /* _KERNEL */ - /* Every 100 calls, free a small amount */ - if (spa_get_random(100) == 0) - lowest = -1024; -#endif /* _KERNEL */ - - last_free_memory = lowest; - last_free_reason = r; - DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); - return (lowest); -} - - -/* - * Determine if the system is under memory pressure and is asking - * to reclaim memory. A return value of B_TRUE indicates that the system - * is under memory pressure and that the arc should adjust accordingly. - */ -static boolean_t -arc_reclaim_needed(void) -{ - return (arc_available_memory() < 0); -} - -extern kmem_cache_t *zio_buf_cache[]; -extern kmem_cache_t *zio_data_buf_cache[]; -extern kmem_cache_t *range_seg_cache; -extern kmem_cache_t *abd_chunk_cache; - -static __noinline void -arc_kmem_reap_soon(void) -{ - size_t i; - kmem_cache_t *prev_cache = NULL; - kmem_cache_t *prev_data_cache = NULL; - - DTRACE_PROBE(arc__kmem_reap_start); -#ifdef _KERNEL - if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) { - /* - * We are exceeding our meta-data cache limit. - * Purge some DNLC entries to release holds on meta-data. - */ - dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); - } -#if defined(__i386) - /* - * Reclaim unused memory from all kmem caches. - */ - kmem_reap(); -#endif -#endif - - for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { - if (zio_buf_cache[i] != prev_cache) { - prev_cache = zio_buf_cache[i]; - kmem_cache_reap_soon(zio_buf_cache[i]); - } - if (zio_data_buf_cache[i] != prev_data_cache) { - prev_data_cache = zio_data_buf_cache[i]; - kmem_cache_reap_soon(zio_data_buf_cache[i]); - } - } - kmem_cache_reap_soon(abd_chunk_cache); - kmem_cache_reap_soon(buf_cache); - kmem_cache_reap_soon(hdr_full_cache); - kmem_cache_reap_soon(hdr_l2only_cache); - kmem_cache_reap_soon(range_seg_cache); - -#ifdef illumos - if (zio_arena != NULL) { - /* - * Ask the vmem arena to reclaim unused memory from its - * quantum caches. - */ - vmem_qcache_reap(zio_arena); - } -#endif - DTRACE_PROBE(arc__kmem_reap_end); -} - -/* ARGSUSED */ -static boolean_t -arc_adjust_cb_check(void *arg, zthr_t *zthr) -{ - /* - * This is necessary in order for the mdb ::arc dcmd to - * show up to date information. Since the ::arc command - * does not call the kstat's update function, without - * this call, the command may show stale stats for the - * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even - * with this change, the data might be up to 1 second - * out of date(the arc_adjust_zthr has a maximum sleep - * time of 1 second); but that should suffice. The - * arc_state_t structures can be queried directly if more - * accurate information is needed. - */ - if (arc_ksp != NULL) - arc_ksp->ks_update(arc_ksp, KSTAT_READ); - - /* - * We have to rely on arc_get_data_impl() to tell us when to adjust, - * rather than checking if we are overflowing here, so that we are - * sure to not leave arc_get_data_impl() waiting on - * arc_adjust_waiters_cv. If we have become "not overflowing" since - * arc_get_data_impl() checked, we need to wake it up. We could - * broadcast the CV here, but arc_get_data_impl() may have not yet - * gone to sleep. We would need to use a mutex to ensure that this - * function doesn't broadcast until arc_get_data_impl() has gone to - * sleep (e.g. the arc_adjust_lock). However, the lock ordering of - * such a lock would necessarily be incorrect with respect to the - * zthr_lock, which is held before this function is called, and is - * held by arc_get_data_impl() when it calls zthr_wakeup(). - */ - return (arc_adjust_needed); -} - -/* - * Keep arc_size under arc_c by running arc_adjust which evicts data - * from the ARC. */ -/* ARGSUSED */ -static void -arc_adjust_cb(void *arg, zthr_t *zthr) -{ - uint64_t evicted = 0; - - /* Evict from cache */ - evicted = arc_adjust(); - - /* - * If evicted is zero, we couldn't evict anything - * via arc_adjust(). This could be due to hash lock - * collisions, but more likely due to the majority of - * arc buffers being unevictable. Therefore, even if - * arc_size is above arc_c, another pass is unlikely to - * be helpful and could potentially cause us to enter an - * infinite loop. Additionally, zthr_iscancelled() is - * checked here so that if the arc is shutting down, the - * broadcast will wake any remaining arc adjust waiters. - */ - mutex_enter(&arc_adjust_lock); - arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) && - evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0; - if (!arc_adjust_needed) { - /* - * We're either no longer overflowing, or we - * can't evict anything more, so we should wake - * up any waiters. - */ - cv_broadcast(&arc_adjust_waiters_cv); - } - mutex_exit(&arc_adjust_lock); -} - -/* ARGSUSED */ -static boolean_t -arc_reap_cb_check(void *arg, zthr_t *zthr) -{ - int64_t free_memory = arc_available_memory(); - - /* - * If a kmem reap is already active, don't schedule more. We must - * check for this because kmem_cache_reap_soon() won't actually - * block on the cache being reaped (this is to prevent callers from - * becoming implicitly blocked by a system-wide kmem reap -- which, - * on a system with many, many full magazines, can take minutes). - */ - if (!kmem_cache_reap_active() && - free_memory < 0) { - arc_no_grow = B_TRUE; - arc_warm = B_TRUE; - /* - * Wait at least zfs_grow_retry (default 60) seconds - * before considering growing. - */ - arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); - return (B_TRUE); - } else if (free_memory < arc_c >> arc_no_grow_shift) { - arc_no_grow = B_TRUE; - } else if (gethrtime() >= arc_growtime) { - arc_no_grow = B_FALSE; - } - - return (B_FALSE); -} - -/* - * Keep enough free memory in the system by reaping the ARC's kmem - * caches. To cause more slabs to be reapable, we may reduce the - * target size of the cache (arc_c), causing the arc_adjust_cb() - * to free more buffers. - */ -/* ARGSUSED */ -static void -arc_reap_cb(void *arg, zthr_t *zthr) -{ - int64_t free_memory; - - /* - * Kick off asynchronous kmem_reap()'s of all our caches. - */ - arc_kmem_reap_soon(); - - /* - * Wait at least arc_kmem_cache_reap_retry_ms between - * arc_kmem_reap_soon() calls. Without this check it is possible to - * end up in a situation where we spend lots of time reaping - * caches, while we're near arc_c_min. Waiting here also gives the - * subsequent free memory check a chance of finding that the - * asynchronous reap has already freed enough memory, and we don't - * need to call arc_reduce_target_size(). - */ - delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000); - - /* - * Reduce the target size as needed to maintain the amount of free - * memory in the system at a fraction of the arc_size (1/128th by - * default). If oversubscribed (free_memory < 0) then reduce the - * target arc_size by the deficit amount plus the fractional - * amount. If free memory is positive but less then the fractional - * amount, reduce by what is needed to hit the fractional amount. - */ - free_memory = arc_available_memory(); - - int64_t to_free = - (arc_c >> arc_shrink_shift) - free_memory; - if (to_free > 0) { -#ifdef _KERNEL -#ifdef illumos - to_free = MAX(to_free, ptob(needfree)); -#endif -#endif - arc_reduce_target_size(to_free); - } -} - -static u_int arc_dnlc_evicts_arg; -extern struct vfsops zfs_vfsops; - -static void -arc_dnlc_evicts_thread(void *dummy __unused) -{ - callb_cpr_t cpr; - u_int percent; - - CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG); - - mutex_enter(&arc_dnlc_evicts_lock); - while (!arc_dnlc_evicts_thread_exit) { - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); - CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock); - if (arc_dnlc_evicts_arg != 0) { - percent = arc_dnlc_evicts_arg; - mutex_exit(&arc_dnlc_evicts_lock); -#ifdef _KERNEL - vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops); -#endif - mutex_enter(&arc_dnlc_evicts_lock); - /* - * Clear our token only after vnlru_free() - * pass is done, to avoid false queueing of - * the requests. - */ - arc_dnlc_evicts_arg = 0; - } - } - arc_dnlc_evicts_thread_exit = FALSE; - cv_broadcast(&arc_dnlc_evicts_cv); - CALLB_CPR_EXIT(&cpr); - thread_exit(); -} - -void -dnlc_reduce_cache(void *arg) -{ - u_int percent; - - percent = (u_int)(uintptr_t)arg; - mutex_enter(&arc_dnlc_evicts_lock); - if (arc_dnlc_evicts_arg == 0) { - arc_dnlc_evicts_arg = percent; - cv_broadcast(&arc_dnlc_evicts_cv); - } - mutex_exit(&arc_dnlc_evicts_lock); -} - -/* - * Adapt arc info given the number of bytes we are trying to add and - * the state that we are comming from. This function is only called - * when we are adding new content to the cache. - */ -static void -arc_adapt(int bytes, arc_state_t *state) -{ - int mult; - uint64_t arc_p_min = (arc_c >> arc_p_min_shift); - int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size); - int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size); - - if (state == arc_l2c_only) - return; - - ASSERT(bytes > 0); - /* - * Adapt the target size of the MRU list: - * - if we just hit in the MRU ghost list, then increase - * the target size of the MRU list. - * - if we just hit in the MFU ghost list, then increase - * the target size of the MFU list by decreasing the - * target size of the MRU list. - */ - if (state == arc_mru_ghost) { - mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); - mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ - - arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); - } else if (state == arc_mfu_ghost) { - uint64_t delta; - - mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); - mult = MIN(mult, 10); - - delta = MIN(bytes * mult, arc_p); - arc_p = MAX(arc_p_min, arc_p - delta); - } - ASSERT((int64_t)arc_p >= 0); - - /* - * Wake reap thread if we do not have any available memory - */ - if (arc_reclaim_needed()) { - zthr_wakeup(arc_reap_zthr); - return; - } - - if (arc_no_grow) - return; - - if (arc_c >= arc_c_max) - return; - - /* - * If we're within (2 * maxblocksize) bytes of the target - * cache size, increment the target cache size - */ - if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) > - 0) { - DTRACE_PROBE1(arc__inc_adapt, int, bytes); - atomic_add_64(&arc_c, (int64_t)bytes); - if (arc_c > arc_c_max) - arc_c = arc_c_max; - else if (state == arc_anon) - atomic_add_64(&arc_p, (int64_t)bytes); - if (arc_p > arc_c) - arc_p = arc_c; - } - ASSERT((int64_t)arc_p >= 0); -} - -/* - * Check if arc_size has grown past our upper threshold, determined by - * zfs_arc_overflow_shift. - */ -static boolean_t -arc_is_overflowing(void) -{ - /* Always allow at least one block of overflow */ - int64_t overflow = MAX(SPA_MAXBLOCKSIZE, - arc_c >> zfs_arc_overflow_shift); - - /* - * We just compare the lower bound here for performance reasons. Our - * primary goals are to make sure that the arc never grows without - * bound, and that it can reach its maximum size. This check - * accomplishes both goals. The maximum amount we could run over by is - * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block - * in the ARC. In practice, that's in the tens of MB, which is low - * enough to be safe. - */ - return (aggsum_lower_bound(&arc_size) >= (int64_t)arc_c + overflow); -} - -static abd_t * -arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag, boolean_t do_adapt) -{ - arc_buf_contents_t type = arc_buf_type(hdr); - - arc_get_data_impl(hdr, size, tag, do_adapt); - if (type == ARC_BUFC_METADATA) { - return (abd_alloc(size, B_TRUE)); - } else { - ASSERT(type == ARC_BUFC_DATA); - return (abd_alloc(size, B_FALSE)); - } -} - -static void * -arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) -{ - arc_buf_contents_t type = arc_buf_type(hdr); - - arc_get_data_impl(hdr, size, tag, B_TRUE); - if (type == ARC_BUFC_METADATA) { - return (zio_buf_alloc(size)); - } else { - ASSERT(type == ARC_BUFC_DATA); - return (zio_data_buf_alloc(size)); - } -} - -/* - * Allocate a block and return it to the caller. If we are hitting the - * hard limit for the cache size, we must sleep, waiting for the eviction - * thread to catch up. If we're past the target size but below the hard - * limit, we'll only signal the reclaim thread and continue on. - */ -static void -arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, boolean_t do_adapt) -{ - arc_state_t *state = hdr->b_l1hdr.b_state; - arc_buf_contents_t type = arc_buf_type(hdr); - - if (do_adapt) - arc_adapt(size, state); - - /* - * If arc_size is currently overflowing, and has grown past our - * upper limit, we must be adding data faster than the evict - * thread can evict. Thus, to ensure we don't compound the - * problem by adding more data and forcing arc_size to grow even - * further past it's target size, we halt and wait for the - * eviction thread to catch up. - * - * It's also possible that the reclaim thread is unable to evict - * enough buffers to get arc_size below the overflow limit (e.g. - * due to buffers being un-evictable, or hash lock collisions). - * In this case, we want to proceed regardless if we're - * overflowing; thus we don't use a while loop here. - */ - if (arc_is_overflowing()) { - mutex_enter(&arc_adjust_lock); - - /* - * Now that we've acquired the lock, we may no longer be - * over the overflow limit, lets check. - * - * We're ignoring the case of spurious wake ups. If that - * were to happen, it'd let this thread consume an ARC - * buffer before it should have (i.e. before we're under - * the overflow limit and were signalled by the reclaim - * thread). As long as that is a rare occurrence, it - * shouldn't cause any harm. - */ - if (arc_is_overflowing()) { - arc_adjust_needed = B_TRUE; - zthr_wakeup(arc_adjust_zthr); - (void) cv_wait(&arc_adjust_waiters_cv, - &arc_adjust_lock); - } - mutex_exit(&arc_adjust_lock); - } - - VERIFY3U(hdr->b_type, ==, type); - if (type == ARC_BUFC_METADATA) { - arc_space_consume(size, ARC_SPACE_META); - } else { - arc_space_consume(size, ARC_SPACE_DATA); - } - - /* - * Update the state size. Note that ghost states have a - * "ghost size" and so don't need to be updated. - */ - if (!GHOST_STATE(state)) { - - (void) zfs_refcount_add_many(&state->arcs_size, size, tag); - - /* - * If this is reached via arc_read, the link is - * protected by the hash lock. If reached via - * arc_buf_alloc, the header should not be accessed by - * any other thread. And, if reached via arc_read_done, - * the hash lock will protect it if it's found in the - * hash table; otherwise no other thread should be - * trying to [add|remove]_reference it. - */ - if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - (void) zfs_refcount_add_many(&state->arcs_esize[type], - size, tag); - } - - /* - * If we are growing the cache, and we are adding anonymous - * data, and we have outgrown arc_p, update arc_p - */ - if (aggsum_upper_bound(&arc_size) < arc_c && - hdr->b_l1hdr.b_state == arc_anon && - (zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) > arc_p)) - arc_p = MIN(arc_c, arc_p + size); - } - ARCSTAT_BUMP(arcstat_allocated); -} - -static void -arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) -{ - arc_free_data_impl(hdr, size, tag); - abd_free(abd); -} - -static void -arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) -{ - arc_buf_contents_t type = arc_buf_type(hdr); - - arc_free_data_impl(hdr, size, tag); - if (type == ARC_BUFC_METADATA) { - zio_buf_free(buf, size); - } else { - ASSERT(type == ARC_BUFC_DATA); - zio_data_buf_free(buf, size); - } -} - -/* - * Free the arc data buffer. - */ -static void -arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) -{ - arc_state_t *state = hdr->b_l1hdr.b_state; - arc_buf_contents_t type = arc_buf_type(hdr); - - /* protected by hash lock, if in the hash table */ - if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT(state != arc_anon && state != arc_l2c_only); - - (void) zfs_refcount_remove_many(&state->arcs_esize[type], - size, tag); - } - (void) zfs_refcount_remove_many(&state->arcs_size, size, tag); - - VERIFY3U(hdr->b_type, ==, type); - if (type == ARC_BUFC_METADATA) { - arc_space_return(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); - arc_space_return(size, ARC_SPACE_DATA); - } -} - -/* - * This routine is called whenever a buffer is accessed. - * NOTE: the hash lock is dropped in this function. - */ -static void -arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) -{ - clock_t now; - - ASSERT(MUTEX_HELD(hash_lock)); - ASSERT(HDR_HAS_L1HDR(hdr)); - - if (hdr->b_l1hdr.b_state == arc_anon) { - /* - * This buffer is not in the cache, and does not - * appear in our "ghost" list. Add the new buffer - * to the MRU state. - */ - - ASSERT0(hdr->b_l1hdr.b_arc_access); - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mru, hdr, hash_lock); - - } else if (hdr->b_l1hdr.b_state == arc_mru) { - now = ddi_get_lbolt(); - - /* - * If this buffer is here because of a prefetch, then either: - * - clear the flag if this is a "referencing" read - * (any subsequent access will bump this into the MFU state). - * or - * - move the buffer to the head of the list if this is - * another prefetch (to make it less likely to be evicted). - */ - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { - if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - /* link protected by hash lock */ - ASSERT(multilist_link_active( - &hdr->b_l1hdr.b_arc_node)); - } else { - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREFETCH | - ARC_FLAG_PRESCIENT_PREFETCH); - ARCSTAT_BUMP(arcstat_mru_hits); - } - hdr->b_l1hdr.b_arc_access = now; - return; - } - - /* - * This buffer has been "accessed" only once so far, - * but it is still in the cache. Move it to the MFU - * state. - */ - if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { - /* - * More than 125ms have passed since we - * instantiated this buffer. Move it to the - * most frequently used state. - */ - hdr->b_l1hdr.b_arc_access = now; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mfu, hdr, hash_lock); - } - atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); - ARCSTAT_BUMP(arcstat_mru_hits); - } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { - arc_state_t *new_state; - /* - * This buffer has been "accessed" recently, but - * was evicted from the cache. Move it to the - * MFU state. - */ - - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { - new_state = arc_mru; - if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREFETCH | - ARC_FLAG_PRESCIENT_PREFETCH); - } - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); - } else { - new_state = arc_mfu; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - } - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - arc_change_state(new_state, hdr, hash_lock); - - atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits); - ARCSTAT_BUMP(arcstat_mru_ghost_hits); - } else if (hdr->b_l1hdr.b_state == arc_mfu) { - /* - * This buffer has been accessed more than once and is - * still in the cache. Keep it in the MFU state. - * - * NOTE: an add_reference() that occurred when we did - * the arc_read() will have kicked this off the list. - * If it was a prefetch, we will explicitly move it to - * the head of the list now. - */ - - atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits); - ARCSTAT_BUMP(arcstat_mfu_hits); - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { - arc_state_t *new_state = arc_mfu; - /* - * This buffer has been accessed more than once but has - * been evicted from the cache. Move it back to the - * MFU state. - */ - - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { - /* - * This is a prefetch access... - * move this block back to the MRU state. - */ - new_state = arc_mru; - } - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(new_state, hdr, hash_lock); - - atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits); - ARCSTAT_BUMP(arcstat_mfu_ghost_hits); - } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { - /* - * This buffer is on the 2nd Level ARC. - */ - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mfu, hdr, hash_lock); - } else { - ASSERT(!"invalid arc state"); - } -} - -/* - * This routine is called by dbuf_hold() to update the arc_access() state - * which otherwise would be skipped for entries in the dbuf cache. - */ -void -arc_buf_access(arc_buf_t *buf) -{ - mutex_enter(&buf->b_evict_lock); - arc_buf_hdr_t *hdr = buf->b_hdr; - - /* - * Avoid taking the hash_lock when possible as an optimization. - * The header must be checked again under the hash_lock in order - * to handle the case where it is concurrently being released. - */ - if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { - mutex_exit(&buf->b_evict_lock); - ARCSTAT_BUMP(arcstat_access_skip); - return; - } - - kmutex_t *hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - - if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { - mutex_exit(hash_lock); - mutex_exit(&buf->b_evict_lock); - ARCSTAT_BUMP(arcstat_access_skip); - return; - } - - mutex_exit(&buf->b_evict_lock); - - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); - - DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - - ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); -} - -/* a generic arc_read_done_func_t which you can use */ -/* ARGSUSED */ -void -arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, - arc_buf_t *buf, void *arg) -{ - if (buf == NULL) - return; - - bcopy(buf->b_data, arg, arc_buf_size(buf)); - arc_buf_destroy(buf, arg); -} - -/* a generic arc_read_done_func_t */ -/* ARGSUSED */ -void -arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, - arc_buf_t *buf, void *arg) -{ - arc_buf_t **bufp = arg; - if (buf == NULL) { - ASSERT(zio == NULL || zio->io_error != 0); - *bufp = NULL; - } else { - ASSERT(zio == NULL || zio->io_error == 0); - *bufp = buf; - ASSERT(buf->b_data != NULL); - } -} - -static void -arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) -{ - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { - ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); - } else { - if (HDR_COMPRESSION_ENABLED(hdr)) { - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, - BP_GET_COMPRESS(bp)); - } - ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); - ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); - } -} - -static void -arc_read_done(zio_t *zio) -{ - arc_buf_hdr_t *hdr = zio->io_private; - kmutex_t *hash_lock = NULL; - arc_callback_t *callback_list; - arc_callback_t *acb; - boolean_t freeable = B_FALSE; - boolean_t no_zio_error = (zio->io_error == 0); - - /* - * The hdr was inserted into hash-table and removed from lists - * prior to starting I/O. We should find this header, since - * it's in the hash table, and it should be legit since it's - * not possible to evict it during the I/O. The only possible - * reason for it not to be found is if we were freed during the - * read. - */ - if (HDR_IN_HASH_TABLE(hdr)) { - ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); - ASSERT3U(hdr->b_dva.dva_word[0], ==, - BP_IDENTITY(zio->io_bp)->dva_word[0]); - ASSERT3U(hdr->b_dva.dva_word[1], ==, - BP_IDENTITY(zio->io_bp)->dva_word[1]); - - arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, - &hash_lock); - - ASSERT((found == hdr && - DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || - (found == hdr && HDR_L2_READING(hdr))); - ASSERT3P(hash_lock, !=, NULL); - } - - if (no_zio_error) { - /* byteswap if necessary */ - if (BP_SHOULD_BYTESWAP(zio->io_bp)) { - if (BP_GET_LEVEL(zio->io_bp) > 0) { - hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; - } else { - hdr->b_l1hdr.b_byteswap = - DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); - } - } else { - hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - } - } - - arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); - if (l2arc_noprefetch && HDR_PREFETCH(hdr)) - arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); - - callback_list = hdr->b_l1hdr.b_acb; - ASSERT3P(callback_list, !=, NULL); - - if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { - /* - * Only call arc_access on anonymous buffers. This is because - * if we've issued an I/O for an evicted buffer, we've already - * called arc_access (to prevent any simultaneous readers from - * getting confused). - */ - arc_access(hdr, hash_lock); - } - - /* - * If a read request has a callback (i.e. acb_done is not NULL), then we - * make a buf containing the data according to the parameters which were - * passed in. The implementation of arc_buf_alloc_impl() ensures that we - * aren't needlessly decompressing the data multiple times. - */ - int callback_cnt = 0; - for (acb = callback_list; acb != NULL; acb = acb->acb_next) { - if (!acb->acb_done) - continue; - - callback_cnt++; - - if (no_zio_error) { - int error = arc_buf_alloc_impl(hdr, acb->acb_private, - acb->acb_compressed, zio->io_error == 0, - &acb->acb_buf); - if (error != 0) { - /* - * Decompression failed. Set io_error - * so that when we call acb_done (below), - * we will indicate that the read failed. - * Note that in the unusual case where one - * callback is compressed and another - * uncompressed, we will mark all of them - * as failed, even though the uncompressed - * one can't actually fail. In this case, - * the hdr will not be anonymous, because - * if there are multiple callbacks, it's - * because multiple threads found the same - * arc buf in the hash table. - */ - zio->io_error = error; - } - } - } - /* - * If there are multiple callbacks, we must have the hash lock, - * because the only way for multiple threads to find this hdr is - * in the hash table. This ensures that if there are multiple - * callbacks, the hdr is not anonymous. If it were anonymous, - * we couldn't use arc_buf_destroy() in the error case below. - */ - ASSERT(callback_cnt < 2 || hash_lock != NULL); - - hdr->b_l1hdr.b_acb = NULL; - arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - if (callback_cnt == 0) { - ASSERT(HDR_PREFETCH(hdr)); - ASSERT0(hdr->b_l1hdr.b_bufcnt); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - } - - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || - callback_list != NULL); - - if (no_zio_error) { - arc_hdr_verify(hdr, zio->io_bp); - } else { - arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); - if (hdr->b_l1hdr.b_state != arc_anon) - arc_change_state(arc_anon, hdr, hash_lock); - if (HDR_IN_HASH_TABLE(hdr)) - buf_hash_remove(hdr); - freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); - } - - /* - * Broadcast before we drop the hash_lock to avoid the possibility - * that the hdr (and hence the cv) might be freed before we get to - * the cv_broadcast(). - */ - cv_broadcast(&hdr->b_l1hdr.b_cv); - - if (hash_lock != NULL) { - mutex_exit(hash_lock); - } else { - /* - * This block was freed while we waited for the read to - * complete. It has been removed from the hash table and - * moved to the anonymous state (so that it won't show up - * in the cache). - */ - ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); - freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); - } - - /* execute each callback and free its structure */ - while ((acb = callback_list) != NULL) { - if (acb->acb_done != NULL) { - if (zio->io_error != 0 && acb->acb_buf != NULL) { - /* - * If arc_buf_alloc_impl() fails during - * decompression, the buf will still be - * allocated, and needs to be freed here. - */ - arc_buf_destroy(acb->acb_buf, acb->acb_private); - acb->acb_buf = NULL; - } - acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, - acb->acb_buf, acb->acb_private); - } - - if (acb->acb_zio_dummy != NULL) { - acb->acb_zio_dummy->io_error = zio->io_error; - zio_nowait(acb->acb_zio_dummy); - } - - callback_list = acb->acb_next; - kmem_free(acb, sizeof (arc_callback_t)); - } - - if (freeable) - arc_hdr_destroy(hdr); -} - -/* - * "Read" the block at the specified DVA (in bp) via the - * cache. If the block is found in the cache, invoke the provided - * callback immediately and return. Note that the `zio' parameter - * in the callback will be NULL in this case, since no IO was - * required. If the block is not in the cache pass the read request - * on to the spa with a substitute callback function, so that the - * requested block will be added to the cache. - * - * If a read request arrives for a block that has a read in-progress, - * either wait for the in-progress read to complete (and return the - * results); or, if this is a read with a "done" func, add a record - * to the read to invoke the "done" func when the read completes, - * and return; or just return. - * - * arc_read_done() will invoke all the requested "done" functions - * for readers of this block. - */ -int -arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, - void *private, zio_priority_t priority, int zio_flags, - arc_flags_t *arc_flags, const zbookmark_phys_t *zb) -{ - arc_buf_hdr_t *hdr = NULL; - kmutex_t *hash_lock = NULL; - zio_t *rzio; - uint64_t guid = spa_load_guid(spa); - boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; - int rc = 0; - - ASSERT(!BP_IS_EMBEDDED(bp) || - BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); - -top: - if (!BP_IS_EMBEDDED(bp)) { - /* - * Embedded BP's have no DVA and require no I/O to "read". - * Create an anonymous arc buf to back it. - */ - hdr = buf_hash_find(guid, bp, &hash_lock); - } - - if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { - arc_buf_t *buf = NULL; - *arc_flags |= ARC_FLAG_CACHED; - - if (HDR_IO_IN_PROGRESS(hdr)) { - zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; - - ASSERT3P(head_zio, !=, NULL); - if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && - priority == ZIO_PRIORITY_SYNC_READ) { - /* - * This is a sync read that needs to wait for - * an in-flight async read. Request that the - * zio have its priority upgraded. - */ - zio_change_priority(head_zio, priority); - DTRACE_PROBE1(arc__async__upgrade__sync, - arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_async_upgrade_sync); - } - if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREDICTIVE_PREFETCH); - } - - if (*arc_flags & ARC_FLAG_WAIT) { - cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); - mutex_exit(hash_lock); - goto top; - } - ASSERT(*arc_flags & ARC_FLAG_NOWAIT); - - if (done) { - arc_callback_t *acb = NULL; - - acb = kmem_zalloc(sizeof (arc_callback_t), - KM_SLEEP); - acb->acb_done = done; - acb->acb_private = private; - acb->acb_compressed = compressed_read; - if (pio != NULL) - acb->acb_zio_dummy = zio_null(pio, - spa, NULL, NULL, NULL, zio_flags); - - ASSERT3P(acb->acb_done, !=, NULL); - acb->acb_zio_head = head_zio; - acb->acb_next = hdr->b_l1hdr.b_acb; - hdr->b_l1hdr.b_acb = acb; - mutex_exit(hash_lock); - return (0); - } - mutex_exit(hash_lock); - return (0); - } - - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); - - if (done) { - if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - /* - * This is a demand read which does not have to - * wait for i/o because we did a predictive - * prefetch i/o for it, which has completed. - */ - DTRACE_PROBE1( - arc__demand__hit__predictive__prefetch, - arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP( - arcstat_demand_hit_predictive_prefetch); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREDICTIVE_PREFETCH); - } - - if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { - ARCSTAT_BUMP( - arcstat_demand_hit_prescient_prefetch); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PRESCIENT_PREFETCH); - } - - ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); - /* Get a buf with the desired data in it. */ - rc = arc_buf_alloc_impl(hdr, private, - compressed_read, B_TRUE, &buf); - if (rc != 0) { - arc_buf_destroy(buf, private); - buf = NULL; - } - ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || - rc == 0 || rc != ENOENT); - } else if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); - } - DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); - if (*arc_flags & ARC_FLAG_L2CACHE) - arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), - data, metadata, hits); - - if (done) - done(NULL, zb, bp, buf, private); - } else { - uint64_t lsize = BP_GET_LSIZE(bp); - uint64_t psize = BP_GET_PSIZE(bp); - arc_callback_t *acb; - vdev_t *vd = NULL; - uint64_t addr = 0; - boolean_t devw = B_FALSE; - uint64_t size; - - if (hdr == NULL) { - /* this block is not in the cache */ - arc_buf_hdr_t *exists = NULL; - arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); - hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - BP_GET_COMPRESS(bp), type); - - if (!BP_IS_EMBEDDED(bp)) { - hdr->b_dva = *BP_IDENTITY(bp); - hdr->b_birth = BP_PHYSICAL_BIRTH(bp); - exists = buf_hash_insert(hdr, &hash_lock); - } - if (exists != NULL) { - /* somebody beat us to the hash insert */ - mutex_exit(hash_lock); - buf_discard_identity(hdr); - arc_hdr_destroy(hdr); - goto top; /* restart the IO request */ - } - } else { - /* - * This block is in the ghost cache. If it was L2-only - * (and thus didn't have an L1 hdr), we realloc the - * header to add an L1 hdr. - */ - if (!HDR_HAS_L1HDR(hdr)) { - hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, - hdr_full_cache); - } - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - - /* - * This is a delicate dance that we play here. - * This hdr is in the ghost list so we access it - * to move it out of the ghost list before we - * initiate the read. If it's a prefetch then - * it won't have a callback so we'll remove the - * reference that arc_buf_alloc_impl() created. We - * do this after we've called arc_access() to - * avoid hitting an assert in remove_reference(). - */ - arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); - arc_access(hdr, hash_lock); - arc_hdr_alloc_pabd(hdr, B_FALSE); - } - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - size = arc_hdr_size(hdr); - - /* - * If compression is enabled on the hdr, then will do - * RAW I/O and will store the compressed data in the hdr's - * data block. Otherwise, the hdr's data block will contain - * the uncompressed data. - */ - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { - zio_flags |= ZIO_FLAG_RAW; - } - - if (*arc_flags & ARC_FLAG_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); - if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); - - if (*arc_flags & ARC_FLAG_L2CACHE) - arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); - if (BP_GET_LEVEL(bp) > 0) - arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); - if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); - ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); - - acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); - acb->acb_done = done; - acb->acb_private = private; - acb->acb_compressed = compressed_read; - - ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); - hdr->b_l1hdr.b_acb = acb; - arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - - if (HDR_HAS_L2HDR(hdr) && - (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { - devw = hdr->b_l2hdr.b_dev->l2ad_writing; - addr = hdr->b_l2hdr.b_daddr; - /* - * Lock out L2ARC device removal. - */ - if (vdev_is_dead(vd) || - !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) - vd = NULL; - } - - /* - * We count both async reads and scrub IOs as asynchronous so - * that both can be upgraded in the event of a cache hit while - * the read IO is still in-flight. - */ - if (priority == ZIO_PRIORITY_ASYNC_READ || - priority == ZIO_PRIORITY_SCRUB) - arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); - else - arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); - - /* - * At this point, we have a level 1 cache miss. Try again in - * L2ARC if possible. - */ - ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); - - DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, - uint64_t, lsize, zbookmark_phys_t *, zb); - ARCSTAT_BUMP(arcstat_misses); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), - data, metadata, misses); -#ifdef _KERNEL -#ifdef RACCT - if (racct_enable) { - PROC_LOCK(curproc); - racct_add_force(curproc, RACCT_READBPS, size); - racct_add_force(curproc, RACCT_READIOPS, 1); - PROC_UNLOCK(curproc); - } -#endif /* RACCT */ - curthread->td_ru.ru_inblock++; -#endif - - if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { - /* - * Read from the L2ARC if the following are true: - * 1. The L2ARC vdev was previously cached. - * 2. This buffer still has L2ARC metadata. - * 3. This buffer isn't currently writing to the L2ARC. - * 4. The L2ARC entry wasn't evicted, which may - * also have invalidated the vdev. - * 5. This isn't prefetch and l2arc_noprefetch is set. - */ - if (HDR_HAS_L2HDR(hdr) && - !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && - !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { - l2arc_read_callback_t *cb; - abd_t *abd; - uint64_t asize; - - DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_l2_hits); - atomic_inc_32(&hdr->b_l2hdr.b_hits); - - cb = kmem_zalloc(sizeof (l2arc_read_callback_t), - KM_SLEEP); - cb->l2rcb_hdr = hdr; - cb->l2rcb_bp = *bp; - cb->l2rcb_zb = *zb; - cb->l2rcb_flags = zio_flags; - - asize = vdev_psize_to_asize(vd, size); - if (asize != size) { - abd = abd_alloc_for_io(asize, - HDR_ISTYPE_METADATA(hdr)); - cb->l2rcb_abd = abd; - } else { - abd = hdr->b_l1hdr.b_pabd; - } - - ASSERT(addr >= VDEV_LABEL_START_SIZE && - addr + asize <= vd->vdev_psize - - VDEV_LABEL_END_SIZE); - - /* - * l2arc read. The SCL_L2ARC lock will be - * released by l2arc_read_done(). - * Issue a null zio if the underlying buffer - * was squashed to zero size by compression. - */ - ASSERT3U(HDR_GET_COMPRESS(hdr), !=, - ZIO_COMPRESS_EMPTY); - rzio = zio_read_phys(pio, vd, addr, - asize, abd, - ZIO_CHECKSUM_OFF, - l2arc_read_done, cb, priority, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY, B_FALSE); - acb->acb_zio_head = rzio; - - if (hash_lock != NULL) - mutex_exit(hash_lock); - - DTRACE_PROBE2(l2arc__read, vdev_t *, vd, - zio_t *, rzio); - ARCSTAT_INCR(arcstat_l2_read_bytes, size); - - if (*arc_flags & ARC_FLAG_NOWAIT) { - zio_nowait(rzio); - return (0); - } - - ASSERT(*arc_flags & ARC_FLAG_WAIT); - if (zio_wait(rzio) == 0) - return (0); - - /* l2arc read error; goto zio_read() */ - if (hash_lock != NULL) - mutex_enter(hash_lock); - } else { - DTRACE_PROBE1(l2arc__miss, - arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_l2_misses); - if (HDR_L2_WRITING(hdr)) - ARCSTAT_BUMP(arcstat_l2_rw_clash); - spa_config_exit(spa, SCL_L2ARC, vd); - } - } else { - if (vd != NULL) - spa_config_exit(spa, SCL_L2ARC, vd); - if (l2arc_ndev != 0) { - DTRACE_PROBE1(l2arc__miss, - arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_l2_misses); - } - } - - rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, - arc_read_done, hdr, priority, zio_flags, zb); - acb->acb_zio_head = rzio; - - if (hash_lock != NULL) - mutex_exit(hash_lock); - - if (*arc_flags & ARC_FLAG_WAIT) - return (zio_wait(rzio)); - - ASSERT(*arc_flags & ARC_FLAG_NOWAIT); - zio_nowait(rzio); - } - return (0); -} - -arc_prune_t * -arc_add_prune_callback(arc_prune_func_t *func, void *private) -{ - arc_prune_t *p; - - p = kmem_alloc(sizeof (*p), KM_SLEEP); - p->p_pfunc = func; - p->p_private = private; - list_link_init(&p->p_node); - zfs_refcount_create(&p->p_refcnt); - - mutex_enter(&arc_prune_mtx); - zfs_refcount_add(&p->p_refcnt, &arc_prune_list); - list_insert_head(&arc_prune_list, p); - mutex_exit(&arc_prune_mtx); - - return (p); -} - -void -arc_remove_prune_callback(arc_prune_t *p) -{ - boolean_t wait = B_FALSE; - mutex_enter(&arc_prune_mtx); - list_remove(&arc_prune_list, p); - if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0) - wait = B_TRUE; - mutex_exit(&arc_prune_mtx); - - /* wait for arc_prune_task to finish */ - if (wait) - taskq_wait(arc_prune_taskq); - ASSERT0(zfs_refcount_count(&p->p_refcnt)); - zfs_refcount_destroy(&p->p_refcnt); - kmem_free(p, sizeof (*p)); -} - -/* - * Notify the arc that a block was freed, and thus will never be used again. - */ -void -arc_freed(spa_t *spa, const blkptr_t *bp) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - uint64_t guid = spa_load_guid(spa); - - ASSERT(!BP_IS_EMBEDDED(bp)); - - hdr = buf_hash_find(guid, bp, &hash_lock); - if (hdr == NULL) - return; - - /* - * We might be trying to free a block that is still doing I/O - * (i.e. prefetch) or has a reference (i.e. a dedup-ed, - * dmu_sync-ed block). If this block is being prefetched, then it - * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr - * until the I/O completes. A block may also have a reference if it is - * part of a dedup-ed, dmu_synced write. The dmu_sync() function would - * have written the new block to its final resting place on disk but - * without the dedup flag set. This would have left the hdr in the MRU - * state and discoverable. When the txg finally syncs it detects that - * the block was overridden in open context and issues an override I/O. - * Since this is a dedup block, the override I/O will determine if the - * block is already in the DDT. If so, then it will replace the io_bp - * with the bp from the DDT and allow the I/O to finish. When the I/O - * reaches the done callback, dbuf_write_override_done, it will - * check to see if the io_bp and io_bp_override are identical. - * If they are not, then it indicates that the bp was replaced with - * the bp in the DDT and the override bp is freed. This allows - * us to arrive here with a reference on a block that is being - * freed. So if we have an I/O in progress, or a reference to - * this hdr, then we don't destroy the hdr. - */ - if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { - arc_change_state(arc_anon, hdr, hash_lock); - arc_hdr_destroy(hdr); - mutex_exit(hash_lock); - } else { - mutex_exit(hash_lock); - } - -} - -/* - * Release this buffer from the cache, making it an anonymous buffer. This - * must be done after a read and prior to modifying the buffer contents. - * If the buffer has more than one reference, we must make - * a new hdr for the buffer. - */ -void -arc_release(arc_buf_t *buf, void *tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - - /* - * It would be nice to assert that if it's DMU metadata (level > - * 0 || it's the dnode file), then it must be syncing context. - * But we don't know that information at this level. - */ - - mutex_enter(&buf->b_evict_lock); - - ASSERT(HDR_HAS_L1HDR(hdr)); - - /* - * We don't grab the hash lock prior to this check, because if - * the buffer's header is in the arc_anon state, it won't be - * linked into the hash table. - */ - if (hdr->b_l1hdr.b_state == arc_anon) { - mutex_exit(&buf->b_evict_lock); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(!HDR_IN_HASH_TABLE(hdr)); - ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT(HDR_EMPTY(hdr)); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); - ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); - - hdr->b_l1hdr.b_arc_access = 0; - - /* - * If the buf is being overridden then it may already - * have a hdr that is not empty. - */ - buf_discard_identity(hdr); - arc_buf_thaw(buf); - - return; - } - - kmutex_t *hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - - /* - * This assignment is only valid as long as the hash_lock is - * held, we must be careful not to reference state or the - * b_state field after dropping the lock. - */ - arc_state_t *state = hdr->b_l1hdr.b_state; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT3P(state, !=, arc_anon); - - /* this buffer is not on any list */ - ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); - - if (HDR_HAS_L2HDR(hdr)) { - mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); - - /* - * We have to recheck this conditional again now that - * we're holding the l2ad_mtx to prevent a race with - * another thread which might be concurrently calling - * l2arc_evict(). In that case, l2arc_evict() might have - * destroyed the header's L2 portion as we were waiting - * to acquire the l2ad_mtx. - */ - if (HDR_HAS_L2HDR(hdr)) { - l2arc_trim(hdr); - arc_hdr_l2hdr_destroy(hdr); - } - - mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); - } - - /* - * Do we have more than one buf? - */ - if (hdr->b_l1hdr.b_bufcnt > 1) { - arc_buf_hdr_t *nhdr; - uint64_t spa = hdr->b_spa; - uint64_t psize = HDR_GET_PSIZE(hdr); - uint64_t lsize = HDR_GET_LSIZE(hdr); - enum zio_compress compress = HDR_GET_COMPRESS(hdr); - arc_buf_contents_t type = arc_buf_type(hdr); - VERIFY3U(hdr->b_type, ==, type); - - ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); - (void) remove_reference(hdr, hash_lock, tag); - - if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { - ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); - ASSERT(ARC_BUF_LAST(buf)); - } - - /* - * Pull the data off of this hdr and attach it to - * a new anonymous hdr. Also find the last buffer - * in the hdr's buffer list. - */ - arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); - ASSERT3P(lastbuf, !=, NULL); - - /* - * If the current arc_buf_t and the hdr are sharing their data - * buffer, then we must stop sharing that block. - */ - if (arc_buf_is_shared(buf)) { - VERIFY(!arc_buf_is_shared(lastbuf)); - - /* - * First, sever the block sharing relationship between - * buf and the arc_buf_hdr_t. - */ - arc_unshare_buf(hdr, buf); - - /* - * Now we need to recreate the hdr's b_pabd. Since we - * have lastbuf handy, we try to share with it, but if - * we can't then we allocate a new b_pabd and copy the - * data from buf into it. - */ - if (arc_can_share(hdr, lastbuf)) { - arc_share_buf(hdr, lastbuf); - } else { - arc_hdr_alloc_pabd(hdr, B_TRUE); - abd_copy_from_buf(hdr->b_l1hdr.b_pabd, - buf->b_data, psize); - } - VERIFY3P(lastbuf->b_data, !=, NULL); - } else if (HDR_SHARED_DATA(hdr)) { - /* - * Uncompressed shared buffers are always at the end - * of the list. Compressed buffers don't have the - * same requirements. This makes it hard to - * simply assert that the lastbuf is shared so - * we rely on the hdr's compression flags to determine - * if we have a compressed, shared buffer. - */ - ASSERT(arc_buf_is_shared(lastbuf) || - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); - ASSERT(!ARC_BUF_SHARED(buf)); - } - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - ASSERT3P(state, !=, arc_l2c_only); - - (void) zfs_refcount_remove_many(&state->arcs_size, - arc_buf_size(buf), buf); - - if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { - ASSERT3P(state, !=, arc_l2c_only); - (void) zfs_refcount_remove_many( - &state->arcs_esize[type], - arc_buf_size(buf), buf); - } - - hdr->b_l1hdr.b_bufcnt -= 1; - arc_cksum_verify(buf); -#ifdef illumos - arc_buf_unwatch(buf); -#endif - - mutex_exit(hash_lock); - - /* - * Allocate a new hdr. The new hdr will contain a b_pabd - * buffer which will be freed in arc_write(). - */ - nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); - ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); - ASSERT0(nhdr->b_l1hdr.b_bufcnt); - ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); - VERIFY3U(nhdr->b_type, ==, type); - ASSERT(!HDR_SHARED_DATA(nhdr)); - - nhdr->b_l1hdr.b_buf = buf; - nhdr->b_l1hdr.b_bufcnt = 1; - (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); - buf->b_hdr = nhdr; - - mutex_exit(&buf->b_evict_lock); - (void) zfs_refcount_add_many(&arc_anon->arcs_size, - arc_buf_size(buf), buf); - } else { - mutex_exit(&buf->b_evict_lock); - ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); - /* protected by hash lock, or hdr is on arc_anon */ - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - arc_change_state(arc_anon, hdr, hash_lock); - hdr->b_l1hdr.b_arc_access = 0; - mutex_exit(hash_lock); - - buf_discard_identity(hdr); - arc_buf_thaw(buf); - } -} - -int -arc_released(arc_buf_t *buf) -{ - int released; - - mutex_enter(&buf->b_evict_lock); - released = (buf->b_data != NULL && - buf->b_hdr->b_l1hdr.b_state == arc_anon); - mutex_exit(&buf->b_evict_lock); - return (released); -} - -#ifdef ZFS_DEBUG -int -arc_referenced(arc_buf_t *buf) -{ - int referenced; - - mutex_enter(&buf->b_evict_lock); - referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); - mutex_exit(&buf->b_evict_lock); - return (referenced); -} -#endif - -static void -arc_write_ready(zio_t *zio) -{ - arc_write_callback_t *callback = zio->io_private; - arc_buf_t *buf = callback->awcb_buf; - arc_buf_hdr_t *hdr = buf->b_hdr; - uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); - - /* - * If we're reexecuting this zio because the pool suspended, then - * cleanup any state that was previously set the first time the - * callback was invoked. - */ - if (zio->io_flags & ZIO_FLAG_REEXECUTED) { - arc_cksum_free(hdr); -#ifdef illumos - arc_buf_unwatch(buf); -#endif - if (hdr->b_l1hdr.b_pabd != NULL) { - if (arc_buf_is_shared(buf)) { - arc_unshare_buf(hdr, buf); - } else { - arc_hdr_free_pabd(hdr); - } - } - } - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT(!arc_buf_is_shared(buf)); - - callback->awcb_ready(zio, buf, callback->awcb_private); - - if (HDR_IO_IN_PROGRESS(hdr)) - ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); - - arc_cksum_compute(buf); - arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - - enum zio_compress compress; - if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { - compress = ZIO_COMPRESS_OFF; - } else { - ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); - compress = BP_GET_COMPRESS(zio->io_bp); - } - HDR_SET_PSIZE(hdr, psize); - arc_hdr_set_compress(hdr, compress); - - - /* - * Fill the hdr with data. If the hdr is compressed, the data we want - * is available from the zio, otherwise we can take it from the buf. - * - * We might be able to share the buf's data with the hdr here. However, - * doing so would cause the ARC to be full of linear ABDs if we write a - * lot of shareable data. As a compromise, we check whether scattered - * ABDs are allowed, and assume that if they are then the user wants - * the ARC to be primarily filled with them regardless of the data being - * written. Therefore, if they're allowed then we allocate one and copy - * the data into it; otherwise, we share the data directly if we can. - */ - if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { - arc_hdr_alloc_pabd(hdr, B_TRUE); - - /* - * Ideally, we would always copy the io_abd into b_pabd, but the - * user may have disabled compressed ARC, thus we must check the - * hdr's compression setting rather than the io_bp's. - */ - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { - ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, - ZIO_COMPRESS_OFF); - ASSERT3U(psize, >, 0); - - abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); - } else { - ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); - - abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, - arc_buf_size(buf)); - } - } else { - ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); - ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); - - arc_share_buf(hdr, buf); - } - - arc_hdr_verify(hdr, zio->io_bp); -} - -static void -arc_write_children_ready(zio_t *zio) -{ - arc_write_callback_t *callback = zio->io_private; - arc_buf_t *buf = callback->awcb_buf; - - callback->awcb_children_ready(zio, buf, callback->awcb_private); -} - -/* - * The SPA calls this callback for each physical write that happens on behalf - * of a logical write. See the comment in dbuf_write_physdone() for details. - */ -static void -arc_write_physdone(zio_t *zio) -{ - arc_write_callback_t *cb = zio->io_private; - if (cb->awcb_physdone != NULL) - cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); -} - -static void -arc_write_done(zio_t *zio) -{ - arc_write_callback_t *callback = zio->io_private; - arc_buf_t *buf = callback->awcb_buf; - arc_buf_hdr_t *hdr = buf->b_hdr; - - ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); - - if (zio->io_error == 0) { - arc_hdr_verify(hdr, zio->io_bp); - - if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { - buf_discard_identity(hdr); - } else { - hdr->b_dva = *BP_IDENTITY(zio->io_bp); - hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); - } - } else { - ASSERT(HDR_EMPTY(hdr)); - } - - /* - * If the block to be written was all-zero or compressed enough to be - * embedded in the BP, no write was performed so there will be no - * dva/birth/checksum. The buffer must therefore remain anonymous - * (and uncached). - */ - if (!HDR_EMPTY(hdr)) { - arc_buf_hdr_t *exists; - kmutex_t *hash_lock; - - ASSERT3U(zio->io_error, ==, 0); - - arc_cksum_verify(buf); - - exists = buf_hash_insert(hdr, &hash_lock); - if (exists != NULL) { - /* - * This can only happen if we overwrite for - * sync-to-convergence, because we remove - * buffers from the hash table when we arc_free(). - */ - if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { - if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) - panic("bad overwrite, hdr=%p exists=%p", - (void *)hdr, (void *)exists); - ASSERT(zfs_refcount_is_zero( - &exists->b_l1hdr.b_refcnt)); - arc_change_state(arc_anon, exists, hash_lock); - mutex_exit(hash_lock); - arc_hdr_destroy(exists); - exists = buf_hash_insert(hdr, &hash_lock); - ASSERT3P(exists, ==, NULL); - } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { - /* nopwrite */ - ASSERT(zio->io_prop.zp_nopwrite); - if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) - panic("bad nopwrite, hdr=%p exists=%p", - (void *)hdr, (void *)exists); - } else { - /* Dedup */ - ASSERT(hdr->b_l1hdr.b_bufcnt == 1); - ASSERT(hdr->b_l1hdr.b_state == arc_anon); - ASSERT(BP_GET_DEDUP(zio->io_bp)); - ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); - } - } - arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - /* if it's not anon, we are doing a scrub */ - if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - } else { - arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - } - - ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - callback->awcb_done(zio, buf, callback->awcb_private); - - abd_put(zio->io_abd); - kmem_free(callback, sizeof (arc_write_callback_t)); -} - -zio_t * -arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, - boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, - arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, - arc_write_done_func_t *done, void *private, zio_priority_t priority, - int zio_flags, const zbookmark_phys_t *zb) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - arc_write_callback_t *callback; - zio_t *zio; - zio_prop_t localprop = *zp; - - ASSERT3P(ready, !=, NULL); - ASSERT3P(done, !=, NULL); - ASSERT(!HDR_IO_ERROR(hdr)); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); - if (l2arc) - arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); - if (ARC_BUF_COMPRESSED(buf)) { - /* - * We're writing a pre-compressed buffer. Make the - * compression algorithm requested by the zio_prop_t match - * the pre-compressed buffer's compression algorithm. - */ - localprop.zp_compress = HDR_GET_COMPRESS(hdr); - - ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); - zio_flags |= ZIO_FLAG_RAW; - } - callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); - callback->awcb_ready = ready; - callback->awcb_children_ready = children_ready; - callback->awcb_physdone = physdone; - callback->awcb_done = done; - callback->awcb_private = private; - callback->awcb_buf = buf; - - /* - * The hdr's b_pabd is now stale, free it now. A new data block - * will be allocated when the zio pipeline calls arc_write_ready(). - */ - if (hdr->b_l1hdr.b_pabd != NULL) { - /* - * If the buf is currently sharing the data block with - * the hdr then we need to break that relationship here. - * The hdr will remain with a NULL data pointer and the - * buf will take sole ownership of the block. - */ - if (arc_buf_is_shared(buf)) { - arc_unshare_buf(hdr, buf); - } else { - arc_hdr_free_pabd(hdr); - } - VERIFY3P(buf->b_data, !=, NULL); - arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); - } - ASSERT(!arc_buf_is_shared(buf)); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - - zio = zio_write(pio, spa, txg, bp, - abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), - HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, - (children_ready != NULL) ? arc_write_children_ready : NULL, - arc_write_physdone, arc_write_done, callback, - priority, zio_flags, zb); - - return (zio); -} - -static int -arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) -{ -#ifdef _KERNEL - uint64_t available_memory = ptob(freemem); - -#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) - available_memory = MIN(available_memory, uma_avail()); -#endif - - if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) - return (0); - - if (txg > spa->spa_lowmem_last_txg) { - spa->spa_lowmem_last_txg = txg; - spa->spa_lowmem_page_load = 0; - } - /* - * If we are in pageout, we know that memory is already tight, - * the arc is already going to be evicting, so we just want to - * continue to let page writes occur as quickly as possible. - */ - if (curproc == pageproc) { - if (spa->spa_lowmem_page_load > - MAX(ptob(minfree), available_memory) / 4) - return (SET_ERROR(ERESTART)); - /* Note: reserve is inflated, so we deflate */ - atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8); - return (0); - } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) { - /* memory is low, delay before restarting */ - ARCSTAT_INCR(arcstat_memory_throttle_count, 1); - return (SET_ERROR(EAGAIN)); - } - spa->spa_lowmem_page_load = 0; -#endif /* _KERNEL */ - return (0); -} - -void -arc_tempreserve_clear(uint64_t reserve) -{ - atomic_add_64(&arc_tempreserve, -reserve); - ASSERT((int64_t)arc_tempreserve >= 0); -} - -int -arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) -{ - int error; - uint64_t anon_size; - - if (reserve > arc_c/4 && !arc_no_grow) { - arc_c = MIN(arc_c_max, reserve * 4); - DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); - } - if (reserve > arc_c) - return (SET_ERROR(ENOMEM)); - - /* - * Don't count loaned bufs as in flight dirty data to prevent long - * network delays from blocking transactions that are ready to be - * assigned to a txg. - */ - - /* assert that it has not wrapped around */ - ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); - - anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) - - arc_loaned_bytes), 0); - - /* - * Writes will, almost always, require additional memory allocations - * in order to compress/encrypt/etc the data. We therefore need to - * make sure that there is sufficient available memory for this. - */ - error = arc_memory_throttle(spa, reserve, txg); - if (error != 0) - return (error); - - /* - * Throttle writes when the amount of dirty data in the cache - * gets too large. We try to keep the cache less than half full - * of dirty blocks so that our sync times don't grow too large. - * - * In the case of one pool being built on another pool, we want - * to make sure we don't end up throttling the lower (backing) - * pool when the upper pool is the majority contributor to dirty - * data. To insure we make forward progress during throttling, we - * also check the current pool's net dirty data and only throttle - * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty - * data in the cache. - * - * Note: if two requests come in concurrently, we might let them - * both succeed, when one of them should fail. Not a huge deal. - */ - uint64_t total_dirty = reserve + arc_tempreserve + anon_size; - uint64_t spa_dirty_anon = spa_dirty_data(spa); - - if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 && - anon_size > arc_c * zfs_arc_anon_limit_percent / 100 && - spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { - uint64_t meta_esize = - zfs_refcount_count( - &arc_anon->arcs_esize[ARC_BUFC_METADATA]); - uint64_t data_esize = - zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); - dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " - "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", - arc_tempreserve >> 10, meta_esize >> 10, - data_esize >> 10, reserve >> 10, arc_c >> 10); - return (SET_ERROR(ERESTART)); - } - atomic_add_64(&arc_tempreserve, reserve); - return (0); -} - -static void -arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, - kstat_named_t *evict_data, kstat_named_t *evict_metadata) -{ - size->value.ui64 = zfs_refcount_count(&state->arcs_size); - evict_data->value.ui64 = - zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); - evict_metadata->value.ui64 = - zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); -} - -static int -arc_kstat_update(kstat_t *ksp, int rw) -{ - arc_stats_t *as = ksp->ks_data; - - if (rw == KSTAT_WRITE) { - return (EACCES); - } else { - arc_kstat_update_state(arc_anon, - &as->arcstat_anon_size, - &as->arcstat_anon_evictable_data, - &as->arcstat_anon_evictable_metadata); - arc_kstat_update_state(arc_mru, - &as->arcstat_mru_size, - &as->arcstat_mru_evictable_data, - &as->arcstat_mru_evictable_metadata); - arc_kstat_update_state(arc_mru_ghost, - &as->arcstat_mru_ghost_size, - &as->arcstat_mru_ghost_evictable_data, - &as->arcstat_mru_ghost_evictable_metadata); - arc_kstat_update_state(arc_mfu, - &as->arcstat_mfu_size, - &as->arcstat_mfu_evictable_data, - &as->arcstat_mfu_evictable_metadata); - arc_kstat_update_state(arc_mfu_ghost, - &as->arcstat_mfu_ghost_size, - &as->arcstat_mfu_ghost_evictable_data, - &as->arcstat_mfu_ghost_evictable_metadata); - - ARCSTAT(arcstat_size) = aggsum_value(&arc_size); - ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used); - ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size); - ARCSTAT(arcstat_metadata_size) = - aggsum_value(&astat_metadata_size); - ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size); - ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size); - ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size); - ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size); -#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11) - ARCSTAT(arcstat_other_size) = aggsum_value(&astat_bonus_size) + - aggsum_value(&astat_dnode_size) + - aggsum_value(&astat_dbuf_size); -#endif - ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size); - } - - return (0); -} - -/* - * This function *must* return indices evenly distributed between all - * sublists of the multilist. This is needed due to how the ARC eviction - * code is laid out; arc_evict_state() assumes ARC buffers are evenly - * distributed between all sublists and uses this assumption when - * deciding which sublist to evict from and how much to evict from it. - */ -unsigned int -arc_state_multilist_index_func(multilist_t *ml, void *obj) -{ - arc_buf_hdr_t *hdr = obj; - - /* - * We rely on b_dva to generate evenly distributed index - * numbers using buf_hash below. So, as an added precaution, - * let's make sure we never add empty buffers to the arc lists. - */ - ASSERT(!HDR_EMPTY(hdr)); - - /* - * The assumption here, is the hash value for a given - * arc_buf_hdr_t will remain constant throughout it's lifetime - * (i.e. it's b_spa, b_dva, and b_birth fields don't change). - * Thus, we don't need to store the header's sublist index - * on insertion, as this index can be recalculated on removal. - * - * Also, the low order bits of the hash value are thought to be - * distributed evenly. Otherwise, in the case that the multilist - * has a power of two number of sublists, each sublists' usage - * would not be evenly distributed. - */ - return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % - multilist_get_num_sublists(ml)); -} - -#ifdef _KERNEL -static eventhandler_tag arc_event_lowmem = NULL; - -static void -arc_lowmem(void *arg __unused, int howto __unused) -{ - int64_t free_memory, to_free; - - arc_no_grow = B_TRUE; - arc_warm = B_TRUE; - arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); - free_memory = arc_available_memory(); - to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0); - DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); - arc_reduce_target_size(to_free); - - mutex_enter(&arc_adjust_lock); - arc_adjust_needed = B_TRUE; - zthr_wakeup(arc_adjust_zthr); - - /* - * It is unsafe to block here in arbitrary threads, because we can come - * here from ARC itself and may hold ARC locks and thus risk a deadlock - * with ARC reclaim thread. - */ - if (curproc == pageproc) - (void) cv_wait(&arc_adjust_waiters_cv, &arc_adjust_lock); - mutex_exit(&arc_adjust_lock); -} -#endif - -static void -arc_state_init(void) -{ - arc_anon = &ARC_anon; - arc_mru = &ARC_mru; - arc_mru_ghost = &ARC_mru_ghost; - arc_mfu = &ARC_mfu; - arc_mfu_ghost = &ARC_mfu_ghost; - arc_l2c_only = &ARC_l2c_only; - - arc_mru->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mru->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mfu->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mfu->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_l2c_only->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - - zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); - - zfs_refcount_create(&arc_anon->arcs_size); - zfs_refcount_create(&arc_mru->arcs_size); - zfs_refcount_create(&arc_mru_ghost->arcs_size); - zfs_refcount_create(&arc_mfu->arcs_size); - zfs_refcount_create(&arc_mfu_ghost->arcs_size); - zfs_refcount_create(&arc_l2c_only->arcs_size); - - aggsum_init(&arc_meta_used, 0); - aggsum_init(&arc_size, 0); - aggsum_init(&astat_data_size, 0); - aggsum_init(&astat_metadata_size, 0); - aggsum_init(&astat_hdr_size, 0); - aggsum_init(&astat_bonus_size, 0); - aggsum_init(&astat_dnode_size, 0); - aggsum_init(&astat_dbuf_size, 0); - aggsum_init(&astat_l2_hdr_size, 0); -} - -static void -arc_state_fini(void) -{ - zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); - zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); - - zfs_refcount_destroy(&arc_anon->arcs_size); - zfs_refcount_destroy(&arc_mru->arcs_size); - zfs_refcount_destroy(&arc_mru_ghost->arcs_size); - zfs_refcount_destroy(&arc_mfu->arcs_size); - zfs_refcount_destroy(&arc_mfu_ghost->arcs_size); - zfs_refcount_destroy(&arc_l2c_only->arcs_size); - - multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - - aggsum_fini(&arc_meta_used); - aggsum_fini(&arc_size); - aggsum_fini(&astat_data_size); - aggsum_fini(&astat_metadata_size); - aggsum_fini(&astat_hdr_size); - aggsum_fini(&astat_bonus_size); - aggsum_fini(&astat_dnode_size); - aggsum_fini(&astat_dbuf_size); - aggsum_fini(&astat_l2_hdr_size); -} - -uint64_t -arc_max_bytes(void) -{ - return (arc_c_max); -} - -void -arc_init(void) -{ - int i, prefetch_tunable_set = 0; - - /* - * allmem is "all memory that we could possibly use". - */ -#ifdef illumos -#ifdef _KERNEL - uint64_t allmem = ptob(physmem - swapfs_minfree); -#else - uint64_t allmem = (physmem * PAGESIZE) / 2; -#endif -#else - uint64_t allmem = kmem_size(); -#endif - mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL); - - mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); - - /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ - arc_c_min = MAX(allmem / 32, arc_abs_min); - /* set max to 5/8 of all memory, or all but 1GB, whichever is more */ - if (allmem >= 1 << 30) - arc_c_max = allmem - (1 << 30); - else - arc_c_max = arc_c_min; - arc_c_max = MAX(allmem * 5 / 8, arc_c_max); - - /* - * In userland, there's only the memory pressure that we artificially - * create (see arc_available_memory()). Don't let arc_c get too - * small, because it can cause transactions to be larger than - * arc_c, causing arc_tempreserve_space() to fail. - */ -#ifndef _KERNEL - arc_c_min = arc_c_max / 2; -#endif - -#ifdef _KERNEL - /* - * Allow the tunables to override our calculations if they are - * reasonable. - */ - if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) { - arc_c_max = zfs_arc_max; - arc_c_min = MIN(arc_c_min, arc_c_max); - } - if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) - arc_c_min = zfs_arc_min; -#endif - - arc_c = arc_c_max; - arc_p = (arc_c >> 1); - - /* limit meta-data to 1/4 of the arc capacity */ - arc_meta_limit = arc_c_max / 4; - -#ifdef _KERNEL - /* - * Metadata is stored in the kernel's heap. Don't let us - * use more than half the heap for the ARC. - */ -#ifdef __FreeBSD__ - arc_meta_limit = MIN(arc_meta_limit, uma_limit() / 2); - arc_dnode_limit = arc_meta_limit / 10; -#else - arc_meta_limit = MIN(arc_meta_limit, - vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2); -#endif -#endif - - /* Allow the tunable to override if it is reasonable */ - if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) - arc_meta_limit = zfs_arc_meta_limit; - - if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) - arc_c_min = arc_meta_limit / 2; - - if (zfs_arc_meta_min > 0) { - arc_meta_min = zfs_arc_meta_min; - } else { - arc_meta_min = arc_c_min / 2; - } - - /* Valid range: - */ - if ((zfs_arc_dnode_limit) && (zfs_arc_dnode_limit != arc_dnode_limit) && - (zfs_arc_dnode_limit >= zfs_arc_meta_min) && - (zfs_arc_dnode_limit <= arc_c_max)) - arc_dnode_limit = zfs_arc_dnode_limit; - - if (zfs_arc_grow_retry > 0) - arc_grow_retry = zfs_arc_grow_retry; - - if (zfs_arc_shrink_shift > 0) - arc_shrink_shift = zfs_arc_shrink_shift; - - if (zfs_arc_no_grow_shift > 0) - arc_no_grow_shift = zfs_arc_no_grow_shift; - /* - * Ensure that arc_no_grow_shift is less than arc_shrink_shift. - */ - if (arc_no_grow_shift >= arc_shrink_shift) - arc_no_grow_shift = arc_shrink_shift - 1; - - if (zfs_arc_p_min_shift > 0) - arc_p_min_shift = zfs_arc_p_min_shift; - - /* if kmem_flags are set, lets try to use less memory */ - if (kmem_debugging()) - arc_c = arc_c / 2; - if (arc_c < arc_c_min) - arc_c = arc_c_min; - - zfs_arc_min = arc_c_min; - zfs_arc_max = arc_c_max; - - arc_state_init(); - - /* - * The arc must be "uninitialized", so that hdr_recl() (which is - * registered by buf_init()) will not access arc_reap_zthr before - * it is created. - */ - ASSERT(!arc_initialized); - buf_init(); - - list_create(&arc_prune_list, sizeof (arc_prune_t), - offsetof(arc_prune_t, p_node)); - mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); - - arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri, - max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); - - arc_dnlc_evicts_thread_exit = FALSE; - - arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, - sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); - - if (arc_ksp != NULL) { - arc_ksp->ks_data = &arc_stats; - arc_ksp->ks_update = arc_kstat_update; - kstat_install(arc_ksp); - } - - arc_adjust_zthr = zthr_create_timer(arc_adjust_cb_check, - arc_adjust_cb, NULL, SEC2NSEC(1)); - arc_reap_zthr = zthr_create_timer(arc_reap_cb_check, - arc_reap_cb, NULL, SEC2NSEC(1)); - -#ifdef _KERNEL - arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, - EVENTHANDLER_PRI_FIRST); -#endif - - (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, - TS_RUN, minclsyspri); - - arc_initialized = B_TRUE; - arc_warm = B_FALSE; - - /* - * Calculate maximum amount of dirty data per pool. - * - * If it has been set by /etc/system, take that. - * Otherwise, use a percentage of physical memory defined by - * zfs_dirty_data_max_percent (default 10%) with a cap at - * zfs_dirty_data_max_max (default 4GB). - */ - if (zfs_dirty_data_max == 0) { - zfs_dirty_data_max = ptob(physmem) * - zfs_dirty_data_max_percent / 100; - zfs_dirty_data_max = MIN(zfs_dirty_data_max, - zfs_dirty_data_max_max); - } - -#ifdef _KERNEL - if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) - prefetch_tunable_set = 1; - -#ifdef __i386__ - if (prefetch_tunable_set == 0) { - printf("ZFS NOTICE: Prefetch is disabled by default on i386 " - "-- to enable,\n"); - printf(" add \"vfs.zfs.prefetch_disable=0\" " - "to /boot/loader.conf.\n"); - zfs_prefetch_disable = 1; - } -#else - if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && - prefetch_tunable_set == 0) { - printf("ZFS NOTICE: Prefetch is disabled by default if less " - "than 4GB of RAM is present;\n" - " to enable, add \"vfs.zfs.prefetch_disable=0\" " - "to /boot/loader.conf.\n"); - zfs_prefetch_disable = 1; - } -#endif - /* Warn about ZFS memory and address space requirements. */ - if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { - printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " - "expect unstable behavior.\n"); - } - if (allmem < 512 * (1 << 20)) { - printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " - "expect unstable behavior.\n"); - printf(" Consider tuning vm.kmem_size and " - "vm.kmem_size_max\n"); - printf(" in /boot/loader.conf.\n"); - } -#endif -} - -void -arc_fini(void) -{ - arc_prune_t *p; - -#ifdef _KERNEL - if (arc_event_lowmem != NULL) - EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); -#endif - - /* Use B_TRUE to ensure *all* buffers are evicted */ - arc_flush(NULL, B_TRUE); - - mutex_enter(&arc_dnlc_evicts_lock); - arc_dnlc_evicts_thread_exit = TRUE; - /* - * The user evicts thread will set arc_user_evicts_thread_exit - * to FALSE when it is finished exiting; we're waiting for that. - */ - while (arc_dnlc_evicts_thread_exit) { - cv_signal(&arc_dnlc_evicts_cv); - cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); - } - mutex_exit(&arc_dnlc_evicts_lock); - - arc_initialized = B_FALSE; - - if (arc_ksp != NULL) { - kstat_delete(arc_ksp); - arc_ksp = NULL; - } - - taskq_wait(arc_prune_taskq); - taskq_destroy(arc_prune_taskq); - - mutex_enter(&arc_prune_mtx); - while ((p = list_head(&arc_prune_list)) != NULL) { - list_remove(&arc_prune_list, p); - zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); - zfs_refcount_destroy(&p->p_refcnt); - kmem_free(p, sizeof (*p)); - } - mutex_exit(&arc_prune_mtx); - - list_destroy(&arc_prune_list); - mutex_destroy(&arc_prune_mtx); - - (void) zthr_cancel(arc_adjust_zthr); - zthr_destroy(arc_adjust_zthr); - - mutex_destroy(&arc_dnlc_evicts_lock); - cv_destroy(&arc_dnlc_evicts_cv); - - (void) zthr_cancel(arc_reap_zthr); - zthr_destroy(arc_reap_zthr); - - mutex_destroy(&arc_adjust_lock); - cv_destroy(&arc_adjust_waiters_cv); - - /* - * buf_fini() must proceed arc_state_fini() because buf_fin() may - * trigger the release of kmem magazines, which can callback to - * arc_space_return() which accesses aggsums freed in act_state_fini(). - */ - buf_fini(); - arc_state_fini(); - - ASSERT0(arc_loaned_bytes); -} - -/* - * Level 2 ARC - * - * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. - * It uses dedicated storage devices to hold cached data, which are populated - * using large infrequent writes. The main role of this cache is to boost - * the performance of random read workloads. The intended L2ARC devices - * include short-stroked disks, solid state disks, and other media with - * substantially faster read latency than disk. - * - * +-----------------------+ - * | ARC | - * +-----------------------+ - * | ^ ^ - * | | | - * l2arc_feed_thread() arc_read() - * | | | - * | l2arc read | - * V | | - * +---------------+ | - * | L2ARC | | - * +---------------+ | - * | ^ | - * l2arc_write() | | - * | | | - * V | | - * +-------+ +-------+ - * | vdev | | vdev | - * | cache | | cache | - * +-------+ +-------+ - * +=========+ .-----. - * : L2ARC : |-_____-| - * : devices : | Disks | - * +=========+ `-_____-' - * - * Read requests are satisfied from the following sources, in order: - * - * 1) ARC - * 2) vdev cache of L2ARC devices - * 3) L2ARC devices - * 4) vdev cache of disks - * 5) disks - * - * Some L2ARC device types exhibit extremely slow write performance. - * To accommodate for this there are some significant differences between - * the L2ARC and traditional cache design: - * - * 1. There is no eviction path from the ARC to the L2ARC. Evictions from - * the ARC behave as usual, freeing buffers and placing headers on ghost - * lists. The ARC does not send buffers to the L2ARC during eviction as - * this would add inflated write latencies for all ARC memory pressure. - * - * 2. The L2ARC attempts to cache data from the ARC before it is evicted. - * It does this by periodically scanning buffers from the eviction-end of - * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are - * not already there. It scans until a headroom of buffers is satisfied, - * which itself is a buffer for ARC eviction. If a compressible buffer is - * found during scanning and selected for writing to an L2ARC device, we - * temporarily boost scanning headroom during the next scan cycle to make - * sure we adapt to compression effects (which might significantly reduce - * the data volume we write to L2ARC). The thread that does this is - * l2arc_feed_thread(), illustrated below; example sizes are included to - * provide a better sense of ratio than this diagram: - * - * head --> tail - * +---------------------+----------+ - * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC - * +---------------------+----------+ | o L2ARC eligible - * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer - * +---------------------+----------+ | - * 15.9 Gbytes ^ 32 Mbytes | - * headroom | - * l2arc_feed_thread() - * | - * l2arc write hand <--[oooo]--' - * | 8 Mbyte - * | write max - * V - * +==============================+ - * L2ARC dev |####|#|###|###| |####| ... | - * +==============================+ - * 32 Gbytes - * - * 3. If an ARC buffer is copied to the L2ARC but then hit instead of - * evicted, then the L2ARC has cached a buffer much sooner than it probably - * needed to, potentially wasting L2ARC device bandwidth and storage. It is - * safe to say that this is an uncommon case, since buffers at the end of - * the ARC lists have moved there due to inactivity. - * - * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, - * then the L2ARC simply misses copying some buffers. This serves as a - * pressure valve to prevent heavy read workloads from both stalling the ARC - * with waits and clogging the L2ARC with writes. This also helps prevent - * the potential for the L2ARC to churn if it attempts to cache content too - * quickly, such as during backups of the entire pool. - * - * 5. After system boot and before the ARC has filled main memory, there are - * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru - * lists can remain mostly static. Instead of searching from tail of these - * lists as pictured, the l2arc_feed_thread() will search from the list heads - * for eligible buffers, greatly increasing its chance of finding them. - * - * The L2ARC device write speed is also boosted during this time so that - * the L2ARC warms up faster. Since there have been no ARC evictions yet, - * there are no L2ARC reads, and no fear of degrading read performance - * through increased writes. - * - * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that - * the vdev queue can aggregate them into larger and fewer writes. Each - * device is written to in a rotor fashion, sweeping writes through - * available space then repeating. - * - * 7. The L2ARC does not store dirty content. It never needs to flush - * write buffers back to disk based storage. - * - * 8. If an ARC buffer is written (and dirtied) which also exists in the - * L2ARC, the now stale L2ARC buffer is immediately dropped. - * - * The performance of the L2ARC can be tweaked by a number of tunables, which - * may be necessary for different workloads: - * - * l2arc_write_max max write bytes per interval - * l2arc_write_boost extra write bytes during device warmup - * l2arc_noprefetch skip caching prefetched buffers - * l2arc_headroom number of max device writes to precache - * l2arc_headroom_boost when we find compressed buffers during ARC - * scanning, we multiply headroom by this - * percentage factor for the next scan cycle, - * since more compressed buffers are likely to - * be present - * l2arc_feed_secs seconds between L2ARC writing - * - * Tunables may be removed or added as future performance improvements are - * integrated, and also may become zpool properties. - * - * There are three key functions that control how the L2ARC warms up: - * - * l2arc_write_eligible() check if a buffer is eligible to cache - * l2arc_write_size() calculate how much to write - * l2arc_write_interval() calculate sleep delay between writes - * - * These three functions determine what to write, how much, and how quickly - * to send writes. - */ - -static boolean_t -l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) -{ - /* - * A buffer is *not* eligible for the L2ARC if it: - * 1. belongs to a different spa. - * 2. is already cached on the L2ARC. - * 3. has an I/O in progress (it may be an incomplete read). - * 4. is flagged not eligible (zfs property). - */ - if (hdr->b_spa != spa_guid) { - ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); - return (B_FALSE); - } - if (HDR_HAS_L2HDR(hdr)) { - ARCSTAT_BUMP(arcstat_l2_write_in_l2); - return (B_FALSE); - } - if (HDR_IO_IN_PROGRESS(hdr)) { - ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); - return (B_FALSE); - } - if (!HDR_L2CACHE(hdr)) { - ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); - return (B_FALSE); - } - - return (B_TRUE); -} - -static uint64_t -l2arc_write_size(void) -{ - uint64_t size; - - /* - * Make sure our globals have meaningful values in case the user - * altered them. - */ - size = l2arc_write_max; - if (size == 0) { - cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " - "be greater than zero, resetting it to the default (%d)", - L2ARC_WRITE_SIZE); - size = l2arc_write_max = L2ARC_WRITE_SIZE; - } - - if (arc_warm == B_FALSE) - size += l2arc_write_boost; - - return (size); - -} - -static clock_t -l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) -{ - clock_t interval, next, now; - - /* - * If the ARC lists are busy, increase our write rate; if the - * lists are stale, idle back. This is achieved by checking - * how much we previously wrote - if it was more than half of - * what we wanted, schedule the next write much sooner. - */ - if (l2arc_feed_again && wrote > (wanted / 2)) - interval = (hz * l2arc_feed_min_ms) / 1000; - else - interval = hz * l2arc_feed_secs; - - now = ddi_get_lbolt(); - next = MAX(now, MIN(now + interval, began + interval)); - - return (next); -} - -/* - * Cycle through L2ARC devices. This is how L2ARC load balances. - * If a device is returned, this also returns holding the spa config lock. - */ -static l2arc_dev_t * -l2arc_dev_get_next(void) -{ - l2arc_dev_t *first, *next = NULL; - - /* - * Lock out the removal of spas (spa_namespace_lock), then removal - * of cache devices (l2arc_dev_mtx). Once a device has been selected, - * both locks will be dropped and a spa config lock held instead. - */ - mutex_enter(&spa_namespace_lock); - mutex_enter(&l2arc_dev_mtx); - - /* if there are no vdevs, there is nothing to do */ - if (l2arc_ndev == 0) - goto out; - - first = NULL; - next = l2arc_dev_last; - do { - /* loop around the list looking for a non-faulted vdev */ - if (next == NULL) { - next = list_head(l2arc_dev_list); - } else { - next = list_next(l2arc_dev_list, next); - if (next == NULL) - next = list_head(l2arc_dev_list); - } - - /* if we have come back to the start, bail out */ - if (first == NULL) - first = next; - else if (next == first) - break; - - } while (vdev_is_dead(next->l2ad_vdev)); - - /* if we were unable to find any usable vdevs, return NULL */ - if (vdev_is_dead(next->l2ad_vdev)) - next = NULL; - - l2arc_dev_last = next; - -out: - mutex_exit(&l2arc_dev_mtx); - - /* - * Grab the config lock to prevent the 'next' device from being - * removed while we are writing to it. - */ - if (next != NULL) - spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); - mutex_exit(&spa_namespace_lock); - - return (next); -} - -/* - * Free buffers that were tagged for destruction. - */ -static void -l2arc_do_free_on_write() -{ - list_t *buflist; - l2arc_data_free_t *df, *df_prev; - - mutex_enter(&l2arc_free_on_write_mtx); - buflist = l2arc_free_on_write; - - for (df = list_tail(buflist); df; df = df_prev) { - df_prev = list_prev(buflist, df); - ASSERT3P(df->l2df_abd, !=, NULL); - abd_free(df->l2df_abd); - list_remove(buflist, df); - kmem_free(df, sizeof (l2arc_data_free_t)); - } - - mutex_exit(&l2arc_free_on_write_mtx); -} - -/* - * A write to a cache device has completed. Update all headers to allow - * reads from these buffers to begin. - */ -static void -l2arc_write_done(zio_t *zio) -{ - l2arc_write_callback_t *cb; - l2arc_dev_t *dev; - list_t *buflist; - arc_buf_hdr_t *head, *hdr, *hdr_prev; - kmutex_t *hash_lock; - int64_t bytes_dropped = 0; - - cb = zio->io_private; - ASSERT3P(cb, !=, NULL); - dev = cb->l2wcb_dev; - ASSERT3P(dev, !=, NULL); - head = cb->l2wcb_head; - ASSERT3P(head, !=, NULL); - buflist = &dev->l2ad_buflist; - ASSERT3P(buflist, !=, NULL); - DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, - l2arc_write_callback_t *, cb); - - if (zio->io_error != 0) - ARCSTAT_BUMP(arcstat_l2_writes_error); - - /* - * All writes completed, or an error was hit. - */ -top: - mutex_enter(&dev->l2ad_mtx); - for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { - hdr_prev = list_prev(buflist, hdr); - - hash_lock = HDR_LOCK(hdr); - - /* - * We cannot use mutex_enter or else we can deadlock - * with l2arc_write_buffers (due to swapping the order - * the hash lock and l2ad_mtx are taken). - */ - if (!mutex_tryenter(hash_lock)) { - /* - * Missed the hash lock. We must retry so we - * don't leave the ARC_FLAG_L2_WRITING bit set. - */ - ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); - - /* - * We don't want to rescan the headers we've - * already marked as having been written out, so - * we reinsert the head node so we can pick up - * where we left off. - */ - list_remove(buflist, head); - list_insert_after(buflist, hdr, head); - - mutex_exit(&dev->l2ad_mtx); - - /* - * We wait for the hash lock to become available - * to try and prevent busy waiting, and increase - * the chance we'll be able to acquire the lock - * the next time around. - */ - mutex_enter(hash_lock); - mutex_exit(hash_lock); - goto top; - } - - /* - * We could not have been moved into the arc_l2c_only - * state while in-flight due to our ARC_FLAG_L2_WRITING - * bit being set. Let's just ensure that's being enforced. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - - if (zio->io_error != 0) { - /* - * Error - drop L2ARC entry. - */ - list_remove(buflist, hdr); - l2arc_trim(hdr); - arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); - - ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr)); - ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); - - bytes_dropped += arc_hdr_size(hdr); - (void) zfs_refcount_remove_many(&dev->l2ad_alloc, - arc_hdr_size(hdr), hdr); - } - - /* - * Allow ARC to begin reads and ghost list evictions to - * this L2ARC entry. - */ - arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); - - mutex_exit(hash_lock); - } - - atomic_inc_64(&l2arc_writes_done); - list_remove(buflist, head); - ASSERT(!HDR_HAS_L1HDR(head)); - kmem_cache_free(hdr_l2only_cache, head); - mutex_exit(&dev->l2ad_mtx); - - vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); - - l2arc_do_free_on_write(); - - kmem_free(cb, sizeof (l2arc_write_callback_t)); -} - -/* - * A read to a cache device completed. Validate buffer contents before - * handing over to the regular ARC routines. - */ -static void -l2arc_read_done(zio_t *zio) -{ - l2arc_read_callback_t *cb; - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - boolean_t valid_cksum; - - ASSERT3P(zio->io_vd, !=, NULL); - ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); - - spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); - - cb = zio->io_private; - ASSERT3P(cb, !=, NULL); - hdr = cb->l2rcb_hdr; - ASSERT3P(hdr, !=, NULL); - - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - - /* - * If the data was read into a temporary buffer, - * move it and free the buffer. - */ - if (cb->l2rcb_abd != NULL) { - ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); - if (zio->io_error == 0) { - abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, - arc_hdr_size(hdr)); - } - - /* - * The following must be done regardless of whether - * there was an error: - * - free the temporary buffer - * - point zio to the real ARC buffer - * - set zio size accordingly - * These are required because zio is either re-used for - * an I/O of the block in the case of the error - * or the zio is passed to arc_read_done() and it - * needs real data. - */ - abd_free(cb->l2rcb_abd); - zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); - zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; - } - - ASSERT3P(zio->io_abd, !=, NULL); - - /* - * Check this survived the L2ARC journey. - */ - ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); - zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ - zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ - - valid_cksum = arc_cksum_is_equal(hdr, zio); - if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { - mutex_exit(hash_lock); - zio->io_private = hdr; - arc_read_done(zio); - } else { - /* - * Buffer didn't survive caching. Increment stats and - * reissue to the original storage device. - */ - if (zio->io_error != 0) { - ARCSTAT_BUMP(arcstat_l2_io_error); - } else { - zio->io_error = SET_ERROR(EIO); - } - if (!valid_cksum) - ARCSTAT_BUMP(arcstat_l2_cksum_bad); - - /* - * If there's no waiter, issue an async i/o to the primary - * storage now. If there *is* a waiter, the caller must - * issue the i/o in a context where it's OK to block. - */ - if (zio->io_waiter == NULL) { - zio_t *pio = zio_unique_parent(zio); - - ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); - - zio = zio_read(pio, zio->io_spa, zio->io_bp, - hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, - hdr, zio->io_priority, cb->l2rcb_flags, - &cb->l2rcb_zb); - for (struct arc_callback *acb = hdr->b_l1hdr.b_acb; - acb != NULL; acb = acb->acb_next) - acb->acb_zio_head = zio; - mutex_exit(hash_lock); - zio_nowait(zio); - } else - mutex_exit(hash_lock); - } - - kmem_free(cb, sizeof (l2arc_read_callback_t)); -} - -/* - * This is the list priority from which the L2ARC will search for pages to - * cache. This is used within loops (0..3) to cycle through lists in the - * desired order. This order can have a significant effect on cache - * performance. - * - * Currently the metadata lists are hit first, MFU then MRU, followed by - * the data lists. This function returns a locked list, and also returns - * the lock pointer. - */ -static multilist_sublist_t * -l2arc_sublist_lock(int list_num) -{ - multilist_t *ml = NULL; - unsigned int idx; - - ASSERT(list_num >= 0 && list_num <= 3); - - switch (list_num) { - case 0: - ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; - break; - case 1: - ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; - break; - case 2: - ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; - break; - case 3: - ml = arc_mru->arcs_list[ARC_BUFC_DATA]; - break; - } - - /* - * Return a randomly-selected sublist. This is acceptable - * because the caller feeds only a little bit of data for each - * call (8MB). Subsequent calls will result in different - * sublists being selected. - */ - idx = multilist_get_random_index(ml); - return (multilist_sublist_lock(ml, idx)); -} - -/* - * Evict buffers from the device write hand to the distance specified in - * bytes. This distance may span populated buffers, it may span nothing. - * This is clearing a region on the L2ARC device ready for writing. - * If the 'all' boolean is set, every buffer is evicted. - */ -static void -l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) -{ - list_t *buflist; - arc_buf_hdr_t *hdr, *hdr_prev; - kmutex_t *hash_lock; - uint64_t taddr; - - buflist = &dev->l2ad_buflist; - - if (!all && dev->l2ad_first) { - /* - * This is the first sweep through the device. There is - * nothing to evict. - */ - return; - } - - if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { - /* - * When nearing the end of the device, evict to the end - * before the device write hand jumps to the start. - */ - taddr = dev->l2ad_end; - } else { - taddr = dev->l2ad_hand + distance; - } - DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, - uint64_t, taddr, boolean_t, all); - -top: - mutex_enter(&dev->l2ad_mtx); - for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { - hdr_prev = list_prev(buflist, hdr); - - hash_lock = HDR_LOCK(hdr); - - /* - * We cannot use mutex_enter or else we can deadlock - * with l2arc_write_buffers (due to swapping the order - * the hash lock and l2ad_mtx are taken). - */ - if (!mutex_tryenter(hash_lock)) { - /* - * Missed the hash lock. Retry. - */ - ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); - mutex_exit(&dev->l2ad_mtx); - mutex_enter(hash_lock); - mutex_exit(hash_lock); - goto top; - } - - /* - * A header can't be on this list if it doesn't have L2 header. - */ - ASSERT(HDR_HAS_L2HDR(hdr)); - - /* Ensure this header has finished being written. */ - ASSERT(!HDR_L2_WRITING(hdr)); - ASSERT(!HDR_L2_WRITE_HEAD(hdr)); - - if (!all && (hdr->b_l2hdr.b_daddr >= taddr || - hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { - /* - * We've evicted to the target address, - * or the end of the device. - */ - mutex_exit(hash_lock); - break; - } - - if (!HDR_HAS_L1HDR(hdr)) { - ASSERT(!HDR_L2_READING(hdr)); - /* - * This doesn't exist in the ARC. Destroy. - * arc_hdr_destroy() will call list_remove() - * and decrement arcstat_l2_lsize. - */ - arc_change_state(arc_anon, hdr, hash_lock); - arc_hdr_destroy(hdr); - } else { - ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); - ARCSTAT_BUMP(arcstat_l2_evict_l1cached); - /* - * Invalidate issued or about to be issued - * reads, since we may be about to write - * over this location. - */ - if (HDR_L2_READING(hdr)) { - ARCSTAT_BUMP(arcstat_l2_evict_reading); - arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); - } - - arc_hdr_l2hdr_destroy(hdr); - } - mutex_exit(hash_lock); - } - mutex_exit(&dev->l2ad_mtx); -} - -/* - * Find and write ARC buffers to the L2ARC device. - * - * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid - * for reading until they have completed writing. - * The headroom_boost is an in-out parameter used to maintain headroom boost - * state between calls to this function. - * - * Returns the number of bytes actually written (which may be smaller than - * the delta by which the device hand has changed due to alignment). - */ -static uint64_t -l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) -{ - arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_psize, write_lsize, headroom; - boolean_t full; - l2arc_write_callback_t *cb; - zio_t *pio, *wzio; - uint64_t guid = spa_load_guid(spa); - int try; - - ASSERT3P(dev->l2ad_vdev, !=, NULL); - - pio = NULL; - write_lsize = write_asize = write_psize = 0; - full = B_FALSE; - head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); - arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); - - ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); - /* - * Copy buffers for L2ARC writing. - */ - for (try = 0; try <= 3; try++) { - multilist_sublist_t *mls = l2arc_sublist_lock(try); - uint64_t passed_sz = 0; - - ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); - - /* - * L2ARC fast warmup. - * - * Until the ARC is warm and starts to evict, read from the - * head of the ARC lists rather than the tail. - */ - if (arc_warm == B_FALSE) - hdr = multilist_sublist_head(mls); - else - hdr = multilist_sublist_tail(mls); - if (hdr == NULL) - ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); - - headroom = target_sz * l2arc_headroom; - if (zfs_compressed_arc_enabled) - headroom = (headroom * l2arc_headroom_boost) / 100; - - for (; hdr; hdr = hdr_prev) { - kmutex_t *hash_lock; - - if (arc_warm == B_FALSE) - hdr_prev = multilist_sublist_next(mls, hdr); - else - hdr_prev = multilist_sublist_prev(mls, hdr); - ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, - HDR_GET_LSIZE(hdr)); - - hash_lock = HDR_LOCK(hdr); - if (!mutex_tryenter(hash_lock)) { - ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); - /* - * Skip this buffer rather than waiting. - */ - continue; - } - - passed_sz += HDR_GET_LSIZE(hdr); - if (passed_sz > headroom) { - /* - * Searched too far. - */ - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); - break; - } - - if (!l2arc_write_eligible(guid, hdr)) { - mutex_exit(hash_lock); - continue; - } - - /* - * We rely on the L1 portion of the header below, so - * it's invalid for this header to have been evicted out - * of the ghost cache, prior to being written out. The - * ARC_FLAG_L2_WRITING bit ensures this won't happen. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - - ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - ASSERT3U(arc_hdr_size(hdr), >, 0); - uint64_t psize = arc_hdr_size(hdr); - uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, - psize); - - if ((write_asize + asize) > target_sz) { - full = B_TRUE; - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_l2_write_full); - break; - } - - if (pio == NULL) { - /* - * Insert a dummy header on the buflist so - * l2arc_write_done() can find where the - * write buffers begin without searching. - */ - mutex_enter(&dev->l2ad_mtx); - list_insert_head(&dev->l2ad_buflist, head); - mutex_exit(&dev->l2ad_mtx); - - cb = kmem_alloc( - sizeof (l2arc_write_callback_t), KM_SLEEP); - cb->l2wcb_dev = dev; - cb->l2wcb_head = head; - pio = zio_root(spa, l2arc_write_done, cb, - ZIO_FLAG_CANFAIL); - ARCSTAT_BUMP(arcstat_l2_write_pios); - } - - hdr->b_l2hdr.b_dev = dev; - hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - arc_hdr_set_flags(hdr, - ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); - - mutex_enter(&dev->l2ad_mtx); - list_insert_head(&dev->l2ad_buflist, hdr); - mutex_exit(&dev->l2ad_mtx); - - (void) zfs_refcount_add_many(&dev->l2ad_alloc, psize, - hdr); - - /* - * Normally the L2ARC can use the hdr's data, but if - * we're sharing data between the hdr and one of its - * bufs, L2ARC needs its own copy of the data so that - * the ZIO below can't race with the buf consumer. - * Another case where we need to create a copy of the - * data is when the buffer size is not device-aligned - * and we need to pad the block to make it such. - * That also keeps the clock hand suitably aligned. - * - * To ensure that the copy will be available for the - * lifetime of the ZIO and be cleaned up afterwards, we - * add it to the l2arc_free_on_write queue. - */ - abd_t *to_write; - if (!HDR_SHARED_DATA(hdr) && psize == asize) { - to_write = hdr->b_l1hdr.b_pabd; - } else { - to_write = abd_alloc_for_io(asize, - HDR_ISTYPE_METADATA(hdr)); - abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); - if (asize != psize) { - abd_zero_off(to_write, psize, - asize - psize); - } - l2arc_free_abd_on_write(to_write, asize, - arc_buf_type(hdr)); - } - wzio = zio_write_phys(pio, dev->l2ad_vdev, - hdr->b_l2hdr.b_daddr, asize, to_write, - ZIO_CHECKSUM_OFF, NULL, hdr, - ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_CANFAIL, B_FALSE); - - write_lsize += HDR_GET_LSIZE(hdr); - DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, - zio_t *, wzio); - - write_psize += psize; - write_asize += asize; - dev->l2ad_hand += asize; - - mutex_exit(hash_lock); - - (void) zio_nowait(wzio); - } - - multilist_sublist_unlock(mls); - - if (full == B_TRUE) - break; - } - - /* No buffers selected for writing? */ - if (pio == NULL) { - ASSERT0(write_lsize); - ASSERT(!HDR_HAS_L1HDR(head)); - kmem_cache_free(hdr_l2only_cache, head); - return (0); - } - - ASSERT3U(write_psize, <=, target_sz); - ARCSTAT_BUMP(arcstat_l2_writes_sent); - ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); - ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); - ARCSTAT_INCR(arcstat_l2_psize, write_psize); - vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); - - /* - * Bump device hand to the device start if it is approaching the end. - * l2arc_evict() will already have evicted ahead for this case. - */ - if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { - dev->l2ad_hand = dev->l2ad_start; - dev->l2ad_first = B_FALSE; - } - - dev->l2ad_writing = B_TRUE; - (void) zio_wait(pio); - dev->l2ad_writing = B_FALSE; - - return (write_asize); -} - -/* - * This thread feeds the L2ARC at regular intervals. This is the beating - * heart of the L2ARC. - */ -/* ARGSUSED */ -static void -l2arc_feed_thread(void *unused __unused) -{ - callb_cpr_t cpr; - l2arc_dev_t *dev; - spa_t *spa; - uint64_t size, wrote; - clock_t begin, next = ddi_get_lbolt(); - - CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); - - mutex_enter(&l2arc_feed_thr_lock); - - while (l2arc_thread_exit == 0) { - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, - next - ddi_get_lbolt()); - CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); - next = ddi_get_lbolt() + hz; - - /* - * Quick check for L2ARC devices. - */ - mutex_enter(&l2arc_dev_mtx); - if (l2arc_ndev == 0) { - mutex_exit(&l2arc_dev_mtx); - continue; - } - mutex_exit(&l2arc_dev_mtx); - begin = ddi_get_lbolt(); - - /* - * This selects the next l2arc device to write to, and in - * doing so the next spa to feed from: dev->l2ad_spa. This - * will return NULL if there are now no l2arc devices or if - * they are all faulted. - * - * If a device is returned, its spa's config lock is also - * held to prevent device removal. l2arc_dev_get_next() - * will grab and release l2arc_dev_mtx. - */ - if ((dev = l2arc_dev_get_next()) == NULL) - continue; - - spa = dev->l2ad_spa; - ASSERT3P(spa, !=, NULL); - - /* - * If the pool is read-only then force the feed thread to - * sleep a little longer. - */ - if (!spa_writeable(spa)) { - next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; - spa_config_exit(spa, SCL_L2ARC, dev); - continue; - } - - /* - * Avoid contributing to memory pressure. - */ - if (arc_reclaim_needed()) { - ARCSTAT_BUMP(arcstat_l2_abort_lowmem); - spa_config_exit(spa, SCL_L2ARC, dev); - continue; - } - - ARCSTAT_BUMP(arcstat_l2_feeds); - - size = l2arc_write_size(); - - /* - * Evict L2ARC buffers that will be overwritten. - */ - l2arc_evict(dev, size, B_FALSE); - - /* - * Write ARC buffers. - */ - wrote = l2arc_write_buffers(spa, dev, size); - - /* - * Calculate interval between writes. - */ - next = l2arc_write_interval(begin, size, wrote); - spa_config_exit(spa, SCL_L2ARC, dev); - } - - l2arc_thread_exit = 0; - cv_broadcast(&l2arc_feed_thr_cv); - CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ - thread_exit(); -} - -boolean_t -l2arc_vdev_present(vdev_t *vd) -{ - l2arc_dev_t *dev; - - mutex_enter(&l2arc_dev_mtx); - for (dev = list_head(l2arc_dev_list); dev != NULL; - dev = list_next(l2arc_dev_list, dev)) { - if (dev->l2ad_vdev == vd) - break; - } - mutex_exit(&l2arc_dev_mtx); - - return (dev != NULL); -} - -/* - * Add a vdev for use by the L2ARC. By this point the spa has already - * validated the vdev and opened it. - */ -void -l2arc_add_vdev(spa_t *spa, vdev_t *vd) -{ - l2arc_dev_t *adddev; - - ASSERT(!l2arc_vdev_present(vd)); - - vdev_ashift_optimize(vd); - - /* - * Create a new l2arc device entry. - */ - adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); - adddev->l2ad_spa = spa; - adddev->l2ad_vdev = vd; - adddev->l2ad_start = VDEV_LABEL_START_SIZE; - adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); - adddev->l2ad_hand = adddev->l2ad_start; - adddev->l2ad_first = B_TRUE; - adddev->l2ad_writing = B_FALSE; - - mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); - /* - * This is a list of all ARC buffers that are still valid on the - * device. - */ - list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); - - vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); - zfs_refcount_create(&adddev->l2ad_alloc); - - /* - * Add device to global list - */ - mutex_enter(&l2arc_dev_mtx); - list_insert_head(l2arc_dev_list, adddev); - atomic_inc_64(&l2arc_ndev); - mutex_exit(&l2arc_dev_mtx); -} - -/* - * Remove a vdev from the L2ARC. - */ -void -l2arc_remove_vdev(vdev_t *vd) -{ - l2arc_dev_t *dev, *nextdev, *remdev = NULL; - - /* - * Find the device by vdev - */ - mutex_enter(&l2arc_dev_mtx); - for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { - nextdev = list_next(l2arc_dev_list, dev); - if (vd == dev->l2ad_vdev) { - remdev = dev; - break; - } - } - ASSERT3P(remdev, !=, NULL); - - /* - * Remove device from global list - */ - list_remove(l2arc_dev_list, remdev); - l2arc_dev_last = NULL; /* may have been invalidated */ - atomic_dec_64(&l2arc_ndev); - mutex_exit(&l2arc_dev_mtx); - - /* - * Clear all buflists and ARC references. L2ARC device flush. - */ - l2arc_evict(remdev, 0, B_TRUE); - list_destroy(&remdev->l2ad_buflist); - mutex_destroy(&remdev->l2ad_mtx); - zfs_refcount_destroy(&remdev->l2ad_alloc); - kmem_free(remdev, sizeof (l2arc_dev_t)); -} - -void -l2arc_init(void) -{ - l2arc_thread_exit = 0; - l2arc_ndev = 0; - l2arc_writes_sent = 0; - l2arc_writes_done = 0; - - mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); - - l2arc_dev_list = &L2ARC_dev_list; - l2arc_free_on_write = &L2ARC_free_on_write; - list_create(l2arc_dev_list, sizeof (l2arc_dev_t), - offsetof(l2arc_dev_t, l2ad_node)); - list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), - offsetof(l2arc_data_free_t, l2df_list_node)); -} - -void -l2arc_fini(void) -{ - /* - * This is called from dmu_fini(), which is called from spa_fini(); - * Because of this, we can assume that all l2arc devices have - * already been removed when the pools themselves were removed. - */ - - l2arc_do_free_on_write(); - - mutex_destroy(&l2arc_feed_thr_lock); - cv_destroy(&l2arc_feed_thr_cv); - mutex_destroy(&l2arc_dev_mtx); - mutex_destroy(&l2arc_free_on_write_mtx); - - list_destroy(l2arc_dev_list); - list_destroy(l2arc_free_on_write); -} - -void -l2arc_start(void) -{ - if (!(spa_mode_global & FWRITE)) - return; - - (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, - TS_RUN, minclsyspri); -} - -void -l2arc_stop(void) -{ - if (!(spa_mode_global & FWRITE)) - return; - - mutex_enter(&l2arc_feed_thr_lock); - cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ - l2arc_thread_exit = 1; - while (l2arc_thread_exit != 0) - cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); - mutex_exit(&l2arc_feed_thr_lock); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c deleted file mode 100644 index d7a7fdb0e1b1..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. - */ - -#include -#include -#include - -/* - * Embedded-data Block Pointers - * - * Normally, block pointers point (via their DVAs) to a block which holds data. - * If the data that we need to store is very small, this is an inefficient - * use of space, because a block must be at minimum 1 sector (typically 512 - * bytes or 4KB). Additionally, reading these small blocks tends to generate - * more random reads. - * - * Embedded-data Block Pointers allow small pieces of data (the "payload", - * up to 112 bytes) to be stored in the block pointer itself, instead of - * being pointed to. The "Pointer" part of this name is a bit of a - * misnomer, as nothing is pointed to. - * - * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to - * be embedded in the block pointer. The logic for this is handled in - * the SPA, by the zio pipeline. Therefore most code outside the zio - * pipeline doesn't need special-cases to handle these block pointers. - * - * See spa.h for details on the exact layout of embedded block pointers. - */ - -void -encode_embedded_bp_compressed(blkptr_t *bp, void *data, - enum zio_compress comp, int uncompressed_size, int compressed_size) -{ - uint64_t *bp64 = (uint64_t *)bp; - uint64_t w = 0; - uint8_t *data8 = data; - - ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE); - ASSERT(uncompressed_size == compressed_size || - comp != ZIO_COMPRESS_OFF); - ASSERT3U(comp, >=, ZIO_COMPRESS_OFF); - ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); - - bzero(bp, sizeof (*bp)); - BP_SET_EMBEDDED(bp, B_TRUE); - BP_SET_COMPRESS(bp, comp); - BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); - BPE_SET_LSIZE(bp, uncompressed_size); - BPE_SET_PSIZE(bp, compressed_size); - - /* - * Encode the byte array into the words of the block pointer. - * First byte goes into low bits of first word (little endian). - */ - for (int i = 0; i < compressed_size; i++) { - BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]); - if (i % sizeof (w) == sizeof (w) - 1) { - /* we've reached the end of a word */ - ASSERT3P(bp64, <, bp + 1); - *bp64 = w; - bp64++; - if (!BPE_IS_PAYLOADWORD(bp, bp64)) - bp64++; - w = 0; - } - } - /* write last partial word */ - if (bp64 < (uint64_t *)(bp + 1)) - *bp64 = w; -} - -/* - * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be - * more than BPE_PAYLOAD_SIZE bytes). - */ -void -decode_embedded_bp_compressed(const blkptr_t *bp, void *buf) -{ - int psize; - uint8_t *buf8 = buf; - uint64_t w = 0; - const uint64_t *bp64 = (const uint64_t *)bp; - - ASSERT(BP_IS_EMBEDDED(bp)); - - psize = BPE_GET_PSIZE(bp); - - /* - * Decode the words of the block pointer into the byte array. - * Low bits of first word are the first byte (little endian). - */ - for (int i = 0; i < psize; i++) { - if (i % sizeof (w) == 0) { - /* beginning of a word */ - ASSERT3P(bp64, <, bp + 1); - w = *bp64; - bp64++; - if (!BPE_IS_PAYLOADWORD(bp, bp64)) - bp64++; - } - buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY); - } -} - -/* - * Fill in the buffer with the (decompressed) payload of the embedded - * blkptr_t. Takes into account compression and byteorder (the payload is - * treated as a stream of bytes). - * Return 0 on success, or ENOSPC if it won't fit in the buffer. - */ -int -decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen) -{ - int lsize, psize; - - ASSERT(BP_IS_EMBEDDED(bp)); - - lsize = BPE_GET_LSIZE(bp); - psize = BPE_GET_PSIZE(bp); - - if (lsize > buflen) - return (ENOSPC); - ASSERT3U(lsize, ==, buflen); - - if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { - uint8_t dstbuf[BPE_PAYLOAD_SIZE]; - decode_embedded_bp_compressed(bp, dstbuf); - VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp), - dstbuf, buf, psize, buflen)); - } else { - ASSERT3U(lsize, ==, psize); - decode_embedded_bp_compressed(bp, buf); - } - - return (0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c deleted file mode 100644 index ee12db3a266d..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. - */ - -#include -#include - - -void -bplist_create(bplist_t *bpl) -{ - mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&bpl->bpl_list, sizeof (bplist_entry_t), - offsetof(bplist_entry_t, bpe_node)); -} - -void -bplist_destroy(bplist_t *bpl) -{ - list_destroy(&bpl->bpl_list); - mutex_destroy(&bpl->bpl_lock); -} - -void -bplist_append(bplist_t *bpl, const blkptr_t *bp) -{ - bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP); - - mutex_enter(&bpl->bpl_lock); - bpe->bpe_blk = *bp; - list_insert_tail(&bpl->bpl_list, bpe); - mutex_exit(&bpl->bpl_lock); -} - -/* - * To aid debugging, we keep the most recently removed entry. This way if - * we are in the callback, we can easily locate the entry. - */ -static bplist_entry_t *bplist_iterate_last_removed; - -void -bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) -{ - bplist_entry_t *bpe; - - mutex_enter(&bpl->bpl_lock); - while (bpe = list_head(&bpl->bpl_list)) { - bplist_iterate_last_removed = bpe; - list_remove(&bpl->bpl_list, bpe); - mutex_exit(&bpl->bpl_lock); - func(arg, &bpe->bpe_blk, tx); - kmem_free(bpe, sizeof (*bpe)); - mutex_enter(&bpl->bpl_lock); - } - mutex_exit(&bpl->bpl_lock); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c deleted file mode 100644 index bbdd765214fc..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c +++ /dev/null @@ -1,606 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright (c) 2017 Datto Inc. - */ - -#include -#include -#include -#include -#include -#include - -/* - * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj). - */ -uint64_t -bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx) -{ - spa_t *spa = dmu_objset_spa(os); - dsl_pool_t *dp = dmu_objset_pool(os); - - if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) { - if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) { - ASSERT0(dp->dp_empty_bpobj); - dp->dp_empty_bpobj = - bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx); - VERIFY(zap_add(os, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, - &dp->dp_empty_bpobj, tx) == 0); - } - spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx); - ASSERT(dp->dp_empty_bpobj != 0); - return (dp->dp_empty_bpobj); - } else { - return (bpobj_alloc(os, blocksize, tx)); - } -} - -void -bpobj_decr_empty(objset_t *os, dmu_tx_t *tx) -{ - dsl_pool_t *dp = dmu_objset_pool(os); - - spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx); - if (!spa_feature_is_active(dmu_objset_spa(os), - SPA_FEATURE_EMPTY_BPOBJ)) { - VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_EMPTY_BPOBJ, tx)); - VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx)); - dp->dp_empty_bpobj = 0; - } -} - -uint64_t -bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) -{ - int size; - - if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) - size = BPOBJ_SIZE_V0; - else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) - size = BPOBJ_SIZE_V1; - else - size = sizeof (bpobj_phys_t); - - return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, - DMU_OT_BPOBJ_HDR, size, tx)); -} - -void -bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) -{ - int64_t i; - bpobj_t bpo; - dmu_object_info_t doi; - int epb; - dmu_buf_t *dbuf = NULL; - - ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj); - VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); - - mutex_enter(&bpo.bpo_lock); - - if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) - goto out; - - VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); - epb = doi.doi_data_block_size / sizeof (uint64_t); - - for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { - uint64_t *objarray; - uint64_t offset, blkoff; - - offset = i * sizeof (uint64_t); - blkoff = P2PHASE(i, epb); - - if (dbuf == NULL || dbuf->db_offset > offset) { - if (dbuf) - dmu_buf_rele(dbuf, FTAG); - VERIFY3U(0, ==, dmu_buf_hold(os, - bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); - } - - ASSERT3U(offset, >=, dbuf->db_offset); - ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - - objarray = dbuf->db_data; - bpobj_free(os, objarray[blkoff], tx); - } - if (dbuf) { - dmu_buf_rele(dbuf, FTAG); - dbuf = NULL; - } - VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); - -out: - mutex_exit(&bpo.bpo_lock); - bpobj_close(&bpo); - - VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); -} - -int -bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) -{ - dmu_object_info_t doi; - int err; - - err = dmu_object_info(os, object, &doi); - if (err) - return (err); - - bzero(bpo, sizeof (*bpo)); - mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); - - ASSERT(bpo->bpo_dbuf == NULL); - ASSERT(bpo->bpo_phys == NULL); - ASSERT(object != 0); - ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); - ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); - - err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); - if (err) - return (err); - - bpo->bpo_os = os; - bpo->bpo_object = object; - bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; - bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); - bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); - bpo->bpo_phys = bpo->bpo_dbuf->db_data; - return (0); -} - -boolean_t -bpobj_is_open(const bpobj_t *bpo) -{ - return (bpo->bpo_object != 0); -} - -void -bpobj_close(bpobj_t *bpo) -{ - /* Lame workaround for closing a bpobj that was never opened. */ - if (bpo->bpo_object == 0) - return; - - dmu_buf_rele(bpo->bpo_dbuf, bpo); - if (bpo->bpo_cached_dbuf != NULL) - dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); - bpo->bpo_dbuf = NULL; - bpo->bpo_phys = NULL; - bpo->bpo_cached_dbuf = NULL; - bpo->bpo_object = 0; - - mutex_destroy(&bpo->bpo_lock); -} - -boolean_t -bpobj_is_empty(bpobj_t *bpo) -{ - return (bpo->bpo_phys->bpo_num_blkptrs == 0 && - (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); -} - -static int -bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, - boolean_t free) -{ - dmu_object_info_t doi; - int epb; - int64_t i; - int err = 0; - dmu_buf_t *dbuf = NULL; - - ASSERT(bpobj_is_open(bpo)); - mutex_enter(&bpo->bpo_lock); - - if (free) - dmu_buf_will_dirty(bpo->bpo_dbuf, tx); - - for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { - blkptr_t *bparray; - blkptr_t *bp; - uint64_t offset, blkoff; - - offset = i * sizeof (blkptr_t); - blkoff = P2PHASE(i, bpo->bpo_epb); - - if (dbuf == NULL || dbuf->db_offset > offset) { - if (dbuf) - dmu_buf_rele(dbuf, FTAG); - err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, - FTAG, &dbuf, 0); - if (err) - break; - } - - ASSERT3U(offset, >=, dbuf->db_offset); - ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - - bparray = dbuf->db_data; - bp = &bparray[blkoff]; - err = func(arg, bp, tx); - if (err) - break; - if (free) { - bpo->bpo_phys->bpo_bytes -= - bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); - ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); - if (bpo->bpo_havecomp) { - bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp); - bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp); - } - bpo->bpo_phys->bpo_num_blkptrs--; - ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); - } - } - if (dbuf) { - dmu_buf_rele(dbuf, FTAG); - dbuf = NULL; - } - if (free) { - VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, - (i + 1) * sizeof (blkptr_t), -1ULL, tx)); - } - if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) - goto out; - - ASSERT(bpo->bpo_havecomp); - err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); - if (err) { - mutex_exit(&bpo->bpo_lock); - return (err); - } - ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); - epb = doi.doi_data_block_size / sizeof (uint64_t); - - for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { - uint64_t *objarray; - uint64_t offset, blkoff; - bpobj_t sublist; - uint64_t used_before, comp_before, uncomp_before; - uint64_t used_after, comp_after, uncomp_after; - - offset = i * sizeof (uint64_t); - blkoff = P2PHASE(i, epb); - - if (dbuf == NULL || dbuf->db_offset > offset) { - if (dbuf) - dmu_buf_rele(dbuf, FTAG); - err = dmu_buf_hold(bpo->bpo_os, - bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0); - if (err) - break; - } - - ASSERT3U(offset, >=, dbuf->db_offset); - ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - - objarray = dbuf->db_data; - err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); - if (err) - break; - if (free) { - err = bpobj_space(&sublist, - &used_before, &comp_before, &uncomp_before); - if (err != 0) { - bpobj_close(&sublist); - break; - } - } - err = bpobj_iterate_impl(&sublist, func, arg, tx, free); - if (free) { - VERIFY3U(0, ==, bpobj_space(&sublist, - &used_after, &comp_after, &uncomp_after)); - bpo->bpo_phys->bpo_bytes -= used_before - used_after; - ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); - bpo->bpo_phys->bpo_comp -= comp_before - comp_after; - bpo->bpo_phys->bpo_uncomp -= - uncomp_before - uncomp_after; - } - - bpobj_close(&sublist); - if (err) - break; - if (free) { - err = dmu_object_free(bpo->bpo_os, - objarray[blkoff], tx); - if (err) - break; - bpo->bpo_phys->bpo_num_subobjs--; - ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0); - } - } - if (dbuf) { - dmu_buf_rele(dbuf, FTAG); - dbuf = NULL; - } - if (free) { - VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, - bpo->bpo_phys->bpo_subobjs, - (i + 1) * sizeof (uint64_t), -1ULL, tx)); - } - -out: - /* If there are no entries, there should be no bytes. */ - if (bpobj_is_empty(bpo)) { - ASSERT0(bpo->bpo_phys->bpo_bytes); - ASSERT0(bpo->bpo_phys->bpo_comp); - ASSERT0(bpo->bpo_phys->bpo_uncomp); - } - - mutex_exit(&bpo->bpo_lock); - return (err); -} - -/* - * Iterate and remove the entries. If func returns nonzero, iteration - * will stop and that entry will not be removed. - */ -int -bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) -{ - return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); -} - -/* - * Iterate the entries. If func returns nonzero, iteration will stop. - */ -int -bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) -{ - return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); -} - -void -bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) -{ - bpobj_t subbpo; - uint64_t used, comp, uncomp, subsubobjs; - - ASSERT(bpobj_is_open(bpo)); - ASSERT(subobj != 0); - ASSERT(bpo->bpo_havesubobj); - ASSERT(bpo->bpo_havecomp); - ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); - - if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) { - bpobj_decr_empty(bpo->bpo_os, tx); - return; - } - - VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); - VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); - - if (bpobj_is_empty(&subbpo)) { - /* No point in having an empty subobj. */ - bpobj_close(&subbpo); - bpobj_free(bpo->bpo_os, subobj, tx); - return; - } - - mutex_enter(&bpo->bpo_lock); - dmu_buf_will_dirty(bpo->bpo_dbuf, tx); - if (bpo->bpo_phys->bpo_subobjs == 0) { - bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, - DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, - DMU_OT_NONE, 0, tx); - } - - dmu_object_info_t doi; - ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi)); - ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); - - dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, - bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), - sizeof (subobj), &subobj, tx); - bpo->bpo_phys->bpo_num_subobjs++; - - /* - * If subobj has only one block of subobjs, then move subobj's - * subobjs to bpo's subobj list directly. This reduces - * recursion in bpobj_iterate due to nested subobjs. - */ - subsubobjs = subbpo.bpo_phys->bpo_subobjs; - if (subsubobjs != 0) { - dmu_object_info_t doi; - - VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); - if (doi.doi_max_offset == doi.doi_data_block_size) { - dmu_buf_t *subdb; - uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; - - VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs, - 0, FTAG, &subdb, 0)); - /* - * Make sure that we are not asking dmu_write() - * to write more data than we have in our buffer. - */ - VERIFY3U(subdb->db_size, >=, - numsubsub * sizeof (subobj)); - dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, - bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), - numsubsub * sizeof (subobj), subdb->db_data, tx); - dmu_buf_rele(subdb, FTAG); - bpo->bpo_phys->bpo_num_subobjs += numsubsub; - - dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); - subbpo.bpo_phys->bpo_subobjs = 0; - VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os, - subsubobjs, tx)); - } - } - bpo->bpo_phys->bpo_bytes += used; - bpo->bpo_phys->bpo_comp += comp; - bpo->bpo_phys->bpo_uncomp += uncomp; - mutex_exit(&bpo->bpo_lock); - - bpobj_close(&subbpo); -} - -void -bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) -{ - blkptr_t stored_bp = *bp; - uint64_t offset; - int blkoff; - blkptr_t *bparray; - - ASSERT(bpobj_is_open(bpo)); - ASSERT(!BP_IS_HOLE(bp)); - ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); - - if (BP_IS_EMBEDDED(bp)) { - /* - * The bpobj will compress better without the payload. - * - * Note that we store EMBEDDED bp's because they have an - * uncompressed size, which must be accounted for. An - * alternative would be to add their size to bpo_uncomp - * without storing the bp, but that would create additional - * complications: bpo_uncomp would be inconsistent with the - * set of BP's stored, and bpobj_iterate() wouldn't visit - * all the space accounted for in the bpobj. - */ - bzero(&stored_bp, sizeof (stored_bp)); - stored_bp.blk_prop = bp->blk_prop; - stored_bp.blk_birth = bp->blk_birth; - } else if (!BP_GET_DEDUP(bp)) { - /* The bpobj will compress better without the checksum */ - bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); - } - - /* We never need the fill count. */ - stored_bp.blk_fill = 0; - - mutex_enter(&bpo->bpo_lock); - - offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); - blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); - - if (bpo->bpo_cached_dbuf == NULL || - offset < bpo->bpo_cached_dbuf->db_offset || - offset >= bpo->bpo_cached_dbuf->db_offset + - bpo->bpo_cached_dbuf->db_size) { - if (bpo->bpo_cached_dbuf) - dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); - VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, - offset, bpo, &bpo->bpo_cached_dbuf, 0)); - } - - dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); - bparray = bpo->bpo_cached_dbuf->db_data; - bparray[blkoff] = stored_bp; - - dmu_buf_will_dirty(bpo->bpo_dbuf, tx); - bpo->bpo_phys->bpo_num_blkptrs++; - bpo->bpo_phys->bpo_bytes += - bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); - if (bpo->bpo_havecomp) { - bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); - bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); - } - mutex_exit(&bpo->bpo_lock); -} - -struct space_range_arg { - spa_t *spa; - uint64_t mintxg; - uint64_t maxtxg; - uint64_t used; - uint64_t comp; - uint64_t uncomp; -}; - -/* ARGSUSED */ -static int -space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -{ - struct space_range_arg *sra = arg; - - if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { - if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) - sra->used += bp_get_dsize_sync(sra->spa, bp); - else - sra->used += bp_get_dsize(sra->spa, bp); - sra->comp += BP_GET_PSIZE(bp); - sra->uncomp += BP_GET_UCSIZE(bp); - } - return (0); -} - -int -bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) -{ - ASSERT(bpobj_is_open(bpo)); - mutex_enter(&bpo->bpo_lock); - - *usedp = bpo->bpo_phys->bpo_bytes; - if (bpo->bpo_havecomp) { - *compp = bpo->bpo_phys->bpo_comp; - *uncompp = bpo->bpo_phys->bpo_uncomp; - mutex_exit(&bpo->bpo_lock); - return (0); - } else { - mutex_exit(&bpo->bpo_lock); - return (bpobj_space_range(bpo, 0, UINT64_MAX, - usedp, compp, uncompp)); - } -} - -/* - * Return the amount of space in the bpobj which is: - * mintxg < blk_birth <= maxtxg - */ -int -bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) -{ - struct space_range_arg sra = { 0 }; - int err; - - ASSERT(bpobj_is_open(bpo)); - - /* - * As an optimization, if they want the whole txg range, just - * get bpo_bytes rather than iterating over the bps. - */ - if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) - return (bpobj_space(bpo, usedp, compp, uncompp)); - - sra.spa = dmu_objset_spa(bpo->bpo_os); - sra.mintxg = mintxg; - sra.maxtxg = maxtxg; - - err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); - *usedp = sra.used; - *compp = sra.comp; - *uncompp = sra.uncomp; - return (err); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c deleted file mode 100644 index c74d07236c1b..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c +++ /dev/null @@ -1,301 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * A bptree is a queue of root block pointers from destroyed datasets. When a - * dataset is destroyed its root block pointer is put on the end of the pool's - * bptree queue so the dataset's blocks can be freed asynchronously by - * dsl_scan_sync. This allows the delete operation to finish without traversing - * all the dataset's blocks. - * - * Note that while bt_begin and bt_end are only ever incremented in this code, - * they are effectively reset to 0 every time the entire bptree is freed because - * the bptree's object is destroyed and re-created. - */ - -struct bptree_args { - bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ - boolean_t ba_free; /* true if freeing during traversal */ - - bptree_itor_t *ba_func; /* function to call for each blockpointer */ - void *ba_arg; /* caller supplied argument to ba_func */ - dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ -} bptree_args_t; - -uint64_t -bptree_alloc(objset_t *os, dmu_tx_t *tx) -{ - uint64_t obj; - dmu_buf_t *db; - bptree_phys_t *bt; - - obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, - SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, - sizeof (bptree_phys_t), tx); - - /* - * Bonus buffer contents are already initialized to 0, but for - * readability we make it explicit. - */ - VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - dmu_buf_will_dirty(db, tx); - bt = db->db_data; - bt->bt_begin = 0; - bt->bt_end = 0; - bt->bt_bytes = 0; - bt->bt_comp = 0; - bt->bt_uncomp = 0; - dmu_buf_rele(db, FTAG); - - return (obj); -} - -int -bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) -{ - dmu_buf_t *db; - bptree_phys_t *bt; - - VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; - ASSERT3U(bt->bt_begin, ==, bt->bt_end); - ASSERT0(bt->bt_bytes); - ASSERT0(bt->bt_comp); - ASSERT0(bt->bt_uncomp); - dmu_buf_rele(db, FTAG); - - return (dmu_object_free(os, obj, tx)); -} - -boolean_t -bptree_is_empty(objset_t *os, uint64_t obj) -{ - dmu_buf_t *db; - bptree_phys_t *bt; - boolean_t rv; - - VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; - rv = (bt->bt_begin == bt->bt_end); - dmu_buf_rele(db, FTAG); - return (rv); -} - -void -bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, - uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) -{ - dmu_buf_t *db; - bptree_phys_t *bt; - bptree_entry_phys_t bte = { 0 }; - - /* - * bptree objects are in the pool mos, therefore they can only be - * modified in syncing context. Furthermore, this is only modified - * by the sync thread, so no locking is necessary. - */ - ASSERT(dmu_tx_is_syncing(tx)); - - VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; - - bte.be_birth_txg = birth_txg; - bte.be_bp = *bp; - dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); - - dmu_buf_will_dirty(db, tx); - bt->bt_end++; - bt->bt_bytes += bytes; - bt->bt_comp += comp; - bt->bt_uncomp += uncomp; - dmu_buf_rele(db, FTAG); -} - -/* ARGSUSED */ -static int -bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) -{ - int err; - struct bptree_args *ba = arg; - - if (bp == NULL || BP_IS_HOLE(bp)) - return (0); - - err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); - if (err == 0 && ba->ba_free) { - ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); - ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); - ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); - } - return (err); -} - -/* - * If "free" is set: - * - It is assumed that "func" will be freeing the block pointers. - * - If "func" returns nonzero, the bookmark will be remembered and - * iteration will be restarted from this point on next invocation. - * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), - * bptree_iterate will remember the bookmark, continue traversing - * any additional entries, and return 0. - * - * If "free" is not set, traversal will stop and return an error if - * an i/o error is encountered. - * - * In either case, if zfs_free_leak_on_eio is set, i/o errors will be - * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to - * traverse_dataset_destroyed()). - */ -int -bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, - void *arg, dmu_tx_t *tx) -{ - boolean_t ioerr = B_FALSE; - int err; - uint64_t i; - dmu_buf_t *db; - struct bptree_args ba; - - ASSERT(!free || dmu_tx_is_syncing(tx)); - - err = dmu_bonus_hold(os, obj, FTAG, &db); - if (err != 0) - return (err); - - if (free) - dmu_buf_will_dirty(db, tx); - - ba.ba_phys = db->db_data; - ba.ba_free = free; - ba.ba_func = func; - ba.ba_arg = arg; - ba.ba_tx = tx; - - err = 0; - for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { - bptree_entry_phys_t bte; - int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; - - err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), - &bte, DMU_READ_NO_PREFETCH); - if (err != 0) - break; - - if (zfs_free_leak_on_eio) - flags |= TRAVERSE_HARD; - zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld " - "bookmark %lld/%lld/%lld/%lld", - (longlong_t)i, - (longlong_t)bte.be_birth_txg, - (longlong_t)bte.be_zb.zb_objset, - (longlong_t)bte.be_zb.zb_object, - (longlong_t)bte.be_zb.zb_level, - (longlong_t)bte.be_zb.zb_blkid); - err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, - bte.be_birth_txg, &bte.be_zb, flags, - bptree_visit_cb, &ba); - if (free) { - /* - * The callback has freed the visited block pointers. - * Record our traversal progress on disk, either by - * updating this record's bookmark, or by logically - * removing this record by advancing bt_begin. - */ - if (err != 0) { - /* save bookmark for future resume */ - ASSERT3U(bte.be_zb.zb_objset, ==, - ZB_DESTROYED_OBJSET); - ASSERT0(bte.be_zb.zb_level); - dmu_write(os, obj, i * sizeof (bte), - sizeof (bte), &bte, tx); - if (err == EIO || err == ECKSUM || - err == ENXIO) { - /* - * Skip the rest of this tree and - * continue on to the next entry. - */ - err = 0; - ioerr = B_TRUE; - } else { - break; - } - } else if (ioerr) { - /* - * This entry is finished, but there were - * i/o errors on previous entries, so we - * can't adjust bt_begin. Set this entry's - * be_birth_txg such that it will be - * treated as a no-op in future traversals. - */ - bte.be_birth_txg = UINT64_MAX; - dmu_write(os, obj, i * sizeof (bte), - sizeof (bte), &bte, tx); - } - - if (!ioerr) { - ba.ba_phys->bt_begin++; - (void) dmu_free_range(os, obj, - i * sizeof (bte), sizeof (bte), tx); - } - } else if (err != 0) { - break; - } - } - - ASSERT(!free || err != 0 || ioerr || - ba.ba_phys->bt_begin == ba.ba_phys->bt_end); - - /* if all blocks are free there should be no used space */ - if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { - if (zfs_free_leak_on_eio) { - ba.ba_phys->bt_bytes = 0; - ba.ba_phys->bt_comp = 0; - ba.ba_phys->bt_uncomp = 0; - } - - ASSERT0(ba.ba_phys->bt_bytes); - ASSERT0(ba.ba_phys->bt_comp); - ASSERT0(ba.ba_phys->bt_uncomp); - } - - dmu_buf_rele(db, FTAG); - - return (err); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c deleted file mode 100644 index 1ddc697b5424..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c +++ /dev/null @@ -1,111 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2014 by Delphix. All rights reserved. - */ - -#include -#include - -static inline bqueue_node_t * -obj2node(bqueue_t *q, void *data) -{ - return ((bqueue_node_t *)((char *)data + q->bq_node_offset)); -} - -/* - * Initialize a blocking queue The maximum capacity of the queue is set to - * size. Types that want to be stored in a bqueue must contain a bqueue_node_t, - * and offset should give its offset from the start of the struct. Return 0 on - * success, or -1 on failure. - */ -int -bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset) -{ - list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t), - node_offset + offsetof(bqueue_node_t, bqn_node)); - cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL); - cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL); - q->bq_node_offset = node_offset; - q->bq_size = 0; - q->bq_maxsize = size; - return (0); -} - -/* - * Destroy a blocking queue. This function asserts that there are no - * elements in the queue, and no one is blocked on the condition - * variables. - */ -void -bqueue_destroy(bqueue_t *q) -{ - ASSERT0(q->bq_size); - cv_destroy(&q->bq_add_cv); - cv_destroy(&q->bq_pop_cv); - mutex_destroy(&q->bq_lock); - list_destroy(&q->bq_list); -} - -/* - * Add data to q, consuming size units of capacity. If there is insufficient - * capacity to consume size units, block until capacity exists. Asserts size is - * > 0. - */ -void -bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) -{ - ASSERT3U(item_size, >, 0); - ASSERT3U(item_size, <, q->bq_maxsize); - mutex_enter(&q->bq_lock); - obj2node(q, data)->bqn_size = item_size; - while (q->bq_size + item_size > q->bq_maxsize) { - cv_wait(&q->bq_add_cv, &q->bq_lock); - } - q->bq_size += item_size; - list_insert_tail(&q->bq_list, data); - cv_signal(&q->bq_pop_cv); - mutex_exit(&q->bq_lock); -} -/* - * Take the first element off of q. If there are no elements on the queue, wait - * until one is put there. Return the removed element. - */ -void * -bqueue_dequeue(bqueue_t *q) -{ - void *ret; - uint64_t item_size; - mutex_enter(&q->bq_lock); - while (q->bq_size == 0) { - cv_wait(&q->bq_pop_cv, &q->bq_lock); - } - ret = list_remove_head(&q->bq_list); - item_size = obj2node(q, ret)->bqn_size; - q->bq_size -= item_size; - mutex_exit(&q->bq_lock); - cv_signal(&q->bq_add_cv); - return (ret); -} - -/* - * Returns true if the space used is 0. - */ -boolean_t -bqueue_empty(bqueue_t *q) -{ - return (q->bq_size == 0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c deleted file mode 100644 index 2b62edad0342..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) 2011 Google, Inc. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -/* - * Copyright (c) 2017 by Delphix. All rights reserved. - */ - -#include - -#define HASH_K1 0xb492b66fbe98f273ULL -#define HASH_K2 0x9ae16a3b2f90404fULL - -/* - * Bitwise right rotate. Normally this will compile to a single - * instruction. - */ -static inline uint64_t -rotate(uint64_t val, int shift) -{ - // Avoid shifting by 64: doing so yields an undefined result. - return (shift == 0 ? val : (val >> shift) | (val << (64 - shift))); -} - -static inline uint64_t -cityhash_helper(uint64_t u, uint64_t v, uint64_t mul) -{ - uint64_t a = (u ^ v) * mul; - a ^= (a >> 47); - uint64_t b = (v ^ a) * mul; - b ^= (b >> 47); - b *= mul; - return (b); -} - -uint64_t -cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4) -{ - uint64_t mul = HASH_K2 + 64; - uint64_t a = w1 * HASH_K1; - uint64_t b = w2; - uint64_t c = w4 * mul; - uint64_t d = w3 * HASH_K2; - return (cityhash_helper(rotate(a + b, 43) + rotate(c, 30) + d, - a + rotate(b + HASH_K2, 18) + c, mul)); - -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c deleted file mode 100644 index 1974ff2197c2..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ /dev/null @@ -1,4248 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -kstat_t *dbuf_ksp; - -typedef struct dbuf_stats { - /* - * Various statistics about the size of the dbuf cache. - */ - kstat_named_t cache_count; - kstat_named_t cache_size_bytes; - kstat_named_t cache_size_bytes_max; - /* - * Statistics regarding the bounds on the dbuf cache size. - */ - kstat_named_t cache_target_bytes; - kstat_named_t cache_lowater_bytes; - kstat_named_t cache_hiwater_bytes; - /* - * Total number of dbuf cache evictions that have occurred. - */ - kstat_named_t cache_total_evicts; - /* - * The distribution of dbuf levels in the dbuf cache and - * the total size of all dbufs at each level. - */ - kstat_named_t cache_levels[DN_MAX_LEVELS]; - kstat_named_t cache_levels_bytes[DN_MAX_LEVELS]; - /* - * Statistics about the dbuf hash table. - */ - kstat_named_t hash_hits; - kstat_named_t hash_misses; - kstat_named_t hash_collisions; - kstat_named_t hash_elements; - kstat_named_t hash_elements_max; - /* - * Number of sublists containing more than one dbuf in the dbuf - * hash table. Keep track of the longest hash chain. - */ - kstat_named_t hash_chains; - kstat_named_t hash_chain_max; - /* - * Number of times a dbuf_create() discovers that a dbuf was - * already created and in the dbuf hash table. - */ - kstat_named_t hash_insert_race; - /* - * Statistics about the size of the metadata dbuf cache. - */ - kstat_named_t metadata_cache_count; - kstat_named_t metadata_cache_size_bytes; - kstat_named_t metadata_cache_size_bytes_max; - /* - * For diagnostic purposes, this is incremented whenever we can't add - * something to the metadata cache because it's full, and instead put - * the data in the regular dbuf cache. - */ - kstat_named_t metadata_cache_overflow; -} dbuf_stats_t; - -dbuf_stats_t dbuf_stats = { - { "cache_count", KSTAT_DATA_UINT64 }, - { "cache_size_bytes", KSTAT_DATA_UINT64 }, - { "cache_size_bytes_max", KSTAT_DATA_UINT64 }, - { "cache_target_bytes", KSTAT_DATA_UINT64 }, - { "cache_lowater_bytes", KSTAT_DATA_UINT64 }, - { "cache_hiwater_bytes", KSTAT_DATA_UINT64 }, - { "cache_total_evicts", KSTAT_DATA_UINT64 }, - { { "cache_levels_N", KSTAT_DATA_UINT64 } }, - { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } }, - { "hash_hits", KSTAT_DATA_UINT64 }, - { "hash_misses", KSTAT_DATA_UINT64 }, - { "hash_collisions", KSTAT_DATA_UINT64 }, - { "hash_elements", KSTAT_DATA_UINT64 }, - { "hash_elements_max", KSTAT_DATA_UINT64 }, - { "hash_chains", KSTAT_DATA_UINT64 }, - { "hash_chain_max", KSTAT_DATA_UINT64 }, - { "hash_insert_race", KSTAT_DATA_UINT64 }, - { "metadata_cache_count", KSTAT_DATA_UINT64 }, - { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 }, - { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 }, - { "metadata_cache_overflow", KSTAT_DATA_UINT64 } -}; - -#define DBUF_STAT_INCR(stat, val) \ - atomic_add_64(&dbuf_stats.stat.value.ui64, (val)); -#define DBUF_STAT_DECR(stat, val) \ - DBUF_STAT_INCR(stat, -(val)); -#define DBUF_STAT_BUMP(stat) \ - DBUF_STAT_INCR(stat, 1); -#define DBUF_STAT_BUMPDOWN(stat) \ - DBUF_STAT_INCR(stat, -1); -#define DBUF_STAT_MAX(stat, v) { \ - uint64_t _m; \ - while ((v) > (_m = dbuf_stats.stat.value.ui64) && \ - (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\ - continue; \ -} - -struct dbuf_hold_impl_data { - /* Function arguments */ - dnode_t *dh_dn; - uint8_t dh_level; - uint64_t dh_blkid; - boolean_t dh_fail_sparse; - boolean_t dh_fail_uncached; - void *dh_tag; - dmu_buf_impl_t **dh_dbp; - /* Local variables */ - dmu_buf_impl_t *dh_db; - dmu_buf_impl_t *dh_parent; - blkptr_t *dh_bp; - int dh_err; - dbuf_dirty_record_t *dh_dr; - int dh_depth; -}; - -static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, - dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, - boolean_t fail_uncached, - void *tag, dmu_buf_impl_t **dbp, int depth); -static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); - -static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); - -/* - * Global data structures and functions for the dbuf cache. - */ -static kmem_cache_t *dbuf_kmem_cache; -static taskq_t *dbu_evict_taskq; - -static kthread_t *dbuf_cache_evict_thread; -static kmutex_t dbuf_evict_lock; -static kcondvar_t dbuf_evict_cv; -static boolean_t dbuf_evict_thread_exit; - -/* - * There are two dbuf caches; each dbuf can only be in one of them at a time. - * - * 1. Cache of metadata dbufs, to help make read-heavy administrative commands - * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs - * that represent the metadata that describes filesystems/snapshots/ - * bookmarks/properties/etc. We only evict from this cache when we export a - * pool, to short-circuit as much I/O as possible for all administrative - * commands that need the metadata. There is no eviction policy for this - * cache, because we try to only include types in it which would occupy a - * very small amount of space per object but create a large impact on the - * performance of these commands. Instead, after it reaches a maximum size - * (which should only happen on very small memory systems with a very large - * number of filesystem objects), we stop taking new dbufs into the - * metadata cache, instead putting them in the normal dbuf cache. - * - * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that - * are not currently held but have been recently released. These dbufs - * are not eligible for arc eviction until they are aged out of the cache. - * Dbufs that are aged out of the cache will be immediately destroyed and - * become eligible for arc eviction. - * - * Dbufs are added to these caches once the last hold is released. If a dbuf is - * later accessed and still exists in the dbuf cache, then it will be removed - * from the cache and later re-added to the head of the cache. - * - * If a given dbuf meets the requirements for the metadata cache, it will go - * there, otherwise it will be considered for the generic LRU dbuf cache. The - * caches and the refcounts tracking their sizes are stored in an array indexed - * by those caches' matching enum values (from dbuf_cached_state_t). - */ -typedef struct dbuf_cache { - multilist_t *cache; - zfs_refcount_t size; -} dbuf_cache_t; -dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; - -/* Size limits for the caches */ -uint64_t dbuf_cache_max_bytes = 0; -uint64_t dbuf_metadata_cache_max_bytes = 0; -/* Set the default sizes of the caches to log2 fraction of arc size */ -int dbuf_cache_shift = 5; -int dbuf_metadata_cache_shift = 6; - -/* - * For diagnostic purposes, this is incremented whenever we can't add - * something to the metadata cache because it's full, and instead put - * the data in the regular dbuf cache. - */ -uint64_t dbuf_metadata_cache_overflow; - -/* - * The LRU dbuf cache uses a three-stage eviction policy: - * - A low water marker designates when the dbuf eviction thread - * should stop evicting from the dbuf cache. - * - When we reach the maximum size (aka mid water mark), we - * signal the eviction thread to run. - * - The high water mark indicates when the eviction thread - * is unable to keep up with the incoming load and eviction must - * happen in the context of the calling thread. - * - * The dbuf cache: - * (max size) - * low water mid water hi water - * +----------------------------------------+----------+----------+ - * | | | | - * | | | | - * | | | | - * | | | | - * +----------------------------------------+----------+----------+ - * stop signal evict - * evicting eviction directly - * thread - * - * The high and low water marks indicate the operating range for the eviction - * thread. The low water mark is, by default, 90% of the total size of the - * cache and the high water mark is at 110% (both of these percentages can be - * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, - * respectively). The eviction thread will try to ensure that the cache remains - * within this range by waking up every second and checking if the cache is - * above the low water mark. The thread can also be woken up by callers adding - * elements into the cache if the cache is larger than the mid water (i.e max - * cache size). Once the eviction thread is woken up and eviction is required, - * it will continue evicting buffers until it's able to reduce the cache size - * to the low water mark. If the cache size continues to grow and hits the high - * water mark, then callers adding elments to the cache will begin to evict - * directly from the cache until the cache is no longer above the high water - * mark. - */ - -/* - * The percentage above and below the maximum cache size. - */ -uint_t dbuf_cache_hiwater_pct = 10; -uint_t dbuf_cache_lowater_pct = 10; - -SYSCTL_DECL(_vfs_zfs); -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_cache_max_bytes, CTLFLAG_RWTUN, - &dbuf_cache_max_bytes, 0, "dbuf cache size in bytes"); -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_max_bytes, CTLFLAG_RWTUN, - &dbuf_metadata_cache_max_bytes, 0, "dbuf metadata cache size in bytes"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_cache_shift, CTLFLAG_RDTUN, - &dbuf_cache_shift, 0, "dbuf cache size as log2 fraction of ARC"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_shift, CTLFLAG_RDTUN, - &dbuf_metadata_cache_shift, 0, - "dbuf metadata cache size as log2 fraction of ARC"); -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_overflow, CTLFLAG_RD, - &dbuf_metadata_cache_overflow, 0, "dbuf metadata cache overflow"); -SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_hiwater_pct, CTLFLAG_RWTUN, - &dbuf_cache_hiwater_pct, 0, "max percents above the dbuf cache size"); -SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_lowater_pct, CTLFLAG_RWTUN, - &dbuf_cache_lowater_pct, 0, "max percents below the dbuf cache size"); - -/* ARGSUSED */ -static int -dbuf_cons(void *vdb, void *unused, int kmflag) -{ - dmu_buf_impl_t *db = vdb; - bzero(db, sizeof (dmu_buf_impl_t)); - - mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); - cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); - multilist_link_init(&db->db_cache_link); - zfs_refcount_create(&db->db_holds); - - return (0); -} - -/* ARGSUSED */ -static void -dbuf_dest(void *vdb, void *unused) -{ - dmu_buf_impl_t *db = vdb; - mutex_destroy(&db->db_mtx); - cv_destroy(&db->db_changed); - ASSERT(!multilist_link_active(&db->db_cache_link)); - zfs_refcount_destroy(&db->db_holds); -} - -/* - * dbuf hash table routines - */ -static dbuf_hash_table_t dbuf_hash_table; - -static uint64_t dbuf_hash_count; - -/* - * We use Cityhash for this. It's fast, and has good hash properties without - * requiring any large static buffers. - */ -static uint64_t -dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) -{ - return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid)); -} - -#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ - ((dbuf)->db.db_object == (obj) && \ - (dbuf)->db_objset == (os) && \ - (dbuf)->db_level == (level) && \ - (dbuf)->db_blkid == (blkid)) - -dmu_buf_impl_t * -dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) -{ - dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv = dbuf_hash(os, obj, level, blkid); - uint64_t idx = hv & h->hash_table_mask; - dmu_buf_impl_t *db; - - mutex_enter(DBUF_HASH_MUTEX(h, idx)); - for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { - if (DBUF_EQUAL(db, os, obj, level, blkid)) { - mutex_enter(&db->db_mtx); - if (db->db_state != DB_EVICTING) { - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - return (db); - } - mutex_exit(&db->db_mtx); - } - } - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - return (NULL); -} - -static dmu_buf_impl_t * -dbuf_find_bonus(objset_t *os, uint64_t object) -{ - dnode_t *dn; - dmu_buf_impl_t *db = NULL; - - if (dnode_hold(os, object, FTAG, &dn) == 0) { - rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_bonus != NULL) { - db = dn->dn_bonus; - mutex_enter(&db->db_mtx); - } - rw_exit(&dn->dn_struct_rwlock); - dnode_rele(dn, FTAG); - } - return (db); -} - -/* - * Insert an entry into the hash table. If there is already an element - * equal to elem in the hash table, then the already existing element - * will be returned and the new element will not be inserted. - * Otherwise returns NULL. - */ -static dmu_buf_impl_t * -dbuf_hash_insert(dmu_buf_impl_t *db) -{ - dbuf_hash_table_t *h = &dbuf_hash_table; - objset_t *os = db->db_objset; - uint64_t obj = db->db.db_object; - int level = db->db_level; - uint64_t blkid, hv, idx; - dmu_buf_impl_t *dbf; - uint32_t i; - - blkid = db->db_blkid; - hv = dbuf_hash(os, obj, level, blkid); - idx = hv & h->hash_table_mask; - - mutex_enter(DBUF_HASH_MUTEX(h, idx)); - for (dbf = h->hash_table[idx], i = 0; dbf != NULL; - dbf = dbf->db_hash_next, i++) { - if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { - mutex_enter(&dbf->db_mtx); - if (dbf->db_state != DB_EVICTING) { - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - return (dbf); - } - mutex_exit(&dbf->db_mtx); - } - } - - if (i > 0) { - DBUF_STAT_BUMP(hash_collisions); - if (i == 1) - DBUF_STAT_BUMP(hash_chains); - - DBUF_STAT_MAX(hash_chain_max, i); - } - - mutex_enter(&db->db_mtx); - db->db_hash_next = h->hash_table[idx]; - h->hash_table[idx] = db; - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - atomic_inc_64(&dbuf_hash_count); - DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count); - - return (NULL); -} - -/* - * Remove an entry from the hash table. It must be in the EVICTING state. - */ -static void -dbuf_hash_remove(dmu_buf_impl_t *db) -{ - dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv, idx; - dmu_buf_impl_t *dbf, **dbp; - - hv = dbuf_hash(db->db_objset, db->db.db_object, - db->db_level, db->db_blkid); - idx = hv & h->hash_table_mask; - - /* - * We mustn't hold db_mtx to maintain lock ordering: - * DBUF_HASH_MUTEX > db_mtx. - */ - ASSERT(zfs_refcount_is_zero(&db->db_holds)); - ASSERT(db->db_state == DB_EVICTING); - ASSERT(!MUTEX_HELD(&db->db_mtx)); - - mutex_enter(DBUF_HASH_MUTEX(h, idx)); - dbp = &h->hash_table[idx]; - while ((dbf = *dbp) != db) { - dbp = &dbf->db_hash_next; - ASSERT(dbf != NULL); - } - *dbp = db->db_hash_next; - db->db_hash_next = NULL; - if (h->hash_table[idx] && - h->hash_table[idx]->db_hash_next == NULL) - DBUF_STAT_BUMPDOWN(hash_chains); - mutex_exit(DBUF_HASH_MUTEX(h, idx)); - atomic_dec_64(&dbuf_hash_count); -} - -typedef enum { - DBVU_EVICTING, - DBVU_NOT_EVICTING -} dbvu_verify_type_t; - -static void -dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) -{ -#ifdef ZFS_DEBUG - int64_t holds; - - if (db->db_user == NULL) - return; - - /* Only data blocks support the attachment of user data. */ - ASSERT(db->db_level == 0); - - /* Clients must resolve a dbuf before attaching user data. */ - ASSERT(db->db.db_data != NULL); - ASSERT3U(db->db_state, ==, DB_CACHED); - - holds = zfs_refcount_count(&db->db_holds); - if (verify_type == DBVU_EVICTING) { - /* - * Immediate eviction occurs when holds == dirtycnt. - * For normal eviction buffers, holds is zero on - * eviction, except when dbuf_fix_old_data() calls - * dbuf_clear_data(). However, the hold count can grow - * during eviction even though db_mtx is held (see - * dmu_bonus_hold() for an example), so we can only - * test the generic invariant that holds >= dirtycnt. - */ - ASSERT3U(holds, >=, db->db_dirtycnt); - } else { - if (db->db_user_immediate_evict == TRUE) - ASSERT3U(holds, >=, db->db_dirtycnt); - else - ASSERT3U(holds, >, 0); - } -#endif -} - -static void -dbuf_evict_user(dmu_buf_impl_t *db) -{ - dmu_buf_user_t *dbu = db->db_user; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - - if (dbu == NULL) - return; - - dbuf_verify_user(db, DBVU_EVICTING); - db->db_user = NULL; - -#ifdef ZFS_DEBUG - if (dbu->dbu_clear_on_evict_dbufp != NULL) - *dbu->dbu_clear_on_evict_dbufp = NULL; -#endif - - /* - * There are two eviction callbacks - one that we call synchronously - * and one that we invoke via a taskq. The async one is useful for - * avoiding lock order reversals and limiting stack depth. - * - * Note that if we have a sync callback but no async callback, - * it's likely that the sync callback will free the structure - * containing the dbu. In that case we need to take care to not - * dereference dbu after calling the sync evict func. - */ - boolean_t has_async = (dbu->dbu_evict_func_async != NULL); - - if (dbu->dbu_evict_func_sync != NULL) - dbu->dbu_evict_func_sync(dbu); - - if (has_async) { - taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async, - dbu, 0, &dbu->dbu_tqent); - } -} - -boolean_t -dbuf_is_metadata(dmu_buf_impl_t *db) -{ - if (db->db_level > 0) { - return (B_TRUE); - } else { - boolean_t is_metadata; - - DB_DNODE_ENTER(db); - is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); - DB_DNODE_EXIT(db); - - return (is_metadata); - } -} - -/* - * This returns whether this dbuf should be stored in the metadata cache, which - * is based on whether it's from one of the dnode types that store data related - * to traversing dataset hierarchies. - */ -static boolean_t -dbuf_include_in_metadata_cache(dmu_buf_impl_t *db) -{ - DB_DNODE_ENTER(db); - dmu_object_type_t type = DB_DNODE(db)->dn_type; - DB_DNODE_EXIT(db); - - /* Check if this dbuf is one of the types we care about */ - if (DMU_OT_IS_METADATA_CACHED(type)) { - /* If we hit this, then we set something up wrong in dmu_ot */ - ASSERT(DMU_OT_IS_METADATA(type)); - - /* - * Sanity check for small-memory systems: don't allocate too - * much memory for this purpose. - */ - if (zfs_refcount_count( - &dbuf_caches[DB_DBUF_METADATA_CACHE].size) > - dbuf_metadata_cache_max_bytes) { - dbuf_metadata_cache_overflow++; - DTRACE_PROBE1(dbuf__metadata__cache__overflow, - dmu_buf_impl_t *, db); - return (B_FALSE); - } - - return (B_TRUE); - } - - return (B_FALSE); -} - -/* - * This function *must* return indices evenly distributed between all - * sublists of the multilist. This is needed due to how the dbuf eviction - * code is laid out; dbuf_evict_thread() assumes dbufs are evenly - * distributed between all sublists and uses this assumption when - * deciding which sublist to evict from and how much to evict from it. - */ -unsigned int -dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) -{ - dmu_buf_impl_t *db = obj; - - /* - * The assumption here, is the hash value for a given - * dmu_buf_impl_t will remain constant throughout it's lifetime - * (i.e. it's objset, object, level and blkid fields don't change). - * Thus, we don't need to store the dbuf's sublist index - * on insertion, as this index can be recalculated on removal. - * - * Also, the low order bits of the hash value are thought to be - * distributed evenly. Otherwise, in the case that the multilist - * has a power of two number of sublists, each sublists' usage - * would not be evenly distributed. - */ - return (dbuf_hash(db->db_objset, db->db.db_object, - db->db_level, db->db_blkid) % - multilist_get_num_sublists(ml)); -} - -static inline unsigned long -dbuf_cache_target_bytes(void) -{ - return MIN(dbuf_cache_max_bytes, - arc_max_bytes() >> dbuf_cache_shift); -} - -static inline uint64_t -dbuf_cache_hiwater_bytes(void) -{ - uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); - return (dbuf_cache_target + - (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100); -} - -static inline uint64_t -dbuf_cache_lowater_bytes(void) -{ - uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); - return (dbuf_cache_target - - (dbuf_cache_target * dbuf_cache_lowater_pct) / 100); -} - -static inline boolean_t -dbuf_cache_above_lowater(void) -{ - return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > - dbuf_cache_lowater_bytes()); -} - -/* - * Evict the oldest eligible dbuf from the dbuf cache. - */ -static void -dbuf_evict_one(void) -{ - int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache); - multilist_sublist_t *mls = multilist_sublist_lock( - dbuf_caches[DB_DBUF_CACHE].cache, idx); - - ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); - - dmu_buf_impl_t *db = multilist_sublist_tail(mls); - while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { - db = multilist_sublist_prev(mls, db); - } - - DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, - multilist_sublist_t *, mls); - - if (db != NULL) { - multilist_sublist_remove(mls, db); - multilist_sublist_unlock(mls); - (void) zfs_refcount_remove_many( - &dbuf_caches[DB_DBUF_CACHE].size, - db->db.db_size, db); - DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); - DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], - db->db.db_size); - ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); - db->db_caching_status = DB_NO_CACHE; - dbuf_destroy(db); - DBUF_STAT_BUMP(cache_total_evicts); - } else { - multilist_sublist_unlock(mls); - } -} - -/* - * The dbuf evict thread is responsible for aging out dbufs from the - * cache. Once the cache has reached it's maximum size, dbufs are removed - * and destroyed. The eviction thread will continue running until the size - * of the dbuf cache is at or below the maximum size. Once the dbuf is aged - * out of the cache it is destroyed and becomes eligible for arc eviction. - */ -/* ARGSUSED */ -static void -dbuf_evict_thread(void *unused __unused) -{ - callb_cpr_t cpr; - - CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); - - mutex_enter(&dbuf_evict_lock); - while (!dbuf_evict_thread_exit) { - while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_hires(&dbuf_evict_cv, - &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); - CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); -#ifdef __FreeBSD__ - if (dbuf_ksp != NULL) - dbuf_ksp->ks_update(dbuf_ksp, KSTAT_READ); -#endif - } - mutex_exit(&dbuf_evict_lock); - - /* - * Keep evicting as long as we're above the low water mark - * for the cache. We do this without holding the locks to - * minimize lock contention. - */ - while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { - dbuf_evict_one(); - } - - mutex_enter(&dbuf_evict_lock); - } - - dbuf_evict_thread_exit = B_FALSE; - cv_broadcast(&dbuf_evict_cv); - CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ - thread_exit(); -} - -/* - * Wake up the dbuf eviction thread if the dbuf cache is at its max size. - * If the dbuf cache is at its high water mark, then evict a dbuf from the - * dbuf cache using the callers context. - */ -static void -dbuf_evict_notify(uint64_t size) -{ - /* - * We check if we should evict without holding the dbuf_evict_lock, - * because it's OK to occasionally make the wrong decision here, - * and grabbing the lock results in massive lock contention. - */ - if (size > dbuf_cache_max_bytes) { - if (size > dbuf_cache_hiwater_bytes()) - dbuf_evict_one(); - cv_signal(&dbuf_evict_cv); - } -} - -static int -dbuf_kstat_update(kstat_t *ksp, int rw) -{ - dbuf_stats_t *ds = ksp->ks_data; - - if (rw == KSTAT_WRITE) { - return (SET_ERROR(EACCES)); - } else { - ds->metadata_cache_size_bytes.value.ui64 = - zfs_refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size); - ds->cache_size_bytes.value.ui64 = - zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size); - ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes(); - ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes(); - ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes(); - ds->hash_elements.value.ui64 = dbuf_hash_count; - } - - return (0); -} - -void -dbuf_init(void) -{ - uint64_t hsize = 1ULL << 16; - dbuf_hash_table_t *h = &dbuf_hash_table; - int i; - - /* - * The hash table is big enough to fill all of physical memory - * with an average 4K block size. The table will take up - * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). - */ - while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) - hsize <<= 1; - -retry: - h->hash_table_mask = hsize - 1; - h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); - if (h->hash_table == NULL) { - /* XXX - we should really return an error instead of assert */ - ASSERT(hsize > (1ULL << 10)); - hsize >>= 1; - goto retry; - } - - dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", - sizeof (dmu_buf_impl_t), - 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); - - for (i = 0; i < DBUF_MUTEXES; i++) - mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); - - dbuf_stats_init(h); - /* - * Setup the parameters for the dbuf caches. We set the sizes of the - * dbuf cache and the metadata cache to 1/32nd and 1/16th (default) - * of the size of the ARC, respectively. If the values are set in - * /etc/system and they're not greater than the size of the ARC, then - * we honor that value. - */ - if (dbuf_cache_max_bytes == 0 || - dbuf_cache_max_bytes >= arc_max_bytes()) { - dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift; - } - if (dbuf_metadata_cache_max_bytes == 0 || - dbuf_metadata_cache_max_bytes >= arc_max_bytes()) { - dbuf_metadata_cache_max_bytes = - arc_max_bytes() >> dbuf_metadata_cache_shift; - } - - /* - * All entries are queued via taskq_dispatch_ent(), so min/maxalloc - * configuration is not required. - */ - dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); - - for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { - dbuf_caches[dcs].cache = - multilist_create(sizeof (dmu_buf_impl_t), - offsetof(dmu_buf_impl_t, db_cache_link), - dbuf_cache_multilist_index_func); - zfs_refcount_create(&dbuf_caches[dcs].size); - } - - dbuf_evict_thread_exit = B_FALSE; - mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); - dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, - NULL, 0, &p0, TS_RUN, minclsyspri); - - dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc", - KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - if (dbuf_ksp != NULL) { - for (i = 0; i < DN_MAX_LEVELS; i++) { - snprintf(dbuf_stats.cache_levels[i].name, - KSTAT_STRLEN, "cache_level_%d", i); - dbuf_stats.cache_levels[i].data_type = - KSTAT_DATA_UINT64; - snprintf(dbuf_stats.cache_levels_bytes[i].name, - KSTAT_STRLEN, "cache_level_%d_bytes", i); - dbuf_stats.cache_levels_bytes[i].data_type = - KSTAT_DATA_UINT64; - } - dbuf_ksp->ks_data = &dbuf_stats; - dbuf_ksp->ks_update = dbuf_kstat_update; - kstat_install(dbuf_ksp); - } -} - -void -dbuf_fini(void) -{ - dbuf_hash_table_t *h = &dbuf_hash_table; - int i; - - dbuf_stats_destroy(); - - for (i = 0; i < DBUF_MUTEXES; i++) - mutex_destroy(&h->hash_mutexes[i]); - kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); - kmem_cache_destroy(dbuf_kmem_cache); - taskq_destroy(dbu_evict_taskq); - - mutex_enter(&dbuf_evict_lock); - dbuf_evict_thread_exit = B_TRUE; - while (dbuf_evict_thread_exit) { - cv_signal(&dbuf_evict_cv); - cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); - } - mutex_exit(&dbuf_evict_lock); - - mutex_destroy(&dbuf_evict_lock); - cv_destroy(&dbuf_evict_cv); - - for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { - zfs_refcount_destroy(&dbuf_caches[dcs].size); - multilist_destroy(dbuf_caches[dcs].cache); - } - - if (dbuf_ksp != NULL) { - kstat_delete(dbuf_ksp); - dbuf_ksp = NULL; - } -} - -/* - * Other stuff. - */ - -#ifdef ZFS_DEBUG -static void -dbuf_verify(dmu_buf_impl_t *db) -{ - dnode_t *dn; - dbuf_dirty_record_t *dr; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - - if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) - return; - - ASSERT(db->db_objset != NULL); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (dn == NULL) { - ASSERT(db->db_parent == NULL); - ASSERT(db->db_blkptr == NULL); - } else { - ASSERT3U(db->db.db_object, ==, dn->dn_object); - ASSERT3P(db->db_objset, ==, dn->dn_objset); - ASSERT3U(db->db_level, <, dn->dn_nlevels); - ASSERT(db->db_blkid == DMU_BONUS_BLKID || - db->db_blkid == DMU_SPILL_BLKID || - !avl_is_empty(&dn->dn_dbufs)); - } - if (db->db_blkid == DMU_BONUS_BLKID) { - ASSERT(dn != NULL); - ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); - ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); - } else if (db->db_blkid == DMU_SPILL_BLKID) { - ASSERT(dn != NULL); - ASSERT0(db->db.db_offset); - } else { - ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); - } - - for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) - ASSERT(dr->dr_dbuf == db); - - for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) - ASSERT(dr->dr_dbuf == db); - - /* - * We can't assert that db_size matches dn_datablksz because it - * can be momentarily different when another thread is doing - * dnode_set_blksz(). - */ - if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { - dr = db->db_data_pending; - /* - * It should only be modified in syncing context, so - * make sure we only have one copy of the data. - */ - ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); - } - - /* verify db->db_blkptr */ - if (db->db_blkptr) { - if (db->db_parent == dn->dn_dbuf) { - /* db is pointed to by the dnode */ - /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ - if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) - ASSERT(db->db_parent == NULL); - else - ASSERT(db->db_parent != NULL); - if (db->db_blkid != DMU_SPILL_BLKID) - ASSERT3P(db->db_blkptr, ==, - &dn->dn_phys->dn_blkptr[db->db_blkid]); - } else { - /* db is pointed to by an indirect block */ - int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; - ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); - ASSERT3U(db->db_parent->db.db_object, ==, - db->db.db_object); - /* - * dnode_grow_indblksz() can make this fail if we don't - * have the struct_rwlock. XXX indblksz no longer - * grows. safe to do this now? - */ - if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { - ASSERT3P(db->db_blkptr, ==, - ((blkptr_t *)db->db_parent->db.db_data + - db->db_blkid % epb)); - } - } - } - if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && - (db->db_buf == NULL || db->db_buf->b_data) && - db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && - db->db_state != DB_FILL && !dn->dn_free_txg) { - /* - * If the blkptr isn't set but they have nonzero data, - * it had better be dirty, otherwise we'll lose that - * data when we evict this buffer. - * - * There is an exception to this rule for indirect blocks; in - * this case, if the indirect block is a hole, we fill in a few - * fields on each of the child blocks (importantly, birth time) - * to prevent hole birth times from being lost when you - * partially fill in a hole. - */ - if (db->db_dirtycnt == 0) { - if (db->db_level == 0) { - uint64_t *buf = db->db.db_data; - int i; - - for (i = 0; i < db->db.db_size >> 3; i++) { - ASSERT(buf[i] == 0); - } - } else { - blkptr_t *bps = db->db.db_data; - ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, - db->db.db_size); - /* - * We want to verify that all the blkptrs in the - * indirect block are holes, but we may have - * automatically set up a few fields for them. - * We iterate through each blkptr and verify - * they only have those fields set. - */ - for (int i = 0; - i < db->db.db_size / sizeof (blkptr_t); - i++) { - blkptr_t *bp = &bps[i]; - ASSERT(ZIO_CHECKSUM_IS_ZERO( - &bp->blk_cksum)); - ASSERT( - DVA_IS_EMPTY(&bp->blk_dva[0]) && - DVA_IS_EMPTY(&bp->blk_dva[1]) && - DVA_IS_EMPTY(&bp->blk_dva[2])); - ASSERT0(bp->blk_fill); - ASSERT0(bp->blk_pad[0]); - ASSERT0(bp->blk_pad[1]); - ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT(BP_IS_HOLE(bp)); - ASSERT0(bp->blk_phys_birth); - } - } - } - } - DB_DNODE_EXIT(db); -} -#endif - -static void -dbuf_clear_data(dmu_buf_impl_t *db) -{ - ASSERT(MUTEX_HELD(&db->db_mtx)); - dbuf_evict_user(db); - ASSERT3P(db->db_buf, ==, NULL); - db->db.db_data = NULL; - if (db->db_state != DB_NOFILL) - db->db_state = DB_UNCACHED; -} - -static void -dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) -{ - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(buf != NULL); - - db->db_buf = buf; - ASSERT(buf->b_data != NULL); - db->db.db_data = buf->b_data; -} - -/* - * Loan out an arc_buf for read. Return the loaned arc_buf. - */ -arc_buf_t * -dbuf_loan_arcbuf(dmu_buf_impl_t *db) -{ - arc_buf_t *abuf; - - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - mutex_enter(&db->db_mtx); - if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) { - int blksz = db->db.db_size; - spa_t *spa = db->db_objset->os_spa; - - mutex_exit(&db->db_mtx); - abuf = arc_loan_buf(spa, B_FALSE, blksz); - bcopy(db->db.db_data, abuf->b_data, blksz); - } else { - abuf = db->db_buf; - arc_loan_inuse_buf(abuf, db); - db->db_buf = NULL; - dbuf_clear_data(db); - mutex_exit(&db->db_mtx); - } - return (abuf); -} - -/* - * Calculate which level n block references the data at the level 0 offset - * provided. - */ -uint64_t -dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) -{ - if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { - /* - * The level n blkid is equal to the level 0 blkid divided by - * the number of level 0s in a level n block. - * - * The level 0 blkid is offset >> datablkshift = - * offset / 2^datablkshift. - * - * The number of level 0s in a level n is the number of block - * pointers in an indirect block, raised to the power of level. - * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = - * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). - * - * Thus, the level n blkid is: offset / - * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) - * = offset / 2^(datablkshift + level * - * (indblkshift - SPA_BLKPTRSHIFT)) - * = offset >> (datablkshift + level * - * (indblkshift - SPA_BLKPTRSHIFT)) - */ - return (offset >> (dn->dn_datablkshift + level * - (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); - } else { - ASSERT3U(offset, <, dn->dn_datablksz); - return (0); - } -} - -static void -dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, - arc_buf_t *buf, void *vdb) -{ - dmu_buf_impl_t *db = vdb; - - mutex_enter(&db->db_mtx); - ASSERT3U(db->db_state, ==, DB_READ); - /* - * All reads are synchronous, so we must have a hold on the dbuf - */ - ASSERT(zfs_refcount_count(&db->db_holds) > 0); - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); - if (buf == NULL) { - /* i/o error */ - ASSERT(zio == NULL || zio->io_error != 0); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT3P(db->db_buf, ==, NULL); - db->db_state = DB_UNCACHED; - } else if (db->db_level == 0 && db->db_freed_in_flight) { - /* freed in flight */ - ASSERT(zio == NULL || zio->io_error == 0); - if (buf == NULL) { - buf = arc_alloc_buf(db->db_objset->os_spa, - db, DBUF_GET_BUFC_TYPE(db), db->db.db_size); - } - arc_release(buf, db); - bzero(buf->b_data, db->db.db_size); - arc_buf_freeze(buf); - db->db_freed_in_flight = FALSE; - dbuf_set_data(db, buf); - db->db_state = DB_CACHED; - } else { - /* success */ - ASSERT(zio == NULL || zio->io_error == 0); - dbuf_set_data(db, buf); - db->db_state = DB_CACHED; - } - cv_broadcast(&db->db_changed); - dbuf_rele_and_unlock(db, NULL, B_FALSE); -} - -static void -dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) -{ - dnode_t *dn; - zbookmark_phys_t zb; - arc_flags_t aflags = ARC_FLAG_NOWAIT; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - /* We need the struct_rwlock to prevent db_blkptr from changing. */ - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_state == DB_UNCACHED); - ASSERT(db->db_buf == NULL); - - if (db->db_blkid == DMU_BONUS_BLKID) { - /* - * The bonus length stored in the dnode may be less than - * the maximum available space in the bonus buffer. - */ - int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); - int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); - - ASSERT3U(bonuslen, <=, db->db.db_size); - db->db.db_data = zio_buf_alloc(max_bonuslen); - arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); - if (bonuslen < max_bonuslen) - bzero(db->db.db_data, max_bonuslen); - if (bonuslen) - bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); - DB_DNODE_EXIT(db); - db->db_state = DB_CACHED; - mutex_exit(&db->db_mtx); - return; - } - - /* - * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() - * processes the delete record and clears the bp while we are waiting - * for the dn_mtx (resulting in a "no" from block_freed). - */ - if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || - (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || - BP_IS_HOLE(db->db_blkptr)))) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - - dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type, - db->db.db_size)); - bzero(db->db.db_data, db->db.db_size); - - if (db->db_blkptr != NULL && db->db_level > 0 && - BP_IS_HOLE(db->db_blkptr) && - db->db_blkptr->blk_birth != 0) { - blkptr_t *bps = db->db.db_data; - for (int i = 0; i < ((1 << - DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t)); - i++) { - blkptr_t *bp = &bps[i]; - ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, - 1 << dn->dn_indblkshift); - BP_SET_LSIZE(bp, - BP_GET_LEVEL(db->db_blkptr) == 1 ? - dn->dn_datablksz : - BP_GET_LSIZE(db->db_blkptr)); - BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); - BP_SET_LEVEL(bp, - BP_GET_LEVEL(db->db_blkptr) - 1); - BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); - } - } - DB_DNODE_EXIT(db); - db->db_state = DB_CACHED; - mutex_exit(&db->db_mtx); - return; - } - - DB_DNODE_EXIT(db); - - db->db_state = DB_READ; - mutex_exit(&db->db_mtx); - - if (DBUF_IS_L2CACHEABLE(db)) - aflags |= ARC_FLAG_L2CACHE; - - SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? - db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, - db->db.db_object, db->db_level, db->db_blkid); - - dbuf_add_ref(db, NULL); - - (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, - dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, - (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, - &aflags, &zb); -} - -/* - * This is our just-in-time copy function. It makes a copy of buffers that - * have been modified in a previous transaction group before we access them in - * the current active group. - * - * This function is used in three places: when we are dirtying a buffer for the - * first time in a txg, when we are freeing a range in a dnode that includes - * this buffer, and when we are accessing a buffer which was received compressed - * and later referenced in a WRITE_BYREF record. - * - * Note that when we are called from dbuf_free_range() we do not put a hold on - * the buffer, we just traverse the active dbuf list for the dnode. - */ -static void -dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) -{ - dbuf_dirty_record_t *dr = db->db_last_dirty; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db.db_data != NULL); - ASSERT(db->db_level == 0); - ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); - - if (dr == NULL || - (dr->dt.dl.dr_data != - ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) - return; - - /* - * If the last dirty record for this dbuf has not yet synced - * and its referencing the dbuf data, either: - * reset the reference to point to a new copy, - * or (if there a no active holders) - * just null out the current db_data pointer. - */ - ASSERT(dr->dr_txg >= txg - 2); - if (db->db_blkid == DMU_BONUS_BLKID) { - /* Note that the data bufs here are zio_bufs */ - dnode_t *dn = DB_DNODE(db); - int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); - dr->dt.dl.dr_data = zio_buf_alloc(bonuslen); - arc_space_consume(bonuslen, ARC_SPACE_BONUS); - bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); - } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) { - int size = arc_buf_size(db->db_buf); - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa = db->db_objset->os_spa; - enum zio_compress compress_type = - arc_get_compression(db->db_buf); - - if (compress_type == ZIO_COMPRESS_OFF) { - dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); - } else { - ASSERT3U(type, ==, ARC_BUFC_DATA); - dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db, - size, arc_buf_lsize(db->db_buf), compress_type); - } - bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); - } else { - db->db_buf = NULL; - dbuf_clear_data(db); - } -} - -int -dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) -{ - int err = 0; - boolean_t prefetch; - dnode_t *dn; - - /* - * We don't have to hold the mutex to check db_state because it - * can't be freed while we have a hold on the buffer. - */ - ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - - if (db->db_state == DB_NOFILL) - return (SET_ERROR(EIO)); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_enter(&dn->dn_struct_rwlock, RW_READER); - - prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && - DBUF_IS_CACHEABLE(db); - - mutex_enter(&db->db_mtx); - if (db->db_state == DB_CACHED) { - /* - * If the arc buf is compressed, we need to decompress it to - * read the data. This could happen during the "zfs receive" of - * a stream which is compressed and deduplicated. - */ - if (db->db_buf != NULL && - arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) { - dbuf_fix_old_data(db, - spa_syncing_txg(dmu_objset_spa(db->db_objset))); - err = arc_decompress(db->db_buf); - dbuf_set_data(db, db->db_buf); - } - mutex_exit(&db->db_mtx); - if (prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_hits); - } else if (db->db_state == DB_UNCACHED) { - spa_t *spa = dn->dn_objset->os_spa; - boolean_t need_wait = B_FALSE; - - if (zio == NULL && - db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { - zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); - need_wait = B_TRUE; - } - dbuf_read_impl(db, zio, flags); - - /* dbuf_read_impl has dropped db_mtx for us */ - - if (prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_misses); - - if (need_wait) - err = zio_wait(zio); - } else { - /* - * Another reader came in while the dbuf was in flight - * between UNCACHED and CACHED. Either a writer will finish - * writing the buffer (sending the dbuf to CACHED) or the - * first reader's request will reach the read_done callback - * and send the dbuf to CACHED. Otherwise, a failure - * occurred and the dbuf went to UNCACHED. - */ - mutex_exit(&db->db_mtx); - if (prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_misses); - - /* Skip the wait per the caller's request. */ - mutex_enter(&db->db_mtx); - if ((flags & DB_RF_NEVERWAIT) == 0) { - while (db->db_state == DB_READ || - db->db_state == DB_FILL) { - ASSERT(db->db_state == DB_READ || - (flags & DB_RF_HAVESTRUCT) == 0); - DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, - db, zio_t *, zio); - cv_wait(&db->db_changed, &db->db_mtx); - } - if (db->db_state == DB_UNCACHED) - err = SET_ERROR(EIO); - } - mutex_exit(&db->db_mtx); - } - - return (err); -} - -static void -dbuf_noread(dmu_buf_impl_t *db) -{ - ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - mutex_enter(&db->db_mtx); - while (db->db_state == DB_READ || db->db_state == DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); - if (db->db_state == DB_UNCACHED) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa = db->db_objset->os_spa; - - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size)); - db->db_state = DB_FILL; - } else if (db->db_state == DB_NOFILL) { - dbuf_clear_data(db); - } else { - ASSERT3U(db->db_state, ==, DB_CACHED); - } - mutex_exit(&db->db_mtx); -} - -void -dbuf_unoverride(dbuf_dirty_record_t *dr) -{ - dmu_buf_impl_t *db = dr->dr_dbuf; - blkptr_t *bp = &dr->dt.dl.dr_overridden_by; - uint64_t txg = dr->dr_txg; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - /* - * This assert is valid because dmu_sync() expects to be called by - * a zilog's get_data while holding a range lock. This call only - * comes from dbuf_dirty() callers who must also hold a range lock. - */ - ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); - ASSERT(db->db_level == 0); - - if (db->db_blkid == DMU_BONUS_BLKID || - dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) - return; - - ASSERT(db->db_data_pending != dr); - - /* free this block */ - if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) - zio_free(db->db_objset->os_spa, txg, bp); - - dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - dr->dt.dl.dr_nopwrite = B_FALSE; - - /* - * Release the already-written buffer, so we leave it in - * a consistent dirty state. Note that all callers are - * modifying the buffer, so they will immediately do - * another (redundant) arc_release(). Therefore, leave - * the buf thawed to save the effort of freezing & - * immediately re-thawing it. - */ - arc_release(dr->dt.dl.dr_data, db); -} - -/* - * Evict (if its unreferenced) or clear (if its referenced) any level-0 - * data blocks in the free range, so that any future readers will find - * empty blocks. - */ -void -dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, - dmu_tx_t *tx) -{ - dmu_buf_impl_t db_search; - dmu_buf_impl_t *db, *db_next; - uint64_t txg = tx->tx_txg; - avl_index_t where; - - if (end_blkid > dn->dn_maxblkid && - !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID)) - end_blkid = dn->dn_maxblkid; - dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); - - db_search.db_level = 0; - db_search.db_blkid = start_blkid; - db_search.db_state = DB_SEARCH; - - mutex_enter(&dn->dn_dbufs_mtx); - db = avl_find(&dn->dn_dbufs, &db_search, &where); - ASSERT3P(db, ==, NULL); - - db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); - - for (; db != NULL; db = db_next) { - db_next = AVL_NEXT(&dn->dn_dbufs, db); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - - if (db->db_level != 0 || db->db_blkid > end_blkid) { - break; - } - ASSERT3U(db->db_blkid, >=, start_blkid); - - /* found a level 0 buffer in the range */ - mutex_enter(&db->db_mtx); - if (dbuf_undirty(db, tx)) { - /* mutex has been dropped and dbuf destroyed */ - continue; - } - - if (db->db_state == DB_UNCACHED || - db->db_state == DB_NOFILL || - db->db_state == DB_EVICTING) { - ASSERT(db->db.db_data == NULL); - mutex_exit(&db->db_mtx); - continue; - } - if (db->db_state == DB_READ || db->db_state == DB_FILL) { - /* will be handled in dbuf_read_done or dbuf_rele */ - db->db_freed_in_flight = TRUE; - mutex_exit(&db->db_mtx); - continue; - } - if (zfs_refcount_count(&db->db_holds) == 0) { - ASSERT(db->db_buf); - dbuf_destroy(db); - continue; - } - /* The dbuf is referenced */ - - if (db->db_last_dirty != NULL) { - dbuf_dirty_record_t *dr = db->db_last_dirty; - - if (dr->dr_txg == txg) { - /* - * This buffer is "in-use", re-adjust the file - * size to reflect that this buffer may - * contain new data when we sync. - */ - if (db->db_blkid != DMU_SPILL_BLKID && - db->db_blkid > dn->dn_maxblkid) - dn->dn_maxblkid = db->db_blkid; - dbuf_unoverride(dr); - } else { - /* - * This dbuf is not dirty in the open context. - * Either uncache it (if its not referenced in - * the open context) or reset its contents to - * empty. - */ - dbuf_fix_old_data(db, txg); - } - } - /* clear the contents if its cached */ - if (db->db_state == DB_CACHED) { - ASSERT(db->db.db_data != NULL); - arc_release(db->db_buf, db); - bzero(db->db.db_data, db->db.db_size); - arc_buf_freeze(db->db_buf); - } - - mutex_exit(&db->db_mtx); - } - mutex_exit(&dn->dn_dbufs_mtx); -} - -void -dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) -{ - arc_buf_t *buf, *obuf; - int osize = db->db.db_size; - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dnode_t *dn; - - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - - /* XXX does *this* func really need the lock? */ - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); - - /* - * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held - * is OK, because there can be no other references to the db - * when we are changing its size, so no concurrent DB_FILL can - * be happening. - */ - /* - * XXX we should be doing a dbuf_read, checking the return - * value and returning that up to our callers - */ - dmu_buf_will_dirty(&db->db, tx); - - /* create the data buffer for the new block */ - buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); - - /* copy old block data to the new block */ - obuf = db->db_buf; - bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); - /* zero the remainder */ - if (size > osize) - bzero((uint8_t *)buf->b_data + osize, size - osize); - - mutex_enter(&db->db_mtx); - dbuf_set_data(db, buf); - arc_buf_destroy(obuf, db); - db->db.db_size = size; - - if (db->db_level == 0) { - ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); - db->db_last_dirty->dt.dl.dr_data = buf; - } - mutex_exit(&db->db_mtx); - - dmu_objset_willuse_space(dn->dn_objset, size - osize, tx); - DB_DNODE_EXIT(db); -} - -void -dbuf_release_bp(dmu_buf_impl_t *db) -{ - objset_t *os = db->db_objset; - - ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); - ASSERT(arc_released(os->os_phys_buf) || - list_link_active(&os->os_dsl_dataset->ds_synced_link)); - ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); - - (void) arc_release(db->db_buf, db); -} - -/* - * We already have a dirty record for this TXG, and we are being - * dirtied again. - */ -static void -dbuf_redirty(dbuf_dirty_record_t *dr) -{ - dmu_buf_impl_t *db = dr->dr_dbuf; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { - /* - * If this buffer has already been written out, - * we now need to reset its state. - */ - dbuf_unoverride(dr); - if (db->db.db_object != DMU_META_DNODE_OBJECT && - db->db_state != DB_NOFILL) { - /* Already released on initial dirty, so just thaw. */ - ASSERT(arc_released(db->db_buf)); - arc_buf_thaw(db->db_buf); - } - } -} - -dbuf_dirty_record_t * -dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - dnode_t *dn; - objset_t *os; - dbuf_dirty_record_t **drp, *dr; - int drop_struct_lock = FALSE; - int txgoff = tx->tx_txg & TXG_MASK; - - ASSERT(tx->tx_txg != 0); - ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - DMU_TX_DIRTY_BUF(tx, db); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - /* - * Shouldn't dirty a regular buffer in syncing context. Private - * objects may be dirtied in syncing context, but only if they - * were already pre-dirtied in open context. - */ -#ifdef DEBUG - if (dn->dn_objset->os_dsl_dataset != NULL) { - rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, - RW_READER, FTAG); - } - ASSERT(!dmu_tx_is_syncing(tx) || - BP_IS_HOLE(dn->dn_objset->os_rootbp) || - DMU_OBJECT_IS_SPECIAL(dn->dn_object) || - dn->dn_objset->os_dsl_dataset == NULL); - if (dn->dn_objset->os_dsl_dataset != NULL) - rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); -#endif - /* - * We make this assert for private objects as well, but after we - * check if we're already dirty. They are allowed to re-dirty - * in syncing context. - */ - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); - - mutex_enter(&db->db_mtx); - /* - * XXX make this true for indirects too? The problem is that - * transactions created with dmu_tx_create_assigned() from - * syncing context don't bother holding ahead. - */ - ASSERT(db->db_level != 0 || - db->db_state == DB_CACHED || db->db_state == DB_FILL || - db->db_state == DB_NOFILL); - - mutex_enter(&dn->dn_mtx); - /* - * Don't set dirtyctx to SYNC if we're just modifying this as we - * initialize the objset. - */ - if (dn->dn_dirtyctx == DN_UNDIRTIED) { - if (dn->dn_objset->os_dsl_dataset != NULL) { - rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, - RW_READER, FTAG); - } - if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { - dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? - DN_DIRTY_SYNC : DN_DIRTY_OPEN); - ASSERT(dn->dn_dirtyctx_firstset == NULL); - dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); - } - if (dn->dn_objset->os_dsl_dataset != NULL) { - rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, - FTAG); - } - } - - if (tx->tx_txg > dn->dn_dirty_txg) - dn->dn_dirty_txg = tx->tx_txg; - mutex_exit(&dn->dn_mtx); - - if (db->db_blkid == DMU_SPILL_BLKID) - dn->dn_have_spill = B_TRUE; - - /* - * If this buffer is already dirty, we're done. - */ - drp = &db->db_last_dirty; - ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || - db->db.db_object == DMU_META_DNODE_OBJECT); - while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) - drp = &dr->dr_next; - if (dr && dr->dr_txg == tx->tx_txg) { - DB_DNODE_EXIT(db); - - dbuf_redirty(dr); - mutex_exit(&db->db_mtx); - return (dr); - } - - /* - * Only valid if not already dirty. - */ - ASSERT(dn->dn_object == 0 || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); - - ASSERT3U(dn->dn_nlevels, >, db->db_level); - - /* - * We should only be dirtying in syncing context if it's the - * mos or we're initializing the os or it's a special object. - * However, we are allowed to dirty in syncing context provided - * we already dirtied it in open context. Hence we must make - * this assertion only if we're not already dirty. - */ - os = dn->dn_objset; - VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa)); -#ifdef DEBUG - if (dn->dn_objset->os_dsl_dataset != NULL) - rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); - ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || - os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); - if (dn->dn_objset->os_dsl_dataset != NULL) - rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); -#endif - ASSERT(db->db.db_size != 0); - - dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); - - if (db->db_blkid != DMU_BONUS_BLKID) { - dmu_objset_willuse_space(os, db->db.db_size, tx); - } - - /* - * If this buffer is dirty in an old transaction group we need - * to make a copy of it so that the changes we make in this - * transaction group won't leak out when we sync the older txg. - */ - dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); - list_link_init(&dr->dr_dirty_node); - if (db->db_level == 0) { - void *data_old = db->db_buf; - - if (db->db_state != DB_NOFILL) { - if (db->db_blkid == DMU_BONUS_BLKID) { - dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db.db_data; - } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { - /* - * Release the data buffer from the cache so - * that we can modify it without impacting - * possible other users of this cached data - * block. Note that indirect blocks and - * private objects are not released until the - * syncing state (since they are only modified - * then). - */ - arc_release(db->db_buf, db); - dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db_buf; - } - ASSERT(data_old != NULL); - } - dr->dt.dl.dr_data = data_old; - } else { - mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); - list_create(&dr->dt.di.dr_children, - sizeof (dbuf_dirty_record_t), - offsetof(dbuf_dirty_record_t, dr_dirty_node)); - } - if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) - dr->dr_accounted = db->db.db_size; - dr->dr_dbuf = db; - dr->dr_txg = tx->tx_txg; - dr->dr_next = *drp; - *drp = dr; - - /* - * We could have been freed_in_flight between the dbuf_noread - * and dbuf_dirty. We win, as though the dbuf_noread() had - * happened after the free. - */ - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - db->db_blkid != DMU_SPILL_BLKID) { - mutex_enter(&dn->dn_mtx); - if (dn->dn_free_ranges[txgoff] != NULL) { - range_tree_clear(dn->dn_free_ranges[txgoff], - db->db_blkid, 1); - } - mutex_exit(&dn->dn_mtx); - db->db_freed_in_flight = FALSE; - } - - /* - * This buffer is now part of this txg - */ - dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); - db->db_dirtycnt += 1; - ASSERT3U(db->db_dirtycnt, <=, 3); - - mutex_exit(&db->db_mtx); - - if (db->db_blkid == DMU_BONUS_BLKID || - db->db_blkid == DMU_SPILL_BLKID) { - mutex_enter(&dn->dn_mtx); - ASSERT(!list_link_active(&dr->dr_dirty_node)); - list_insert_tail(&dn->dn_dirty_records[txgoff], dr); - mutex_exit(&dn->dn_mtx); - dnode_setdirty(dn, tx); - DB_DNODE_EXIT(db); - return (dr); - } - - /* - * The dn_struct_rwlock prevents db_blkptr from changing - * due to a write from syncing context completing - * while we are running, so we want to acquire it before - * looking at db_blkptr. - */ - if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { - rw_enter(&dn->dn_struct_rwlock, RW_READER); - drop_struct_lock = TRUE; - } - - /* - * We need to hold the dn_struct_rwlock to make this assertion, - * because it protects dn_phys / dn_next_nlevels from changing. - */ - ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || - dn->dn_phys->dn_nlevels > db->db_level || - dn->dn_next_nlevels[txgoff] > db->db_level || - dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || - dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); - - /* - * If we are overwriting a dedup BP, then unless it is snapshotted, - * when we get to syncing context we will need to decrement its - * refcount in the DDT. Prefetch the relevant DDT block so that - * syncing context won't have to wait for the i/o. - */ - ddt_prefetch(os->os_spa, db->db_blkptr); - - if (db->db_level == 0) { - dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); - ASSERT(dn->dn_maxblkid >= db->db_blkid); - } - - if (db->db_level+1 < dn->dn_nlevels) { - dmu_buf_impl_t *parent = db->db_parent; - dbuf_dirty_record_t *di; - int parent_held = FALSE; - - if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - - parent = dbuf_hold_level(dn, db->db_level+1, - db->db_blkid >> epbs, FTAG); - ASSERT(parent != NULL); - parent_held = TRUE; - } - if (drop_struct_lock) - rw_exit(&dn->dn_struct_rwlock); - ASSERT3U(db->db_level+1, ==, parent->db_level); - di = dbuf_dirty(parent, tx); - if (parent_held) - dbuf_rele(parent, FTAG); - - mutex_enter(&db->db_mtx); - /* - * Since we've dropped the mutex, it's possible that - * dbuf_undirty() might have changed this out from under us. - */ - if (db->db_last_dirty == dr || - dn->dn_object == DMU_META_DNODE_OBJECT) { - mutex_enter(&di->dt.di.dr_mtx); - ASSERT3U(di->dr_txg, ==, tx->tx_txg); - ASSERT(!list_link_active(&dr->dr_dirty_node)); - list_insert_tail(&di->dt.di.dr_children, dr); - mutex_exit(&di->dt.di.dr_mtx); - dr->dr_parent = di; - } - mutex_exit(&db->db_mtx); - } else { - ASSERT(db->db_level+1 == dn->dn_nlevels); - ASSERT(db->db_blkid < dn->dn_nblkptr); - ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); - mutex_enter(&dn->dn_mtx); - ASSERT(!list_link_active(&dr->dr_dirty_node)); - list_insert_tail(&dn->dn_dirty_records[txgoff], dr); - mutex_exit(&dn->dn_mtx); - if (drop_struct_lock) - rw_exit(&dn->dn_struct_rwlock); - } - - dnode_setdirty(dn, tx); - DB_DNODE_EXIT(db); - return (dr); -} - -/* - * Undirty a buffer in the transaction group referenced by the given - * transaction. Return whether this evicted the dbuf. - */ -static boolean_t -dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - dnode_t *dn; - uint64_t txg = tx->tx_txg; - dbuf_dirty_record_t *dr, **drp; - - ASSERT(txg != 0); - - /* - * Due to our use of dn_nlevels below, this can only be called - * in open context, unless we are operating on the MOS. - * From syncing context, dn_nlevels may be different from the - * dn_nlevels used when dbuf was dirtied. - */ - ASSERT(db->db_objset == - dmu_objset_pool(db->db_objset)->dp_meta_objset || - txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT0(db->db_level); - ASSERT(MUTEX_HELD(&db->db_mtx)); - - /* - * If this buffer is not dirty, we're done. - */ - for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) - if (dr->dr_txg <= txg) - break; - if (dr == NULL || dr->dr_txg < txg) - return (B_FALSE); - ASSERT(dr->dr_txg == txg); - ASSERT(dr->dr_dbuf == db); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - - dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); - - ASSERT(db->db.db_size != 0); - - dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), - dr->dr_accounted, txg); - - *drp = dr->dr_next; - - /* - * Note that there are three places in dbuf_dirty() - * where this dirty record may be put on a list. - * Make sure to do a list_remove corresponding to - * every one of those list_insert calls. - */ - if (dr->dr_parent) { - mutex_enter(&dr->dr_parent->dt.di.dr_mtx); - list_remove(&dr->dr_parent->dt.di.dr_children, dr); - mutex_exit(&dr->dr_parent->dt.di.dr_mtx); - } else if (db->db_blkid == DMU_SPILL_BLKID || - db->db_level + 1 == dn->dn_nlevels) { - ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); - mutex_enter(&dn->dn_mtx); - list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); - mutex_exit(&dn->dn_mtx); - } - DB_DNODE_EXIT(db); - - if (db->db_state != DB_NOFILL) { - dbuf_unoverride(dr); - - ASSERT(db->db_buf != NULL); - ASSERT(dr->dt.dl.dr_data != NULL); - if (dr->dt.dl.dr_data != db->db_buf) - arc_buf_destroy(dr->dt.dl.dr_data, db); - } - - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - - if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { - ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); - dbuf_destroy(db); - return (B_TRUE); - } - - return (B_FALSE); -} - -void -dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; - - ASSERT(tx->tx_txg != 0); - ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - - /* - * Quick check for dirtyness. For already dirty blocks, this - * reduces runtime of this function by >90%, and overall performance - * by 50% for some workloads (e.g. file deletion with indirect blocks - * cached). - */ - mutex_enter(&db->db_mtx); - dbuf_dirty_record_t *dr; - for (dr = db->db_last_dirty; - dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { - /* - * It's possible that it is already dirty but not cached, - * because there are some calls to dbuf_dirty() that don't - * go through dmu_buf_will_dirty(). - */ - if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { - /* This dbuf is already dirty and cached. */ - dbuf_redirty(dr); - mutex_exit(&db->db_mtx); - return; - } - } - mutex_exit(&db->db_mtx); - - DB_DNODE_ENTER(db); - if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) - rf |= DB_RF_HAVESTRUCT; - DB_DNODE_EXIT(db); - (void) dbuf_read(db, NULL, rf); - (void) dbuf_dirty(db, tx); -} - -void -dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - db->db_state = DB_NOFILL; - - dmu_buf_will_fill(db_fake, tx); -} - -void -dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(tx->tx_txg != 0); - ASSERT(db->db_level == 0); - ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - - ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || - dmu_tx_private_ok(tx)); - - dbuf_noread(db); - (void) dbuf_dirty(db, tx); -} - -#pragma weak dmu_buf_fill_done = dbuf_fill_done -/* ARGSUSED */ -void -dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - mutex_enter(&db->db_mtx); - DBUF_VERIFY(db); - - if (db->db_state == DB_FILL) { - if (db->db_level == 0 && db->db_freed_in_flight) { - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - /* we were freed while filling */ - /* XXX dbuf_undirty? */ - bzero(db->db.db_data, db->db.db_size); - db->db_freed_in_flight = FALSE; - } - db->db_state = DB_CACHED; - cv_broadcast(&db->db_changed); - } - mutex_exit(&db->db_mtx); -} - -void -dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, - bp_embedded_type_t etype, enum zio_compress comp, - int uncompressed_size, int compressed_size, int byteorder, - dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; - struct dirty_leaf *dl; - dmu_object_type_t type; - - if (etype == BP_EMBEDDED_TYPE_DATA) { - ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), - SPA_FEATURE_EMBEDDED_DATA)); - } - - DB_DNODE_ENTER(db); - type = DB_DNODE(db)->dn_type; - DB_DNODE_EXIT(db); - - ASSERT0(db->db_level); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - - dmu_buf_will_not_fill(dbuf, tx); - - ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); - dl = &db->db_last_dirty->dt.dl; - encode_embedded_bp_compressed(&dl->dr_overridden_by, - data, comp, uncompressed_size, compressed_size); - BPE_SET_ETYPE(&dl->dr_overridden_by, etype); - BP_SET_TYPE(&dl->dr_overridden_by, type); - BP_SET_LEVEL(&dl->dr_overridden_by, 0); - BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); - - dl->dr_override_state = DR_OVERRIDDEN; - dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; -} - -/* - * Directly assign a provided arc buf to a given dbuf if it's not referenced - * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. - */ -void -dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) -{ - ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(db->db_level == 0); - ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); - ASSERT(buf != NULL); - ASSERT(arc_buf_lsize(buf) == db->db.db_size); - ASSERT(tx->tx_txg != 0); - - arc_return_buf(buf, db); - ASSERT(arc_released(buf)); - - mutex_enter(&db->db_mtx); - - while (db->db_state == DB_READ || db->db_state == DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); - - ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); - - if (db->db_state == DB_CACHED && - zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { - mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx); - bcopy(buf->b_data, db->db.db_data, db->db.db_size); - arc_buf_destroy(buf, db); - xuio_stat_wbuf_copied(); - return; - } - - xuio_stat_wbuf_nocopy(); - if (db->db_state == DB_CACHED) { - dbuf_dirty_record_t *dr = db->db_last_dirty; - - ASSERT(db->db_buf != NULL); - if (dr != NULL && dr->dr_txg == tx->tx_txg) { - ASSERT(dr->dt.dl.dr_data == db->db_buf); - if (!arc_released(db->db_buf)) { - ASSERT(dr->dt.dl.dr_override_state == - DR_OVERRIDDEN); - arc_release(db->db_buf, db); - } - dr->dt.dl.dr_data = buf; - arc_buf_destroy(db->db_buf, db); - } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { - arc_release(db->db_buf, db); - arc_buf_destroy(db->db_buf, db); - } - db->db_buf = NULL; - } - ASSERT(db->db_buf == NULL); - dbuf_set_data(db, buf); - db->db_state = DB_FILL; - mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx); - dmu_buf_fill_done(&db->db, tx); -} - -void -dbuf_destroy(dmu_buf_impl_t *db) -{ - dnode_t *dn; - dmu_buf_impl_t *parent = db->db_parent; - dmu_buf_impl_t *dndb; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(zfs_refcount_is_zero(&db->db_holds)); - - if (db->db_buf != NULL) { - arc_buf_destroy(db->db_buf, db); - db->db_buf = NULL; - } - - if (db->db_blkid == DMU_BONUS_BLKID) { - int slots = DB_DNODE(db)->dn_num_slots; - int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); - if (db->db.db_data != NULL) { - zio_buf_free(db->db.db_data, bonuslen); - arc_space_return(bonuslen, ARC_SPACE_BONUS); - db->db_state = DB_UNCACHED; - } - } - - dbuf_clear_data(db); - - if (multilist_link_active(&db->db_cache_link)) { - ASSERT(db->db_caching_status == DB_DBUF_CACHE || - db->db_caching_status == DB_DBUF_METADATA_CACHE); - - multilist_remove(dbuf_caches[db->db_caching_status].cache, db); - (void) zfs_refcount_remove_many( - &dbuf_caches[db->db_caching_status].size, - db->db.db_size, db); - - if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { - DBUF_STAT_BUMPDOWN(metadata_cache_count); - } else { - DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); - DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], - db->db.db_size); - } - db->db_caching_status = DB_NO_CACHE; - } - - ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); - ASSERT(db->db_data_pending == NULL); - - db->db_state = DB_EVICTING; - db->db_blkptr = NULL; - - /* - * Now that db_state is DB_EVICTING, nobody else can find this via - * the hash table. We can now drop db_mtx, which allows us to - * acquire the dn_dbufs_mtx. - */ - mutex_exit(&db->db_mtx); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - dndb = dn->dn_dbuf; - if (db->db_blkid != DMU_BONUS_BLKID) { - boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); - if (needlock) - mutex_enter(&dn->dn_dbufs_mtx); - avl_remove(&dn->dn_dbufs, db); - membar_producer(); - DB_DNODE_EXIT(db); - if (needlock) - mutex_exit(&dn->dn_dbufs_mtx); - /* - * Decrementing the dbuf count means that the hold corresponding - * to the removed dbuf is no longer discounted in dnode_move(), - * so the dnode cannot be moved until after we release the hold. - * The membar_producer() ensures visibility of the decremented - * value in dnode_move(), since DB_DNODE_EXIT doesn't actually - * release any lock. - */ - mutex_enter(&dn->dn_mtx); - dnode_rele_and_unlock(dn, db, B_TRUE); - db->db_dnode_handle = NULL; - - dbuf_hash_remove(db); - } else { - DB_DNODE_EXIT(db); - } - - ASSERT(zfs_refcount_is_zero(&db->db_holds)); - - db->db_parent = NULL; - - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); - ASSERT(db->db_hash_next == NULL); - ASSERT(db->db_blkptr == NULL); - ASSERT(db->db_data_pending == NULL); - ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); - ASSERT(!multilist_link_active(&db->db_cache_link)); - - kmem_cache_free(dbuf_kmem_cache, db); - arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); - - /* - * If this dbuf is referenced from an indirect dbuf, - * decrement the ref count on the indirect dbuf. - */ - if (parent && parent != dndb) { - mutex_enter(&parent->db_mtx); - dbuf_rele_and_unlock(parent, db, B_TRUE); - } -} - -/* - * Note: While bpp will always be updated if the function returns success, - * parentp will not be updated if the dnode does not have dn_dbuf filled in; - * this happens when the dnode is the meta-dnode, or a userused or groupused - * object. - */ -__attribute__((always_inline)) -static inline int -dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, - dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh) -{ - *parentp = NULL; - *bpp = NULL; - - ASSERT(blkid != DMU_BONUS_BLKID); - - if (blkid == DMU_SPILL_BLKID) { - mutex_enter(&dn->dn_mtx); - if (dn->dn_have_spill && - (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) - *bpp = DN_SPILL_BLKPTR(dn->dn_phys); - else - *bpp = NULL; - dbuf_add_ref(dn->dn_dbuf, NULL); - *parentp = dn->dn_dbuf; - mutex_exit(&dn->dn_mtx); - return (0); - } - - int nlevels = - (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels; - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - - ASSERT3U(level * epbs, <, 64); - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - /* - * This assertion shouldn't trip as long as the max indirect block size - * is less than 1M. The reason for this is that up to that point, - * the number of levels required to address an entire object with blocks - * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In - * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55 - * (i.e. we can address the entire object), objects will all use at most - * N-1 levels and the assertion won't overflow. However, once epbs is - * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be - * enough to address an entire object, so objects will have 5 levels, - * but then this assertion will overflow. - * - * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we - * need to redo this logic to handle overflows. - */ - ASSERT(level >= nlevels || - ((nlevels - level - 1) * epbs) + - highbit64(dn->dn_phys->dn_nblkptr) <= 64); - if (level >= nlevels || - blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr << - ((nlevels - level - 1) * epbs)) || - (fail_sparse && - blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { - /* the buffer has no parent yet */ - return (SET_ERROR(ENOENT)); - } else if (level < nlevels-1) { - /* this block is referenced from an indirect block */ - int err; - if (dh == NULL) { - err = dbuf_hold_impl(dn, level+1, - blkid >> epbs, fail_sparse, FALSE, NULL, parentp); - } else { - __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1, - blkid >> epbs, fail_sparse, FALSE, NULL, - parentp, dh->dh_depth + 1); - err = __dbuf_hold_impl(dh + 1); - } - if (err) - return (err); - err = dbuf_read(*parentp, NULL, - (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); - if (err) { - dbuf_rele(*parentp, NULL); - *parentp = NULL; - return (err); - } - *bpp = ((blkptr_t *)(*parentp)->db.db_data) + - (blkid & ((1ULL << epbs) - 1)); - if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) - ASSERT(BP_IS_HOLE(*bpp)); - return (0); - } else { - /* the block is referenced from the dnode */ - ASSERT3U(level, ==, nlevels-1); - ASSERT(dn->dn_phys->dn_nblkptr == 0 || - blkid < dn->dn_phys->dn_nblkptr); - if (dn->dn_dbuf) { - dbuf_add_ref(dn->dn_dbuf, NULL); - *parentp = dn->dn_dbuf; - } - *bpp = &dn->dn_phys->dn_blkptr[blkid]; - return (0); - } -} - -static dmu_buf_impl_t * -dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, - dmu_buf_impl_t *parent, blkptr_t *blkptr) -{ - objset_t *os = dn->dn_objset; - dmu_buf_impl_t *db, *odb; - - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - ASSERT(dn->dn_type != DMU_OT_NONE); - - db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); - - db->db_objset = os; - db->db.db_object = dn->dn_object; - db->db_level = level; - db->db_blkid = blkid; - db->db_last_dirty = NULL; - db->db_dirtycnt = 0; - db->db_dnode_handle = dn->dn_handle; - db->db_parent = parent; - db->db_blkptr = blkptr; - - db->db_user = NULL; - db->db_user_immediate_evict = FALSE; - db->db_freed_in_flight = FALSE; - db->db_pending_evict = FALSE; - - if (blkid == DMU_BONUS_BLKID) { - ASSERT3P(parent, ==, dn->dn_dbuf); - db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - - (dn->dn_nblkptr-1) * sizeof (blkptr_t); - ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); - db->db.db_offset = DMU_BONUS_BLKID; - db->db_state = DB_UNCACHED; - db->db_caching_status = DB_NO_CACHE; - /* the bonus dbuf is not placed in the hash table */ - arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); - return (db); - } else if (blkid == DMU_SPILL_BLKID) { - db->db.db_size = (blkptr != NULL) ? - BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; - db->db.db_offset = 0; - } else { - int blocksize = - db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; - db->db.db_size = blocksize; - db->db.db_offset = db->db_blkid * blocksize; - } - - /* - * Hold the dn_dbufs_mtx while we get the new dbuf - * in the hash table *and* added to the dbufs list. - * This prevents a possible deadlock with someone - * trying to look up this dbuf before its added to the - * dn_dbufs list. - */ - mutex_enter(&dn->dn_dbufs_mtx); - db->db_state = DB_EVICTING; - if ((odb = dbuf_hash_insert(db)) != NULL) { - /* someone else inserted it first */ - kmem_cache_free(dbuf_kmem_cache, db); - mutex_exit(&dn->dn_dbufs_mtx); - DBUF_STAT_BUMP(hash_insert_race); - return (odb); - } - avl_add(&dn->dn_dbufs, db); - - db->db_state = DB_UNCACHED; - db->db_caching_status = DB_NO_CACHE; - mutex_exit(&dn->dn_dbufs_mtx); - arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); - - if (parent && parent != dn->dn_dbuf) - dbuf_add_ref(parent, db); - - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || - zfs_refcount_count(&dn->dn_holds) > 0); - (void) zfs_refcount_add(&dn->dn_holds, db); - - dprintf_dbuf(db, "db=%p\n", db); - - return (db); -} - -typedef struct dbuf_prefetch_arg { - spa_t *dpa_spa; /* The spa to issue the prefetch in. */ - zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ - int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ - int dpa_curlevel; /* The current level that we're reading */ - dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ - zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ - zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ - arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ -} dbuf_prefetch_arg_t; - -/* - * Actually issue the prefetch read for the block given. - */ -static void -dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) -{ - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) - return; - - arc_flags_t aflags = - dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; - - ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); - ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); - ASSERT(dpa->dpa_zio != NULL); - (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, - dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &aflags, &dpa->dpa_zb); -} - -/* - * Called when an indirect block above our prefetch target is read in. This - * will either read in the next indirect block down the tree or issue the actual - * prefetch if the next block down is our target. - */ -static void -dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, - const blkptr_t *iobp, arc_buf_t *abuf, void *private) -{ - dbuf_prefetch_arg_t *dpa = private; - - ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); - ASSERT3S(dpa->dpa_curlevel, >, 0); - - if (abuf == NULL) { - ASSERT(zio == NULL || zio->io_error != 0); - kmem_free(dpa, sizeof (*dpa)); - return; - } - ASSERT(zio == NULL || zio->io_error == 0); - - /* - * The dpa_dnode is only valid if we are called with a NULL - * zio. This indicates that the arc_read() returned without - * first calling zio_read() to issue a physical read. Once - * a physical read is made the dpa_dnode must be invalidated - * as the locks guarding it may have been dropped. If the - * dpa_dnode is still valid, then we want to add it to the dbuf - * cache. To do so, we must hold the dbuf associated with the block - * we just prefetched, read its contents so that we associate it - * with an arc_buf_t, and then release it. - */ - if (zio != NULL) { - ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); - if (zio->io_flags & ZIO_FLAG_RAW) { - ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); - } else { - ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); - } - ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); - - dpa->dpa_dnode = NULL; - } else if (dpa->dpa_dnode != NULL) { - uint64_t curblkid = dpa->dpa_zb.zb_blkid >> - (dpa->dpa_epbs * (dpa->dpa_curlevel - - dpa->dpa_zb.zb_level)); - dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, - dpa->dpa_curlevel, curblkid, FTAG); - (void) dbuf_read(db, NULL, - DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); - dbuf_rele(db, FTAG); - } - - if (abuf == NULL) { - kmem_free(dpa, sizeof(*dpa)); - return; - } - - dpa->dpa_curlevel--; - - uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> - (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); - blkptr_t *bp = ((blkptr_t *)abuf->b_data) + - P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); - if (BP_IS_HOLE(bp)) { - kmem_free(dpa, sizeof (*dpa)); - } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { - ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); - dbuf_issue_final_prefetch(dpa, bp); - kmem_free(dpa, sizeof (*dpa)); - } else { - arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; - zbookmark_phys_t zb; - - /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (dpa->dpa_aflags & ARC_FLAG_L2CACHE) - iter_aflags |= ARC_FLAG_L2CACHE; - - ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); - - SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, - dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); - - (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, - bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &iter_aflags, &zb); - } - - arc_buf_destroy(abuf, private); -} - -/* - * Issue prefetch reads for the given block on the given level. If the indirect - * blocks above that block are not in memory, we will read them in - * asynchronously. As a result, this call never blocks waiting for a read to - * complete. - */ -void -dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, - arc_flags_t aflags) -{ - blkptr_t bp; - int epbs, nlevels, curlevel; - uint64_t curblkid; - - ASSERT(blkid != DMU_BONUS_BLKID); - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - - if (blkid > dn->dn_maxblkid) - return; - - if (dnode_block_freed(dn, blkid)) - return; - - /* - * This dnode hasn't been written to disk yet, so there's nothing to - * prefetch. - */ - nlevels = dn->dn_phys->dn_nlevels; - if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) - return; - - epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) - return; - - dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, - level, blkid); - if (db != NULL) { - mutex_exit(&db->db_mtx); - /* - * This dbuf already exists. It is either CACHED, or - * (we assume) about to be read or filled. - */ - return; - } - - /* - * Find the closest ancestor (indirect block) of the target block - * that is present in the cache. In this indirect block, we will - * find the bp that is at curlevel, curblkid. - */ - curlevel = level; - curblkid = blkid; - while (curlevel < nlevels - 1) { - int parent_level = curlevel + 1; - uint64_t parent_blkid = curblkid >> epbs; - dmu_buf_impl_t *db; - - if (dbuf_hold_impl(dn, parent_level, parent_blkid, - FALSE, TRUE, FTAG, &db) == 0) { - blkptr_t *bpp = db->db_buf->b_data; - bp = bpp[P2PHASE(curblkid, 1 << epbs)]; - dbuf_rele(db, FTAG); - break; - } - - curlevel = parent_level; - curblkid = parent_blkid; - } - - if (curlevel == nlevels - 1) { - /* No cached indirect blocks found. */ - ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); - bp = dn->dn_phys->dn_blkptr[curblkid]; - } - if (BP_IS_HOLE(&bp)) - return; - - ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); - - zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, - ZIO_FLAG_CANFAIL); - - dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, - dn->dn_object, level, blkid); - dpa->dpa_curlevel = curlevel; - dpa->dpa_prio = prio; - dpa->dpa_aflags = aflags; - dpa->dpa_spa = dn->dn_objset->os_spa; - dpa->dpa_dnode = dn; - dpa->dpa_epbs = epbs; - dpa->dpa_zio = pio; - - /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) - dpa->dpa_aflags |= ARC_FLAG_L2CACHE; - - /* - * If we have the indirect just above us, no need to do the asynchronous - * prefetch chain; we'll just run the last step ourselves. If we're at - * a higher level, though, we want to issue the prefetches for all the - * indirect blocks asynchronously, so we can go on with whatever we were - * doing. - */ - if (curlevel == level) { - ASSERT3U(curblkid, ==, blkid); - dbuf_issue_final_prefetch(dpa, &bp); - kmem_free(dpa, sizeof (*dpa)); - } else { - arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; - zbookmark_phys_t zb; - - /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) - iter_aflags |= ARC_FLAG_L2CACHE; - - SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, - dn->dn_object, curlevel, curblkid); - (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, - &bp, dbuf_prefetch_indirect_done, dpa, prio, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &iter_aflags, &zb); - } - /* - * We use pio here instead of dpa_zio since it's possible that - * dpa may have already been freed. - */ - zio_nowait(pio); -} - -#define DBUF_HOLD_IMPL_MAX_DEPTH 20 - -/* - * Helper function for __dbuf_hold_impl() to copy a buffer. Handles - * the case of encrypted, compressed and uncompressed buffers by - * allocating the new buffer, respectively, with arc_alloc_raw_buf(), - * arc_alloc_compressed_buf() or arc_alloc_buf().* - * - * NOTE: Declared noinline to avoid stack bloat in __dbuf_hold_impl(). - */ -noinline static void -dbuf_hold_copy(struct dbuf_hold_impl_data *dh) -{ - dnode_t *dn = dh->dh_dn; - dmu_buf_impl_t *db = dh->dh_db; - dbuf_dirty_record_t *dr = dh->dh_dr; - arc_buf_t *data = dr->dt.dl.dr_data; - - enum zio_compress compress_type = arc_get_compression(data); - - if (compress_type != ZIO_COMPRESS_OFF) { - dbuf_set_data(db, arc_alloc_compressed_buf( - dn->dn_objset->os_spa, db, arc_buf_size(data), - arc_buf_lsize(data), compress_type)); - } else { - dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db, - DBUF_GET_BUFC_TYPE(db), db->db.db_size)); - } - - bcopy(data->b_data, db->db.db_data, arc_buf_size(data)); -} - -/* - * Returns with db_holds incremented, and db_mtx not held. - * Note: dn_struct_rwlock must be held. - */ -static int -__dbuf_hold_impl(struct dbuf_hold_impl_data *dh) -{ - ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH); - dh->dh_parent = NULL; - - ASSERT(dh->dh_blkid != DMU_BONUS_BLKID); - ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock)); - ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level); - - *(dh->dh_dbp) = NULL; - - /* dbuf_find() returns with db_mtx held */ - dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object, - dh->dh_level, dh->dh_blkid); - - if (dh->dh_db == NULL) { - dh->dh_bp = NULL; - - if (dh->dh_fail_uncached) - return (SET_ERROR(ENOENT)); - - ASSERT3P(dh->dh_parent, ==, NULL); - dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, - dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp, dh); - if (dh->dh_fail_sparse) { - if (dh->dh_err == 0 && - dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) - dh->dh_err = SET_ERROR(ENOENT); - if (dh->dh_err) { - if (dh->dh_parent) - dbuf_rele(dh->dh_parent, NULL); - return (dh->dh_err); - } - } - if (dh->dh_err && dh->dh_err != ENOENT) - return (dh->dh_err); - dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid, - dh->dh_parent, dh->dh_bp); - } - - if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) { - mutex_exit(&dh->dh_db->db_mtx); - return (SET_ERROR(ENOENT)); - } - - if (dh->dh_db->db_buf != NULL) { - arc_buf_access(dh->dh_db->db_buf); - ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data); - } - - ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf)); - - /* - * If this buffer is currently syncing out, and we are are - * still referencing it from db_data, we need to make a copy - * of it in case we decide we want to dirty it again in this txg. - */ - if (dh->dh_db->db_level == 0 && - dh->dh_db->db_blkid != DMU_BONUS_BLKID && - dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT && - dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) { - dh->dh_dr = dh->dh_db->db_data_pending; - if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) - dbuf_hold_copy(dh); - } - - if (multilist_link_active(&dh->dh_db->db_cache_link)) { - ASSERT(zfs_refcount_is_zero(&dh->dh_db->db_holds)); - ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE || - dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE); - - multilist_remove( - dbuf_caches[dh->dh_db->db_caching_status].cache, - dh->dh_db); - (void) zfs_refcount_remove_many( - &dbuf_caches[dh->dh_db->db_caching_status].size, - dh->dh_db->db.db_size, dh->dh_db); - - if (dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE) { - DBUF_STAT_BUMPDOWN(metadata_cache_count); - } else { - DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]); - DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level], - dh->dh_db->db.db_size); - } - dh->dh_db->db_caching_status = DB_NO_CACHE; - } - (void) zfs_refcount_add(&dh->dh_db->db_holds, dh->dh_tag); - DBUF_VERIFY(dh->dh_db); - mutex_exit(&dh->dh_db->db_mtx); - - /* NOTE: we can't rele the parent until after we drop the db_mtx */ - if (dh->dh_parent) - dbuf_rele(dh->dh_parent, NULL); - - ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn); - ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid); - ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level); - *(dh->dh_dbp) = dh->dh_db; - - return (0); -} - -/* - * The following code preserves the recursive function dbuf_hold_impl() - * but moves the local variables AND function arguments to the heap to - * minimize the stack frame size. Enough space is initially allocated - * on the stack for 20 levels of recursion. - */ -int -dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, - boolean_t fail_sparse, boolean_t fail_uncached, - void *tag, dmu_buf_impl_t **dbp) -{ - struct dbuf_hold_impl_data *dh; - int error; - - dh = kmem_alloc(sizeof (struct dbuf_hold_impl_data) * - DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP); - __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, - fail_uncached, tag, dbp, 0); - - error = __dbuf_hold_impl(dh); - - kmem_free(dh, sizeof (struct dbuf_hold_impl_data) * - DBUF_HOLD_IMPL_MAX_DEPTH); - - return (error); -} - -static void -__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, - dnode_t *dn, uint8_t level, uint64_t blkid, - boolean_t fail_sparse, boolean_t fail_uncached, - void *tag, dmu_buf_impl_t **dbp, int depth) -{ - dh->dh_dn = dn; - dh->dh_level = level; - dh->dh_blkid = blkid; - - dh->dh_fail_sparse = fail_sparse; - dh->dh_fail_uncached = fail_uncached; - - dh->dh_tag = tag; - dh->dh_dbp = dbp; - - dh->dh_db = NULL; - dh->dh_parent = NULL; - dh->dh_bp = NULL; - dh->dh_err = 0; - dh->dh_dr = NULL; - - dh->dh_depth = depth; -} - -dmu_buf_impl_t * -dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) -{ - return (dbuf_hold_level(dn, 0, blkid, tag)); -} - -dmu_buf_impl_t * -dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) -{ - dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); - return (err ? NULL : db); -} - -void -dbuf_create_bonus(dnode_t *dn) -{ - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); - - ASSERT(dn->dn_bonus == NULL); - dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); -} - -int -dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; - - if (db->db_blkid != DMU_SPILL_BLKID) - return (SET_ERROR(ENOTSUP)); - if (blksz == 0) - blksz = SPA_MINBLOCKSIZE; - ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); - blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - dbuf_new_size(db, blksz, tx); - rw_exit(&dn->dn_struct_rwlock); - DB_DNODE_EXIT(db); - - return (0); -} - -void -dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) -{ - dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); -} - -#pragma weak dmu_buf_add_ref = dbuf_add_ref -void -dbuf_add_ref(dmu_buf_impl_t *db, void *tag) -{ - int64_t holds = zfs_refcount_add(&db->db_holds, tag); - ASSERT3S(holds, >, 1); -} - -#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref -boolean_t -dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, - void *tag) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dmu_buf_impl_t *found_db; - boolean_t result = B_FALSE; - - if (db->db_blkid == DMU_BONUS_BLKID) - found_db = dbuf_find_bonus(os, obj); - else - found_db = dbuf_find(os, obj, 0, blkid); - - if (found_db != NULL) { - if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { - (void) zfs_refcount_add(&db->db_holds, tag); - result = B_TRUE; - } - mutex_exit(&db->db_mtx); - } - return (result); -} - -/* - * If you call dbuf_rele() you had better not be referencing the dnode handle - * unless you have some other direct or indirect hold on the dnode. (An indirect - * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) - * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the - * dnode's parent dbuf evicting its dnode handles. - */ -void -dbuf_rele(dmu_buf_impl_t *db, void *tag) -{ - mutex_enter(&db->db_mtx); - dbuf_rele_and_unlock(db, tag, B_FALSE); -} - -void -dmu_buf_rele(dmu_buf_t *db, void *tag) -{ - dbuf_rele((dmu_buf_impl_t *)db, tag); -} - -/* - * dbuf_rele() for an already-locked dbuf. This is necessary to allow - * db_dirtycnt and db_holds to be updated atomically. The 'evicting' - * argument should be set if we are already in the dbuf-evicting code - * path, in which case we don't want to recursively evict. This allows us to - * avoid deeply nested stacks that would have a call flow similar to this: - * - * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() - * ^ | - * | | - * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ - * - */ -void -dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) -{ - int64_t holds; - uint64_t size; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - DBUF_VERIFY(db); - - /* - * Remove the reference to the dbuf before removing its hold on the - * dnode so we can guarantee in dnode_move() that a referenced bonus - * buffer has a corresponding dnode hold. - */ - holds = zfs_refcount_remove(&db->db_holds, tag); - ASSERT(holds >= 0); - - /* - * We can't freeze indirects if there is a possibility that they - * may be modified in the current syncing context. - */ - if (db->db_buf != NULL && - holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { - arc_buf_freeze(db->db_buf); - } - - if (holds == db->db_dirtycnt && - db->db_level == 0 && db->db_user_immediate_evict) - dbuf_evict_user(db); - - if (holds == 0) { - if (db->db_blkid == DMU_BONUS_BLKID) { - dnode_t *dn; - boolean_t evict_dbuf = db->db_pending_evict; - - /* - * If the dnode moves here, we cannot cross this - * barrier until the move completes. - */ - DB_DNODE_ENTER(db); - - dn = DB_DNODE(db); - atomic_dec_32(&dn->dn_dbufs_count); - - /* - * Decrementing the dbuf count means that the bonus - * buffer's dnode hold is no longer discounted in - * dnode_move(). The dnode cannot move until after - * the dnode_rele() below. - */ - DB_DNODE_EXIT(db); - - /* - * Do not reference db after its lock is dropped. - * Another thread may evict it. - */ - mutex_exit(&db->db_mtx); - - if (evict_dbuf) - dnode_evict_bonus(dn); - - dnode_rele(dn, db); - } else if (db->db_buf == NULL) { - /* - * This is a special case: we never associated this - * dbuf with any data allocated from the ARC. - */ - ASSERT(db->db_state == DB_UNCACHED || - db->db_state == DB_NOFILL); - dbuf_destroy(db); - } else if (arc_released(db->db_buf)) { - /* - * This dbuf has anonymous data associated with it. - */ - dbuf_destroy(db); - } else { - boolean_t do_arc_evict = B_FALSE; - blkptr_t bp; - spa_t *spa = dmu_objset_spa(db->db_objset); - - if (!DBUF_IS_CACHEABLE(db) && - db->db_blkptr != NULL && - !BP_IS_HOLE(db->db_blkptr) && - !BP_IS_EMBEDDED(db->db_blkptr)) { - do_arc_evict = B_TRUE; - bp = *db->db_blkptr; - } - - if (!DBUF_IS_CACHEABLE(db) || - db->db_pending_evict) { - dbuf_destroy(db); - } else if (!multilist_link_active(&db->db_cache_link)) { - ASSERT3U(db->db_caching_status, ==, - DB_NO_CACHE); - - dbuf_cached_state_t dcs = - dbuf_include_in_metadata_cache(db) ? - DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; - db->db_caching_status = dcs; - - multilist_insert(dbuf_caches[dcs].cache, db); - size = zfs_refcount_add_many( - &dbuf_caches[dcs].size, db->db.db_size, db); - - if (dcs == DB_DBUF_METADATA_CACHE) { - DBUF_STAT_BUMP(metadata_cache_count); - DBUF_STAT_MAX( - metadata_cache_size_bytes_max, - size); - } else { - DBUF_STAT_BUMP( - cache_levels[db->db_level]); - DBUF_STAT_BUMP(cache_count); - DBUF_STAT_INCR( - cache_levels_bytes[db->db_level], - db->db.db_size); - DBUF_STAT_MAX(cache_size_bytes_max, - size); - } - mutex_exit(&db->db_mtx); - - if (dcs == DB_DBUF_CACHE && !evicting) - dbuf_evict_notify(size); - } - - if (do_arc_evict) - arc_freed(spa, &bp); - } - } else { - mutex_exit(&db->db_mtx); - } - -} - -#pragma weak dmu_buf_refcount = dbuf_refcount -uint64_t -dbuf_refcount(dmu_buf_impl_t *db) -{ - return (zfs_refcount_count(&db->db_holds)); -} - -void * -dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, - dmu_buf_user_t *new_user) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - mutex_enter(&db->db_mtx); - dbuf_verify_user(db, DBVU_NOT_EVICTING); - if (db->db_user == old_user) - db->db_user = new_user; - else - old_user = db->db_user; - dbuf_verify_user(db, DBVU_NOT_EVICTING); - mutex_exit(&db->db_mtx); - - return (old_user); -} - -void * -dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) -{ - return (dmu_buf_replace_user(db_fake, NULL, user)); -} - -void * -dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - db->db_user_immediate_evict = TRUE; - return (dmu_buf_set_user(db_fake, user)); -} - -void * -dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) -{ - return (dmu_buf_replace_user(db_fake, user, NULL)); -} - -void * -dmu_buf_get_user(dmu_buf_t *db_fake) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - dbuf_verify_user(db, DBVU_NOT_EVICTING); - return (db->db_user); -} - -void -dmu_buf_user_evict_wait() -{ - taskq_wait(dbu_evict_taskq); -} - -blkptr_t * -dmu_buf_get_blkptr(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - return (dbi->db_blkptr); -} - -objset_t * -dmu_buf_get_objset(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - return (dbi->db_objset); -} - -dnode_t * -dmu_buf_dnode_enter(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - DB_DNODE_ENTER(dbi); - return (DB_DNODE(dbi)); -} - -void -dmu_buf_dnode_exit(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - DB_DNODE_EXIT(dbi); -} - -static void -dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) -{ - /* ASSERT(dmu_tx_is_syncing(tx) */ - ASSERT(MUTEX_HELD(&db->db_mtx)); - - if (db->db_blkptr != NULL) - return; - - if (db->db_blkid == DMU_SPILL_BLKID) { - db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys); - BP_ZERO(db->db_blkptr); - return; - } - if (db->db_level == dn->dn_phys->dn_nlevels-1) { - /* - * This buffer was allocated at a time when there was - * no available blkptrs from the dnode, or it was - * inappropriate to hook it in (i.e., nlevels mis-match). - */ - ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); - ASSERT(db->db_parent == NULL); - db->db_parent = dn->dn_dbuf; - db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; - DBUF_VERIFY(db); - } else { - dmu_buf_impl_t *parent = db->db_parent; - int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - - ASSERT(dn->dn_phys->dn_nlevels > 1); - if (parent == NULL) { - mutex_exit(&db->db_mtx); - rw_enter(&dn->dn_struct_rwlock, RW_READER); - parent = dbuf_hold_level(dn, db->db_level + 1, - db->db_blkid >> epbs, db); - rw_exit(&dn->dn_struct_rwlock); - mutex_enter(&db->db_mtx); - db->db_parent = parent; - } - db->db_blkptr = (blkptr_t *)parent->db.db_data + - (db->db_blkid & ((1ULL << epbs) - 1)); - DBUF_VERIFY(db); - } -} - -/* - * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it - * is critical the we not allow the compiler to inline this function in to - * dbuf_sync_list() thereby drastically bloating the stack usage. - */ -noinline static void -dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; - zio_t *zio; - - ASSERT(dmu_tx_is_syncing(tx)); - - dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); - - mutex_enter(&db->db_mtx); - - ASSERT(db->db_level > 0); - DBUF_VERIFY(db); - - /* Read the block if it hasn't been read yet. */ - if (db->db_buf == NULL) { - mutex_exit(&db->db_mtx); - (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); - mutex_enter(&db->db_mtx); - } - ASSERT3U(db->db_state, ==, DB_CACHED); - ASSERT(db->db_buf != NULL); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - /* Indirect block size must match what the dnode thinks it is. */ - ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); - dbuf_check_blkptr(dn, db); - DB_DNODE_EXIT(db); - - /* Provide the pending dirty record to child dbufs */ - db->db_data_pending = dr; - - mutex_exit(&db->db_mtx); - - dbuf_write(dr, db->db_buf, tx); - - zio = dr->dr_zio; - mutex_enter(&dr->dt.di.dr_mtx); - dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); - ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - mutex_exit(&dr->dt.di.dr_mtx); - zio_nowait(zio); -} - -/* - * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is - * critical the we not allow the compiler to inline this function in to - * dbuf_sync_list() thereby drastically bloating the stack usage. - */ -noinline static void -dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) -{ - arc_buf_t **datap = &dr->dt.dl.dr_data; - dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; - objset_t *os; - uint64_t txg = tx->tx_txg; - - ASSERT(dmu_tx_is_syncing(tx)); - - dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); - - mutex_enter(&db->db_mtx); - /* - * To be synced, we must be dirtied. But we - * might have been freed after the dirty. - */ - if (db->db_state == DB_UNCACHED) { - /* This buffer has been freed since it was dirtied */ - ASSERT(db->db.db_data == NULL); - } else if (db->db_state == DB_FILL) { - /* This buffer was freed and is now being re-filled */ - ASSERT(db->db.db_data != dr->dt.dl.dr_data); - } else { - ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); - } - DBUF_VERIFY(db); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - - if (db->db_blkid == DMU_SPILL_BLKID) { - mutex_enter(&dn->dn_mtx); - if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { - /* - * In the previous transaction group, the bonus buffer - * was entirely used to store the attributes for the - * dnode which overrode the dn_spill field. However, - * when adding more attributes to the file a spill - * block was required to hold the extra attributes. - * - * Make sure to clear the garbage left in the dn_spill - * field from the previous attributes in the bonus - * buffer. Otherwise, after writing out the spill - * block to the new allocated dva, it will free - * the old block pointed to by the invalid dn_spill. - */ - db->db_blkptr = NULL; - } - dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; - mutex_exit(&dn->dn_mtx); - } - - /* - * If this is a bonus buffer, simply copy the bonus data into the - * dnode. It will be written out when the dnode is synced (and it - * will be synced, since it must have been dirty for dbuf_sync to - * be called). - */ - if (db->db_blkid == DMU_BONUS_BLKID) { - dbuf_dirty_record_t **drp; - - ASSERT(*datap != NULL); - ASSERT0(db->db_level); - ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, - DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); - bcopy(*datap, DN_BONUS(dn->dn_phys), - DN_MAX_BONUS_LEN(dn->dn_phys)); - DB_DNODE_EXIT(db); - - if (*datap != db->db.db_data) { - int slots = DB_DNODE(db)->dn_num_slots; - int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); - zio_buf_free(*datap, bonuslen); - arc_space_return(bonuslen, ARC_SPACE_BONUS); - } - db->db_data_pending = NULL; - drp = &db->db_last_dirty; - while (*drp != dr) - drp = &(*drp)->dr_next; - ASSERT(dr->dr_next == NULL); - ASSERT(dr->dr_dbuf == db); - *drp = dr->dr_next; - if (dr->dr_dbuf->db_level != 0) { - mutex_destroy(&dr->dt.di.dr_mtx); - list_destroy(&dr->dt.di.dr_children); - } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); - return; - } - - os = dn->dn_objset; - - /* - * This function may have dropped the db_mtx lock allowing a dmu_sync - * operation to sneak in. As a result, we need to ensure that we - * don't check the dr_override_state until we have returned from - * dbuf_check_blkptr. - */ - dbuf_check_blkptr(dn, db); - - /* - * If this buffer is in the middle of an immediate write, - * wait for the synchronous IO to complete. - */ - while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { - ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); - cv_wait(&db->db_changed, &db->db_mtx); - ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); - } - - if (db->db_state != DB_NOFILL && - dn->dn_object != DMU_META_DNODE_OBJECT && - zfs_refcount_count(&db->db_holds) > 1 && - dr->dt.dl.dr_override_state != DR_OVERRIDDEN && - *datap == db->db_buf) { - /* - * If this buffer is currently "in use" (i.e., there - * are active holds and db_data still references it), - * then make a copy before we start the write so that - * any modifications from the open txg will not leak - * into this write. - * - * NOTE: this copy does not need to be made for - * objects only modified in the syncing context (e.g. - * DNONE_DNODE blocks). - */ - int psize = arc_buf_size(*datap); - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - enum zio_compress compress_type = arc_get_compression(*datap); - - if (compress_type == ZIO_COMPRESS_OFF) { - *datap = arc_alloc_buf(os->os_spa, db, type, psize); - } else { - ASSERT3U(type, ==, ARC_BUFC_DATA); - int lsize = arc_buf_lsize(*datap); - *datap = arc_alloc_compressed_buf(os->os_spa, db, - psize, lsize, compress_type); - } - bcopy(db->db.db_data, (*datap)->b_data, psize); - } - db->db_data_pending = dr; - - mutex_exit(&db->db_mtx); - - dbuf_write(dr, *datap, tx); - - ASSERT(!list_link_active(&dr->dr_dirty_node)); - if (dn->dn_object == DMU_META_DNODE_OBJECT) { - list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); - DB_DNODE_EXIT(db); - } else { - /* - * Although zio_nowait() does not "wait for an IO", it does - * initiate the IO. If this is an empty write it seems plausible - * that the IO could actually be completed before the nowait - * returns. We need to DB_DNODE_EXIT() first in case - * zio_nowait() invalidates the dbuf. - */ - DB_DNODE_EXIT(db); - zio_nowait(dr->dr_zio); - } -} - -void -dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) -{ - dbuf_dirty_record_t *dr; - - while (dr = list_head(list)) { - if (dr->dr_zio != NULL) { - /* - * If we find an already initialized zio then we - * are processing the meta-dnode, and we have finished. - * The dbufs for all dnodes are put back on the list - * during processing, so that we can zio_wait() - * these IOs after initiating all child IOs. - */ - ASSERT3U(dr->dr_dbuf->db.db_object, ==, - DMU_META_DNODE_OBJECT); - break; - } - if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && - dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { - VERIFY3U(dr->dr_dbuf->db_level, ==, level); - } - list_remove(list, dr); - if (dr->dr_dbuf->db_level > 0) - dbuf_sync_indirect(dr, tx); - else - dbuf_sync_leaf(dr, tx); - } -} - -/* ARGSUSED */ -static void -dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) -{ - dmu_buf_impl_t *db = vdb; - dnode_t *dn; - blkptr_t *bp = zio->io_bp; - blkptr_t *bp_orig = &zio->io_bp_orig; - spa_t *spa = zio->io_spa; - int64_t delta; - uint64_t fill = 0; - int i; - - ASSERT3P(db->db_blkptr, !=, NULL); - ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); - dnode_diduse_space(dn, delta - zio->io_prev_space_delta); - zio->io_prev_space_delta = delta; - - if (bp->blk_birth != 0) { - ASSERT((db->db_blkid != DMU_SPILL_BLKID && - BP_GET_TYPE(bp) == dn->dn_type) || - (db->db_blkid == DMU_SPILL_BLKID && - BP_GET_TYPE(bp) == dn->dn_bonustype) || - BP_IS_EMBEDDED(bp)); - ASSERT(BP_GET_LEVEL(bp) == db->db_level); - } - - mutex_enter(&db->db_mtx); - -#ifdef ZFS_DEBUG - if (db->db_blkid == DMU_SPILL_BLKID) { - ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); - ASSERT(!(BP_IS_HOLE(bp)) && - db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); - } -#endif - - if (db->db_level == 0) { - mutex_enter(&dn->dn_mtx); - if (db->db_blkid > dn->dn_phys->dn_maxblkid && - db->db_blkid != DMU_SPILL_BLKID) - dn->dn_phys->dn_maxblkid = db->db_blkid; - mutex_exit(&dn->dn_mtx); - - if (dn->dn_type == DMU_OT_DNODE) { - i = 0; - while (i < db->db.db_size) { - dnode_phys_t *dnp = - (void *)(((char *)db->db.db_data) + i); - - i += DNODE_MIN_SIZE; - if (dnp->dn_type != DMU_OT_NONE) { - fill++; - i += dnp->dn_extra_slots * - DNODE_MIN_SIZE; - } - } - } else { - if (BP_IS_HOLE(bp)) { - fill = 0; - } else { - fill = 1; - } - } - } else { - blkptr_t *ibp = db->db.db_data; - ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); - for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { - if (BP_IS_HOLE(ibp)) - continue; - fill += BP_GET_FILL(ibp); - } - } - DB_DNODE_EXIT(db); - - if (!BP_IS_EMBEDDED(bp)) - bp->blk_fill = fill; - - mutex_exit(&db->db_mtx); - - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - *db->db_blkptr = *bp; - rw_exit(&dn->dn_struct_rwlock); -} - -/* ARGSUSED */ -/* - * This function gets called just prior to running through the compression - * stage of the zio pipeline. If we're an indirect block comprised of only - * holes, then we want this indirect to be compressed away to a hole. In - * order to do that we must zero out any information about the holes that - * this indirect points to prior to before we try to compress it. - */ -static void -dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) -{ - dmu_buf_impl_t *db = vdb; - dnode_t *dn; - blkptr_t *bp; - unsigned int epbs, i; - - ASSERT3U(db->db_level, >, 0); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - ASSERT3U(epbs, <, 31); - - /* Determine if all our children are holes */ - for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { - if (!BP_IS_HOLE(bp)) - break; - } - - /* - * If all the children are holes, then zero them all out so that - * we may get compressed away. - */ - if (i == 1 << epbs) { - /* - * We only found holes. Grab the rwlock to prevent - * anybody from reading the blocks we're about to - * zero out. - */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - bzero(db->db.db_data, db->db.db_size); - rw_exit(&dn->dn_struct_rwlock); - } - DB_DNODE_EXIT(db); -} - -/* - * The SPA will call this callback several times for each zio - once - * for every physical child i/o (zio->io_phys_children times). This - * allows the DMU to monitor the progress of each logical i/o. For example, - * there may be 2 copies of an indirect block, or many fragments of a RAID-Z - * block. There may be a long delay before all copies/fragments are completed, - * so this callback allows us to retire dirty space gradually, as the physical - * i/os complete. - */ -/* ARGSUSED */ -static void -dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) -{ - dmu_buf_impl_t *db = arg; - objset_t *os = db->db_objset; - dsl_pool_t *dp = dmu_objset_pool(os); - dbuf_dirty_record_t *dr; - int delta = 0; - - dr = db->db_data_pending; - ASSERT3U(dr->dr_txg, ==, zio->io_txg); - - /* - * The callback will be called io_phys_children times. Retire one - * portion of our dirty space each time we are called. Any rounding - * error will be cleaned up by dsl_pool_sync()'s call to - * dsl_pool_undirty_space(). - */ - delta = dr->dr_accounted / zio->io_phys_children; - dsl_pool_undirty_space(dp, delta, zio->io_txg); -} - -/* ARGSUSED */ -static void -dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) -{ - dmu_buf_impl_t *db = vdb; - blkptr_t *bp_orig = &zio->io_bp_orig; - blkptr_t *bp = db->db_blkptr; - objset_t *os = db->db_objset; - dmu_tx_t *tx = os->os_synctx; - dbuf_dirty_record_t **drp, *dr; - - ASSERT0(zio->io_error); - ASSERT(db->db_blkptr == bp); - - /* - * For nopwrites and rewrites we ensure that the bp matches our - * original and bypass all the accounting. - */ - if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { - ASSERT(BP_EQUAL(bp, bp_orig)); - } else { - dsl_dataset_t *ds = os->os_dsl_dataset; - (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); - dsl_dataset_block_born(ds, bp, tx); - } - - mutex_enter(&db->db_mtx); - - DBUF_VERIFY(db); - - drp = &db->db_last_dirty; - while ((dr = *drp) != db->db_data_pending) - drp = &dr->dr_next; - ASSERT(!list_link_active(&dr->dr_dirty_node)); - ASSERT(dr->dr_dbuf == db); - ASSERT(dr->dr_next == NULL); - *drp = dr->dr_next; - -#ifdef ZFS_DEBUG - if (db->db_blkid == DMU_SPILL_BLKID) { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); - ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && - db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); - DB_DNODE_EXIT(db); - } -#endif - - if (db->db_level == 0) { - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); - if (db->db_state != DB_NOFILL) { - if (dr->dt.dl.dr_data != db->db_buf) - arc_buf_destroy(dr->dt.dl.dr_data, db); - } - } else { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); - if (!BP_IS_HOLE(db->db_blkptr)) { - int epbs = - dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - ASSERT3U(db->db_blkid, <=, - dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); - ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, - db->db.db_size); - } - DB_DNODE_EXIT(db); - mutex_destroy(&dr->dt.di.dr_mtx); - list_destroy(&dr->dt.di.dr_children); - } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - - cv_broadcast(&db->db_changed); - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - db->db_data_pending = NULL; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); -} - -static void -dbuf_write_nofill_ready(zio_t *zio) -{ - dbuf_write_ready(zio, NULL, zio->io_private); -} - -static void -dbuf_write_nofill_done(zio_t *zio) -{ - dbuf_write_done(zio, NULL, zio->io_private); -} - -static void -dbuf_write_override_ready(zio_t *zio) -{ - dbuf_dirty_record_t *dr = zio->io_private; - dmu_buf_impl_t *db = dr->dr_dbuf; - - dbuf_write_ready(zio, NULL, db); -} - -static void -dbuf_write_override_done(zio_t *zio) -{ - dbuf_dirty_record_t *dr = zio->io_private; - dmu_buf_impl_t *db = dr->dr_dbuf; - blkptr_t *obp = &dr->dt.dl.dr_overridden_by; - - mutex_enter(&db->db_mtx); - if (!BP_EQUAL(zio->io_bp, obp)) { - if (!BP_IS_HOLE(obp)) - dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); - arc_release(dr->dt.dl.dr_data, db); - } - mutex_exit(&db->db_mtx); - dbuf_write_done(zio, NULL, db); - - if (zio->io_abd != NULL) - abd_put(zio->io_abd); -} - -typedef struct dbuf_remap_impl_callback_arg { - objset_t *drica_os; - uint64_t drica_blk_birth; - dmu_tx_t *drica_tx; -} dbuf_remap_impl_callback_arg_t; - -static void -dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size, - void *arg) -{ - dbuf_remap_impl_callback_arg_t *drica = arg; - objset_t *os = drica->drica_os; - spa_t *spa = dmu_objset_spa(os); - dmu_tx_t *tx = drica->drica_tx; - - ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); - - if (os == spa_meta_objset(spa)) { - spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); - } else { - dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset, - size, drica->drica_blk_birth, tx); - } -} - -static void -dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx) -{ - blkptr_t bp_copy = *bp; - spa_t *spa = dmu_objset_spa(dn->dn_objset); - dbuf_remap_impl_callback_arg_t drica; - - ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); - - drica.drica_os = dn->dn_objset; - drica.drica_blk_birth = bp->blk_birth; - drica.drica_tx = tx; - if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, - &drica)) { - /* - * The struct_rwlock prevents dbuf_read_impl() from - * dereferencing the BP while we are changing it. To - * avoid lock contention, only grab it when we are actually - * changing the BP. - */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - *bp = bp_copy; - rw_exit(&dn->dn_struct_rwlock); - } -} - -/* - * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting - * to remap a copy of every bp in the dbuf. - */ -boolean_t -dbuf_can_remap(const dmu_buf_impl_t *db) -{ - spa_t *spa = dmu_objset_spa(db->db_objset); - blkptr_t *bp = db->db.db_data; - boolean_t ret = B_FALSE; - - ASSERT3U(db->db_level, >, 0); - ASSERT3S(db->db_state, ==, DB_CACHED); - - ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { - blkptr_t bp_copy = bp[i]; - if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { - ret = B_TRUE; - break; - } - } - spa_config_exit(spa, SCL_VDEV, FTAG); - - return (ret); -} - -boolean_t -dnode_needs_remap(const dnode_t *dn) -{ - spa_t *spa = dmu_objset_spa(dn->dn_objset); - boolean_t ret = B_FALSE; - - if (dn->dn_phys->dn_nlevels == 0) { - return (B_FALSE); - } - - ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) { - blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j]; - if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { - ret = B_TRUE; - break; - } - } - spa_config_exit(spa, SCL_VDEV, FTAG); - - return (ret); -} - -/* - * Remap any existing BP's to concrete vdevs, if possible. - */ -static void -dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - spa_t *spa = dmu_objset_spa(db->db_objset); - ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); - - if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)) - return; - - if (db->db_level > 0) { - blkptr_t *bp = db->db.db_data; - for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { - dbuf_remap_impl(dn, &bp[i], tx); - } - } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { - dnode_phys_t *dnp = db->db.db_data; - ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==, - DMU_OT_DNODE); - for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; - i += dnp[i].dn_extra_slots + 1) { - for (int j = 0; j < dnp[i].dn_nblkptr; j++) { - dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx); - } - } - } -} - - -/* Issue I/O to commit a dirty buffer to disk. */ -static void -dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; - objset_t *os; - dmu_buf_impl_t *parent = db->db_parent; - uint64_t txg = tx->tx_txg; - zbookmark_phys_t zb; - zio_prop_t zp; - zio_t *zio; - int wp_flag = 0; - - ASSERT(dmu_tx_is_syncing(tx)); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - os = dn->dn_objset; - - if (db->db_state != DB_NOFILL) { - if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { - /* - * Private object buffers are released here rather - * than in dbuf_dirty() since they are only modified - * in the syncing context and we don't want the - * overhead of making multiple copies of the data. - */ - if (BP_IS_HOLE(db->db_blkptr)) { - arc_buf_thaw(data); - } else { - dbuf_release_bp(db); - } - dbuf_remap(dn, db, tx); - } - } - - if (parent != dn->dn_dbuf) { - /* Our parent is an indirect block. */ - /* We have a dirty parent that has been scheduled for write. */ - ASSERT(parent && parent->db_data_pending); - /* Our parent's buffer is one level closer to the dnode. */ - ASSERT(db->db_level == parent->db_level-1); - /* - * We're about to modify our parent's db_data by modifying - * our block pointer, so the parent must be released. - */ - ASSERT(arc_released(parent->db_buf)); - zio = parent->db_data_pending->dr_zio; - } else { - /* Our parent is the dnode itself. */ - ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && - db->db_blkid != DMU_SPILL_BLKID) || - (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); - if (db->db_blkid != DMU_SPILL_BLKID) - ASSERT3P(db->db_blkptr, ==, - &dn->dn_phys->dn_blkptr[db->db_blkid]); - zio = dn->dn_zio; - } - - ASSERT(db->db_level == 0 || data == db->db_buf); - ASSERT3U(db->db_blkptr->blk_birth, <=, txg); - ASSERT(zio); - - SET_BOOKMARK(&zb, os->os_dsl_dataset ? - os->os_dsl_dataset->ds_object : DMU_META_OBJSET, - db->db.db_object, db->db_level, db->db_blkid); - - if (db->db_blkid == DMU_SPILL_BLKID) - wp_flag = WP_SPILL; - wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; - - dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); - DB_DNODE_EXIT(db); - - /* - * We copy the blkptr now (rather than when we instantiate the dirty - * record), because its value can change between open context and - * syncing context. We do not need to hold dn_struct_rwlock to read - * db_blkptr because we are in syncing context. - */ - dr->dr_bp_copy = *db->db_blkptr; - - if (db->db_level == 0 && - dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { - /* - * The BP for this block has been provided by open context - * (by dmu_sync() or dmu_buf_write_embedded()). - */ - abd_t *contents = (data != NULL) ? - abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; - - dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy, - contents, db->db.db_size, db->db.db_size, &zp, - dbuf_write_override_ready, NULL, NULL, - dbuf_write_override_done, - dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); - mutex_enter(&db->db_mtx); - dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, - dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); - mutex_exit(&db->db_mtx); - } else if (db->db_state == DB_NOFILL) { - ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || - zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); - dr->dr_zio = zio_write(zio, os->os_spa, txg, - &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, - dbuf_write_nofill_ready, NULL, NULL, - dbuf_write_nofill_done, db, - ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); - } else { - ASSERT(arc_released(data)); - - /* - * For indirect blocks, we want to setup the children - * ready callback so that we can properly handle an indirect - * block that only contains holes. - */ - arc_write_done_func_t *children_ready_cb = NULL; - if (db->db_level != 0) - children_ready_cb = dbuf_write_children_ready; - - dr->dr_zio = arc_write(zio, os->os_spa, txg, - &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), - &zp, dbuf_write_ready, children_ready_cb, - dbuf_write_physdone, dbuf_write_done, db, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); - } -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c deleted file mode 100644 index 0a86830f71ad..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c +++ /dev/null @@ -1,242 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -#include -#include -#include - -/* - * Calculate the index of the arc header for the state, disabled by default. - */ -int zfs_dbuf_state_index = 0; - -/* - * ========================================================================== - * Dbuf Hash Read Routines - * ========================================================================== - */ -typedef struct dbuf_stats_t { - kmutex_t lock; - kstat_t *kstat; - dbuf_hash_table_t *hash; - int idx; -} dbuf_stats_t; - -static dbuf_stats_t dbuf_stats_hash_table; - -static int -dbuf_stats_hash_table_headers(char *buf, size_t size) -{ - size = snprintf(buf, size - 1, - "%-88s | %-124s | %s\n" - "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | " - "%-5s %-5s %-6s %-8s %-6s %-8s %-12s " - "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | " - "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n", - "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", - "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list", - "atype", "index", "flags", "count", "asize", "access", "mru", "gmru", - "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", - "dtype", "btype", "data_bs", "meta_bs", "bsize", - "lvls", "dholds", "blocks", "dsize"); - buf[size] = '\0'; - - return (0); -} - -int -__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) -{ - arc_buf_info_t abi = { 0 }; - dmu_object_info_t doi = { 0 }; - dnode_t *dn = DB_DNODE(db); - - if (db->db_buf) - arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index); - - if (dn) - __dmu_object_info_from_dnode(dn, &doi); - - size = snprintf(buf, size - 1, - "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | " - "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu " - "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | " - "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n", - /* dmu_buf_impl_t */ - spa_name(dn->dn_objset->os_spa), - (u_longlong_t)dmu_objset_id(db->db_objset), - (longlong_t)db->db.db_object, - (longlong_t)db->db_level, - (longlong_t)db->db_blkid, - (u_longlong_t)db->db.db_offset, - (u_longlong_t)db->db.db_size, - !!dbuf_is_metadata(db), - db->db_state, - (ulong_t)zfs_refcount_count(&db->db_holds), - /* arc_buf_info_t */ - abi.abi_state_type, - abi.abi_state_contents, - (longlong_t)abi.abi_state_index, - abi.abi_flags, - (ulong_t)abi.abi_bufcnt, - (u_longlong_t)abi.abi_size, - (u_longlong_t)abi.abi_access, - (ulong_t)abi.abi_mru_hits, - (ulong_t)abi.abi_mru_ghost_hits, - (ulong_t)abi.abi_mfu_hits, - (ulong_t)abi.abi_mfu_ghost_hits, - (ulong_t)abi.abi_l2arc_hits, - (u_longlong_t)abi.abi_l2arc_dattr, - (u_longlong_t)abi.abi_l2arc_asize, - abi.abi_l2arc_compress, - (ulong_t)abi.abi_holds, - /* dmu_object_info_t */ - doi.doi_type, - doi.doi_bonus_type, - (ulong_t)doi.doi_data_block_size, - (ulong_t)doi.doi_metadata_block_size, - (u_longlong_t)doi.doi_bonus_size, - (ulong_t)doi.doi_indirection, - (ulong_t)zfs_refcount_count(&dn->dn_holds), - (u_longlong_t)doi.doi_fill_count, - (u_longlong_t)doi.doi_max_offset); - buf[size] = '\0'; - - return (size); -} - -static int -dbuf_stats_hash_table_data(char *buf, size_t size, void *data) -{ - dbuf_stats_t *dsh = (dbuf_stats_t *)data; - dbuf_hash_table_t *h = dsh->hash; - dmu_buf_impl_t *db; - int length, error = 0; - - ASSERT3S(dsh->idx, >=, 0); - ASSERT3S(dsh->idx, <=, h->hash_table_mask); - memset(buf, 0, size); - - mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); - for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { - /* - * Returning ENOMEM will cause the data and header functions - * to be called with a larger scratch buffers. - */ - if (size < 512) { - error = ENOMEM; - break; - } - - mutex_enter(&db->db_mtx); - mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); - - length = __dbuf_stats_hash_table_data(buf, size, db); - buf += length; - size -= length; - - mutex_exit(&db->db_mtx); - mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); - } - mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); - - return (error); -} - -static void * -dbuf_stats_hash_table_addr(kstat_t *ksp, off_t n) -{ - dbuf_stats_t *dsh = ksp->ks_private; - - ASSERT(MUTEX_HELD(&dsh->lock)); - - if (n <= dsh->hash->hash_table_mask) { - dsh->idx = n; - return (dsh); - } - - return (NULL); -} - -#ifndef __FreeBSD__ -/* - * XXX The FreeBSD SPL is missing support for KSTAT_TYPE_RAW - * we can enable this as soon as that's implemented. See the - * lindebugfs module for similar callback semantics. - */ -static void -dbuf_stats_hash_table_init(dbuf_hash_table_t *hash) -{ - dbuf_stats_t *dsh = &dbuf_stats_hash_table; - kstat_t *ksp; - - mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL); - dsh->hash = hash; - - ksp = kstat_create("zfs", 0, "dbufs", "misc", - KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); - dsh->kstat = ksp; - - if (ksp) { - ksp->ks_lock = &dsh->lock; - ksp->ks_ndata = UINT32_MAX; - ksp->ks_private = dsh; - kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers, - dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr); - kstat_install(ksp); - } -} - -static void -dbuf_stats_hash_table_destroy(void) -{ - dbuf_stats_t *dsh = &dbuf_stats_hash_table; - kstat_t *ksp; - - ksp = dsh->kstat; - if (ksp) - kstat_delete(ksp); - - mutex_destroy(&dsh->lock); -} -#else -static void -dbuf_stats_hash_table_init(dbuf_hash_table_t *hash) -{ -} - -static void -dbuf_stats_hash_table_destroy(void) -{ -} -#endif - -void -dbuf_stats_init(dbuf_hash_table_t *hash) -{ - dbuf_stats_hash_table_init(hash); -} - -void -dbuf_stats_destroy(void) -{ - dbuf_stats_hash_table_destroy(); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c deleted file mode 100644 index 964aa6c054f5..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c +++ /dev/null @@ -1,1189 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Enable/disable prefetching of dedup-ed blocks which are going to be freed. - */ -int zfs_dedup_prefetch = 1; - -SYSCTL_DECL(_vfs_zfs); -SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "ZFS DEDUP"); -SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RWTUN, &zfs_dedup_prefetch, - 0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed"); - -static const ddt_ops_t *ddt_ops[DDT_TYPES] = { - &ddt_zap_ops, -}; - -static const char *ddt_class_name[DDT_CLASSES] = { - "ditto", - "duplicate", - "unique", -}; - -static void -ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - dmu_tx_t *tx) -{ - spa_t *spa = ddt->ddt_spa; - objset_t *os = ddt->ddt_os; - uint64_t *objectp = &ddt->ddt_object[type][class]; - boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags & - ZCHECKSUM_FLAG_DEDUP; - char name[DDT_NAMELEN]; - - ddt_object_name(ddt, type, class, name); - - ASSERT(*objectp == 0); - VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); - ASSERT(*objectp != 0); - - VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, - sizeof (uint64_t), 1, objectp, tx) == 0); - - VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, - sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), - &ddt->ddt_histogram[type][class], tx) == 0); -} - -static void -ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - dmu_tx_t *tx) -{ - spa_t *spa = ddt->ddt_spa; - objset_t *os = ddt->ddt_os; - uint64_t *objectp = &ddt->ddt_object[type][class]; - uint64_t count; - char name[DDT_NAMELEN]; - - ddt_object_name(ddt, type, class, name); - - ASSERT(*objectp != 0); - VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0); - ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); - VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); - VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); - VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); - bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); - - *objectp = 0; -} - -static int -ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) -{ - ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; - dmu_object_info_t doi; - uint64_t count; - char name[DDT_NAMELEN]; - int error; - - ddt_object_name(ddt, type, class, name); - - error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, - sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); - - if (error != 0) - return (error); - - VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, - sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), - &ddt->ddt_histogram[type][class])); - - /* - * Seed the cached statistics. - */ - VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); - - error = ddt_object_count(ddt, type, class, &count); - if (error) - return error; - - ddo->ddo_count = count; - ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; - ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; - - return (0); -} - -static void -ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - dmu_tx_t *tx) -{ - ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; - dmu_object_info_t doi; - uint64_t count; - char name[DDT_NAMELEN]; - - ddt_object_name(ddt, type, class, name); - - VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, - sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), - &ddt->ddt_histogram[type][class], tx) == 0); - - /* - * Cache DDT statistics; this is the only time they'll change. - */ - VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); - VERIFY(ddt_object_count(ddt, type, class, &count) == 0); - - ddo->ddo_count = count; - ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; - ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; -} - -static int -ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - ddt_entry_t *dde) -{ - if (!ddt_object_exists(ddt, type, class)) - return (SET_ERROR(ENOENT)); - - return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, - ddt->ddt_object[type][class], dde)); -} - -static void -ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - ddt_entry_t *dde) -{ - if (!ddt_object_exists(ddt, type, class)) - return; - - ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, - ddt->ddt_object[type][class], dde); -} - -int -ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - ddt_entry_t *dde, dmu_tx_t *tx) -{ - ASSERT(ddt_object_exists(ddt, type, class)); - - return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, - ddt->ddt_object[type][class], dde, tx)); -} - -static int -ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - ddt_entry_t *dde, dmu_tx_t *tx) -{ - ASSERT(ddt_object_exists(ddt, type, class)); - - return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, - ddt->ddt_object[type][class], dde, tx)); -} - -int -ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - uint64_t *walk, ddt_entry_t *dde) -{ - ASSERT(ddt_object_exists(ddt, type, class)); - - return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, - ddt->ddt_object[type][class], dde, walk)); -} - -int -ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, uint64_t *count) -{ - ASSERT(ddt_object_exists(ddt, type, class)); - - return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, - ddt->ddt_object[type][class], count)); -} - -int -ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - dmu_object_info_t *doi) -{ - if (!ddt_object_exists(ddt, type, class)) - return (SET_ERROR(ENOENT)); - - return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], - doi)); -} - -boolean_t -ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) -{ - return (!!ddt->ddt_object[type][class]); -} - -void -ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - char *name) -{ - (void) sprintf(name, DMU_POOL_DDT, - zio_checksum_table[ddt->ddt_checksum].ci_name, - ddt_ops[type]->ddt_op_name, ddt_class_name[class]); -} - -void -ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) -{ - ASSERT(txg != 0); - - for (int d = 0; d < SPA_DVAS_PER_BP; d++) - bp->blk_dva[d] = ddp->ddp_dva[d]; - BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); -} - -void -ddt_bp_create(enum zio_checksum checksum, - const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) -{ - BP_ZERO(bp); - - if (ddp != NULL) - ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); - - bp->blk_cksum = ddk->ddk_cksum; - bp->blk_fill = 1; - - BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); - BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); - BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); - BP_SET_CHECKSUM(bp, checksum); - BP_SET_TYPE(bp, DMU_OT_DEDUP); - BP_SET_LEVEL(bp, 0); - BP_SET_DEDUP(bp, 0); - BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); -} - -void -ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) -{ - ddk->ddk_cksum = bp->blk_cksum; - ddk->ddk_prop = 0; - - DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); - DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); - DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); -} - -void -ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) -{ - ASSERT(ddp->ddp_phys_birth == 0); - - for (int d = 0; d < SPA_DVAS_PER_BP; d++) - ddp->ddp_dva[d] = bp->blk_dva[d]; - ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); -} - -void -ddt_phys_clear(ddt_phys_t *ddp) -{ - bzero(ddp, sizeof (*ddp)); -} - -void -ddt_phys_addref(ddt_phys_t *ddp) -{ - ddp->ddp_refcnt++; -} - -void -ddt_phys_decref(ddt_phys_t *ddp) -{ - if (ddp) { - ASSERT((int64_t)ddp->ddp_refcnt > 0); - ddp->ddp_refcnt--; - } -} - -void -ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) -{ - blkptr_t blk; - - ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); - ddt_phys_clear(ddp); - zio_free(ddt->ddt_spa, txg, &blk); -} - -ddt_phys_t * -ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) -{ - ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; - - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && - BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) - return (ddp); - } - return (NULL); -} - -uint64_t -ddt_phys_total_refcnt(const ddt_entry_t *dde) -{ - uint64_t refcnt = 0; - - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) - refcnt += dde->dde_phys[p].ddp_refcnt; - - return (refcnt); -} - -static void -ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) -{ - spa_t *spa = ddt->ddt_spa; - ddt_phys_t *ddp = dde->dde_phys; - ddt_key_t *ddk = &dde->dde_key; - uint64_t lsize = DDK_GET_LSIZE(ddk); - uint64_t psize = DDK_GET_PSIZE(ddk); - - bzero(dds, sizeof (*dds)); - - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - uint64_t dsize = 0; - uint64_t refcnt = ddp->ddp_refcnt; - - if (ddp->ddp_phys_birth == 0) - continue; - - for (int d = 0; d < SPA_DVAS_PER_BP; d++) - dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); - - dds->dds_blocks += 1; - dds->dds_lsize += lsize; - dds->dds_psize += psize; - dds->dds_dsize += dsize; - - dds->dds_ref_blocks += refcnt; - dds->dds_ref_lsize += lsize * refcnt; - dds->dds_ref_psize += psize * refcnt; - dds->dds_ref_dsize += dsize * refcnt; - } -} - -void -ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) -{ - const uint64_t *s = (const uint64_t *)src; - uint64_t *d = (uint64_t *)dst; - uint64_t *d_end = (uint64_t *)(dst + 1); - - ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ - - while (d < d_end) - *d++ += (*s++ ^ neg) - neg; -} - -static void -ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) -{ - ddt_stat_t dds; - ddt_histogram_t *ddh; - int bucket; - - ddt_stat_generate(ddt, dde, &dds); - - bucket = highbit64(dds.dds_ref_blocks) - 1; - ASSERT(bucket >= 0); - - ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; - - ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); -} - -void -ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) -{ - for (int h = 0; h < 64; h++) - ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); -} - -void -ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) -{ - bzero(dds, sizeof (*dds)); - - for (int h = 0; h < 64; h++) - ddt_stat_add(dds, &ddh->ddh_stat[h], 0); -} - -boolean_t -ddt_histogram_empty(const ddt_histogram_t *ddh) -{ - const uint64_t *s = (const uint64_t *)ddh; - const uint64_t *s_end = (const uint64_t *)(ddh + 1); - - while (s < s_end) - if (*s++ != 0) - return (B_FALSE); - - return (B_TRUE); -} - -void -ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) -{ - /* Sum the statistics we cached in ddt_object_sync(). */ - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { - ddt_t *ddt = spa->spa_ddt[c]; - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; - class++) { - ddt_object_t *ddo = - &ddt->ddt_object_stats[type][class]; - ddo_total->ddo_count += ddo->ddo_count; - ddo_total->ddo_dspace += ddo->ddo_dspace; - ddo_total->ddo_mspace += ddo->ddo_mspace; - } - } - } - - /* ... and compute the averages. */ - if (ddo_total->ddo_count != 0) { - ddo_total->ddo_dspace /= ddo_total->ddo_count; - ddo_total->ddo_mspace /= ddo_total->ddo_count; - } -} - -void -ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) -{ - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { - ddt_t *ddt = spa->spa_ddt[c]; - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; - class++) { - ddt_histogram_add(ddh, - &ddt->ddt_histogram_cache[type][class]); - } - } - } -} - -void -ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) -{ - ddt_histogram_t *ddh_total; - - ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); - ddt_get_dedup_histogram(spa, ddh_total); - ddt_histogram_stat(dds_total, ddh_total); - kmem_free(ddh_total, sizeof (ddt_histogram_t)); -} - -uint64_t -ddt_get_dedup_dspace(spa_t *spa) -{ - ddt_stat_t dds_total = { 0 }; - - ddt_get_dedup_stats(spa, &dds_total); - return (dds_total.dds_ref_dsize - dds_total.dds_dsize); -} - -uint64_t -ddt_get_pool_dedup_ratio(spa_t *spa) -{ - ddt_stat_t dds_total = { 0 }; - - ddt_get_dedup_stats(spa, &dds_total); - if (dds_total.dds_dsize == 0) - return (100); - - return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); -} - -int -ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) -{ - spa_t *spa = ddt->ddt_spa; - uint64_t total_refcnt = 0; - uint64_t ditto = spa->spa_dedup_ditto; - int total_copies = 0; - int desired_copies = 0; - - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { - ddt_phys_t *ddp = &dde->dde_phys[p]; - zio_t *zio = dde->dde_lead_zio[p]; - uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ - if (zio != NULL) - refcnt += zio->io_parent_count; /* pending refs */ - if (ddp == ddp_willref) - refcnt++; /* caller's ref */ - if (refcnt != 0) { - total_refcnt += refcnt; - total_copies += p; - } - } - - if (ditto == 0 || ditto > UINT32_MAX) - ditto = UINT32_MAX; - - if (total_refcnt >= 1) - desired_copies++; - if (total_refcnt >= ditto) - desired_copies++; - if (total_refcnt >= ditto * ditto) - desired_copies++; - - return (MAX(desired_copies, total_copies) - total_copies); -} - -int -ddt_ditto_copies_present(ddt_entry_t *dde) -{ - ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; - dva_t *dva = ddp->ddp_dva; - int copies = 0 - DVA_GET_GANG(dva); - - for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++) - if (DVA_IS_VALID(dva)) - copies++; - - ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); - - return (copies); -} - -size_t -ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) -{ - uchar_t *version = dst++; - int cpfunc = ZIO_COMPRESS_ZLE; - zio_compress_info_t *ci = &zio_compress_table[cpfunc]; - size_t c_len; - - ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ - - c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); - - if (c_len == s_len) { - cpfunc = ZIO_COMPRESS_OFF; - bcopy(src, dst, s_len); - } - - *version = cpfunc; - /* CONSTCOND */ - if (ZFS_HOST_BYTEORDER) - *version |= DDT_COMPRESS_BYTEORDER_MASK; - - return (c_len + 1); -} - -void -ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) -{ - uchar_t version = *src++; - int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; - zio_compress_info_t *ci = &zio_compress_table[cpfunc]; - - if (ci->ci_decompress != NULL) - (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); - else - bcopy(src, dst, d_len); - - if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) != - (ZFS_HOST_BYTEORDER != 0)) - byteswap_uint64_array(dst, d_len); -} - -ddt_t * -ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) -{ - return (spa->spa_ddt[c]); -} - -ddt_t * -ddt_select(spa_t *spa, const blkptr_t *bp) -{ - return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); -} - -void -ddt_enter(ddt_t *ddt) -{ - mutex_enter(&ddt->ddt_lock); -} - -void -ddt_exit(ddt_t *ddt) -{ - mutex_exit(&ddt->ddt_lock); -} - -static ddt_entry_t * -ddt_alloc(const ddt_key_t *ddk) -{ - ddt_entry_t *dde; - - dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP); - cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); - - dde->dde_key = *ddk; - - return (dde); -} - -static void -ddt_free(ddt_entry_t *dde) -{ - ASSERT(!dde->dde_loading); - - for (int p = 0; p < DDT_PHYS_TYPES; p++) - ASSERT(dde->dde_lead_zio[p] == NULL); - - if (dde->dde_repair_abd != NULL) - abd_free(dde->dde_repair_abd); - - cv_destroy(&dde->dde_cv); - kmem_free(dde, sizeof (*dde)); -} - -void -ddt_remove(ddt_t *ddt, ddt_entry_t *dde) -{ - ASSERT(MUTEX_HELD(&ddt->ddt_lock)); - - avl_remove(&ddt->ddt_tree, dde); - ddt_free(dde); -} - -ddt_entry_t * -ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) -{ - ddt_entry_t *dde, dde_search; - enum ddt_type type; - enum ddt_class class; - avl_index_t where; - int error; - - ASSERT(MUTEX_HELD(&ddt->ddt_lock)); - - ddt_key_fill(&dde_search.dde_key, bp); - - dde = avl_find(&ddt->ddt_tree, &dde_search, &where); - if (dde == NULL) { - if (!add) - return (NULL); - dde = ddt_alloc(&dde_search.dde_key); - avl_insert(&ddt->ddt_tree, dde, where); - } - - while (dde->dde_loading) - cv_wait(&dde->dde_cv, &ddt->ddt_lock); - - if (dde->dde_loaded) - return (dde); - - dde->dde_loading = B_TRUE; - - ddt_exit(ddt); - - error = ENOENT; - - for (type = 0; type < DDT_TYPES; type++) { - for (class = 0; class < DDT_CLASSES; class++) { - error = ddt_object_lookup(ddt, type, class, dde); - if (error != ENOENT) { - ASSERT0(error); - break; - } - } - if (error != ENOENT) - break; - } - - ddt_enter(ddt); - - ASSERT(dde->dde_loaded == B_FALSE); - ASSERT(dde->dde_loading == B_TRUE); - - dde->dde_type = type; /* will be DDT_TYPES if no entry found */ - dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ - dde->dde_loaded = B_TRUE; - dde->dde_loading = B_FALSE; - - if (error == 0) - ddt_stat_update(ddt, dde, -1ULL); - - cv_broadcast(&dde->dde_cv); - - return (dde); -} - -void -ddt_prefetch(spa_t *spa, const blkptr_t *bp) -{ - ddt_t *ddt; - ddt_entry_t dde; - - if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) - return; - - /* - * We only remove the DDT once all tables are empty and only - * prefetch dedup blocks when there are entries in the DDT. - * Thus no locking is required as the DDT can't disappear on us. - */ - ddt = ddt_select(spa, bp); - ddt_key_fill(&dde.dde_key, bp); - - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { - ddt_object_prefetch(ddt, type, class, &dde); - } - } -} - -/* - * Opaque struct used for ddt_key comparison - */ -#define DDT_KEY_CMP_LEN (sizeof (ddt_key_t) / sizeof (uint16_t)) - -typedef struct ddt_key_cmp { - uint16_t u16[DDT_KEY_CMP_LEN]; -} ddt_key_cmp_t; - -int -ddt_entry_compare(const void *x1, const void *x2) -{ - const ddt_entry_t *dde1 = x1; - const ddt_entry_t *dde2 = x2; - const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)&dde1->dde_key; - const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)&dde2->dde_key; - int32_t cmp = 0; - - for (int i = 0; i < DDT_KEY_CMP_LEN; i++) { - cmp = (int32_t)k1->u16[i] - (int32_t)k2->u16[i]; - if (likely(cmp)) - break; - } - - return (AVL_ISIGN(cmp)); -} - -static ddt_t * -ddt_table_alloc(spa_t *spa, enum zio_checksum c) -{ - ddt_t *ddt; - - ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP); - - mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&ddt->ddt_tree, ddt_entry_compare, - sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); - avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, - sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); - ddt->ddt_checksum = c; - ddt->ddt_spa = spa; - ddt->ddt_os = spa->spa_meta_objset; - - return (ddt); -} - -static void -ddt_table_free(ddt_t *ddt) -{ - ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); - ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); - avl_destroy(&ddt->ddt_tree); - avl_destroy(&ddt->ddt_repair_tree); - mutex_destroy(&ddt->ddt_lock); - kmem_free(ddt, sizeof (*ddt)); -} - -void -ddt_create(spa_t *spa) -{ - spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; - - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) - spa->spa_ddt[c] = ddt_table_alloc(spa, c); -} - -int -ddt_load(spa_t *spa) -{ - int error; - - ddt_create(spa); - - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, - &spa->spa_ddt_stat_object); - - if (error) - return (error == ENOENT ? 0 : error); - - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { - ddt_t *ddt = spa->spa_ddt[c]; - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; - class++) { - error = ddt_object_load(ddt, type, class); - if (error != 0 && error != ENOENT) - return (error); - } - } - - /* - * Seed the cached histograms. - */ - bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, - sizeof (ddt->ddt_histogram)); - } - - return (0); -} - -void -ddt_unload(spa_t *spa) -{ - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { - if (spa->spa_ddt[c]) { - ddt_table_free(spa->spa_ddt[c]); - spa->spa_ddt[c] = NULL; - } - } -} - -boolean_t -ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) -{ - ddt_t *ddt; - ddt_entry_t dde; - - if (!BP_GET_DEDUP(bp)) - return (B_FALSE); - - if (max_class == DDT_CLASS_UNIQUE) - return (B_TRUE); - - ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; - - ddt_key_fill(&dde.dde_key, bp); - - for (enum ddt_type type = 0; type < DDT_TYPES; type++) - for (enum ddt_class class = 0; class <= max_class; class++) - if (ddt_object_lookup(ddt, type, class, &dde) == 0) - return (B_TRUE); - - return (B_FALSE); -} - -ddt_entry_t * -ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) -{ - ddt_key_t ddk; - ddt_entry_t *dde; - - ddt_key_fill(&ddk, bp); - - dde = ddt_alloc(&ddk); - - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { - /* - * We can only do repair if there are multiple copies - * of the block. For anything in the UNIQUE class, - * there's definitely only one copy, so don't even try. - */ - if (class != DDT_CLASS_UNIQUE && - ddt_object_lookup(ddt, type, class, dde) == 0) - return (dde); - } - } - - bzero(dde->dde_phys, sizeof (dde->dde_phys)); - - return (dde); -} - -void -ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) -{ - avl_index_t where; - - ddt_enter(ddt); - - if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) && - avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) - avl_insert(&ddt->ddt_repair_tree, dde, where); - else - ddt_free(dde); - - ddt_exit(ddt); -} - -static void -ddt_repair_entry_done(zio_t *zio) -{ - ddt_entry_t *rdde = zio->io_private; - - ddt_free(rdde); -} - -static void -ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) -{ - ddt_phys_t *ddp = dde->dde_phys; - ddt_phys_t *rddp = rdde->dde_phys; - ddt_key_t *ddk = &dde->dde_key; - ddt_key_t *rddk = &rdde->dde_key; - zio_t *zio; - blkptr_t blk; - - zio = zio_null(rio, rio->io_spa, NULL, - ddt_repair_entry_done, rdde, rio->io_flags); - - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { - if (ddp->ddp_phys_birth == 0 || - ddp->ddp_phys_birth != rddp->ddp_phys_birth || - bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) - continue; - ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); - zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, - rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL, - ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); - } - - zio_nowait(zio); -} - -static void -ddt_repair_table(ddt_t *ddt, zio_t *rio) -{ - spa_t *spa = ddt->ddt_spa; - ddt_entry_t *dde, *rdde_next, *rdde; - avl_tree_t *t = &ddt->ddt_repair_tree; - blkptr_t blk; - - if (spa_sync_pass(spa) > 1) - return; - - ddt_enter(ddt); - for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { - rdde_next = AVL_NEXT(t, rdde); - avl_remove(&ddt->ddt_repair_tree, rdde); - ddt_exit(ddt); - ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); - dde = ddt_repair_start(ddt, &blk); - ddt_repair_entry(ddt, dde, rdde, rio); - ddt_repair_done(ddt, dde); - ddt_enter(ddt); - } - ddt_exit(ddt); -} - -static void -ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) -{ - dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; - ddt_phys_t *ddp = dde->dde_phys; - ddt_key_t *ddk = &dde->dde_key; - enum ddt_type otype = dde->dde_type; - enum ddt_type ntype = DDT_TYPE_CURRENT; - enum ddt_class oclass = dde->dde_class; - enum ddt_class nclass; - uint64_t total_refcnt = 0; - - ASSERT(dde->dde_loaded); - ASSERT(!dde->dde_loading); - - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - ASSERT(dde->dde_lead_zio[p] == NULL); - ASSERT((int64_t)ddp->ddp_refcnt >= 0); - if (ddp->ddp_phys_birth == 0) { - ASSERT(ddp->ddp_refcnt == 0); - continue; - } - if (p == DDT_PHYS_DITTO) { - if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) - ddt_phys_free(ddt, ddk, ddp, txg); - continue; - } - if (ddp->ddp_refcnt == 0) - ddt_phys_free(ddt, ddk, ddp, txg); - total_refcnt += ddp->ddp_refcnt; - } - - if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) - nclass = DDT_CLASS_DITTO; - else if (total_refcnt > 1) - nclass = DDT_CLASS_DUPLICATE; - else - nclass = DDT_CLASS_UNIQUE; - - if (otype != DDT_TYPES && - (otype != ntype || oclass != nclass || total_refcnt == 0)) { - VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); - ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); - } - - if (total_refcnt != 0) { - dde->dde_type = ntype; - dde->dde_class = nclass; - ddt_stat_update(ddt, dde, 0); - if (!ddt_object_exists(ddt, ntype, nclass)) - ddt_object_create(ddt, ntype, nclass, tx); - VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); - - /* - * If the class changes, the order that we scan this bp - * changes. If it decreases, we could miss it, so - * scan it right now. (This covers both class changing - * while we are doing ddt_walk(), and when we are - * traversing.) - */ - if (nclass < oclass) { - dsl_scan_ddt_entry(dp->dp_scan, - ddt->ddt_checksum, dde, tx); - } - } -} - -static void -ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) -{ - spa_t *spa = ddt->ddt_spa; - ddt_entry_t *dde; - void *cookie = NULL; - - if (avl_numnodes(&ddt->ddt_tree) == 0) - return; - - ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); - - if (spa->spa_ddt_stat_object == 0) { - spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, - DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_DDT_STATS, tx); - } - - while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { - ddt_sync_entry(ddt, dde, tx, txg); - ddt_free(dde); - } - - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - uint64_t add, count = 0; - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { - if (ddt_object_exists(ddt, type, class)) { - ddt_object_sync(ddt, type, class, tx); - VERIFY(ddt_object_count(ddt, type, class, - &add) == 0); - count += add; - } - } - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { - if (count == 0 && ddt_object_exists(ddt, type, class)) - ddt_object_destroy(ddt, type, class, tx); - } - } - - bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, - sizeof (ddt->ddt_histogram)); -} - -void -ddt_sync(spa_t *spa, uint64_t txg) -{ - dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; - dmu_tx_t *tx; - zio_t *rio; - - ASSERT(spa_syncing_txg(spa) == txg); - - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - - rio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL); - - /* - * This function may cause an immediate scan of ddt blocks (see - * the comment above dsl_scan_ddt() for details). We set the - * scan's root zio here so that we can wait for any scan IOs in - * addition to the regular ddt IOs. - */ - ASSERT3P(scn->scn_zio_root, ==, NULL); - scn->scn_zio_root = rio; - - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { - ddt_t *ddt = spa->spa_ddt[c]; - if (ddt == NULL) - continue; - ddt_sync_table(ddt, tx, txg); - ddt_repair_table(ddt, rio); - } - - (void) zio_wait(rio); - scn->scn_zio_root = NULL; - - dmu_tx_commit(tx); -} - -int -ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) -{ - do { - do { - do { - ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; - int error = ENOENT; - if (ddt_object_exists(ddt, ddb->ddb_type, - ddb->ddb_class)) { - error = ddt_object_walk(ddt, - ddb->ddb_type, ddb->ddb_class, - &ddb->ddb_cursor, dde); - } - dde->dde_type = ddb->ddb_type; - dde->dde_class = ddb->ddb_class; - if (error == 0) - return (0); - if (error != ENOENT) - return (error); - ddb->ddb_cursor = 0; - } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); - ddb->ddb_checksum = 0; - } while (++ddb->ddb_type < DDT_TYPES); - ddb->ddb_type = 0; - } while (++ddb->ddb_class < DDT_CLASSES); - - return (SET_ERROR(ENOENT)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c deleted file mode 100644 index b2202fb91531..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c +++ /dev/null @@ -1,165 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include - -int ddt_zap_leaf_blockshift = 12; -int ddt_zap_indirect_blockshift = 12; - -static int -ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) -{ - zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY; - - if (prehash) - flags |= ZAP_FLAG_PRE_HASHED_KEY; - - *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP, - ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift, - DMU_OT_NONE, 0, tx); - - return (*objectp == 0 ? ENOTSUP : 0); -} - -static int -ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) -{ - return (zap_destroy(os, object, tx)); -} - -static int -ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde) -{ - uchar_t cbuf[sizeof (dde->dde_phys) + 1]; - uint64_t one, csize; - int error; - - error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key, - DDT_KEY_WORDS, &one, &csize); - if (error) - return (error); - - ASSERT(one == 1); - ASSERT(csize <= sizeof (cbuf)); - - error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key, - DDT_KEY_WORDS, 1, csize, cbuf); - if (error) - return (error); - - ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); - - return (0); -} - -static void -ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde) -{ - (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key, - DDT_KEY_WORDS); -} - -static int -ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) -{ - uchar_t cbuf[sizeof (dde->dde_phys) + 1]; - uint64_t csize; - - csize = ddt_compress(dde->dde_phys, cbuf, - sizeof (dde->dde_phys), sizeof (cbuf)); - - return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key, - DDT_KEY_WORDS, 1, csize, cbuf, tx)); -} - -static int -ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) -{ - return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key, - DDT_KEY_WORDS, tx)); -} - -static int -ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) -{ - zap_cursor_t zc; - zap_attribute_t za; - int error; - - if (*walk == 0) { - /* - * We don't want to prefetch the entire ZAP object, because - * it can be enormous. Also the primary use of DDT iteration - * is for scrubbing, in which case we will be issuing many - * scrub i/os for each ZAP block that we read in, so - * reading the ZAP is unlikely to be the bottleneck. - */ - zap_cursor_init_noprefetch(&zc, os, object); - } else { - zap_cursor_init_serialized(&zc, os, object, *walk); - } - if ((error = zap_cursor_retrieve(&zc, &za)) == 0) { - uchar_t cbuf[sizeof (dde->dde_phys) + 1]; - uint64_t csize = za.za_num_integers; - ASSERT(za.za_integer_length == 1); - error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name, - DDT_KEY_WORDS, 1, csize, cbuf); - ASSERT(error == 0); - if (error == 0) { - ddt_decompress(cbuf, dde->dde_phys, csize, - sizeof (dde->dde_phys)); - dde->dde_key = *(ddt_key_t *)za.za_name; - } - zap_cursor_advance(&zc); - *walk = zap_cursor_serialize(&zc); - } - zap_cursor_fini(&zc); - return (error); -} - -static int -ddt_zap_count(objset_t *os, uint64_t object, uint64_t *count) -{ - - return (zap_count(os, object, count)); -} - -const ddt_ops_t ddt_zap_ops = { - "zap", - ddt_zap_create, - ddt_zap_destroy, - ddt_zap_lookup, - ddt_zap_prefetch, - ddt_zap_update, - ddt_zap_remove, - ddt_zap_walk, - ddt_zap_count, -}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c deleted file mode 100644 index 59e551e75d43..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ /dev/null @@ -1,2748 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. - * Copyright (c) 2019 Datto Inc. - */ -/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ -/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */ -/* Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef _KERNEL -#include -#include -#include -#endif - -/* - * Enable/disable nopwrite feature. - */ -int zfs_nopwrite_enabled = 1; -SYSCTL_DECL(_vfs_zfs); -SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN, - &zfs_nopwrite_enabled, 0, "Enable nopwrite feature"); - -/* - * Tunable to control percentage of dirtied L1 blocks from frees allowed into - * one TXG. After this threshold is crossed, additional dirty blocks from frees - * will wait until the next TXG. - * A value of zero will disable this throttle. - */ -uint32_t zfs_per_txg_dirty_frees_percent = 5; -SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN, - &zfs_per_txg_dirty_frees_percent, 0, - "Percentage of dirtied indirect blocks from frees allowed in one txg"); - -/* - * This can be used for testing, to ensure that certain actions happen - * while in the middle of a remap (which might otherwise complete too - * quickly). - */ -int zfs_object_remap_one_indirect_delay_ticks = 0; - -/* - * Limit the amount we can prefetch with one call to this amount. This - * helps to limit the amount of memory that can be used by prefetching. - * Larger objects should be prefetched a bit at a time. - */ -uint64_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; - -const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { - { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "object directory" }, - { DMU_BSWAP_UINT64, TRUE, TRUE, "object array" }, - { DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "ZIL intent log" }, - { DMU_BSWAP_DNODE, TRUE, FALSE, "DMU dnode" }, - { DMU_BSWAP_OBJSET, TRUE, TRUE, "DMU objset" }, - { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL directory" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL directory child map" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset snap map" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL props" }, - { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL dataset" }, - { DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" }, - { DMU_BSWAP_OLDACL, TRUE, FALSE, "ZFS V0 ACL" }, - { DMU_BSWAP_UINT8, FALSE, FALSE, "ZFS plain file" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS directory" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS delete queue" }, - { DMU_BSWAP_UINT8, FALSE, FALSE, "zvol object" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" }, - { DMU_BSWAP_UINT8, FALSE, FALSE, "other uint8[]" }, - { DMU_BSWAP_UINT64, FALSE, FALSE, "other uint64[]" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" }, - { DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "Pool properties" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL permissions" }, - { DMU_BSWAP_ACL, TRUE, FALSE, "ZFS ACL" }, - { DMU_BSWAP_UINT8, TRUE, FALSE, "ZFS SYSACL" }, - { DMU_BSWAP_UINT8, TRUE, FALSE, "FUID table" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset next clones" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group used" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group quota" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "snapshot refcount tags" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" }, - { DMU_BSWAP_UINT8, TRUE, FALSE, "System attributes" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "SA master node" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr registration" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr layouts" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" }, - { DMU_BSWAP_UINT8, FALSE, FALSE, "deduplicated block" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL deadlist map" }, - { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL deadlist map hdr" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dir clones" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" } -}; - -const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { - { byteswap_uint8_array, "uint8" }, - { byteswap_uint16_array, "uint16" }, - { byteswap_uint32_array, "uint32" }, - { byteswap_uint64_array, "uint64" }, - { zap_byteswap, "zap" }, - { dnode_buf_byteswap, "dnode" }, - { dmu_objset_byteswap, "objset" }, - { zfs_znode_byteswap, "znode" }, - { zfs_oldacl_byteswap, "oldacl" }, - { zfs_acl_byteswap, "acl" } -}; - -int -dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, - void *tag, dmu_buf_t **dbp) -{ - uint64_t blkid; - dmu_buf_impl_t *db; - - blkid = dbuf_whichblock(dn, 0, offset); - rw_enter(&dn->dn_struct_rwlock, RW_READER); - db = dbuf_hold(dn, blkid, tag); - rw_exit(&dn->dn_struct_rwlock); - - if (db == NULL) { - *dbp = NULL; - return (SET_ERROR(EIO)); - } - - *dbp = &db->db; - return (0); -} -int -dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, - void *tag, dmu_buf_t **dbp) -{ - dnode_t *dn; - uint64_t blkid; - dmu_buf_impl_t *db; - int err; - - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); - blkid = dbuf_whichblock(dn, 0, offset); - rw_enter(&dn->dn_struct_rwlock, RW_READER); - db = dbuf_hold(dn, blkid, tag); - rw_exit(&dn->dn_struct_rwlock); - dnode_rele(dn, FTAG); - - if (db == NULL) { - *dbp = NULL; - return (SET_ERROR(EIO)); - } - - *dbp = &db->db; - return (err); -} - -int -dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, - void *tag, dmu_buf_t **dbp, int flags) -{ - int err; - int db_flags = DB_RF_CANFAIL; - - if (flags & DMU_READ_NO_PREFETCH) - db_flags |= DB_RF_NOPREFETCH; - - err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); - if (err == 0) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); - err = dbuf_read(db, NULL, db_flags); - if (err != 0) { - dbuf_rele(db, tag); - *dbp = NULL; - } - } - - return (err); -} - -int -dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, - void *tag, dmu_buf_t **dbp, int flags) -{ - int err; - int db_flags = DB_RF_CANFAIL; - - if (flags & DMU_READ_NO_PREFETCH) - db_flags |= DB_RF_NOPREFETCH; - - err = dmu_buf_hold_noread(os, object, offset, tag, dbp); - if (err == 0) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); - err = dbuf_read(db, NULL, db_flags); - if (err != 0) { - dbuf_rele(db, tag); - *dbp = NULL; - } - } - - return (err); -} - -int -dmu_bonus_max(void) -{ - return (DN_OLD_MAX_BONUSLEN); -} - -int -dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; - int error; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - - if (dn->dn_bonus != db) { - error = SET_ERROR(EINVAL); - } else if (newsize < 0 || newsize > db_fake->db_size) { - error = SET_ERROR(EINVAL); - } else { - dnode_setbonuslen(dn, newsize, tx); - error = 0; - } - - DB_DNODE_EXIT(db); - return (error); -} - -int -dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; - int error; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - - if (!DMU_OT_IS_VALID(type)) { - error = SET_ERROR(EINVAL); - } else if (dn->dn_bonus != db) { - error = SET_ERROR(EINVAL); - } else { - dnode_setbonus_type(dn, type, tx); - error = 0; - } - - DB_DNODE_EXIT(db); - return (error); -} - -dmu_object_type_t -dmu_get_bonustype(dmu_buf_t *db_fake) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; - dmu_object_type_t type; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - type = dn->dn_bonustype; - DB_DNODE_EXIT(db); - - return (type); -} - -int -dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) -{ - dnode_t *dn; - int error; - - error = dnode_hold(os, object, FTAG, &dn); - dbuf_rm_spill(dn, tx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - dnode_rm_spill(dn, tx); - rw_exit(&dn->dn_struct_rwlock); - dnode_rele(dn, FTAG); - return (error); -} - -/* - * returns ENOENT, EIO, or 0. - */ -int -dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) -{ - dnode_t *dn; - dmu_buf_impl_t *db; - int error; - - error = dnode_hold(os, object, FTAG, &dn); - if (error) - return (error); - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_bonus == NULL) { - rw_exit(&dn->dn_struct_rwlock); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - if (dn->dn_bonus == NULL) - dbuf_create_bonus(dn); - } - db = dn->dn_bonus; - - /* as long as the bonus buf is held, the dnode will be held */ - if (zfs_refcount_add(&db->db_holds, tag) == 1) { - VERIFY(dnode_add_ref(dn, db)); - atomic_inc_32(&dn->dn_dbufs_count); - } - - /* - * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's - * hold and incrementing the dbuf count to ensure that dnode_move() sees - * a dnode hold for every dbuf. - */ - rw_exit(&dn->dn_struct_rwlock); - - dnode_rele(dn, FTAG); - - VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); - - *dbp = &db->db; - return (0); -} - -/* - * returns ENOENT, EIO, or 0. - * - * This interface will allocate a blank spill dbuf when a spill blk - * doesn't already exist on the dnode. - * - * if you only want to find an already existing spill db, then - * dmu_spill_hold_existing() should be used. - */ -int -dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) -{ - dmu_buf_impl_t *db = NULL; - int err; - - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_enter(&dn->dn_struct_rwlock, RW_READER); - - db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); - - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); - - ASSERT(db != NULL); - err = dbuf_read(db, NULL, flags); - if (err == 0) - *dbp = &db->db; - else - dbuf_rele(db, tag); - return (err); -} - -int -dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; - dnode_t *dn; - int err; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - - if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { - err = SET_ERROR(EINVAL); - } else { - rw_enter(&dn->dn_struct_rwlock, RW_READER); - - if (!dn->dn_have_spill) { - err = SET_ERROR(ENOENT); - } else { - err = dmu_spill_hold_by_dnode(dn, - DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); - } - - rw_exit(&dn->dn_struct_rwlock); - } - - DB_DNODE_EXIT(db); - return (err); -} - -int -dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; - dnode_t *dn; - int err; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); - DB_DNODE_EXIT(db); - - return (err); -} - -/* - * Note: longer-term, we should modify all of the dmu_buf_*() interfaces - * to take a held dnode rather than -- the lookup is wasteful, - * and can induce severe lock contention when writing to several files - * whose dnodes are in the same block. - */ -int -dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) -{ - dmu_buf_t **dbp; - uint64_t blkid, nblks, i; - uint32_t dbuf_flags; - int err; - zio_t *zio; - - ASSERT(length <= DMU_MAX_ACCESS); - - /* - * Note: We directly notify the prefetch code of this read, so that - * we can tell it about the multi-block read. dbuf_read() only knows - * about the one block it is accessing. - */ - dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | - DB_RF_NOPREFETCH; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_datablkshift) { - int blkshift = dn->dn_datablkshift; - nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - - P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; - } else { - if (offset + length > dn->dn_datablksz) { - zfs_panic_recover("zfs: accessing past end of object " - "%llx/%llx (size=%u access=%llu+%llu)", - (longlong_t)dn->dn_objset-> - os_dsl_dataset->ds_object, - (longlong_t)dn->dn_object, dn->dn_datablksz, - (longlong_t)offset, (longlong_t)length); - rw_exit(&dn->dn_struct_rwlock); - return (SET_ERROR(EIO)); - } - nblks = 1; - } - dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); - -#if defined(_KERNEL) && defined(RACCT) - if (racct_enable && !read) { - PROC_LOCK(curproc); - racct_add_force(curproc, RACCT_WRITEBPS, length); - racct_add_force(curproc, RACCT_WRITEIOPS, nblks); - PROC_UNLOCK(curproc); - } -#endif - - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); - blkid = dbuf_whichblock(dn, 0, offset); - for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); - if (db == NULL) { - rw_exit(&dn->dn_struct_rwlock); - dmu_buf_rele_array(dbp, nblks, tag); - zio_nowait(zio); - return (SET_ERROR(EIO)); - } - - /* initiate async i/o */ - if (read) - (void) dbuf_read(db, zio, dbuf_flags); -#ifdef _KERNEL - else - curthread->td_ru.ru_oublock++; -#endif - dbp[i] = &db->db; - } - - if ((flags & DMU_READ_NO_PREFETCH) == 0 && - DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { - dmu_zfetch(&dn->dn_zfetch, blkid, nblks, - read && DNODE_IS_CACHEABLE(dn)); - } - rw_exit(&dn->dn_struct_rwlock); - - /* wait for async i/o */ - err = zio_wait(zio); - if (err) { - dmu_buf_rele_array(dbp, nblks, tag); - return (err); - } - - /* wait for other io to complete */ - if (read) { - for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; - mutex_enter(&db->db_mtx); - while (db->db_state == DB_READ || - db->db_state == DB_FILL) - cv_wait(&db->db_changed, &db->db_mtx); - if (db->db_state == DB_UNCACHED) - err = SET_ERROR(EIO); - mutex_exit(&db->db_mtx); - if (err) { - dmu_buf_rele_array(dbp, nblks, tag); - return (err); - } - } - } - - *numbufsp = nblks; - *dbpp = dbp; - return (0); -} - -static int -dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) -{ - dnode_t *dn; - int err; - - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); - - err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp, DMU_READ_PREFETCH); - - dnode_rele(dn, FTAG); - - return (err); -} - -int -dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, - uint64_t length, boolean_t read, void *tag, int *numbufsp, - dmu_buf_t ***dbpp) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; - int err; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp, DMU_READ_PREFETCH); - DB_DNODE_EXIT(db); - - return (err); -} - -void -dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) -{ - int i; - dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; - - if (numbufs == 0) - return; - - for (i = 0; i < numbufs; i++) { - if (dbp[i]) - dbuf_rele(dbp[i], tag); - } - - kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); -} - -/* - * Issue prefetch i/os for the given blocks. If level is greater than 0, the - * indirect blocks prefeteched will be those that point to the blocks containing - * the data starting at offset, and continuing to offset + len. - * - * Note that if the indirect blocks above the blocks being prefetched are not in - * cache, they will be asychronously read in. - */ -void -dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, - uint64_t len, zio_priority_t pri) -{ - dnode_t *dn; - uint64_t blkid; - int nblks, err; - - if (len == 0) { /* they're interested in the bonus buffer */ - dn = DMU_META_DNODE(os); - - if (object == 0 || object >= DN_MAX_OBJECT) - return; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, level, - object * sizeof (dnode_phys_t)); - dbuf_prefetch(dn, level, blkid, pri, 0); - rw_exit(&dn->dn_struct_rwlock); - return; - } - - /* - * See comment before the definition of dmu_prefetch_max. - */ - len = MIN(len, dmu_prefetch_max); - - /* - * XXX - Note, if the dnode for the requested object is not - * already cached, we will do a *synchronous* read in the - * dnode_hold() call. The same is true for any indirects. - */ - err = dnode_hold(os, object, FTAG, &dn); - if (err != 0) - return; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - /* - * offset + len - 1 is the last byte we want to prefetch for, and offset - * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the - * last block we want to prefetch, and dbuf_whichblock(dn, level, - * offset) is the first. Then the number we need to prefetch is the - * last - first + 1. - */ - if (level > 0 || dn->dn_datablkshift != 0) { - nblks = dbuf_whichblock(dn, level, offset + len - 1) - - dbuf_whichblock(dn, level, offset) + 1; - } else { - nblks = (offset < dn->dn_datablksz); - } - - if (nblks != 0) { - blkid = dbuf_whichblock(dn, level, offset); - for (int i = 0; i < nblks; i++) - dbuf_prefetch(dn, level, blkid + i, pri, 0); - } - - rw_exit(&dn->dn_struct_rwlock); - - dnode_rele(dn, FTAG); -} - -/* - * Get the next "chunk" of file data to free. We traverse the file from - * the end so that the file gets shorter over time (if we crashes in the - * middle, this will leave us in a better state). We find allocated file - * data by simply searching the allocated level 1 indirects. - * - * On input, *start should be the first offset that does not need to be - * freed (e.g. "offset + length"). On return, *start will be the first - * offset that should be freed and l1blks is set to the number of level 1 - * indirect blocks found within the chunk. - */ -static int -get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) -{ - uint64_t blks; - uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); - /* bytes of data covered by a level-1 indirect block */ - uint64_t iblkrange = - dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); - - ASSERT3U(minimum, <=, *start); - - /* - * Check if we can free the entire range assuming that all of the - * L1 blocks in this range have data. If we can, we use this - * worst case value as an estimate so we can avoid having to look - * at the object's actual data. - */ - uint64_t total_l1blks = - (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) / - iblkrange; - if (total_l1blks <= maxblks) { - *l1blks = total_l1blks; - *start = minimum; - return (0); - } - ASSERT(ISP2(iblkrange)); - - for (blks = 0; *start > minimum && blks < maxblks; blks++) { - int err; - - /* - * dnode_next_offset(BACKWARDS) will find an allocated L1 - * indirect block at or before the input offset. We must - * decrement *start so that it is at the end of the region - * to search. - */ - (*start)--; - - err = dnode_next_offset(dn, - DNODE_FIND_BACKWARDS, start, 2, 1, 0); - - /* if there are no indirect blocks before start, we are done */ - if (err == ESRCH) { - *start = minimum; - break; - } else if (err != 0) { - *l1blks = blks; - return (err); - } - - /* set start to the beginning of this L1 indirect */ - *start = P2ALIGN(*start, iblkrange); - } - if (*start < minimum) - *start = minimum; - *l1blks = blks; - - return (0); -} - -/* - * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set, - * otherwise return false. - * Used below in dmu_free_long_range_impl() to enable abort when unmounting - */ -/*ARGSUSED*/ -static boolean_t -dmu_objset_zfs_unmounting(objset_t *os) -{ -#ifdef _KERNEL - if (dmu_objset_type(os) == DMU_OST_ZFS) - return (zfs_get_vfs_flag_unmounted(os)); -#endif - return (B_FALSE); -} - -static int -dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, - uint64_t length) -{ - uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; - int err; - uint64_t dirty_frees_threshold; - dsl_pool_t *dp = dmu_objset_pool(os); - - if (offset >= object_size) - return (0); - - if (zfs_per_txg_dirty_frees_percent <= 100) - dirty_frees_threshold = - zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; - else - dirty_frees_threshold = zfs_dirty_data_max / 20; - - if (length == DMU_OBJECT_END || offset + length > object_size) - length = object_size - offset; - - while (length != 0) { - uint64_t chunk_end, chunk_begin, chunk_len; - uint64_t l1blks; - dmu_tx_t *tx; - - if (dmu_objset_zfs_unmounting(dn->dn_objset)) - return (SET_ERROR(EINTR)); - - chunk_end = chunk_begin = offset + length; - - /* move chunk_begin backwards to the beginning of this chunk */ - err = get_next_chunk(dn, &chunk_begin, offset, &l1blks); - if (err) - return (err); - ASSERT3U(chunk_begin, >=, offset); - ASSERT3U(chunk_begin, <=, chunk_end); - - chunk_len = chunk_end - chunk_begin; - - tx = dmu_tx_create(os); - dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); - - /* - * Mark this transaction as typically resulting in a net - * reduction in space used. - */ - dmu_tx_mark_netfree(tx); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - - uint64_t txg = dmu_tx_get_txg(tx); - - mutex_enter(&dp->dp_lock); - uint64_t long_free_dirty = - dp->dp_long_free_dirty_pertxg[txg & TXG_MASK]; - mutex_exit(&dp->dp_lock); - - /* - * To avoid filling up a TXG with just frees, wait for - * the next TXG to open before freeing more chunks if - * we have reached the threshold of frees. - */ - if (dirty_frees_threshold != 0 && - long_free_dirty >= dirty_frees_threshold) { - dmu_tx_commit(tx); - txg_wait_open(dp, 0); - continue; - } - - /* - * In order to prevent unnecessary write throttling, for each - * TXG, we track the cumulative size of L1 blocks being dirtied - * in dnode_free_range() below. We compare this number to a - * tunable threshold, past which we prevent new L1 dirty freeing - * blocks from being added into the open TXG. See - * dmu_free_long_range_impl() for details. The threshold - * prevents write throttle activation due to dirty freeing L1 - * blocks taking up a large percentage of zfs_dirty_data_max. - */ - mutex_enter(&dp->dp_lock); - dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] += - l1blks << dn->dn_indblkshift; - mutex_exit(&dp->dp_lock); - DTRACE_PROBE3(free__long__range, - uint64_t, long_free_dirty, uint64_t, chunk_len, - uint64_t, txg); - dnode_free_range(dn, chunk_begin, chunk_len, tx); - dmu_tx_commit(tx); - - length -= chunk_len; - } - return (0); -} - -int -dmu_free_long_range(objset_t *os, uint64_t object, - uint64_t offset, uint64_t length) -{ - dnode_t *dn; - int err; - - err = dnode_hold(os, object, FTAG, &dn); - if (err != 0) - return (err); - err = dmu_free_long_range_impl(os, dn, offset, length); - - /* - * It is important to zero out the maxblkid when freeing the entire - * file, so that (a) subsequent calls to dmu_free_long_range_impl() - * will take the fast path, and (b) dnode_reallocate() can verify - * that the entire file has been freed. - */ - if (err == 0 && offset == 0 && length == DMU_OBJECT_END) - dn->dn_maxblkid = 0; - - dnode_rele(dn, FTAG); - return (err); -} - -int -dmu_free_long_object(objset_t *os, uint64_t object) -{ - dmu_tx_t *tx; - int err; - - err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); - if (err != 0) - return (err); - - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, object); - dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); - dmu_tx_mark_netfree(tx); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err == 0) { - err = dmu_object_free(os, object, tx); - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } - - return (err); -} - -int -dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, - uint64_t size, dmu_tx_t *tx) -{ - dnode_t *dn; - int err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); - ASSERT(offset < UINT64_MAX); - ASSERT(size == -1ULL || size <= UINT64_MAX - offset); - dnode_free_range(dn, offset, size, tx); - dnode_rele(dn, FTAG); - return (0); -} - -static int -dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, - void *buf, uint32_t flags) -{ - dmu_buf_t **dbp; - int numbufs, err = 0; - - /* - * Deal with odd block sizes, where there can't be data past the first - * block. If we ever do the tail block optimization, we will need to - * handle that here as well. - */ - if (dn->dn_maxblkid == 0) { - int newsz = offset > dn->dn_datablksz ? 0 : - MIN(size, dn->dn_datablksz - offset); - bzero((char *)buf + newsz, size - newsz); - size = newsz; - } - - while (size > 0) { - uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); - int i; - - /* - * NB: we could do this block-at-a-time, but it's nice - * to be reading in parallel. - */ - err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, - TRUE, FTAG, &numbufs, &dbp, flags); - if (err) - break; - - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - bcopy((char *)db->db_data + bufoff, buf, tocpy); - - offset += tocpy; - size -= tocpy; - buf = (char *)buf + tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); - } - return (err); -} - -int -dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags) -{ - dnode_t *dn; - int err; - - err = dnode_hold(os, object, FTAG, &dn); - if (err != 0) - return (err); - - err = dmu_read_impl(dn, offset, size, buf, flags); - dnode_rele(dn, FTAG); - return (err); -} - -int -dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, - uint32_t flags) -{ - return (dmu_read_impl(dn, offset, size, buf, flags)); -} - -static void -dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) -{ - int i; - - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty(db, tx); - - bcopy(buf, (char *)db->db_data + bufoff, tocpy); - - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); - - offset += tocpy; - size -= tocpy; - buf = (char *)buf + tocpy; - } -} - -void -dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) -{ - dmu_buf_t **dbp; - int numbufs; - - if (size == 0) - return; - - VERIFY0(dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp)); - dmu_write_impl(dbp, numbufs, offset, size, buf, tx); - dmu_buf_rele_array(dbp, numbufs, FTAG); -} - -void -dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) -{ - dmu_buf_t **dbp; - int numbufs; - - if (size == 0) - return; - - VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, - FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); - dmu_write_impl(dbp, numbufs, offset, size, buf, tx); - dmu_buf_rele_array(dbp, numbufs, FTAG); -} - -static int -dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn, - uint64_t last_removal_txg, uint64_t offset) -{ - uint64_t l1blkid = dbuf_whichblock(dn, 1, offset); - int err = 0; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG); - ASSERT3P(dbuf, !=, NULL); - - /* - * If the block hasn't been written yet, this default will ensure - * we don't try to remap it. - */ - uint64_t birth = UINT64_MAX; - ASSERT3U(last_removal_txg, !=, UINT64_MAX); - if (dbuf->db_blkptr != NULL) - birth = dbuf->db_blkptr->blk_birth; - rw_exit(&dn->dn_struct_rwlock); - - /* - * If this L1 was already written after the last removal, then we've - * already tried to remap it. - */ - if (birth <= last_removal_txg && - dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 && - dbuf_can_remap(dbuf)) { - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_remap_l1indirect(tx, dn->dn_object); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err == 0) { - (void) dbuf_dirty(dbuf, tx); - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } - } - - dbuf_rele(dbuf, FTAG); - - delay(zfs_object_remap_one_indirect_delay_ticks); - - return (err); -} - -/* - * Remap all blockpointers in the object, if possible, so that they reference - * only concrete vdevs. - * - * To do this, iterate over the L0 blockpointers and remap any that reference - * an indirect vdev. Note that we only examine L0 blockpointers; since we - * cannot guarantee that we can remap all blockpointer anyways (due to split - * blocks), we do not want to make the code unnecessarily complicated to - * catch the unlikely case that there is an L1 block on an indirect vdev that - * contains no indirect blockpointers. - */ -int -dmu_object_remap_indirects(objset_t *os, uint64_t object, - uint64_t last_removal_txg) -{ - uint64_t offset, l1span; - int err; - dnode_t *dn; - - err = dnode_hold(os, object, FTAG, &dn); - if (err != 0) { - return (err); - } - - if (dn->dn_nlevels <= 1) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { - err = SET_ERROR(EINTR); - } - - /* - * If the dnode has no indirect blocks, we cannot dirty them. - * We still want to remap the blkptr(s) in the dnode if - * appropriate, so mark it as dirty. - */ - if (err == 0 && dnode_needs_remap(dn)) { - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, dn->dn_object); - if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) { - dnode_setdirty(dn, tx); - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } - } - - dnode_rele(dn, FTAG); - return (err); - } - - offset = 0; - l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT + - dn->dn_datablkshift); - /* - * Find the next L1 indirect that is not a hole. - */ - while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { - err = SET_ERROR(EINTR); - break; - } - if ((err = dmu_object_remap_one_indirect(os, dn, - last_removal_txg, offset)) != 0) { - break; - } - offset += l1span; - } - - dnode_rele(dn, FTAG); - return (err); -} - -void -dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - dmu_tx_t *tx) -{ - dmu_buf_t **dbp; - int numbufs, i; - - if (size == 0) - return; - - VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp)); - - for (i = 0; i < numbufs; i++) { - dmu_buf_t *db = dbp[i]; - - dmu_buf_will_not_fill(db, tx); - } - dmu_buf_rele_array(dbp, numbufs, FTAG); -} - -void -dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, - void *data, uint8_t etype, uint8_t comp, int uncompressed_size, - int compressed_size, int byteorder, dmu_tx_t *tx) -{ - dmu_buf_t *db; - - ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); - ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); - VERIFY0(dmu_buf_hold_noread(os, object, offset, - FTAG, &db)); - - dmu_buf_write_embedded(db, - data, (bp_embedded_type_t)etype, (enum zio_compress)comp, - uncompressed_size, compressed_size, byteorder, tx); - - dmu_buf_rele(db, FTAG); -} - -/* - * DMU support for xuio - */ -kstat_t *xuio_ksp = NULL; - -int -dmu_xuio_init(xuio_t *xuio, int nblk) -{ - dmu_xuio_t *priv; - uio_t *uio = &xuio->xu_uio; - - uio->uio_iovcnt = nblk; - uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); - - priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); - priv->cnt = nblk; - priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); - priv->iovp = uio->uio_iov; - XUIO_XUZC_PRIV(xuio) = priv; - - if (XUIO_XUZC_RW(xuio) == UIO_READ) - XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); - else - XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); - - return (0); -} - -void -dmu_xuio_fini(xuio_t *xuio) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - int nblk = priv->cnt; - - kmem_free(priv->iovp, nblk * sizeof (iovec_t)); - kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); - kmem_free(priv, sizeof (dmu_xuio_t)); - - if (XUIO_XUZC_RW(xuio) == UIO_READ) - XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); - else - XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); -} - -/* - * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } - * and increase priv->next by 1. - */ -int -dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) -{ - struct iovec *iov; - uio_t *uio = &xuio->xu_uio; - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - int i = priv->next++; - - ASSERT(i < priv->cnt); - ASSERT(off + n <= arc_buf_lsize(abuf)); - iov = uio->uio_iov + i; - iov->iov_base = (char *)abuf->b_data + off; - iov->iov_len = n; - priv->bufs[i] = abuf; - return (0); -} - -int -dmu_xuio_cnt(xuio_t *xuio) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - return (priv->cnt); -} - -arc_buf_t * -dmu_xuio_arcbuf(xuio_t *xuio, int i) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - - ASSERT(i < priv->cnt); - return (priv->bufs[i]); -} - -void -dmu_xuio_clear(xuio_t *xuio, int i) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - - ASSERT(i < priv->cnt); - priv->bufs[i] = NULL; -} - -static void -xuio_stat_init(void) -{ - xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", - KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - if (xuio_ksp != NULL) { - xuio_ksp->ks_data = &xuio_stats; - kstat_install(xuio_ksp); - } -} - -static void -xuio_stat_fini(void) -{ - if (xuio_ksp != NULL) { - kstat_delete(xuio_ksp); - xuio_ksp = NULL; - } -} - -void -xuio_stat_wbuf_copied(void) -{ - XUIOSTAT_BUMP(xuiostat_wbuf_copied); -} - -void -xuio_stat_wbuf_nocopy(void) -{ - XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); -} - -#ifdef _KERNEL -int -dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) -{ - dmu_buf_t **dbp; - int numbufs, i, err; - xuio_t *xuio = NULL; - - /* - * NB: we could do this block-at-a-time, but it's nice - * to be reading in parallel. - */ - err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, - TRUE, FTAG, &numbufs, &dbp, 0); - if (err) - return (err); - -#ifdef UIO_XUIO - if (uio->uio_extflg == UIO_XUIO) - xuio = (xuio_t *)uio; -#endif - - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - - bufoff = uio->uio_loffset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - if (xuio) { - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - arc_buf_t *dbuf_abuf = dbi->db_buf; - arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); - err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); - if (!err) { - uio->uio_resid -= tocpy; - uio->uio_loffset += tocpy; - } - - if (abuf == dbuf_abuf) - XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); - else - XUIOSTAT_BUMP(xuiostat_rbuf_copied); - } else { -#ifdef illumos - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_READ, uio); -#else - err = vn_io_fault_uiomove((char *)db->db_data + bufoff, - tocpy, uio); -#endif - } - if (err) - break; - - size -= tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); - - return (err); -} - -/* - * Read 'size' bytes into the uio buffer. - * From object zdb->db_object. - * Starting at offset uio->uio_loffset. - * - * If the caller already has a dbuf in the target object - * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), - * because we don't have to find the dnode_t for the object. - */ -int -dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; - dnode_t *dn; - int err; - - if (size == 0) - return (0); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_read_uio_dnode(dn, uio, size); - DB_DNODE_EXIT(db); - - return (err); -} - -/* - * Read 'size' bytes into the uio buffer. - * From the specified object - * Starting at offset uio->uio_loffset. - */ -int -dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) -{ - dnode_t *dn; - int err; - - if (size == 0) - return (0); - - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); - - err = dmu_read_uio_dnode(dn, uio, size); - - dnode_rele(dn, FTAG); - - return (err); -} - -int -dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) -{ - dmu_buf_t **dbp; - int numbufs; - int err = 0; - int i; - - err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, - FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); - if (err) - return (err); - - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - - bufoff = uio->uio_loffset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty(db, tx); - -#ifdef illumos - /* - * XXX uiomove could block forever (eg. nfs-backed - * pages). There needs to be a uiolockdown() function - * to lock the pages in memory, so that uiomove won't - * block. - */ - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_WRITE, uio); -#else - err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy, - uio); -#endif - - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); - - if (err) - break; - - size -= tocpy; - } - - dmu_buf_rele_array(dbp, numbufs, FTAG); - return (err); -} - -/* - * Write 'size' bytes from the uio buffer. - * To object zdb->db_object. - * Starting at offset uio->uio_loffset. - * - * If the caller already has a dbuf in the target object - * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), - * because we don't have to find the dnode_t for the object. - */ -int -dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, - dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; - dnode_t *dn; - int err; - - if (size == 0) - return (0); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_write_uio_dnode(dn, uio, size, tx); - DB_DNODE_EXIT(db); - - return (err); -} - -/* - * Write 'size' bytes from the uio buffer. - * To the specified object. - * Starting at offset uio->uio_loffset. - */ -int -dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, - dmu_tx_t *tx) -{ - dnode_t *dn; - int err; - - if (size == 0) - return (0); - - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); - - err = dmu_write_uio_dnode(dn, uio, size, tx); - - dnode_rele(dn, FTAG); - - return (err); -} - -#ifdef illumos -int -dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - page_t *pp, dmu_tx_t *tx) -{ - dmu_buf_t **dbp; - int numbufs, i; - int err; - - if (size == 0) - return (0); - - err = dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp); - if (err) - return (err); - - for (i = 0; i < numbufs; i++) { - int tocpy, copied, thiscpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - caddr_t va; - - ASSERT(size > 0); - ASSERT3U(db->db_size, >=, PAGESIZE); - - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty(db, tx); - - for (copied = 0; copied < tocpy; copied += PAGESIZE) { - ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); - thiscpy = MIN(PAGESIZE, tocpy - copied); - va = zfs_map_page(pp, S_READ); - bcopy(va, (char *)db->db_data + bufoff, thiscpy); - zfs_unmap_page(pp, va); - pp = pp->p_next; - bufoff += PAGESIZE; - } - - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); - - offset += tocpy; - size -= tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); - return (err); -} - -#else /* !illumos */ - -int -dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - vm_page_t *ma, dmu_tx_t *tx) -{ - dmu_buf_t **dbp; - struct sf_buf *sf; - int numbufs, i; - int err; - - if (size == 0) - return (0); - - err = dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp); - if (err) - return (err); - - for (i = 0; i < numbufs; i++) { - int tocpy, copied, thiscpy; - int bufoff; - dmu_buf_t *db = dbp[i]; - caddr_t va; - - ASSERT(size > 0); - ASSERT3U(db->db_size, >=, PAGESIZE); - - bufoff = offset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); - - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty(db, tx); - - for (copied = 0; copied < tocpy; copied += PAGESIZE) { - ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff); - thiscpy = MIN(PAGESIZE, tocpy - copied); - va = zfs_map_page(*ma, &sf); - bcopy(va, (char *)db->db_data + bufoff, thiscpy); - zfs_unmap_page(sf); - ma += 1; - bufoff += PAGESIZE; - } - - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); - - offset += tocpy; - size -= tocpy; - } - dmu_buf_rele_array(dbp, numbufs, FTAG); - return (err); -} - -int -dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, - int *rbehind, int *rahead, int last_size) -{ - struct sf_buf *sf; - vm_object_t vmobj; - vm_page_t m; - dmu_buf_t **dbp; - dmu_buf_t *db; - caddr_t va; - int numbufs, i; - int bufoff, pgoff, tocpy; - int mi, di; - int err; - - ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex); - ASSERT(last_size <= PAGE_SIZE); - - err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex), - IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp); - if (err != 0) - return (err); - -#ifdef DEBUG - IMPLY(last_size < PAGE_SIZE, *rahead == 0); - if (dbp[0]->db_offset != 0 || numbufs > 1) { - for (i = 0; i < numbufs; i++) { - ASSERT(ISP2(dbp[i]->db_size)); - ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0); - ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size); - } - } -#endif - - vmobj = ma[0]->object; - - db = dbp[0]; - for (i = 0; i < *rbehind; i++) { - m = vm_page_grab_unlocked(vmobj, ma[0]->pindex - 1 - i, - VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | - VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY); - if (m == NULL) - break; - if (!vm_page_none_valid(m)) { - ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); - vm_page_sunbusy(m); - break; - } - ASSERT(m->dirty == 0); - ASSERT(!pmap_page_is_write_mapped(m)); - - ASSERT(db->db_size > PAGE_SIZE); - bufoff = IDX_TO_OFF(m->pindex) % db->db_size; - va = zfs_map_page(m, &sf); - bcopy((char *)db->db_data + bufoff, va, PAGESIZE); - zfs_unmap_page(sf); - vm_page_valid(m); - if ((m->busy_lock & VPB_BIT_WAITERS) != 0) - vm_page_activate(m); - else - vm_page_deactivate(m); - vm_page_sunbusy(m); - } - *rbehind = i; - - bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size; - pgoff = 0; - for (mi = 0, di = 0; mi < count && di < numbufs; ) { - if (pgoff == 0) { - m = ma[mi]; - if (m != bogus_page) { - vm_page_assert_xbusied(m); - ASSERT(vm_page_none_valid(m)); - ASSERT(m->dirty == 0); - ASSERT(!pmap_page_is_mapped(m)); - va = zfs_map_page(m, &sf); - } - } - if (bufoff == 0) - db = dbp[di]; - - if (m != bogus_page) { - ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==, - db->db_offset + bufoff); - } - - /* - * We do not need to clamp the copy size by the file - * size as the last block is zero-filled beyond the - * end of file anyway. - */ - tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff); - if (m != bogus_page) - bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy); - - pgoff += tocpy; - ASSERT(pgoff <= PAGESIZE); - if (pgoff == PAGESIZE) { - if (m != bogus_page) { - zfs_unmap_page(sf); - vm_page_valid(m); - } - ASSERT(mi < count); - mi++; - pgoff = 0; - } - - bufoff += tocpy; - ASSERT(bufoff <= db->db_size); - if (bufoff == db->db_size) { - ASSERT(di < numbufs); - di++; - bufoff = 0; - } - } - -#ifdef DEBUG - /* - * Three possibilities: - * - last requested page ends at a buffer boundary and , thus, - * all pages and buffers have been iterated; - * - all requested pages are filled, but the last buffer - * has not been exhausted; - * the read-ahead is possible only in this case; - * - all buffers have been read, but the last page has not been - * fully filled; - * this is only possible if the file has only a single buffer - * with a size that is not a multiple of the page size. - */ - if (mi == count) { - ASSERT(di >= numbufs - 1); - IMPLY(*rahead != 0, di == numbufs - 1); - IMPLY(*rahead != 0, bufoff != 0); - ASSERT(pgoff == 0); - } - if (di == numbufs) { - ASSERT(mi >= count - 1); - ASSERT(*rahead == 0); - IMPLY(pgoff == 0, mi == count); - if (pgoff != 0) { - ASSERT(mi == count - 1); - ASSERT((dbp[0]->db_size & PAGE_MASK) != 0); - } - } -#endif - if (pgoff != 0) { - ASSERT(m != bogus_page); - bzero(va + pgoff, PAGESIZE - pgoff); - zfs_unmap_page(sf); - vm_page_valid(m); - } - - for (i = 0; i < *rahead; i++) { - m = vm_page_grab_unlocked(vmobj, ma[count - 1]->pindex + 1 + i, - VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | - VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY); - if (m == NULL) - break; - if (!vm_page_none_valid(m)) { - ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); - vm_page_sunbusy(m); - break; - } - ASSERT(m->dirty == 0); - ASSERT(!pmap_page_is_write_mapped(m)); - - ASSERT(db->db_size > PAGE_SIZE); - bufoff = IDX_TO_OFF(m->pindex) % db->db_size; - tocpy = MIN(db->db_size - bufoff, PAGESIZE); - va = zfs_map_page(m, &sf); - bcopy((char *)db->db_data + bufoff, va, tocpy); - if (tocpy < PAGESIZE) { - ASSERT(i == *rahead - 1); - ASSERT((db->db_size & PAGE_MASK) != 0); - bzero(va + tocpy, PAGESIZE - tocpy); - } - zfs_unmap_page(sf); - vm_page_valid(m); - if ((m->busy_lock & VPB_BIT_WAITERS) != 0) - vm_page_activate(m); - else - vm_page_deactivate(m); - vm_page_sunbusy(m); - } - *rahead = i; - - dmu_buf_rele_array(dbp, numbufs, FTAG); - return (0); -} -#endif /* illumos */ -#endif /* _KERNEL */ - -/* - * Allocate a loaned anonymous arc buffer. - */ -arc_buf_t * -dmu_request_arcbuf(dmu_buf_t *handle, int size) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; - - return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); -} - -/* - * Free a loaned arc buffer. - */ -void -dmu_return_arcbuf(arc_buf_t *buf) -{ - arc_return_buf(buf, FTAG); - arc_buf_destroy(buf, FTAG); -} - -/* - * When possible directly assign passed loaned arc buffer to a dbuf. - * If this is not possible copy the contents of passed arc buf via - * dmu_write(). - */ -void -dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, - dmu_tx_t *tx) -{ - dmu_buf_impl_t *db; - uint32_t blksz = (uint32_t)arc_buf_lsize(buf); - uint64_t blkid; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, 0, offset); - VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); - rw_exit(&dn->dn_struct_rwlock); - - /* - * We can only assign if the offset is aligned, the arc buf is the - * same size as the dbuf, and the dbuf is not metadata. - */ - if (offset == db->db.db_offset && blksz == db->db.db_size) { -#ifdef _KERNEL - curthread->td_ru.ru_oublock++; -#ifdef RACCT - if (racct_enable) { - PROC_LOCK(curproc); - racct_add_force(curproc, RACCT_WRITEBPS, blksz); - racct_add_force(curproc, RACCT_WRITEIOPS, 1); - PROC_UNLOCK(curproc); - } -#endif /* RACCT */ -#endif /* _KERNEL */ - dbuf_assign_arcbuf(db, buf, tx); - dbuf_rele(db, FTAG); - } else { - objset_t *os; - uint64_t object; - - /* compressed bufs must always be assignable to their dbuf */ - ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); - ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); - - os = dn->dn_objset; - object = dn->dn_object; - - dbuf_rele(db, FTAG); - dmu_write(os, object, offset, blksz, buf->b_data, tx); - dmu_return_arcbuf(buf); - XUIOSTAT_BUMP(xuiostat_wbuf_copied); - } -} - -void -dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, - dmu_tx_t *tx) -{ - dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; - - DB_DNODE_ENTER(dbuf); - dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx); - DB_DNODE_EXIT(dbuf); -} - -typedef struct { - dbuf_dirty_record_t *dsa_dr; - dmu_sync_cb_t *dsa_done; - zgd_t *dsa_zgd; - dmu_tx_t *dsa_tx; -} dmu_sync_arg_t; - -/* ARGSUSED */ -static void -dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) -{ - dmu_sync_arg_t *dsa = varg; - dmu_buf_t *db = dsa->dsa_zgd->zgd_db; - blkptr_t *bp = zio->io_bp; - - if (zio->io_error == 0) { - if (BP_IS_HOLE(bp)) { - /* - * A block of zeros may compress to a hole, but the - * block size still needs to be known for replay. - */ - BP_SET_LSIZE(bp, db->db_size); - } else if (!BP_IS_EMBEDDED(bp)) { - ASSERT(BP_GET_LEVEL(bp) == 0); - bp->blk_fill = 1; - } - } -} - -static void -dmu_sync_late_arrival_ready(zio_t *zio) -{ - dmu_sync_ready(zio, NULL, zio->io_private); -} - -/* ARGSUSED */ -static void -dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) -{ - dmu_sync_arg_t *dsa = varg; - dbuf_dirty_record_t *dr = dsa->dsa_dr; - dmu_buf_impl_t *db = dr->dr_dbuf; - zgd_t *zgd = dsa->dsa_zgd; - - /* - * Record the vdev(s) backing this blkptr so they can be flushed after - * the writes for the lwb have completed. - */ - if (zio->io_error == 0) { - zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); - } - - mutex_enter(&db->db_mtx); - ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); - if (zio->io_error == 0) { - dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); - if (dr->dt.dl.dr_nopwrite) { - blkptr_t *bp = zio->io_bp; - blkptr_t *bp_orig = &zio->io_bp_orig; - uint8_t chksum = BP_GET_CHECKSUM(bp_orig); - - ASSERT(BP_EQUAL(bp, bp_orig)); - VERIFY(BP_EQUAL(bp, db->db_blkptr)); - ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); - ASSERT(zio_checksum_table[chksum].ci_flags & - ZCHECKSUM_FLAG_NOPWRITE); - } - dr->dt.dl.dr_overridden_by = *zio->io_bp; - dr->dt.dl.dr_override_state = DR_OVERRIDDEN; - dr->dt.dl.dr_copies = zio->io_prop.zp_copies; - - /* - * Old style holes are filled with all zeros, whereas - * new-style holes maintain their lsize, type, level, - * and birth time (see zio_write_compress). While we - * need to reset the BP_SET_LSIZE() call that happened - * in dmu_sync_ready for old style holes, we do *not* - * want to wipe out the information contained in new - * style holes. Thus, only zero out the block pointer if - * it's an old style hole. - */ - if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && - dr->dt.dl.dr_overridden_by.blk_birth == 0) - BP_ZERO(&dr->dt.dl.dr_overridden_by); - } else { - dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - } - cv_broadcast(&db->db_changed); - mutex_exit(&db->db_mtx); - - dsa->dsa_done(dsa->dsa_zgd, zio->io_error); - - kmem_free(dsa, sizeof (*dsa)); -} - -static void -dmu_sync_late_arrival_done(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - dmu_sync_arg_t *dsa = zio->io_private; - blkptr_t *bp_orig = &zio->io_bp_orig; - zgd_t *zgd = dsa->dsa_zgd; - - if (zio->io_error == 0) { - /* - * Record the vdev(s) backing this blkptr so they can be - * flushed after the writes for the lwb have completed. - */ - zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); - - if (!BP_IS_HOLE(bp)) { - ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); - ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); - ASSERT(zio->io_bp->blk_birth == zio->io_txg); - ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); - zio_free(zio->io_spa, zio->io_txg, zio->io_bp); - } - } - - dmu_tx_commit(dsa->dsa_tx); - - dsa->dsa_done(dsa->dsa_zgd, zio->io_error); - - abd_put(zio->io_abd); - kmem_free(dsa, sizeof (*dsa)); -} - -static int -dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, - zio_prop_t *zp, zbookmark_phys_t *zb) -{ - dmu_sync_arg_t *dsa; - dmu_tx_t *tx; - - tx = dmu_tx_create(os); - dmu_tx_hold_space(tx, zgd->zgd_db->db_size); - if (dmu_tx_assign(tx, TXG_WAIT) != 0) { - dmu_tx_abort(tx); - /* Make zl_get_data do txg_waited_synced() */ - return (SET_ERROR(EIO)); - } - - /* - * In order to prevent the zgd's lwb from being free'd prior to - * dmu_sync_late_arrival_done() being called, we have to ensure - * the lwb's "max txg" takes this tx's txg into account. - */ - zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx)); - - dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); - dsa->dsa_dr = NULL; - dsa->dsa_done = done; - dsa->dsa_zgd = zgd; - dsa->dsa_tx = tx; - - /* - * Since we are currently syncing this txg, it's nontrivial to - * determine what BP to nopwrite against, so we disable nopwrite. - * - * When syncing, the db_blkptr is initially the BP of the previous - * txg. We can not nopwrite against it because it will be changed - * (this is similar to the non-late-arrival case where the dbuf is - * dirty in a future txg). - * - * Then dbuf_write_ready() sets bp_blkptr to the location we will write. - * We can not nopwrite against it because although the BP will not - * (typically) be changed, the data has not yet been persisted to this - * location. - * - * Finally, when dbuf_write_done() is called, it is theoretically - * possible to always nopwrite, because the data that was written in - * this txg is the same data that we are trying to write. However we - * would need to check that this dbuf is not dirty in any future - * txg's (as we do in the normal dmu_sync() path). For simplicity, we - * don't nopwrite in this case. - */ - zp->zp_nopwrite = B_FALSE; - - zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, - abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), - zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, - dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, - dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); - - return (0); -} - -/* - * Intent log support: sync the block associated with db to disk. - * N.B. and XXX: the caller is responsible for making sure that the - * data isn't changing while dmu_sync() is writing it. - * - * Return values: - * - * EEXIST: this txg has already been synced, so there's nothing to do. - * The caller should not log the write. - * - * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. - * The caller should not log the write. - * - * EALREADY: this block is already in the process of being synced. - * The caller should track its progress (somehow). - * - * EIO: could not do the I/O. - * The caller should do a txg_wait_synced(). - * - * 0: the I/O has been initiated. - * The caller should log this blkptr in the done callback. - * It is possible that the I/O will fail, in which case - * the error will be reported to the done callback and - * propagated to pio from zio_done(). - */ -int -dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; - objset_t *os = db->db_objset; - dsl_dataset_t *ds = os->os_dsl_dataset; - dbuf_dirty_record_t *dr; - dmu_sync_arg_t *dsa; - zbookmark_phys_t zb; - zio_prop_t zp; - dnode_t *dn; - - ASSERT(pio != NULL); - ASSERT(txg != 0); - - SET_BOOKMARK(&zb, ds->ds_object, - db->db.db_object, db->db_level, db->db_blkid); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); - DB_DNODE_EXIT(db); - - /* - * If we're frozen (running ziltest), we always need to generate a bp. - */ - if (txg > spa_freeze_txg(os->os_spa)) - return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); - - /* - * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() - * and us. If we determine that this txg is not yet syncing, - * but it begins to sync a moment later, that's OK because the - * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. - */ - mutex_enter(&db->db_mtx); - - if (txg <= spa_last_synced_txg(os->os_spa)) { - /* - * This txg has already synced. There's nothing to do. - */ - mutex_exit(&db->db_mtx); - return (SET_ERROR(EEXIST)); - } - - if (txg <= spa_syncing_txg(os->os_spa)) { - /* - * This txg is currently syncing, so we can't mess with - * the dirty record anymore; just write a new log block. - */ - mutex_exit(&db->db_mtx); - return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); - } - - dr = db->db_last_dirty; - while (dr && dr->dr_txg != txg) - dr = dr->dr_next; - - if (dr == NULL) { - /* - * There's no dr for this dbuf, so it must have been freed. - * There's no need to log writes to freed blocks, so we're done. - */ - mutex_exit(&db->db_mtx); - return (SET_ERROR(ENOENT)); - } - - ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); - - if (db->db_blkptr != NULL) { - /* - * We need to fill in zgd_bp with the current blkptr so that - * the nopwrite code can check if we're writing the same - * data that's already on disk. We can only nopwrite if we - * are sure that after making the copy, db_blkptr will not - * change until our i/o completes. We ensure this by - * holding the db_mtx, and only allowing nopwrite if the - * block is not already dirty (see below). This is verified - * by dmu_sync_done(), which VERIFYs that the db_blkptr has - * not changed. - */ - *zgd->zgd_bp = *db->db_blkptr; - } - - /* - * Assume the on-disk data is X, the current syncing data (in - * txg - 1) is Y, and the current in-memory data is Z (currently - * in dmu_sync). - * - * We usually want to perform a nopwrite if X and Z are the - * same. However, if Y is different (i.e. the BP is going to - * change before this write takes effect), then a nopwrite will - * be incorrect - we would override with X, which could have - * been freed when Y was written. - * - * (Note that this is not a concern when we are nop-writing from - * syncing context, because X and Y must be identical, because - * all previous txgs have been synced.) - * - * Therefore, we disable nopwrite if the current BP could change - * before this TXG. There are two ways it could change: by - * being dirty (dr_next is non-NULL), or by being freed - * (dnode_block_freed()). This behavior is verified by - * zio_done(), which VERIFYs that the override BP is identical - * to the on-disk BP. - */ - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) - zp.zp_nopwrite = B_FALSE; - DB_DNODE_EXIT(db); - - ASSERT(dr->dr_txg == txg); - if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || - dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { - /* - * We have already issued a sync write for this buffer, - * or this buffer has already been synced. It could not - * have been dirtied since, or we would have cleared the state. - */ - mutex_exit(&db->db_mtx); - return (SET_ERROR(EALREADY)); - } - - ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); - dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; - mutex_exit(&db->db_mtx); - - dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); - dsa->dsa_dr = dr; - dsa->dsa_done = done; - dsa->dsa_zgd = zgd; - dsa->dsa_tx = NULL; - - zio_nowait(arc_write(pio, os->os_spa, txg, - zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), - &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); - - return (0); -} - -int -dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, - dmu_tx_t *tx) -{ - dnode_t *dn; - int err; - - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); - err = dnode_set_blksz(dn, size, ibs, tx); - dnode_rele(dn, FTAG); - return (err); -} - -void -dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, - dmu_tx_t *tx) -{ - dnode_t *dn; - - /* - * Send streams include each object's checksum function. This - * check ensures that the receiving system can understand the - * checksum function transmitted. - */ - ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); - - VERIFY0(dnode_hold(os, object, FTAG, &dn)); - ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); - dn->dn_checksum = checksum; - dnode_setdirty(dn, tx); - dnode_rele(dn, FTAG); -} - -void -dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, - dmu_tx_t *tx) -{ - dnode_t *dn; - - /* - * Send streams include each object's compression function. This - * check ensures that the receiving system can understand the - * compression function transmitted. - */ - ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); - - VERIFY0(dnode_hold(os, object, FTAG, &dn)); - dn->dn_compress = compress; - dnode_setdirty(dn, tx); - dnode_rele(dn, FTAG); -} - -int zfs_mdcomp_disable = 0; -SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN, - &zfs_mdcomp_disable, 0, "Disable metadata compression"); - -/* - * When the "redundant_metadata" property is set to "most", only indirect - * blocks of this level and higher will have an additional ditto block. - */ -int zfs_redundant_metadata_most_ditto_level = 2; - -void -dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) -{ - dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; - boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || - (wp & WP_SPILL)); - enum zio_checksum checksum = os->os_checksum; - enum zio_compress compress = os->os_compress; - enum zio_checksum dedup_checksum = os->os_dedup_checksum; - boolean_t dedup = B_FALSE; - boolean_t nopwrite = B_FALSE; - boolean_t dedup_verify = os->os_dedup_verify; - int copies = os->os_copies; - - /* - * We maintain different write policies for each of the following - * types of data: - * 1. metadata - * 2. preallocated blocks (i.e. level-0 blocks of a dump device) - * 3. all other level 0 blocks - */ - if (ismd) { - if (zfs_mdcomp_disable) { - compress = ZIO_COMPRESS_EMPTY; - } else { - /* - * XXX -- we should design a compression algorithm - * that specializes in arrays of bps. - */ - compress = zio_compress_select(os->os_spa, - ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); - } - - /* - * Metadata always gets checksummed. If the data - * checksum is multi-bit correctable, and it's not a - * ZBT-style checksum, then it's suitable for metadata - * as well. Otherwise, the metadata checksum defaults - * to fletcher4. - */ - if (!(zio_checksum_table[checksum].ci_flags & - ZCHECKSUM_FLAG_METADATA) || - (zio_checksum_table[checksum].ci_flags & - ZCHECKSUM_FLAG_EMBEDDED)) - checksum = ZIO_CHECKSUM_FLETCHER_4; - - if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || - (os->os_redundant_metadata == - ZFS_REDUNDANT_METADATA_MOST && - (level >= zfs_redundant_metadata_most_ditto_level || - DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) - copies++; - } else if (wp & WP_NOFILL) { - ASSERT(level == 0); - - /* - * If we're writing preallocated blocks, we aren't actually - * writing them so don't set any policy properties. These - * blocks are currently only used by an external subsystem - * outside of zfs (i.e. dump) and not written by the zio - * pipeline. - */ - compress = ZIO_COMPRESS_OFF; - checksum = ZIO_CHECKSUM_NOPARITY; - } else { - compress = zio_compress_select(os->os_spa, dn->dn_compress, - compress); - - checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? - zio_checksum_select(dn->dn_checksum, checksum) : - dedup_checksum; - - /* - * Determine dedup setting. If we are in dmu_sync(), - * we won't actually dedup now because that's all - * done in syncing context; but we do want to use the - * dedup checkum. If the checksum is not strong - * enough to ensure unique signatures, force - * dedup_verify. - */ - if (dedup_checksum != ZIO_CHECKSUM_OFF) { - dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; - if (!(zio_checksum_table[checksum].ci_flags & - ZCHECKSUM_FLAG_DEDUP)) - dedup_verify = B_TRUE; - } - - /* - * Enable nopwrite if we have secure enough checksum - * algorithm (see comment in zio_nop_write) and - * compression is enabled. We don't enable nopwrite if - * dedup is enabled as the two features are mutually - * exclusive. - */ - nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & - ZCHECKSUM_FLAG_NOPWRITE) && - compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); - } - - zp->zp_checksum = checksum; - zp->zp_compress = compress; - ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); - - zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; - zp->zp_level = level; - zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); - zp->zp_dedup = dedup; - zp->zp_dedup_verify = dedup && dedup_verify; - zp->zp_nopwrite = nopwrite; - zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ? - os->os_zpl_special_smallblock : 0; -} - -int -dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) -{ - dnode_t *dn; - int err; - - /* - * Sync any current changes before - * we go trundling through the block pointers. - */ - err = dmu_object_wait_synced(os, object); - if (err) { - return (err); - } - - err = dnode_hold(os, object, FTAG, &dn); - if (err) { - return (err); - } - - err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); - dnode_rele(dn, FTAG); - - return (err); -} - -/* - * Given the ZFS object, if it contains any dirty nodes - * this function flushes all dirty blocks to disk. This - * ensures the DMU object info is updated. A more efficient - * future version might just find the TXG with the maximum - * ID and wait for that to be synced. - */ -int -dmu_object_wait_synced(objset_t *os, uint64_t object) -{ - dnode_t *dn; - int error, i; - - error = dnode_hold(os, object, FTAG, &dn); - if (error) { - return (error); - } - - for (i = 0; i < TXG_SIZE; i++) { - if (list_link_active(&dn->dn_dirty_link[i])) { - break; - } - } - dnode_rele(dn, FTAG); - if (i != TXG_SIZE) { - txg_wait_synced(dmu_objset_pool(os), 0); - } - - return (0); -} - -void -__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) -{ - dnode_phys_t *dnp = dn->dn_phys; - - doi->doi_data_block_size = dn->dn_datablksz; - doi->doi_metadata_block_size = dn->dn_indblkshift ? - 1ULL << dn->dn_indblkshift : 0; - doi->doi_type = dn->dn_type; - doi->doi_bonus_type = dn->dn_bonustype; - doi->doi_bonus_size = dn->dn_bonuslen; - doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT; - doi->doi_indirection = dn->dn_nlevels; - doi->doi_checksum = dn->dn_checksum; - doi->doi_compress = dn->dn_compress; - doi->doi_nblkptr = dn->dn_nblkptr; - doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; - doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; - doi->doi_fill_count = 0; - for (int i = 0; i < dnp->dn_nblkptr; i++) - doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); -} - -void -dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) -{ - rw_enter(&dn->dn_struct_rwlock, RW_READER); - mutex_enter(&dn->dn_mtx); - - __dmu_object_info_from_dnode(dn, doi); - - mutex_exit(&dn->dn_mtx); - rw_exit(&dn->dn_struct_rwlock); -} - -/* - * Get information on a DMU object. - * If doi is NULL, just indicates whether the object exists. - */ -int -dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) -{ - dnode_t *dn; - int err = dnode_hold(os, object, FTAG, &dn); - - if (err) - return (err); - - if (doi != NULL) - dmu_object_info_from_dnode(dn, doi); - - dnode_rele(dn, FTAG); - return (0); -} - -/* - * As above, but faster; can be used when you have a held dbuf in hand. - */ -void -dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - DB_DNODE_ENTER(db); - dmu_object_info_from_dnode(DB_DNODE(db), doi); - DB_DNODE_EXIT(db); -} - -/* - * Faster still when you only care about the size. - * This is specifically optimized for zfs_getattr(). - */ -void -dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, - u_longlong_t *nblk512) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - - *blksize = dn->dn_datablksz; - /* add in number of slots used for the dnode itself */ - *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> - SPA_MINBLOCKSHIFT) + dn->dn_num_slots; - DB_DNODE_EXIT(db); -} - -void -dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - *dnsize = dn->dn_num_slots << DNODE_SHIFT; - DB_DNODE_EXIT(db); -} - -void -byteswap_uint64_array(void *vbuf, size_t size) -{ - uint64_t *buf = vbuf; - size_t count = size >> 3; - int i; - - ASSERT((size & 7) == 0); - - for (i = 0; i < count; i++) - buf[i] = BSWAP_64(buf[i]); -} - -void -byteswap_uint32_array(void *vbuf, size_t size) -{ - uint32_t *buf = vbuf; - size_t count = size >> 2; - int i; - - ASSERT((size & 3) == 0); - - for (i = 0; i < count; i++) - buf[i] = BSWAP_32(buf[i]); -} - -void -byteswap_uint16_array(void *vbuf, size_t size) -{ - uint16_t *buf = vbuf; - size_t count = size >> 1; - int i; - - ASSERT((size & 1) == 0); - - for (i = 0; i < count; i++) - buf[i] = BSWAP_16(buf[i]); -} - -/* ARGSUSED */ -void -byteswap_uint8_array(void *vbuf, size_t size) -{ -} - -void -dmu_init(void) -{ - abd_init(); - zfs_dbgmsg_init(); - sa_cache_init(); - xuio_stat_init(); - dmu_objset_init(); - dnode_init(); - zfetch_init(); - zio_compress_init(); - l2arc_init(); - arc_init(); - dbuf_init(); -} - -void -dmu_fini(void) -{ - arc_fini(); /* arc depends on l2arc, so arc must go first */ - l2arc_fini(); - zfetch_fini(); - zio_compress_fini(); - dbuf_fini(); - dnode_fini(); - dmu_objset_fini(); - xuio_stat_fini(); - sa_cache_fini(); - zfs_dbgmsg_fini(); - abd_fini(); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c deleted file mode 100644 index e7bfdaa90e97..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c +++ /dev/null @@ -1,251 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct diffarg { - struct file *da_fp; /* file to which we are reporting */ - offset_t *da_offp; - int da_err; /* error that stopped diff search */ - dmu_diff_record_t da_ddr; - kthread_t *da_td; -}; - -static int -write_bytes(struct diffarg *da) -{ - struct uio auio; - struct iovec aiov; - - aiov.iov_base = (caddr_t)&da->da_ddr; - aiov.iov_len = sizeof (da->da_ddr); - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_resid = aiov.iov_len; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_WRITE; - auio.uio_offset = (off_t)-1; - auio.uio_td = da->da_td; -#ifdef _KERNEL - if (da->da_fp->f_type == DTYPE_VNODE) - bwillwrite(); - return (fo_write(da->da_fp, &auio, da->da_td->td_ucred, 0, da->da_td)); -#else - fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); - return (EOPNOTSUPP); -#endif -} - -static int -write_record(struct diffarg *da) -{ - - if (da->da_ddr.ddr_type == DDR_NONE) { - da->da_err = 0; - return (0); - } - - da->da_err = write_bytes(da); - *da->da_offp += sizeof (da->da_ddr); - return (da->da_err); -} - -static int -report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last) -{ - ASSERT(first <= last); - if (da->da_ddr.ddr_type != DDR_FREE || - first != da->da_ddr.ddr_last + 1) { - if (write_record(da) != 0) - return (da->da_err); - da->da_ddr.ddr_type = DDR_FREE; - da->da_ddr.ddr_first = first; - da->da_ddr.ddr_last = last; - return (0); - } - da->da_ddr.ddr_last = last; - return (0); -} - -static int -report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp) -{ - ASSERT(dnp != NULL); - if (dnp->dn_type == DMU_OT_NONE) - return (report_free_dnode_range(da, object, object)); - - if (da->da_ddr.ddr_type != DDR_INUSE || - object != da->da_ddr.ddr_last + 1) { - if (write_record(da) != 0) - return (da->da_err); - da->da_ddr.ddr_type = DDR_INUSE; - da->da_ddr.ddr_first = da->da_ddr.ddr_last = object; - return (0); - } - da->da_ddr.ddr_last = object; - return (0); -} - -#define DBP_SPAN(dnp, level) \ - (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ - (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) - -/* ARGSUSED */ -static int -diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) -{ - struct diffarg *da = arg; - int err = 0; - - if (issig(JUSTLOOKING) && issig(FORREAL)) - return (SET_ERROR(EINTR)); - - if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT) - return (0); - - if (BP_IS_HOLE(bp)) { - uint64_t span = DBP_SPAN(dnp, zb->zb_level); - uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; - - err = report_free_dnode_range(da, dnobj, - dnobj + (span >> DNODE_SHIFT) - 1); - if (err) - return (err); - } else if (zb->zb_level == 0) { - dnode_phys_t *blk; - arc_buf_t *abuf; - arc_flags_t aflags = ARC_FLAG_WAIT; - int blksz = BP_GET_LSIZE(bp); - int i; - - if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, - &aflags, zb) != 0) - return (SET_ERROR(EIO)); - - blk = abuf->b_data; - for (i = 0; i < blksz >> DNODE_SHIFT; i++) { - uint64_t dnobj = (zb->zb_blkid << - (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; - err = report_dnode(da, dnobj, blk+i); - if (err) - break; - } - arc_buf_destroy(abuf, &abuf); - if (err) - return (err); - /* Don't care about the data blocks */ - return (TRAVERSE_VISIT_NO_CHILDREN); - } - return (0); -} - -int -dmu_diff(const char *tosnap_name, const char *fromsnap_name, -#ifdef illumos - struct vnode *vp, offset_t *offp) -#else - struct file *fp, offset_t *offp) -#endif -{ - struct diffarg da; - dsl_dataset_t *fromsnap; - dsl_dataset_t *tosnap; - dsl_pool_t *dp; - int error; - uint64_t fromtxg; - - if (strchr(tosnap_name, '@') == NULL || - strchr(fromsnap_name, '@') == NULL) - return (SET_ERROR(EINVAL)); - - error = dsl_pool_hold(tosnap_name, FTAG, &dp); - if (error != 0) - return (error); - - error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap); - if (error != 0) { - dsl_pool_rele(dp, FTAG); - return (error); - } - - error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap); - if (error != 0) { - dsl_dataset_rele(tosnap, FTAG); - dsl_pool_rele(dp, FTAG); - return (error); - } - - if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { - dsl_dataset_rele(fromsnap, FTAG); - dsl_dataset_rele(tosnap, FTAG); - dsl_pool_rele(dp, FTAG); - return (SET_ERROR(EXDEV)); - } - - fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg; - dsl_dataset_rele(fromsnap, FTAG); - - dsl_dataset_long_hold(tosnap, FTAG); - dsl_pool_rele(dp, FTAG); - - da.da_fp = fp; - da.da_offp = offp; - da.da_ddr.ddr_type = DDR_NONE; - da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0; - da.da_err = 0; - da.da_td = curthread; - - error = traverse_dataset(tosnap, fromtxg, - TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da); - - if (error != 0) { - da.da_err = error; - } else { - /* we set the da.da_err we return as side-effect */ - (void) write_record(&da); - } - - dsl_dataset_long_rele(tosnap, FTAG); - dsl_dataset_rele(tosnap, FTAG); - - return (da.da_err); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c deleted file mode 100644 index b40ccf4a7839..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c +++ /dev/null @@ -1,444 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. - * Copyright 2014 HybridCluster. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * Each of the concurrent object allocators will grab - * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to - * grab 128 slots, which is 4 blocks worth. This was experimentally - * determined to be the lowest value that eliminates the measurable effect - * of lock contention from this code path. - */ -int dmu_object_alloc_chunk_shift = 7; - -static uint64_t -dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, - int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, - int dnodesize, dmu_tx_t *tx) -{ - uint64_t object; - uint64_t L1_dnode_count = DNODES_PER_BLOCK << - (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); - dnode_t *dn = NULL; - int dn_slots = dnodesize >> DNODE_SHIFT; - boolean_t restarted = B_FALSE; - uint64_t *cpuobj = &os->os_obj_next_percpu[CPU_SEQID % - os->os_obj_next_percpu_len]; - int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; - int error; - - if (dn_slots == 0) { - dn_slots = DNODE_MIN_SLOTS; - } else { - ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); - ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); - } - - /* - * The "chunk" of dnodes that is assigned to a CPU-specific - * allocator needs to be at least one block's worth, to avoid - * lock contention on the dbuf. It can be at most one L1 block's - * worth, so that the "rescan after polishing off a L1's worth" - * logic below will be sure to kick in. - */ - if (dnodes_per_chunk < DNODES_PER_BLOCK) - dnodes_per_chunk = DNODES_PER_BLOCK; - if (dnodes_per_chunk > L1_dnode_count) - dnodes_per_chunk = L1_dnode_count; - -#ifdef __FreeBSD__ - object = atomic_load_64(cpuobj); -#else - object = *cpuobj; -#endif - - for (;;) { - /* - * If we finished a chunk of dnodes, get a new one from - * the global allocator. - */ - if ((P2PHASE(object, dnodes_per_chunk) == 0) || - (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < - dn_slots)) { - DNODE_STAT_BUMP(dnode_alloc_next_chunk); - mutex_enter(&os->os_obj_lock); - ASSERT0(P2PHASE(os->os_obj_next_chunk, - dnodes_per_chunk)); - object = os->os_obj_next_chunk; - - /* - * Each time we polish off a L1 bp worth of dnodes - * (2^12 objects), move to another L1 bp that's - * still reasonably sparse (at most 1/4 full). Look - * from the beginning at most once per txg. If we - * still can't allocate from that L1 block, search - * for an empty L0 block, which will quickly skip - * to the end of the metadnode if the no nearby L0 - * blocks are empty. This fallback avoids a - * pathology where full dnode blocks containing - * large dnodes appear sparse because they have a - * low blk_fill, leading to many failed allocation - * attempts. In the long term a better mechanism to - * search for sparse metadnode regions, such as - * spacemaps, could be implemented. - * - * os_scan_dnodes is set during txg sync if enough - * objects have been freed since the previous - * rescan to justify backfilling again. - * - * Note that dmu_traverse depends on the behavior - * that we use multiple blocks of the dnode object - * before going back to reuse objects. Any change - * to this algorithm should preserve that property - * or find another solution to the issues described - * in traverse_visitbp. - */ - if (P2PHASE(object, L1_dnode_count) == 0) { - uint64_t offset; - uint64_t blkfill; - int minlvl; - if (os->os_rescan_dnodes) { - offset = 0; - os->os_rescan_dnodes = B_FALSE; - } else { - offset = object << DNODE_SHIFT; - } - blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; - minlvl = restarted ? 1 : 2; - restarted = B_TRUE; - error = dnode_next_offset(DMU_META_DNODE(os), - DNODE_FIND_HOLE, &offset, minlvl, - blkfill, 0); - if (error == 0) { - object = offset >> DNODE_SHIFT; - } - } - /* - * Note: if "restarted", we may find a L0 that - * is not suitably aligned. - */ - os->os_obj_next_chunk = - P2ALIGN(object, dnodes_per_chunk) + - dnodes_per_chunk; - (void) atomic_swap_64(cpuobj, object); - mutex_exit(&os->os_obj_lock); - } - - /* - * The value of (*cpuobj) before adding dn_slots is the object - * ID assigned to us. The value afterwards is the object ID - * assigned to whoever wants to do an allocation next. - */ - object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; - - /* - * XXX We should check for an i/o error here and return - * up to our caller. Actually we should pre-read it in - * dmu_tx_assign(), but there is currently no mechanism - * to do so. - */ - error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, - dn_slots, FTAG, &dn); - if (error == 0) { - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - /* - * Another thread could have allocated it; check - * again now that we have the struct lock. - */ - if (dn->dn_type == DMU_OT_NONE) { - dnode_allocate(dn, ot, blocksize, 0, - bonustype, bonuslen, dn_slots, tx); - rw_exit(&dn->dn_struct_rwlock); - dmu_tx_add_new_object(tx, dn); - dnode_rele(dn, FTAG); - return (object); - } - rw_exit(&dn->dn_struct_rwlock); - dnode_rele(dn, FTAG); - DNODE_STAT_BUMP(dnode_alloc_race); - } - - /* - * Skip to next known valid starting point on error. This - * is the start of the next block of dnodes. - */ - if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { - object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); - DNODE_STAT_BUMP(dnode_alloc_next_block); - } - (void) atomic_swap_64(cpuobj, object); - } -} - -uint64_t -dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, - bonuslen, 0, tx)); -} - -uint64_t -dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, - int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, - dmu_tx_t *tx) -{ - return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, - bonustype, bonuslen, 0, tx)); -} - -uint64_t -dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) -{ - return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, - bonuslen, dnodesize, tx)); -} - -int -dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, - bonuslen, 0, tx)); -} - -int -dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonustype, int bonuslen, - int dnodesize, dmu_tx_t *tx) -{ - dnode_t *dn; - int dn_slots = dnodesize >> DNODE_SHIFT; - int err; - - if (dn_slots == 0) - dn_slots = DNODE_MIN_SLOTS; - ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); - ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); - - if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) - return (SET_ERROR(EBADF)); - - err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, - FTAG, &dn); - if (err) - return (err); - dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); - dmu_tx_add_new_object(tx, dn); - - dnode_rele(dn, FTAG); - - return (0); -} - -int -dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, - bonuslen, DNODE_MIN_SIZE, tx)); -} - -int -dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, - dmu_tx_t *tx) -{ - dnode_t *dn; - int dn_slots = dnodesize >> DNODE_SHIFT; - int err; - - if (dn_slots == 0) - dn_slots = DNODE_MIN_SLOTS; - - if (object == DMU_META_DNODE_OBJECT) - return (SET_ERROR(EBADF)); - - err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, - FTAG, &dn); - if (err) - return (err); - - dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx); - - dnode_rele(dn, FTAG); - return (err); -} - - -int -dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) -{ - dnode_t *dn; - int err; - - ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); - - err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, - FTAG, &dn); - if (err) - return (err); - - ASSERT(dn->dn_type != DMU_OT_NONE); - /* - * If we don't create this free range, we'll leak indirect blocks when - * we get to freeing the dnode in syncing context. - */ - dnode_free_range(dn, 0, DMU_OBJECT_END, tx); - dnode_free(dn, tx); - dnode_rele(dn, FTAG); - - return (0); -} - -/* - * Return (in *objectp) the next object which is allocated (or a hole) - * after *object, taking into account only objects that may have been modified - * after the specified txg. - */ -int -dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) -{ - uint64_t offset; - uint64_t start_obj; - struct dsl_dataset *ds = os->os_dsl_dataset; - int error; - - if (*objectp == 0) { - start_obj = 1; - } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { - uint64_t i = *objectp + 1; - uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); - dmu_object_info_t doi; - - /* - * Scan through the remaining meta dnode block. The contents - * of each slot in the block are known so it can be quickly - * checked. If the block is exhausted without a match then - * hand off to dnode_next_offset() for further scanning. - */ - while (i <= last_obj) { - error = dmu_object_info(os, i, &doi); - if (error == ENOENT) { - if (hole) { - *objectp = i; - return (0); - } else { - i++; - } - } else if (error == EEXIST) { - i++; - } else if (error == 0) { - if (hole) { - i += doi.doi_dnodesize >> DNODE_SHIFT; - } else { - *objectp = i; - return (0); - } - } else { - return (error); - } - } - - start_obj = i; - } else { - start_obj = *objectp + 1; - } - - offset = start_obj << DNODE_SHIFT; - - error = dnode_next_offset(DMU_META_DNODE(os), - (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); - - *objectp = offset >> DNODE_SHIFT; - - return (error); -} - -/* - * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the - * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. - * - * Only for use from syncing context, on MOS objects. - */ -void -dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, - dmu_tx_t *tx) -{ - dnode_t *dn; - - ASSERT(dmu_tx_is_syncing(tx)); - - VERIFY0(dnode_hold(mos, object, FTAG, &dn)); - if (dn->dn_type == DMU_OTN_ZAP_METADATA) { - dnode_rele(dn, FTAG); - return; - } - ASSERT3U(dn->dn_type, ==, old_type); - ASSERT0(dn->dn_maxblkid); - - /* - * We must initialize the ZAP data before changing the type, - * so that concurrent calls to *_is_zapified() can determine if - * the object has been completely zapified by checking the type. - */ - mzap_create_impl(mos, object, 0, 0, tx); - - dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = - DMU_OTN_ZAP_METADATA; - dnode_setdirty(dn, tx); - dnode_rele(dn, FTAG); - - spa_feature_incr(dmu_objset_spa(mos), - SPA_FEATURE_EXTENSIBLE_DATASET, tx); -} - -void -dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) -{ - dnode_t *dn; - dmu_object_type_t t; - - ASSERT(dmu_tx_is_syncing(tx)); - - VERIFY0(dnode_hold(mos, object, FTAG, &dn)); - t = dn->dn_type; - dnode_rele(dn, FTAG); - - if (t == DMU_OTN_ZAP_METADATA) { - spa_feature_decr(dmu_objset_spa(mos), - SPA_FEATURE_EXTENSIBLE_DATASET, tx); - } - VERIFY0(dmu_object_free(mos, object, tx)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c deleted file mode 100644 index 1b691d412293..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ /dev/null @@ -1,2484 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright (c) 2015, STRATO AG, Inc. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2017 Nexenta Systems, Inc. - * Copyright (c) 2018, loli10K . All rights reserved. - */ - -/* Portions Copyright 2010 Robert Milkowski */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "zfs_namecheck.h" - -/* - * Needed to close a window in dnode_move() that allows the objset to be freed - * before it can be safely accessed. - */ -krwlock_t os_lock; - -/* - * Tunable to overwrite the maximum number of threads for the parallization - * of dmu_objset_find_dp, needed to speed up the import of pools with many - * datasets. - * Default is 4 times the number of leaf vdevs. - */ -int dmu_find_threads = 0; - -/* - * Backfill lower metadnode objects after this many have been freed. - * Backfilling negatively impacts object creation rates, so only do it - * if there are enough holes to fill. - */ -int dmu_rescan_dnode_threshold = 131072; - -static void dmu_objset_find_dp_cb(void *arg); - -void -dmu_objset_init(void) -{ - rw_init(&os_lock, NULL, RW_DEFAULT, NULL); -} - -void -dmu_objset_fini(void) -{ - rw_destroy(&os_lock); -} - -spa_t * -dmu_objset_spa(objset_t *os) -{ - return (os->os_spa); -} - -zilog_t * -dmu_objset_zil(objset_t *os) -{ - return (os->os_zil); -} - -dsl_pool_t * -dmu_objset_pool(objset_t *os) -{ - dsl_dataset_t *ds; - - if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) - return (ds->ds_dir->dd_pool); - else - return (spa_get_dsl(os->os_spa)); -} - -dsl_dataset_t * -dmu_objset_ds(objset_t *os) -{ - return (os->os_dsl_dataset); -} - -dmu_objset_type_t -dmu_objset_type(objset_t *os) -{ - return (os->os_phys->os_type); -} - -void -dmu_objset_name(objset_t *os, char *buf) -{ - dsl_dataset_name(os->os_dsl_dataset, buf); -} - -uint64_t -dmu_objset_id(objset_t *os) -{ - dsl_dataset_t *ds = os->os_dsl_dataset; - - return (ds ? ds->ds_object : 0); -} - -uint64_t -dmu_objset_dnodesize(objset_t *os) -{ - return (os->os_dnodesize); -} - -zfs_sync_type_t -dmu_objset_syncprop(objset_t *os) -{ - return (os->os_sync); -} - -zfs_logbias_op_t -dmu_objset_logbias(objset_t *os) -{ - return (os->os_logbias); -} - -static void -checksum_changed_cb(void *arg, uint64_t newval) -{ - objset_t *os = arg; - - /* - * Inheritance should have been done by now. - */ - ASSERT(newval != ZIO_CHECKSUM_INHERIT); - - os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); -} - -static void -compression_changed_cb(void *arg, uint64_t newval) -{ - objset_t *os = arg; - - /* - * Inheritance and range checking should have been done by now. - */ - ASSERT(newval != ZIO_COMPRESS_INHERIT); - - os->os_compress = zio_compress_select(os->os_spa, newval, - ZIO_COMPRESS_ON); -} - -static void -copies_changed_cb(void *arg, uint64_t newval) -{ - objset_t *os = arg; - - /* - * Inheritance and range checking should have been done by now. - */ - ASSERT(newval > 0); - ASSERT(newval <= spa_max_replication(os->os_spa)); - - os->os_copies = newval; -} - -static void -dedup_changed_cb(void *arg, uint64_t newval) -{ - objset_t *os = arg; - spa_t *spa = os->os_spa; - enum zio_checksum checksum; - - /* - * Inheritance should have been done by now. - */ - ASSERT(newval != ZIO_CHECKSUM_INHERIT); - - checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); - - os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; - os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); -} - -static void -primary_cache_changed_cb(void *arg, uint64_t newval) -{ - objset_t *os = arg; - - /* - * Inheritance and range checking should have been done by now. - */ - ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || - newval == ZFS_CACHE_METADATA); - - os->os_primary_cache = newval; -} - -static void -secondary_cache_changed_cb(void *arg, uint64_t newval) -{ - objset_t *os = arg; - - /* - * Inheritance and range checking should have been done by now. - */ - ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || - newval == ZFS_CACHE_METADATA); - - os->os_secondary_cache = newval; -} - -static void -sync_changed_cb(void *arg, uint64_t newval) -{ - objset_t *os = arg; - - /* - * Inheritance and range checking should have been done by now. - */ - ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || - newval == ZFS_SYNC_DISABLED); - - os->os_sync = newval; - if (os->os_zil) - zil_set_sync(os->os_zil, newval); -} - -static void -redundant_metadata_changed_cb(void *arg, uint64_t newval) -{ - objset_t *os = arg; - - /* - * Inheritance and range checking should have been done by now. - */ - ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL || - newval == ZFS_REDUNDANT_METADATA_MOST); - - os->os_redundant_metadata = newval; -} - -static void -dnodesize_changed_cb(void *arg, uint64_t newval) -{ - objset_t *os = arg; - - switch (newval) { - case ZFS_DNSIZE_LEGACY: - os->os_dnodesize = DNODE_MIN_SIZE; - break; - case ZFS_DNSIZE_AUTO: - /* - * Choose a dnode size that will work well for most - * workloads if the user specified "auto". Future code - * improvements could dynamically select a dnode size - * based on observed workload patterns. - */ - os->os_dnodesize = DNODE_MIN_SIZE * 2; - break; - case ZFS_DNSIZE_1K: - case ZFS_DNSIZE_2K: - case ZFS_DNSIZE_4K: - case ZFS_DNSIZE_8K: - case ZFS_DNSIZE_16K: - os->os_dnodesize = newval; - break; - } -} - -static void -smallblk_changed_cb(void *arg, uint64_t newval) -{ - objset_t *os = arg; - - /* - * Inheritance and range checking should have been done by now. - */ - ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE); - ASSERT(ISP2(newval)); - - os->os_zpl_special_smallblock = newval; -} - -static void -logbias_changed_cb(void *arg, uint64_t newval) -{ - objset_t *os = arg; - - ASSERT(newval == ZFS_LOGBIAS_LATENCY || - newval == ZFS_LOGBIAS_THROUGHPUT); - os->os_logbias = newval; - if (os->os_zil) - zil_set_logbias(os->os_zil, newval); -} - -static void -recordsize_changed_cb(void *arg, uint64_t newval) -{ - objset_t *os = arg; - - os->os_recordsize = newval; -} - -void -dmu_objset_byteswap(void *buf, size_t size) -{ - objset_phys_t *osp = buf; - - ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); - dnode_byteswap(&osp->os_meta_dnode); - byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); - osp->os_type = BSWAP_64(osp->os_type); - osp->os_flags = BSWAP_64(osp->os_flags); - if (size == sizeof (objset_phys_t)) { - dnode_byteswap(&osp->os_userused_dnode); - dnode_byteswap(&osp->os_groupused_dnode); - } -} - -/* - * The hash is a CRC-based hash of the objset_t pointer and the object number. - */ -static uint64_t -dnode_hash(const objset_t *os, uint64_t obj) -{ - uintptr_t osv = (uintptr_t)os; - uint64_t crc = -1ULL; - - ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); - /* - * The low 6 bits of the pointer don't have much entropy, because - * the objset_t is larger than 2^6 bytes long. - */ - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF]; - - crc ^= (osv>>14) ^ (obj>>24); - - return (crc); -} - -unsigned int -dnode_multilist_index_func(multilist_t *ml, void *obj) -{ - dnode_t *dn = obj; - return (dnode_hash(dn->dn_objset, dn->dn_object) % - multilist_get_num_sublists(ml)); -} - -/* - * Instantiates the objset_t in-memory structure corresponding to the - * objset_phys_t that's pointed to by the specified blkptr_t. - */ -int -dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - objset_t **osp) -{ - objset_t *os; - int i, err; - - ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); - -#if 0 - /* - * The $ORIGIN dataset (if it exists) doesn't have an associated - * objset, so there's no reason to open it. The $ORIGIN dataset - * will not exist on pools older than SPA_VERSION_ORIGIN. - */ - if (ds != NULL && spa_get_dsl(spa) != NULL && - spa_get_dsl(spa)->dp_origin_snap != NULL) { - ASSERT3P(ds->ds_dir, !=, - spa_get_dsl(spa)->dp_origin_snap->ds_dir); - } -#endif - - os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); - os->os_dsl_dataset = ds; - os->os_spa = spa; - os->os_rootbp = bp; - if (!BP_IS_HOLE(os->os_rootbp)) { - arc_flags_t aflags = ARC_FLAG_WAIT; - zbookmark_phys_t zb; - SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, - ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - - if (DMU_OS_IS_L2CACHEABLE(os)) - aflags |= ARC_FLAG_L2CACHE; - - dprintf_bp(os->os_rootbp, "reading %s", ""); - err = arc_read(NULL, spa, os->os_rootbp, - arc_getbuf_func, &os->os_phys_buf, - ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); - if (err != 0) { - kmem_free(os, sizeof (objset_t)); - /* convert checksum errors into IO errors */ - if (err == ECKSUM) - err = SET_ERROR(EIO); - return (err); - } - - /* Increase the blocksize if we are permitted. */ - if (spa_version(spa) >= SPA_VERSION_USERSPACE && - arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { - arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf, - ARC_BUFC_METADATA, sizeof (objset_phys_t)); - bzero(buf->b_data, sizeof (objset_phys_t)); - bcopy(os->os_phys_buf->b_data, buf->b_data, - arc_buf_size(os->os_phys_buf)); - arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); - os->os_phys_buf = buf; - } - - os->os_phys = os->os_phys_buf->b_data; - os->os_flags = os->os_phys->os_flags; - } else { - int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? - sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; - os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf, - ARC_BUFC_METADATA, size); - os->os_phys = os->os_phys_buf->b_data; - bzero(os->os_phys, size); - } - - /* - * Note: the changed_cb will be called once before the register - * func returns, thus changing the checksum/compression from the - * default (fletcher2/off). Snapshots don't need to know about - * checksum/compression/copies. - */ - if (ds != NULL) { - boolean_t needlock = B_FALSE; - - /* - * Note: it's valid to open the objset if the dataset is - * long-held, in which case the pool_config lock will not - * be held. - */ - if (!dsl_pool_config_held(dmu_objset_pool(os))) { - needlock = B_TRUE; - dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - } - err = dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), - primary_cache_changed_cb, os); - if (err == 0) { - err = dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), - secondary_cache_changed_cb, os); - } - if (!ds->ds_is_snapshot) { - if (err == 0) { - err = dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), - checksum_changed_cb, os); - } - if (err == 0) { - err = dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), - compression_changed_cb, os); - } - if (err == 0) { - err = dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_COPIES), - copies_changed_cb, os); - } - if (err == 0) { - err = dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_DEDUP), - dedup_changed_cb, os); - } - if (err == 0) { - err = dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_LOGBIAS), - logbias_changed_cb, os); - } - if (err == 0) { - err = dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_SYNC), - sync_changed_cb, os); - } - if (err == 0) { - err = dsl_prop_register(ds, - zfs_prop_to_name( - ZFS_PROP_REDUNDANT_METADATA), - redundant_metadata_changed_cb, os); - } - if (err == 0) { - err = dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_RECORDSIZE), - recordsize_changed_cb, os); - } - if (err == 0) { - err = dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_DNODESIZE), - dnodesize_changed_cb, os); - } - if (err == 0) { - err = dsl_prop_register(ds, - zfs_prop_to_name( - ZFS_PROP_SPECIAL_SMALL_BLOCKS), - smallblk_changed_cb, os); - } - } - if (needlock) - dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - if (err != 0) { - arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); - kmem_free(os, sizeof (objset_t)); - return (err); - } - } else { - /* It's the meta-objset. */ - os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; - os->os_compress = ZIO_COMPRESS_ON; - os->os_copies = spa_max_replication(spa); - os->os_dedup_checksum = ZIO_CHECKSUM_OFF; - os->os_dedup_verify = B_FALSE; - os->os_logbias = ZFS_LOGBIAS_LATENCY; - os->os_sync = ZFS_SYNC_STANDARD; - os->os_primary_cache = ZFS_CACHE_ALL; - os->os_secondary_cache = ZFS_CACHE_ALL; - os->os_dnodesize = DNODE_MIN_SIZE; - } - /* - * These properties will be filled in by the logic in zfs_get_zplprop() - * when they are queried for the first time. - */ - os->os_version = OBJSET_PROP_UNINITIALIZED; - os->os_normalization = OBJSET_PROP_UNINITIALIZED; - os->os_utf8only = OBJSET_PROP_UNINITIALIZED; - os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED; - - if (ds == NULL || !ds->ds_is_snapshot) - os->os_zil_header = os->os_phys->os_zil_header; - os->os_zil = zil_alloc(os, &os->os_zil_header); - - for (i = 0; i < TXG_SIZE; i++) { - os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t), - offsetof(dnode_t, dn_dirty_link[i]), - dnode_multilist_index_func); - } - list_create(&os->os_dnodes, sizeof (dnode_t), - offsetof(dnode_t, dn_link)); - list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), - offsetof(dmu_buf_impl_t, db_link)); - - mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); - os->os_obj_next_percpu_len = boot_ncpus; - os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len * - sizeof (os->os_obj_next_percpu[0]), KM_SLEEP); - - dnode_special_open(os, &os->os_phys->os_meta_dnode, - DMU_META_DNODE_OBJECT, &os->os_meta_dnode); - if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { - dnode_special_open(os, &os->os_phys->os_userused_dnode, - DMU_USERUSED_OBJECT, &os->os_userused_dnode); - dnode_special_open(os, &os->os_phys->os_groupused_dnode, - DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); - } - - *osp = os; - return (0); -} - -int -dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) -{ - int err = 0; - - /* - * We shouldn't be doing anything with dsl_dataset_t's unless the - * pool_config lock is held, or the dataset is long-held. - */ - ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) || - dsl_dataset_long_held(ds)); - - mutex_enter(&ds->ds_opening_lock); - if (ds->ds_objset == NULL) { - objset_t *os; - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), - ds, dsl_dataset_get_blkptr(ds), &os); - rrw_exit(&ds->ds_bp_rwlock, FTAG); - - if (err == 0) { - mutex_enter(&ds->ds_lock); - ASSERT(ds->ds_objset == NULL); - ds->ds_objset = os; - mutex_exit(&ds->ds_lock); - } - } - *osp = ds->ds_objset; - mutex_exit(&ds->ds_opening_lock); - return (err); -} - -/* - * Holds the pool while the objset is held. Therefore only one objset - * can be held at a time. - */ -int -dmu_objset_hold(const char *name, void *tag, objset_t **osp) -{ - dsl_pool_t *dp; - dsl_dataset_t *ds; - int err; - - err = dsl_pool_hold(name, tag, &dp); - if (err != 0) - return (err); - err = dsl_dataset_hold(dp, name, tag, &ds); - if (err != 0) { - dsl_pool_rele(dp, tag); - return (err); - } - - err = dmu_objset_from_ds(ds, osp); - if (err != 0) { - dsl_dataset_rele(ds, tag); - dsl_pool_rele(dp, tag); - } - - return (err); -} - -static int -dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type, - boolean_t readonly, void *tag, objset_t **osp) -{ - int err; - - err = dmu_objset_from_ds(ds, osp); - if (err != 0) { - dsl_dataset_disown(ds, tag); - } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { - dsl_dataset_disown(ds, tag); - return (SET_ERROR(EINVAL)); - } else if (!readonly && dsl_dataset_is_snapshot(ds)) { - dsl_dataset_disown(ds, tag); - return (SET_ERROR(EROFS)); - } - return (err); -} - -/* - * dsl_pool must not be held when this is called. - * Upon successful return, there will be a longhold on the dataset, - * and the dsl_pool will not be held. - */ -int -dmu_objset_own(const char *name, dmu_objset_type_t type, - boolean_t readonly, void *tag, objset_t **osp) -{ - dsl_pool_t *dp; - dsl_dataset_t *ds; - int err; - - err = dsl_pool_hold(name, FTAG, &dp); - if (err != 0) - return (err); - err = dsl_dataset_own(dp, name, tag, &ds); - if (err != 0) { - dsl_pool_rele(dp, FTAG); - return (err); - } - err = dmu_objset_own_impl(ds, type, readonly, tag, osp); - dsl_pool_rele(dp, FTAG); - - return (err); -} - -int -dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, - boolean_t readonly, void *tag, objset_t **osp) -{ - dsl_dataset_t *ds; - int err; - - err = dsl_dataset_own_obj(dp, obj, tag, &ds); - if (err != 0) - return (err); - - return (dmu_objset_own_impl(ds, type, readonly, tag, osp)); -} - -void -dmu_objset_rele(objset_t *os, void *tag) -{ - dsl_pool_t *dp = dmu_objset_pool(os); - dsl_dataset_rele(os->os_dsl_dataset, tag); - dsl_pool_rele(dp, tag); -} - -/* - * When we are called, os MUST refer to an objset associated with a dataset - * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner - * == tag. We will then release and reacquire ownership of the dataset while - * holding the pool config_rwlock to avoid intervening namespace or ownership - * changes may occur. - * - * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to - * release the hold on its dataset and acquire a new one on the dataset of the - * same name so that it can be partially torn down and reconstructed. - */ -void -dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, - void *tag) -{ - dsl_pool_t *dp; - char name[ZFS_MAX_DATASET_NAME_LEN]; - - VERIFY3P(ds, !=, NULL); - VERIFY3P(ds->ds_owner, ==, tag); - VERIFY(dsl_dataset_long_held(ds)); - - dsl_dataset_name(ds, name); - dp = ds->ds_dir->dd_pool; - dsl_pool_config_enter(dp, FTAG); - dsl_dataset_disown(ds, tag); - VERIFY0(dsl_dataset_own(dp, name, tag, newds)); - dsl_pool_config_exit(dp, FTAG); -} - -void -dmu_objset_disown(objset_t *os, void *tag) -{ - dsl_dataset_disown(os->os_dsl_dataset, tag); -} - -void -dmu_objset_evict_dbufs(objset_t *os) -{ - dnode_t dn_marker; - dnode_t *dn; - - mutex_enter(&os->os_lock); - dn = list_head(&os->os_dnodes); - while (dn != NULL) { - /* - * Skip dnodes without holds. We have to do this dance - * because dnode_add_ref() only works if there is already a - * hold. If the dnode has no holds, then it has no dbufs. - */ - if (dnode_add_ref(dn, FTAG)) { - list_insert_after(&os->os_dnodes, dn, &dn_marker); - mutex_exit(&os->os_lock); - - dnode_evict_dbufs(dn); - dnode_rele(dn, FTAG); - - mutex_enter(&os->os_lock); - dn = list_next(&os->os_dnodes, &dn_marker); - list_remove(&os->os_dnodes, &dn_marker); - } else { - dn = list_next(&os->os_dnodes, dn); - } - } - mutex_exit(&os->os_lock); - - if (DMU_USERUSED_DNODE(os) != NULL) { - dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os)); - dnode_evict_dbufs(DMU_USERUSED_DNODE(os)); - } - dnode_evict_dbufs(DMU_META_DNODE(os)); -} - -/* - * Objset eviction processing is split into into two pieces. - * The first marks the objset as evicting, evicts any dbufs that - * have a refcount of zero, and then queues up the objset for the - * second phase of eviction. Once os->os_dnodes has been cleared by - * dnode_buf_pageout()->dnode_destroy(), the second phase is executed. - * The second phase closes the special dnodes, dequeues the objset from - * the list of those undergoing eviction, and finally frees the objset. - * - * NOTE: Due to asynchronous eviction processing (invocation of - * dnode_buf_pageout()), it is possible for the meta dnode for the - * objset to have no holds even though os->os_dnodes is not empty. - */ -void -dmu_objset_evict(objset_t *os) -{ - dsl_dataset_t *ds = os->os_dsl_dataset; - - for (int t = 0; t < TXG_SIZE; t++) - ASSERT(!dmu_objset_is_dirty(os, t)); - - if (ds) - dsl_prop_unregister_all(ds, os); - - if (os->os_sa) - sa_tear_down(os); - - dmu_objset_evict_dbufs(os); - - mutex_enter(&os->os_lock); - spa_evicting_os_register(os->os_spa, os); - if (list_is_empty(&os->os_dnodes)) { - mutex_exit(&os->os_lock); - dmu_objset_evict_done(os); - } else { - mutex_exit(&os->os_lock); - } -} - -void -dmu_objset_evict_done(objset_t *os) -{ - ASSERT3P(list_head(&os->os_dnodes), ==, NULL); - - dnode_special_close(&os->os_meta_dnode); - if (DMU_USERUSED_DNODE(os)) { - dnode_special_close(&os->os_userused_dnode); - dnode_special_close(&os->os_groupused_dnode); - } - zil_free(os->os_zil); - - arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); - - /* - * This is a barrier to prevent the objset from going away in - * dnode_move() until we can safely ensure that the objset is still in - * use. We consider the objset valid before the barrier and invalid - * after the barrier. - */ - rw_enter(&os_lock, RW_READER); - rw_exit(&os_lock); - - kmem_free(os->os_obj_next_percpu, - os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0])); - - mutex_destroy(&os->os_lock); - mutex_destroy(&os->os_userused_lock); - mutex_destroy(&os->os_obj_lock); - mutex_destroy(&os->os_user_ptr_lock); - for (int i = 0; i < TXG_SIZE; i++) { - multilist_destroy(os->os_dirty_dnodes[i]); - } - spa_evicting_os_deregister(os->os_spa, os); - kmem_free(os, sizeof (objset_t)); -} - -timestruc_t -dmu_objset_snap_cmtime(objset_t *os) -{ - return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); -} - -/* called from dsl for meta-objset */ -objset_t * -dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - dmu_objset_type_t type, dmu_tx_t *tx) -{ - objset_t *os; - dnode_t *mdn; - - ASSERT(dmu_tx_is_syncing(tx)); - - if (ds != NULL) - VERIFY0(dmu_objset_from_ds(ds, &os)); - else - VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); - - mdn = DMU_META_DNODE(os); - - dnode_allocate(mdn, DMU_OT_DNODE, DNODE_BLOCK_SIZE, DN_MAX_INDBLKSHIFT, - DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx); - - /* - * We don't want to have to increase the meta-dnode's nlevels - * later, because then we could do it in quescing context while - * we are also accessing it in open context. - * - * This precaution is not necessary for the MOS (ds == NULL), - * because the MOS is only updated in syncing context. - * This is most fortunate: the MOS is the only objset that - * needs to be synced multiple times as spa_sync() iterates - * to convergence, so minimizing its dn_nlevels matters. - */ - if (ds != NULL) { - int levels = 1; - - /* - * Determine the number of levels necessary for the meta-dnode - * to contain DN_MAX_OBJECT dnodes. Note that in order to - * ensure that we do not overflow 64 bits, there has to be - * a nlevels that gives us a number of blocks > DN_MAX_OBJECT - * but < 2^64. Therefore, - * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be - * less than (64 - log2(DN_MAX_OBJECT)) (16). - */ - while ((uint64_t)mdn->dn_nblkptr << - (mdn->dn_datablkshift - DNODE_SHIFT + - (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < - DN_MAX_OBJECT) - levels++; - - mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = - mdn->dn_nlevels = levels; - } - - ASSERT(type != DMU_OST_NONE); - ASSERT(type != DMU_OST_ANY); - ASSERT(type < DMU_OST_NUMTYPES); - os->os_phys->os_type = type; - if (dmu_objset_userused_enabled(os)) { - os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; - os->os_flags = os->os_phys->os_flags; - } - - dsl_dataset_dirty(ds, tx); - - return (os); -} - -typedef struct dmu_objset_create_arg { - const char *doca_name; - cred_t *doca_cred; - void (*doca_userfunc)(objset_t *os, void *arg, - cred_t *cr, dmu_tx_t *tx); - void *doca_userarg; - dmu_objset_type_t doca_type; - uint64_t doca_flags; -} dmu_objset_create_arg_t; - -/*ARGSUSED*/ -static int -dmu_objset_create_check(void *arg, dmu_tx_t *tx) -{ - dmu_objset_create_arg_t *doca = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dir_t *pdd; - dsl_dataset_t *parentds; - objset_t *parentos; - const char *tail; - int error; - - if (strchr(doca->doca_name, '@') != NULL) - return (SET_ERROR(EINVAL)); - - if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN) - return (SET_ERROR(ENAMETOOLONG)); - - if (dataset_nestcheck(doca->doca_name) != 0) - return (SET_ERROR(ENAMETOOLONG)); - - error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); - if (error != 0) - return (error); - if (tail == NULL) { - dsl_dir_rele(pdd, FTAG); - return (SET_ERROR(EEXIST)); - } - error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, - doca->doca_cred); - if (error != 0) { - dsl_dir_rele(pdd, FTAG); - return (error); - } - - /* can't create below anything but filesystems (eg. no ZVOLs) */ - error = dsl_dataset_hold_obj(pdd->dd_pool, - dsl_dir_phys(pdd)->dd_head_dataset_obj, FTAG, &parentds); - if (error != 0) { - dsl_dir_rele(pdd, FTAG); - return (error); - } - error = dmu_objset_from_ds(parentds, &parentos); - if (error != 0) { - dsl_dataset_rele(parentds, FTAG); - dsl_dir_rele(pdd, FTAG); - return (error); - } - if (dmu_objset_type(parentos) != DMU_OST_ZFS) { - dsl_dataset_rele(parentds, FTAG); - dsl_dir_rele(pdd, FTAG); - return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); - } - dsl_dataset_rele(parentds, FTAG); - dsl_dir_rele(pdd, FTAG); - - return (error); -} - -static void -dmu_objset_create_sync(void *arg, dmu_tx_t *tx) -{ - dmu_objset_create_arg_t *doca = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dir_t *pdd; - const char *tail; - dsl_dataset_t *ds; - uint64_t obj; - blkptr_t *bp; - objset_t *os; - - VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); - - obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, - doca->doca_cred, tx); - - VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - bp = dsl_dataset_get_blkptr(ds); - os = dmu_objset_create_impl(pdd->dd_pool->dp_spa, - ds, bp, doca->doca_type, tx); - rrw_exit(&ds->ds_bp_rwlock, FTAG); - - if (doca->doca_userfunc != NULL) { - doca->doca_userfunc(os, doca->doca_userarg, - doca->doca_cred, tx); - } - -#if defined(__FreeBSD__) && defined(_KERNEL) - zvol_create_minors(dp->dp_spa, doca->doca_name); -#endif - spa_history_log_internal_ds(ds, "create", tx, ""); - dsl_dataset_rele(ds, FTAG); - dsl_dir_rele(pdd, FTAG); -} - -int -dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, - void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) -{ - dmu_objset_create_arg_t doca; - - doca.doca_name = name; - doca.doca_cred = CRED(); - doca.doca_flags = flags; - doca.doca_userfunc = func; - doca.doca_userarg = arg; - doca.doca_type = type; - - return (dsl_sync_task(name, - dmu_objset_create_check, dmu_objset_create_sync, &doca, - 5, ZFS_SPACE_CHECK_NORMAL)); -} - -typedef struct dmu_objset_clone_arg { - const char *doca_clone; - const char *doca_origin; - cred_t *doca_cred; -} dmu_objset_clone_arg_t; - -/*ARGSUSED*/ -static int -dmu_objset_clone_check(void *arg, dmu_tx_t *tx) -{ - dmu_objset_clone_arg_t *doca = arg; - dsl_dir_t *pdd; - const char *tail; - int error; - dsl_dataset_t *origin; - dsl_pool_t *dp = dmu_tx_pool(tx); - - if (strchr(doca->doca_clone, '@') != NULL) - return (SET_ERROR(EINVAL)); - - if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN) - return (SET_ERROR(ENAMETOOLONG)); - - error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); - if (error != 0) - return (error); - if (tail == NULL) { - dsl_dir_rele(pdd, FTAG); - return (SET_ERROR(EEXIST)); - } - - error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, - doca->doca_cred); - if (error != 0) { - dsl_dir_rele(pdd, FTAG); - return (SET_ERROR(EDQUOT)); - } - dsl_dir_rele(pdd, FTAG); - - error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); - if (error != 0) - return (error); - - /* You can only clone snapshots, not the head datasets. */ - if (!origin->ds_is_snapshot) { - dsl_dataset_rele(origin, FTAG); - return (SET_ERROR(EINVAL)); - } - dsl_dataset_rele(origin, FTAG); - - return (0); -} - -static void -dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) -{ - dmu_objset_clone_arg_t *doca = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dir_t *pdd; - const char *tail; - dsl_dataset_t *origin, *ds; - uint64_t obj; - char namebuf[ZFS_MAX_DATASET_NAME_LEN]; - - VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); - VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); - - obj = dsl_dataset_create_sync(pdd, tail, origin, 0, - doca->doca_cred, tx); - - VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); - dsl_dataset_name(origin, namebuf); -#if defined(__FreeBSD__) && defined(_KERNEL) - zvol_create_minors(dp->dp_spa, doca->doca_clone); -#endif - spa_history_log_internal_ds(ds, "clone", tx, - "origin=%s (%llu)", namebuf, origin->ds_object); - dsl_dataset_rele(ds, FTAG); - dsl_dataset_rele(origin, FTAG); - dsl_dir_rele(pdd, FTAG); -} - -int -dmu_objset_clone(const char *clone, const char *origin) -{ - dmu_objset_clone_arg_t doca; - - doca.doca_clone = clone; - doca.doca_origin = origin; - doca.doca_cred = CRED(); - - return (dsl_sync_task(clone, - dmu_objset_clone_check, dmu_objset_clone_sync, &doca, - 5, ZFS_SPACE_CHECK_NORMAL)); -} - -static int -dmu_objset_remap_indirects_impl(objset_t *os, uint64_t last_removed_txg) -{ - int error = 0; - uint64_t object = 0; - while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { - error = dmu_object_remap_indirects(os, object, - last_removed_txg); - /* - * If the ZPL removed the object before we managed to dnode_hold - * it, we would get an ENOENT. If the ZPL declares its intent - * to remove the object (dnode_free) before we manage to - * dnode_hold it, we would get an EEXIST. In either case, we - * want to continue remapping the other objects in the objset; - * in all other cases, we want to break early. - */ - if (error != 0 && error != ENOENT && error != EEXIST) { - break; - } - } - if (error == ESRCH) { - error = 0; - } - return (error); -} - -int -dmu_objset_remap_indirects(const char *fsname) -{ - int error = 0; - objset_t *os = NULL; - uint64_t last_removed_txg; - uint64_t remap_start_txg; - dsl_dir_t *dd; - - error = dmu_objset_hold(fsname, FTAG, &os); - if (error != 0) { - return (error); - } - dd = dmu_objset_ds(os)->ds_dir; - - if (!spa_feature_is_enabled(dmu_objset_spa(os), - SPA_FEATURE_OBSOLETE_COUNTS)) { - dmu_objset_rele(os, FTAG); - return (SET_ERROR(ENOTSUP)); - } - - if (dsl_dataset_is_snapshot(dmu_objset_ds(os))) { - dmu_objset_rele(os, FTAG); - return (SET_ERROR(EINVAL)); - } - - /* - * If there has not been a removal, we're done. - */ - last_removed_txg = spa_get_last_removal_txg(dmu_objset_spa(os)); - if (last_removed_txg == -1ULL) { - dmu_objset_rele(os, FTAG); - return (0); - } - - /* - * If we have remapped since the last removal, we're done. - */ - if (dsl_dir_is_zapified(dd)) { - uint64_t last_remap_txg; - if (zap_lookup(spa_meta_objset(dmu_objset_spa(os)), - dd->dd_object, DD_FIELD_LAST_REMAP_TXG, - sizeof (last_remap_txg), 1, &last_remap_txg) == 0 && - last_remap_txg > last_removed_txg) { - dmu_objset_rele(os, FTAG); - return (0); - } - } - - dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); - dsl_pool_rele(dmu_objset_pool(os), FTAG); - - remap_start_txg = spa_last_synced_txg(dmu_objset_spa(os)); - error = dmu_objset_remap_indirects_impl(os, last_removed_txg); - if (error == 0) { - /* - * We update the last_remap_txg to be the start txg so that - * we can guarantee that every block older than last_remap_txg - * that can be remapped has been remapped. - */ - error = dsl_dir_update_last_remap_txg(dd, remap_start_txg); - } - - dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); - dsl_dataset_rele(dmu_objset_ds(os), FTAG); - - return (error); -} - -int -dmu_objset_snapshot_one(const char *fsname, const char *snapname) -{ - int err; - char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); - nvlist_t *snaps = fnvlist_alloc(); - - fnvlist_add_boolean(snaps, longsnap); - strfree(longsnap); - err = dsl_dataset_snapshot(snaps, NULL, NULL); - fnvlist_free(snaps); - return (err); -} - -static void -dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) -{ - dnode_t *dn; - - while ((dn = multilist_sublist_head(list)) != NULL) { - ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); - ASSERT(dn->dn_dbuf->db_data_pending); - /* - * Initialize dn_zio outside dnode_sync() because the - * meta-dnode needs to set it ouside dnode_sync(). - */ - dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; - ASSERT(dn->dn_zio); - - ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); - multilist_sublist_remove(list, dn); - - /* - * If we are not doing useraccounting (os_synced_dnodes == NULL) - * we are done with this dnode for this txg. Unset dn_dirty_txg - * if later txgs aren't dirtying it so that future holders do - * not get a stale value. Otherwise, we will do this in - * userquota_updates_task() when processing has completely - * finished for this txg. - */ - multilist_t *newlist = dn->dn_objset->os_synced_dnodes; - if (newlist != NULL) { - (void) dnode_add_ref(dn, newlist); - multilist_insert(newlist, dn); - } else { - mutex_enter(&dn->dn_mtx); - if (dn->dn_dirty_txg == tx->tx_txg) - dn->dn_dirty_txg = 0; - mutex_exit(&dn->dn_mtx); - } - - dnode_sync(dn, tx); - } -} - -/* ARGSUSED */ -static void -dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) -{ - blkptr_t *bp = zio->io_bp; - objset_t *os = arg; - dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; - - ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); - ASSERT0(BP_GET_LEVEL(bp)); - - /* - * Update rootbp fill count: it should be the number of objects - * allocated in the object set (not counting the "special" - * objects that are stored in the objset_phys_t -- the meta - * dnode and user/group accounting objects). - */ - bp->blk_fill = 0; - for (int i = 0; i < dnp->dn_nblkptr; i++) - bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]); - if (os->os_dsl_dataset != NULL) - rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG); - *os->os_rootbp = *bp; - if (os->os_dsl_dataset != NULL) - rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); -} - -/* ARGSUSED */ -static void -dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) -{ - blkptr_t *bp = zio->io_bp; - blkptr_t *bp_orig = &zio->io_bp_orig; - objset_t *os = arg; - - if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { - ASSERT(BP_EQUAL(bp, bp_orig)); - } else { - dsl_dataset_t *ds = os->os_dsl_dataset; - dmu_tx_t *tx = os->os_synctx; - - (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); - dsl_dataset_block_born(ds, bp, tx); - } - kmem_free(bp, sizeof (*bp)); -} - -typedef struct sync_dnodes_arg { - multilist_t *sda_list; - int sda_sublist_idx; - multilist_t *sda_newlist; - dmu_tx_t *sda_tx; -} sync_dnodes_arg_t; - -static void -sync_dnodes_task(void *arg) -{ - sync_dnodes_arg_t *sda = arg; - - multilist_sublist_t *ms = - multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx); - - dmu_objset_sync_dnodes(ms, sda->sda_tx); - - multilist_sublist_unlock(ms); - - kmem_free(sda, sizeof (*sda)); -} - - -/* called from dsl */ -void -dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) -{ - int txgoff; - zbookmark_phys_t zb; - zio_prop_t zp; - zio_t *zio; - list_t *list; - dbuf_dirty_record_t *dr; - int num_sublists; - multilist_t *ml; - blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); - *blkptr_copy = *os->os_rootbp; - - dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); - - ASSERT(dmu_tx_is_syncing(tx)); - /* XXX the write_done callback should really give us the tx... */ - os->os_synctx = tx; - - if (os->os_dsl_dataset == NULL) { - /* - * This is the MOS. If we have upgraded, - * spa_max_replication() could change, so reset - * os_copies here. - */ - os->os_copies = spa_max_replication(os->os_spa); - } - - /* - * Create the root block IO - */ - SET_BOOKMARK(&zb, os->os_dsl_dataset ? - os->os_dsl_dataset->ds_object : DMU_META_OBJSET, - ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - arc_release(os->os_phys_buf, &os->os_phys_buf); - - dmu_write_policy(os, NULL, 0, 0, &zp); - - zio = arc_write(pio, os->os_spa, tx->tx_txg, - blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), - &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, - os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); - - /* - * Sync special dnodes - the parent IO for the sync is the root block - */ - DMU_META_DNODE(os)->dn_zio = zio; - dnode_sync(DMU_META_DNODE(os), tx); - - os->os_phys->os_flags = os->os_flags; - - if (DMU_USERUSED_DNODE(os) && - DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { - DMU_USERUSED_DNODE(os)->dn_zio = zio; - dnode_sync(DMU_USERUSED_DNODE(os), tx); - DMU_GROUPUSED_DNODE(os)->dn_zio = zio; - dnode_sync(DMU_GROUPUSED_DNODE(os), tx); - } - - txgoff = tx->tx_txg & TXG_MASK; - - if (dmu_objset_userused_enabled(os)) { - /* - * We must create the list here because it uses the - * dn_dirty_link[] of this txg. But it may already - * exist because we call dsl_dataset_sync() twice per txg. - */ - if (os->os_synced_dnodes == NULL) { - os->os_synced_dnodes = - multilist_create(sizeof (dnode_t), - offsetof(dnode_t, dn_dirty_link[txgoff]), - dnode_multilist_index_func); - } else { - ASSERT3U(os->os_synced_dnodes->ml_offset, ==, - offsetof(dnode_t, dn_dirty_link[txgoff])); - } - } - - ml = os->os_dirty_dnodes[txgoff]; - num_sublists = multilist_get_num_sublists(ml); - for (int i = 0; i < num_sublists; i++) { - if (multilist_sublist_is_empty_idx(ml, i)) - continue; - sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP); - sda->sda_list = ml; - sda->sda_sublist_idx = i; - sda->sda_tx = tx; - (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, - sync_dnodes_task, sda, 0); - /* callback frees sda */ - } - taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); - - list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; - while ((dr = list_head(list)) != NULL) { - ASSERT0(dr->dr_dbuf->db_level); - list_remove(list, dr); - if (dr->dr_zio) - zio_nowait(dr->dr_zio); - } - - /* Enable dnode backfill if enough objects have been freed. */ - if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { - os->os_rescan_dnodes = B_TRUE; - os->os_freed_dnodes = 0; - } - - /* - * Free intent log blocks up to this tx. - */ - zil_sync(os->os_zil, tx); - os->os_phys->os_zil_header = os->os_zil_header; - zio_nowait(zio); -} - -boolean_t -dmu_objset_is_dirty(objset_t *os, uint64_t txg) -{ - return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK])); -} - -static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; - -void -dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) -{ - used_cbs[ost] = cb; -} - -boolean_t -dmu_objset_userused_enabled(objset_t *os) -{ - return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && - used_cbs[os->os_phys->os_type] != NULL && - DMU_USERUSED_DNODE(os) != NULL); -} - -typedef struct userquota_node { - uint64_t uqn_id; - int64_t uqn_delta; - avl_node_t uqn_node; -} userquota_node_t; - -typedef struct userquota_cache { - avl_tree_t uqc_user_deltas; - avl_tree_t uqc_group_deltas; -} userquota_cache_t; - -static int -userquota_compare(const void *l, const void *r) -{ - const userquota_node_t *luqn = l; - const userquota_node_t *ruqn = r; - - if (luqn->uqn_id < ruqn->uqn_id) - return (-1); - if (luqn->uqn_id > ruqn->uqn_id) - return (1); - return (0); -} - -static void -do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) -{ - void *cookie; - userquota_node_t *uqn; - - ASSERT(dmu_tx_is_syncing(tx)); - - cookie = NULL; - while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas, - &cookie)) != NULL) { - /* - * os_userused_lock protects against concurrent calls to - * zap_increment_int(). It's needed because zap_increment_int() - * is not thread-safe (i.e. not atomic). - */ - mutex_enter(&os->os_userused_lock); - VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT, - uqn->uqn_id, uqn->uqn_delta, tx)); - mutex_exit(&os->os_userused_lock); - kmem_free(uqn, sizeof (*uqn)); - } - avl_destroy(&cache->uqc_user_deltas); - - cookie = NULL; - while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas, - &cookie)) != NULL) { - mutex_enter(&os->os_userused_lock); - VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT, - uqn->uqn_id, uqn->uqn_delta, tx)); - mutex_exit(&os->os_userused_lock); - kmem_free(uqn, sizeof (*uqn)); - } - avl_destroy(&cache->uqc_group_deltas); -} - -static void -userquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta) -{ - userquota_node_t search = { .uqn_id = id }; - avl_index_t idx; - - userquota_node_t *uqn = avl_find(avl, &search, &idx); - if (uqn == NULL) { - uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP); - uqn->uqn_id = id; - avl_insert(avl, uqn, idx); - } - uqn->uqn_delta += delta; -} - -static void -do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags, - uint64_t user, uint64_t group, boolean_t subtract) -{ - if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { - int64_t delta = DNODE_MIN_SIZE + used; - if (subtract) - delta = -delta; - - userquota_update_cache(&cache->uqc_user_deltas, user, delta); - userquota_update_cache(&cache->uqc_group_deltas, group, delta); - } -} - -typedef struct userquota_updates_arg { - objset_t *uua_os; - int uua_sublist_idx; - dmu_tx_t *uua_tx; -} userquota_updates_arg_t; - -static void -userquota_updates_task(void *arg) -{ - userquota_updates_arg_t *uua = arg; - objset_t *os = uua->uua_os; - dmu_tx_t *tx = uua->uua_tx; - dnode_t *dn; - userquota_cache_t cache = { 0 }; - - multilist_sublist_t *list = - multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx); - - ASSERT(multilist_sublist_head(list) == NULL || - dmu_objset_userused_enabled(os)); - avl_create(&cache.uqc_user_deltas, userquota_compare, - sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); - avl_create(&cache.uqc_group_deltas, userquota_compare, - sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); - - while ((dn = multilist_sublist_head(list)) != NULL) { - int flags; - ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); - ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || - dn->dn_phys->dn_flags & - DNODE_FLAG_USERUSED_ACCOUNTED); - - flags = dn->dn_id_flags; - ASSERT(flags); - if (flags & DN_ID_OLD_EXIST) { - do_userquota_update(&cache, - dn->dn_oldused, dn->dn_oldflags, - dn->dn_olduid, dn->dn_oldgid, B_TRUE); - } - if (flags & DN_ID_NEW_EXIST) { - do_userquota_update(&cache, - DN_USED_BYTES(dn->dn_phys), - dn->dn_phys->dn_flags, dn->dn_newuid, - dn->dn_newgid, B_FALSE); - } - - mutex_enter(&dn->dn_mtx); - dn->dn_oldused = 0; - dn->dn_oldflags = 0; - if (dn->dn_id_flags & DN_ID_NEW_EXIST) { - dn->dn_olduid = dn->dn_newuid; - dn->dn_oldgid = dn->dn_newgid; - dn->dn_id_flags |= DN_ID_OLD_EXIST; - if (dn->dn_bonuslen == 0) - dn->dn_id_flags |= DN_ID_CHKED_SPILL; - else - dn->dn_id_flags |= DN_ID_CHKED_BONUS; - } - dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); - if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa)) - dn->dn_dirty_txg = 0; - mutex_exit(&dn->dn_mtx); - - multilist_sublist_remove(list, dn); - dnode_rele(dn, os->os_synced_dnodes); - } - do_userquota_cacheflush(os, &cache, tx); - multilist_sublist_unlock(list); - kmem_free(uua, sizeof (*uua)); -} - -void -dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) -{ - int num_sublists; - - if (!dmu_objset_userused_enabled(os)) - return; - - /* Allocate the user/groupused objects if necessary. */ - if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { - VERIFY0(zap_create_claim(os, - DMU_USERUSED_OBJECT, - DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); - VERIFY0(zap_create_claim(os, - DMU_GROUPUSED_OBJECT, - DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); - } - - num_sublists = multilist_get_num_sublists(os->os_synced_dnodes); - for (int i = 0; i < num_sublists; i++) { - if (multilist_sublist_is_empty_idx(os->os_synced_dnodes, i)) - continue; - userquota_updates_arg_t *uua = - kmem_alloc(sizeof (*uua), KM_SLEEP); - uua->uua_os = os; - uua->uua_sublist_idx = i; - uua->uua_tx = tx; - /* note: caller does taskq_wait() */ - (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, - userquota_updates_task, uua, 0); - /* callback frees uua */ - } -} - -/* - * Returns a pointer to data to find uid/gid from - * - * If a dirty record for transaction group that is syncing can't - * be found then NULL is returned. In the NULL case it is assumed - * the uid/gid aren't changing. - */ -static void * -dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - dbuf_dirty_record_t *dr, **drp; - void *data; - - if (db->db_dirtycnt == 0) - return (db->db.db_data); /* Nothing is changing */ - - for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) - if (dr->dr_txg == tx->tx_txg) - break; - - if (dr == NULL) { - data = NULL; - } else { - dnode_t *dn; - - DB_DNODE_ENTER(dr->dr_dbuf); - dn = DB_DNODE(dr->dr_dbuf); - - if (dn->dn_bonuslen == 0 && - dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) - data = dr->dt.dl.dr_data->b_data; - else - data = dr->dt.dl.dr_data; - - DB_DNODE_EXIT(dr->dr_dbuf); - } - - return (data); -} - -void -dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) -{ - objset_t *os = dn->dn_objset; - void *data = NULL; - dmu_buf_impl_t *db = NULL; - uint64_t *user = NULL; - uint64_t *group = NULL; - int flags = dn->dn_id_flags; - int error; - boolean_t have_spill = B_FALSE; - - if (!dmu_objset_userused_enabled(dn->dn_objset)) - return; - - if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| - DN_ID_CHKED_SPILL))) - return; - - if (before && dn->dn_bonuslen != 0) - data = DN_BONUS(dn->dn_phys); - else if (!before && dn->dn_bonuslen != 0) { - if (dn->dn_bonus) { - db = dn->dn_bonus; - mutex_enter(&db->db_mtx); - data = dmu_objset_userquota_find_data(db, tx); - } else { - data = DN_BONUS(dn->dn_phys); - } - } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { - int rf = 0; - - if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) - rf |= DB_RF_HAVESTRUCT; - error = dmu_spill_hold_by_dnode(dn, - rf | DB_RF_MUST_SUCCEED, - FTAG, (dmu_buf_t **)&db); - ASSERT(error == 0); - mutex_enter(&db->db_mtx); - data = (before) ? db->db.db_data : - dmu_objset_userquota_find_data(db, tx); - have_spill = B_TRUE; - } else { - mutex_enter(&dn->dn_mtx); - dn->dn_id_flags |= DN_ID_CHKED_BONUS; - mutex_exit(&dn->dn_mtx); - return; - } - - if (before) { - ASSERT(data); - user = &dn->dn_olduid; - group = &dn->dn_oldgid; - } else if (data) { - user = &dn->dn_newuid; - group = &dn->dn_newgid; - } - - /* - * Must always call the callback in case the object - * type has changed and that type isn't an object type to track - */ - error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, - user, group); - - /* - * Preserve existing uid/gid when the callback can't determine - * what the new uid/gid are and the callback returned EEXIST. - * The EEXIST error tells us to just use the existing uid/gid. - * If we don't know what the old values are then just assign - * them to 0, since that is a new file being created. - */ - if (!before && data == NULL && error == EEXIST) { - if (flags & DN_ID_OLD_EXIST) { - dn->dn_newuid = dn->dn_olduid; - dn->dn_newgid = dn->dn_oldgid; - } else { - dn->dn_newuid = 0; - dn->dn_newgid = 0; - } - error = 0; - } - - if (db) - mutex_exit(&db->db_mtx); - - mutex_enter(&dn->dn_mtx); - if (error == 0 && before) - dn->dn_id_flags |= DN_ID_OLD_EXIST; - if (error == 0 && !before) - dn->dn_id_flags |= DN_ID_NEW_EXIST; - - if (have_spill) { - dn->dn_id_flags |= DN_ID_CHKED_SPILL; - } else { - dn->dn_id_flags |= DN_ID_CHKED_BONUS; - } - mutex_exit(&dn->dn_mtx); - if (have_spill) - dmu_buf_rele((dmu_buf_t *)db, FTAG); -} - -boolean_t -dmu_objset_userspace_present(objset_t *os) -{ - return (os->os_phys->os_flags & - OBJSET_FLAG_USERACCOUNTING_COMPLETE); -} - -int -dmu_objset_userspace_upgrade(objset_t *os) -{ - uint64_t obj; - int err = 0; - - if (dmu_objset_userspace_present(os)) - return (0); - if (!dmu_objset_userused_enabled(os)) - return (SET_ERROR(ENOTSUP)); - if (dmu_objset_is_snapshot(os)) - return (SET_ERROR(EINVAL)); - - /* - * We simply need to mark every object dirty, so that it will be - * synced out and now accounted. If this is called - * concurrently, or if we already did some work before crashing, - * that's fine, since we track each object's accounted state - * independently. - */ - - for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { - dmu_tx_t *tx; - dmu_buf_t *db; - int objerr; - - if (issig(JUSTLOOKING) && issig(FORREAL)) - return (SET_ERROR(EINTR)); - - objerr = dmu_bonus_hold(os, obj, FTAG, &db); - if (objerr != 0) - continue; - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, obj); - objerr = dmu_tx_assign(tx, TXG_WAIT); - if (objerr != 0) { - dmu_tx_abort(tx); - continue; - } - dmu_buf_will_dirty(db, tx); - dmu_buf_rele(db, FTAG); - dmu_tx_commit(tx); - } - - os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; - txg_wait_synced(dmu_objset_pool(os), 0); - return (0); -} - -void -dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, - uint64_t *usedobjsp, uint64_t *availobjsp) -{ - dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, - usedobjsp, availobjsp); -} - -uint64_t -dmu_objset_fsid_guid(objset_t *os) -{ - return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); -} - -void -dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) -{ - stat->dds_type = os->os_phys->os_type; - if (os->os_dsl_dataset) - dsl_dataset_fast_stat(os->os_dsl_dataset, stat); -} - -void -dmu_objset_stats(objset_t *os, nvlist_t *nv) -{ - ASSERT(os->os_dsl_dataset || - os->os_phys->os_type == DMU_OST_META); - - if (os->os_dsl_dataset != NULL) - dsl_dataset_stats(os->os_dsl_dataset, nv); - - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, - os->os_phys->os_type); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, - dmu_objset_userspace_present(os)); -} - -int -dmu_objset_is_snapshot(objset_t *os) -{ - if (os->os_dsl_dataset != NULL) - return (os->os_dsl_dataset->ds_is_snapshot); - else - return (B_FALSE); -} - -int -dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, - boolean_t *conflict) -{ - dsl_dataset_t *ds = os->os_dsl_dataset; - uint64_t ignored; - - if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) - return (SET_ERROR(ENOENT)); - - return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, - dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored, - MT_NORMALIZE, real, maxlen, conflict)); -} - -int -dmu_snapshot_list_next(objset_t *os, int namelen, char *name, - uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) -{ - dsl_dataset_t *ds = os->os_dsl_dataset; - zap_cursor_t cursor; - zap_attribute_t attr; - - ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); - - if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) - return (SET_ERROR(ENOENT)); - - zap_cursor_init_serialized(&cursor, - ds->ds_dir->dd_pool->dp_meta_objset, - dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp); - - if (zap_cursor_retrieve(&cursor, &attr) != 0) { - zap_cursor_fini(&cursor); - return (SET_ERROR(ENOENT)); - } - - if (strlen(attr.za_name) + 1 > namelen) { - zap_cursor_fini(&cursor); - return (SET_ERROR(ENAMETOOLONG)); - } - - (void) strcpy(name, attr.za_name); - if (idp) - *idp = attr.za_first_integer; - if (case_conflict) - *case_conflict = attr.za_normalization_conflict; - zap_cursor_advance(&cursor); - *offp = zap_cursor_serialize(&cursor); - zap_cursor_fini(&cursor); - - return (0); -} - -int -dmu_dir_list_next(objset_t *os, int namelen, char *name, - uint64_t *idp, uint64_t *offp) -{ - dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; - zap_cursor_t cursor; - zap_attribute_t attr; - - /* there is no next dir on a snapshot! */ - if (os->os_dsl_dataset->ds_object != - dsl_dir_phys(dd)->dd_head_dataset_obj) - return (SET_ERROR(ENOENT)); - - zap_cursor_init_serialized(&cursor, - dd->dd_pool->dp_meta_objset, - dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp); - - if (zap_cursor_retrieve(&cursor, &attr) != 0) { - zap_cursor_fini(&cursor); - return (SET_ERROR(ENOENT)); - } - - if (strlen(attr.za_name) + 1 > namelen) { - zap_cursor_fini(&cursor); - return (SET_ERROR(ENAMETOOLONG)); - } - - (void) strcpy(name, attr.za_name); - if (idp) - *idp = attr.za_first_integer; - zap_cursor_advance(&cursor); - *offp = zap_cursor_serialize(&cursor); - zap_cursor_fini(&cursor); - - return (0); -} - -typedef struct dmu_objset_find_ctx { - taskq_t *dc_tq; - dsl_pool_t *dc_dp; - uint64_t dc_ddobj; - char *dc_ddname; /* last component of ddobj's name */ - int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *); - void *dc_arg; - int dc_flags; - kmutex_t *dc_error_lock; - int *dc_error; -} dmu_objset_find_ctx_t; - -static void -dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp) -{ - dsl_pool_t *dp = dcp->dc_dp; - dsl_dir_t *dd; - dsl_dataset_t *ds; - zap_cursor_t zc; - zap_attribute_t *attr; - uint64_t thisobj; - int err = 0; - - /* don't process if there already was an error */ - if (*dcp->dc_error != 0) - goto out; - - /* - * Note: passing the name (dc_ddname) here is optional, but it - * improves performance because we don't need to call - * zap_value_search() to determine the name. - */ - err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd); - if (err != 0) - goto out; - - /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ - if (dd->dd_myname[0] == '$') { - dsl_dir_rele(dd, FTAG); - goto out; - } - - thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; - attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - - /* - * Iterate over all children. - */ - if (dcp->dc_flags & DS_FIND_CHILDREN) { - for (zap_cursor_init(&zc, dp->dp_meta_objset, - dsl_dir_phys(dd)->dd_child_dir_zapobj); - zap_cursor_retrieve(&zc, attr) == 0; - (void) zap_cursor_advance(&zc)) { - ASSERT3U(attr->za_integer_length, ==, - sizeof (uint64_t)); - ASSERT3U(attr->za_num_integers, ==, 1); - - dmu_objset_find_ctx_t *child_dcp = - kmem_alloc(sizeof (*child_dcp), KM_SLEEP); - *child_dcp = *dcp; - child_dcp->dc_ddobj = attr->za_first_integer; - child_dcp->dc_ddname = spa_strdup(attr->za_name); - if (dcp->dc_tq != NULL) - (void) taskq_dispatch(dcp->dc_tq, - dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP); - else - dmu_objset_find_dp_impl(child_dcp); - } - zap_cursor_fini(&zc); - } - - /* - * Iterate over all snapshots. - */ - if (dcp->dc_flags & DS_FIND_SNAPSHOTS) { - dsl_dataset_t *ds; - err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); - - if (err == 0) { - uint64_t snapobj; - - snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; - dsl_dataset_rele(ds, FTAG); - - for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); - zap_cursor_retrieve(&zc, attr) == 0; - (void) zap_cursor_advance(&zc)) { - ASSERT3U(attr->za_integer_length, ==, - sizeof (uint64_t)); - ASSERT3U(attr->za_num_integers, ==, 1); - - err = dsl_dataset_hold_obj(dp, - attr->za_first_integer, FTAG, &ds); - if (err != 0) - break; - err = dcp->dc_func(dp, ds, dcp->dc_arg); - dsl_dataset_rele(ds, FTAG); - if (err != 0) - break; - } - zap_cursor_fini(&zc); - } - } - - kmem_free(attr, sizeof (zap_attribute_t)); - - if (err != 0) { - dsl_dir_rele(dd, FTAG); - goto out; - } - - /* - * Apply to self. - */ - err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); - - /* - * Note: we hold the dir while calling dsl_dataset_hold_obj() so - * that the dir will remain cached, and we won't have to re-instantiate - * it (which could be expensive due to finding its name via - * zap_value_search()). - */ - dsl_dir_rele(dd, FTAG); - if (err != 0) - goto out; - err = dcp->dc_func(dp, ds, dcp->dc_arg); - dsl_dataset_rele(ds, FTAG); - -out: - if (err != 0) { - mutex_enter(dcp->dc_error_lock); - /* only keep first error */ - if (*dcp->dc_error == 0) - *dcp->dc_error = err; - mutex_exit(dcp->dc_error_lock); - } - - if (dcp->dc_ddname != NULL) - spa_strfree(dcp->dc_ddname); - kmem_free(dcp, sizeof (*dcp)); -} - -static void -dmu_objset_find_dp_cb(void *arg) -{ - dmu_objset_find_ctx_t *dcp = arg; - dsl_pool_t *dp = dcp->dc_dp; - - /* - * We need to get a pool_config_lock here, as there are several - * asssert(pool_config_held) down the stack. Getting a lock via - * dsl_pool_config_enter is risky, as it might be stalled by a - * pending writer. This would deadlock, as the write lock can - * only be granted when our parent thread gives up the lock. - * The _prio interface gives us priority over a pending writer. - */ - dsl_pool_config_enter_prio(dp, FTAG); - - dmu_objset_find_dp_impl(dcp); - - dsl_pool_config_exit(dp, FTAG); -} - -/* - * Find objsets under and including ddobj, call func(ds) on each. - * The order for the enumeration is completely undefined. - * func is called with dsl_pool_config held. - */ -int -dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, - int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) -{ - int error = 0; - taskq_t *tq = NULL; - int ntasks; - dmu_objset_find_ctx_t *dcp; - kmutex_t err_lock; - - mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL); - dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP); - dcp->dc_tq = NULL; - dcp->dc_dp = dp; - dcp->dc_ddobj = ddobj; - dcp->dc_ddname = NULL; - dcp->dc_func = func; - dcp->dc_arg = arg; - dcp->dc_flags = flags; - dcp->dc_error_lock = &err_lock; - dcp->dc_error = &error; - - if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) { - /* - * In case a write lock is held we can't make use of - * parallelism, as down the stack of the worker threads - * the lock is asserted via dsl_pool_config_held. - * In case of a read lock this is solved by getting a read - * lock in each worker thread, which isn't possible in case - * of a writer lock. So we fall back to the synchronous path - * here. - * In the future it might be possible to get some magic into - * dsl_pool_config_held in a way that it returns true for - * the worker threads so that a single lock held from this - * thread suffices. For now, stay single threaded. - */ - dmu_objset_find_dp_impl(dcp); - mutex_destroy(&err_lock); - - return (error); - } - - ntasks = dmu_find_threads; - if (ntasks == 0) - ntasks = vdev_count_leaves(dp->dp_spa) * 4; - tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks, - INT_MAX, 0); - if (tq == NULL) { - kmem_free(dcp, sizeof (*dcp)); - mutex_destroy(&err_lock); - - return (SET_ERROR(ENOMEM)); - } - dcp->dc_tq = tq; - - /* dcp will be freed by task */ - (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP); - - /* - * PORTING: this code relies on the property of taskq_wait to wait - * until no more tasks are queued and no more tasks are active. As - * we always queue new tasks from within other tasks, task_wait - * reliably waits for the full recursion to finish, even though we - * enqueue new tasks after taskq_wait has been called. - * On platforms other than illumos, taskq_wait may not have this - * property. - */ - taskq_wait(tq); - taskq_destroy(tq); - mutex_destroy(&err_lock); - - return (error); -} - -/* - * Find all objsets under name, and for each, call 'func(child_name, arg)'. - * The dp_config_rwlock must not be held when this is called, and it - * will not be held when the callback is called. - * Therefore this function should only be used when the pool is not changing - * (e.g. in syncing context), or the callback can deal with the possible races. - */ -static int -dmu_objset_find_impl(spa_t *spa, const char *name, - int func(const char *, void *), void *arg, int flags) -{ - dsl_dir_t *dd; - dsl_pool_t *dp = spa_get_dsl(spa); - dsl_dataset_t *ds; - zap_cursor_t zc; - zap_attribute_t *attr; - char *child; - uint64_t thisobj; - int err; - - dsl_pool_config_enter(dp, FTAG); - - err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); - if (err != 0) { - dsl_pool_config_exit(dp, FTAG); - return (err); - } - - /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ - if (dd->dd_myname[0] == '$') { - dsl_dir_rele(dd, FTAG); - dsl_pool_config_exit(dp, FTAG); - return (0); - } - - thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; - attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - - /* - * Iterate over all children. - */ - if (flags & DS_FIND_CHILDREN) { - for (zap_cursor_init(&zc, dp->dp_meta_objset, - dsl_dir_phys(dd)->dd_child_dir_zapobj); - zap_cursor_retrieve(&zc, attr) == 0; - (void) zap_cursor_advance(&zc)) { - ASSERT3U(attr->za_integer_length, ==, - sizeof (uint64_t)); - ASSERT3U(attr->za_num_integers, ==, 1); - - child = kmem_asprintf("%s/%s", name, attr->za_name); - dsl_pool_config_exit(dp, FTAG); - err = dmu_objset_find_impl(spa, child, - func, arg, flags); - dsl_pool_config_enter(dp, FTAG); - strfree(child); - if (err != 0) - break; - } - zap_cursor_fini(&zc); - - if (err != 0) { - dsl_dir_rele(dd, FTAG); - dsl_pool_config_exit(dp, FTAG); - kmem_free(attr, sizeof (zap_attribute_t)); - return (err); - } - } - - /* - * Iterate over all snapshots. - */ - if (flags & DS_FIND_SNAPSHOTS) { - err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); - - if (err == 0) { - uint64_t snapobj; - - snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; - dsl_dataset_rele(ds, FTAG); - - for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); - zap_cursor_retrieve(&zc, attr) == 0; - (void) zap_cursor_advance(&zc)) { - ASSERT3U(attr->za_integer_length, ==, - sizeof (uint64_t)); - ASSERT3U(attr->za_num_integers, ==, 1); - - child = kmem_asprintf("%s@%s", - name, attr->za_name); - dsl_pool_config_exit(dp, FTAG); - err = func(child, arg); - dsl_pool_config_enter(dp, FTAG); - strfree(child); - if (err != 0) - break; - } - zap_cursor_fini(&zc); - } - } - - dsl_dir_rele(dd, FTAG); - kmem_free(attr, sizeof (zap_attribute_t)); - dsl_pool_config_exit(dp, FTAG); - - if (err != 0) - return (err); - - /* Apply to self. */ - return (func(name, arg)); -} - -/* - * See comment above dmu_objset_find_impl(). - */ -int -dmu_objset_find(char *name, int func(const char *, void *), void *arg, - int flags) -{ - spa_t *spa; - int error; - - error = spa_open(name, &spa, FTAG); - if (error != 0) - return (error); - error = dmu_objset_find_impl(spa, name, func, arg, flags); - spa_close(spa, FTAG); - return (error); -} - -void -dmu_objset_set_user(objset_t *os, void *user_ptr) -{ - ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); - os->os_user_ptr = user_ptr; -} - -void * -dmu_objset_get_user(objset_t *os) -{ - ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); - return (os->os_user_ptr); -} - -/* - * Determine name of filesystem, given name of snapshot. - * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes - */ -int -dmu_fsname(const char *snapname, char *buf) -{ - char *atp = strchr(snapname, '@'); - if (atp == NULL) - return (SET_ERROR(EINVAL)); - if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN) - return (SET_ERROR(ENAMETOOLONG)); - (void) strlcpy(buf, snapname, atp - snapname + 1); - return (0); -} - -/* - * Call when we think we're going to write/free space in open context to track - * the amount of dirty data in the open txg, which is also the amount - * of memory that can not be evicted until this txg syncs. - */ -void -dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = os->os_dsl_dataset; - int64_t aspace = spa_get_worst_case_asize(os->os_spa, space); - - if (ds != NULL) { - dsl_dir_willuse_space(ds->ds_dir, aspace, tx); - dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); - } -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c deleted file mode 100644 index f4dcc4bcb976..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ /dev/null @@ -1,3550 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. - * Copyright (c) 2012, Martin Matuska . All rights reserved. - * Copyright 2014 HybridCluster. All rights reserved. - * Copyright 2016 RackTop Systems. - * Copyright (c) 2014 Integros [integros.com] - * Copyright (c) 2018, loli10K . All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef __FreeBSD__ -#include -#endif - -#ifdef __FreeBSD__ -#undef dump_write -#define dump_write dmu_dump_write -#endif - -/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ -int zfs_send_corrupt_data = B_FALSE; -int zfs_send_queue_length = 16 * 1024 * 1024; -int zfs_recv_queue_length = 16 * 1024 * 1024; -/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ -int zfs_send_set_freerecords_bit = B_TRUE; - -#ifdef _KERNEL -TUNABLE_INT("vfs.zfs.send_set_freerecords_bit", &zfs_send_set_freerecords_bit); -#endif - -static char *dmu_recv_tag = "dmu_recv_tag"; -const char *recv_clone_name = "%recv"; - -/* - * Use this to override the recordsize calculation for fast zfs send estimates. - */ -uint64_t zfs_override_estimate_recordsize = 0; - -#define BP_SPAN(datablkszsec, indblkshift, level) \ - (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ - (level) * (indblkshift - SPA_BLKPTRSHIFT))) - -static void byteswap_record(dmu_replay_record_t *drr); - -struct send_thread_arg { - bqueue_t q; - dsl_dataset_t *ds; /* Dataset to traverse */ - uint64_t fromtxg; /* Traverse from this txg */ - int flags; /* flags to pass to traverse_dataset */ - int error_code; - boolean_t cancel; - zbookmark_phys_t resume; -}; - -struct send_block_record { - boolean_t eos_marker; /* Marks the end of the stream */ - blkptr_t bp; - zbookmark_phys_t zb; - uint8_t indblkshift; - uint16_t datablkszsec; - bqueue_node_t ln; -}; - -static int -dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) -{ - dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os); - struct uio auio; - struct iovec aiov; - - /* - * The code does not rely on this (len being a multiple of 8). We keep - * this assertion because of the corresponding assertion in - * receive_read(). Keeping this assertion ensures that we do not - * inadvertently break backwards compatibility (causing the assertion - * in receive_read() to trigger on old software). - * - * Removing the assertions could be rolled into a new feature that uses - * data that isn't 8-byte aligned; if the assertions were removed, a - * feature flag would have to be added. - */ - - ASSERT0(len % 8); - - aiov.iov_base = buf; - aiov.iov_len = len; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_resid = len; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_WRITE; - auio.uio_offset = (off_t)-1; - auio.uio_td = dsp->dsa_td; -#ifdef _KERNEL - if (dsp->dsa_fp->f_type == DTYPE_VNODE) - bwillwrite(); - dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0, - dsp->dsa_td); -#else - fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); - dsp->dsa_err = EOPNOTSUPP; -#endif - mutex_enter(&ds->ds_sendstream_lock); - *dsp->dsa_off += len; - mutex_exit(&ds->ds_sendstream_lock); - - return (dsp->dsa_err); -} - -/* - * For all record types except BEGIN, fill in the checksum (overlaid in - * drr_u.drr_checksum.drr_checksum). The checksum verifies everything - * up to the start of the checksum itself. - */ -static int -dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) -{ - ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), - ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - (void) fletcher_4_incremental_native(dsp->dsa_drr, - offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), - &dsp->dsa_zc); - if (dsp->dsa_drr->drr_type == DRR_BEGIN) { - dsp->dsa_sent_begin = B_TRUE; - } else { - ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. - drr_checksum.drr_checksum)); - dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; - } - if (dsp->dsa_drr->drr_type == DRR_END) { - dsp->dsa_sent_end = B_TRUE; - } - (void) fletcher_4_incremental_native(&dsp->dsa_drr-> - drr_u.drr_checksum.drr_checksum, - sizeof (zio_cksum_t), &dsp->dsa_zc); - if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) - return (SET_ERROR(EINTR)); - if (payload_len != 0) { - (void) fletcher_4_incremental_native(payload, payload_len, - &dsp->dsa_zc); - if (dump_bytes(dsp, payload, payload_len) != 0) - return (SET_ERROR(EINTR)); - } - return (0); -} - -/* - * Fill in the drr_free struct, or perform aggregation if the previous record is - * also a free record, and the two are adjacent. - * - * Note that we send free records even for a full send, because we want to be - * able to receive a full send as a clone, which requires a list of all the free - * and freeobject records that were generated on the source. - */ -static int -dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, - uint64_t length) -{ - struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); - - /* - * When we receive a free record, dbuf_free_range() assumes - * that the receiving system doesn't have any dbufs in the range - * being freed. This is always true because there is a one-record - * constraint: we only send one WRITE record for any given - * object,offset. We know that the one-record constraint is - * true because we always send data in increasing order by - * object,offset. - * - * If the increasing-order constraint ever changes, we should find - * another way to assert that the one-record constraint is still - * satisfied. - */ - ASSERT(object > dsp->dsa_last_data_object || - (object == dsp->dsa_last_data_object && - offset > dsp->dsa_last_data_offset)); - - if (length != -1ULL && offset + length < offset) - length = -1ULL; - - /* - * If there is a pending op, but it's not PENDING_FREE, push it out, - * since free block aggregation can only be done for blocks of the - * same type (i.e., DRR_FREE records can only be aggregated with - * other DRR_FREE records. DRR_FREEOBJECTS records can only be - * aggregated with other DRR_FREEOBJECTS records. - */ - if (dsp->dsa_pending_op != PENDING_NONE && - dsp->dsa_pending_op != PENDING_FREE) { - if (dump_record(dsp, NULL, 0) != 0) - return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; - } - - if (dsp->dsa_pending_op == PENDING_FREE) { - /* - * There should never be a PENDING_FREE if length is -1 - * (because dump_dnode is the only place where this - * function is called with a -1, and only after flushing - * any pending record). - */ - ASSERT(length != -1ULL); - /* - * Check to see whether this free block can be aggregated - * with pending one. - */ - if (drrf->drr_object == object && drrf->drr_offset + - drrf->drr_length == offset) { - drrf->drr_length += length; - return (0); - } else { - /* not a continuation. Push out pending record */ - if (dump_record(dsp, NULL, 0) != 0) - return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; - } - } - /* create a FREE record and make it pending */ - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_FREE; - drrf->drr_object = object; - drrf->drr_offset = offset; - drrf->drr_length = length; - drrf->drr_toguid = dsp->dsa_toguid; - if (length == -1ULL) { - if (dump_record(dsp, NULL, 0) != 0) - return (SET_ERROR(EINTR)); - } else { - dsp->dsa_pending_op = PENDING_FREE; - } - - return (0); -} - -static int -dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, - uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp, - void *data) -{ - uint64_t payload_size; - struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); - - /* - * We send data in increasing object, offset order. - * See comment in dump_free() for details. - */ - ASSERT(object > dsp->dsa_last_data_object || - (object == dsp->dsa_last_data_object && - offset > dsp->dsa_last_data_offset)); - dsp->dsa_last_data_object = object; - dsp->dsa_last_data_offset = offset + lsize - 1; - - /* - * If there is any kind of pending aggregation (currently either - * a grouping of free objects or free blocks), push it out to - * the stream, since aggregation can't be done across operations - * of different types. - */ - if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_record(dsp, NULL, 0) != 0) - return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; - } - /* write a WRITE record */ - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_WRITE; - drrw->drr_object = object; - drrw->drr_type = type; - drrw->drr_offset = offset; - drrw->drr_toguid = dsp->dsa_toguid; - drrw->drr_logical_size = lsize; - - /* only set the compression fields if the buf is compressed */ - if (lsize != psize) { - ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); - ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT(!BP_SHOULD_BYTESWAP(bp)); - ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp))); - ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF); - ASSERT3S(psize, >, 0); - ASSERT3S(lsize, >=, psize); - - drrw->drr_compressiontype = BP_GET_COMPRESS(bp); - drrw->drr_compressed_size = psize; - payload_size = drrw->drr_compressed_size; - } else { - payload_size = drrw->drr_logical_size; - } - - if (bp == NULL || BP_IS_EMBEDDED(bp)) { - /* - * There's no pre-computed checksum for partial-block - * writes or embedded BP's, so (like - * fletcher4-checkummed blocks) userland will have to - * compute a dedup-capable checksum itself. - */ - drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; - } else { - drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); - if (zio_checksum_table[drrw->drr_checksumtype].ci_flags & - ZCHECKSUM_FLAG_DEDUP) - drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; - DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); - DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); - DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); - drrw->drr_key.ddk_cksum = bp->blk_cksum; - } - - if (dump_record(dsp, data, payload_size) != 0) - return (SET_ERROR(EINTR)); - return (0); -} - -static int -dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, - int blksz, const blkptr_t *bp) -{ - char buf[BPE_PAYLOAD_SIZE]; - struct drr_write_embedded *drrw = - &(dsp->dsa_drr->drr_u.drr_write_embedded); - - if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_record(dsp, NULL, 0) != 0) - return (EINTR); - dsp->dsa_pending_op = PENDING_NONE; - } - - ASSERT(BP_IS_EMBEDDED(bp)); - - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; - drrw->drr_object = object; - drrw->drr_offset = offset; - drrw->drr_length = blksz; - drrw->drr_toguid = dsp->dsa_toguid; - drrw->drr_compression = BP_GET_COMPRESS(bp); - drrw->drr_etype = BPE_GET_ETYPE(bp); - drrw->drr_lsize = BPE_GET_LSIZE(bp); - drrw->drr_psize = BPE_GET_PSIZE(bp); - - decode_embedded_bp_compressed(bp, buf); - - if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) - return (EINTR); - return (0); -} - -static int -dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) -{ - struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); - - if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_record(dsp, NULL, 0) != 0) - return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; - } - - /* write a SPILL record */ - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_SPILL; - drrs->drr_object = object; - drrs->drr_length = blksz; - drrs->drr_toguid = dsp->dsa_toguid; - - if (dump_record(dsp, data, blksz) != 0) - return (SET_ERROR(EINTR)); - return (0); -} - -static int -dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) -{ - struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); - - /* - * If there is a pending op, but it's not PENDING_FREEOBJECTS, - * push it out, since free block aggregation can only be done for - * blocks of the same type (i.e., DRR_FREE records can only be - * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records - * can only be aggregated with other DRR_FREEOBJECTS records. - */ - if (dsp->dsa_pending_op != PENDING_NONE && - dsp->dsa_pending_op != PENDING_FREEOBJECTS) { - if (dump_record(dsp, NULL, 0) != 0) - return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; - } - if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { - /* - * See whether this free object array can be aggregated - * with pending one - */ - if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { - drrfo->drr_numobjs += numobjs; - return (0); - } else { - /* can't be aggregated. Push out pending record */ - if (dump_record(dsp, NULL, 0) != 0) - return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; - } - } - - /* write a FREEOBJECTS record */ - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; - drrfo->drr_firstobj = firstobj; - drrfo->drr_numobjs = numobjs; - drrfo->drr_toguid = dsp->dsa_toguid; - - dsp->dsa_pending_op = PENDING_FREEOBJECTS; - - return (0); -} - -static int -dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) -{ - struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); - - if (object < dsp->dsa_resume_object) { - /* - * Note: when resuming, we will visit all the dnodes in - * the block of dnodes that we are resuming from. In - * this case it's unnecessary to send the dnodes prior to - * the one we are resuming from. We should be at most one - * block's worth of dnodes behind the resume point. - */ - ASSERT3U(dsp->dsa_resume_object - object, <, - 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); - return (0); - } - - if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) - return (dump_freeobjects(dsp, object, 1)); - - if (dsp->dsa_pending_op != PENDING_NONE) { - if (dump_record(dsp, NULL, 0) != 0) - return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; - } - - /* write an OBJECT record */ - bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); - dsp->dsa_drr->drr_type = DRR_OBJECT; - drro->drr_object = object; - drro->drr_type = dnp->dn_type; - drro->drr_bonustype = dnp->dn_bonustype; - drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; - drro->drr_bonuslen = dnp->dn_bonuslen; - drro->drr_dn_slots = dnp->dn_extra_slots + 1; - drro->drr_checksumtype = dnp->dn_checksum; - drro->drr_compress = dnp->dn_compress; - drro->drr_toguid = dsp->dsa_toguid; - - if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && - drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) - drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; - - if (dump_record(dsp, DN_BONUS(dnp), - P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) { - return (SET_ERROR(EINTR)); - } - - /* Free anything past the end of the file. */ - if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) - return (SET_ERROR(EINTR)); - if (dsp->dsa_err != 0) - return (SET_ERROR(EINTR)); - return (0); -} - -static boolean_t -backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) -{ - if (!BP_IS_EMBEDDED(bp)) - return (B_FALSE); - - /* - * Compression function must be legacy, or explicitly enabled. - */ - if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && - !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4))) - return (B_FALSE); - - /* - * Embed type must be explicitly enabled. - */ - switch (BPE_GET_ETYPE(bp)) { - case BP_EMBEDDED_TYPE_DATA: - if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) - return (B_TRUE); - break; - default: - return (B_FALSE); - } - return (B_FALSE); -} - -/* - * This is the callback function to traverse_dataset that acts as the worker - * thread for dmu_send_impl. - */ -/*ARGSUSED*/ -static int -send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) -{ - struct send_thread_arg *sta = arg; - struct send_block_record *record; - uint64_t record_size; - int err = 0; - - ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || - zb->zb_object >= sta->resume.zb_object); - - if (sta->cancel) - return (SET_ERROR(EINTR)); - - if (bp == NULL) { - ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); - return (0); - } else if (zb->zb_level < 0) { - return (0); - } - - record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP); - record->eos_marker = B_FALSE; - record->bp = *bp; - record->zb = *zb; - record->indblkshift = dnp->dn_indblkshift; - record->datablkszsec = dnp->dn_datablkszsec; - record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; - bqueue_enqueue(&sta->q, record, record_size); - - return (err); -} - -/* - * This function kicks off the traverse_dataset. It also handles setting the - * error code of the thread in case something goes wrong, and pushes the End of - * Stream record when the traverse_dataset call has finished. If there is no - * dataset to traverse, the thread immediately pushes End of Stream marker. - */ -static void -send_traverse_thread(void *arg) -{ - struct send_thread_arg *st_arg = arg; - int err; - struct send_block_record *data; - - if (st_arg->ds != NULL) { - err = traverse_dataset_resume(st_arg->ds, - st_arg->fromtxg, &st_arg->resume, - st_arg->flags, send_cb, st_arg); - - if (err != EINTR) - st_arg->error_code = err; - } - data = kmem_zalloc(sizeof (*data), KM_SLEEP); - data->eos_marker = B_TRUE; - bqueue_enqueue(&st_arg->q, data, 1); - thread_exit(); -} - -/* - * This function actually handles figuring out what kind of record needs to be - * dumped, reading the data (which has hopefully been prefetched), and calling - * the appropriate helper function. - */ -static int -do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) -{ - dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os); - const blkptr_t *bp = &data->bp; - const zbookmark_phys_t *zb = &data->zb; - uint8_t indblkshift = data->indblkshift; - uint16_t dblkszsec = data->datablkszsec; - spa_t *spa = ds->ds_dir->dd_pool->dp_spa; - dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; - int err = 0; - - ASSERT3U(zb->zb_level, >=, 0); - - ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || - zb->zb_object >= dsa->dsa_resume_object); - - if (zb->zb_object != DMU_META_DNODE_OBJECT && - DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { - return (0); - } else if (BP_IS_HOLE(bp) && - zb->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); - uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; - err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT); - } else if (BP_IS_HOLE(bp)) { - uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); - uint64_t offset = zb->zb_blkid * span; - err = dump_free(dsa, zb->zb_object, offset, span); - } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { - return (0); - } else if (type == DMU_OT_DNODE) { - int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf; - - ASSERT0(zb->zb_level); - - if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, - &aflags, zb) != 0) - return (SET_ERROR(EIO)); - - dnode_phys_t *blk = abuf->b_data; - uint64_t dnobj = zb->zb_blkid * epb; - for (int i = 0; i < epb; i += blk[i].dn_extra_slots + 1) { - err = dump_dnode(dsa, dnobj + i, blk + i); - if (err != 0) - break; - } - arc_buf_destroy(abuf, &abuf); - } else if (type == DMU_OT_SA) { - arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf; - int blksz = BP_GET_LSIZE(bp); - - if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, - &aflags, zb) != 0) - return (SET_ERROR(EIO)); - - err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); - arc_buf_destroy(abuf, &abuf); - } else if (backup_do_embed(dsa, bp)) { - /* it's an embedded level-0 block of a regular object */ - int blksz = dblkszsec << SPA_MINBLOCKSHIFT; - ASSERT0(zb->zb_level); - err = dump_write_embedded(dsa, zb->zb_object, - zb->zb_blkid * blksz, blksz, bp); - } else { - /* it's a level-0 block of a regular object */ - arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf; - int blksz = dblkszsec << SPA_MINBLOCKSHIFT; - uint64_t offset; - - /* - * If we have large blocks stored on disk but the send flags - * don't allow us to send large blocks, we split the data from - * the arc buf into chunks. - */ - boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE && - !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); - /* - * We should only request compressed data from the ARC if all - * the following are true: - * - stream compression was requested - * - we aren't splitting large blocks into smaller chunks - * - the data won't need to be byteswapped before sending - * - this isn't an embedded block - * - this isn't metadata (if receiving on a different endian - * system it can be byteswapped more easily) - */ - boolean_t request_compressed = - (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) && - !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && - !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); - - ASSERT0(zb->zb_level); - ASSERT(zb->zb_object > dsa->dsa_resume_object || - (zb->zb_object == dsa->dsa_resume_object && - zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); - - ASSERT0(zb->zb_level); - ASSERT(zb->zb_object > dsa->dsa_resume_object || - (zb->zb_object == dsa->dsa_resume_object && - zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); - - ASSERT3U(blksz, ==, BP_GET_LSIZE(bp)); - - enum zio_flag zioflags = ZIO_FLAG_CANFAIL; - if (request_compressed) - zioflags |= ZIO_FLAG_RAW; - if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) { - if (zfs_send_corrupt_data) { - /* Send a block filled with 0x"zfs badd bloc" */ - abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA, - blksz); - uint64_t *ptr; - for (ptr = abuf->b_data; - (char *)ptr < (char *)abuf->b_data + blksz; - ptr++) - *ptr = 0x2f5baddb10cULL; - } else { - return (SET_ERROR(EIO)); - } - } - - offset = zb->zb_blkid * blksz; - - if (split_large_blocks) { - ASSERT3U(arc_get_compression(abuf), ==, - ZIO_COMPRESS_OFF); - char *buf = abuf->b_data; - while (blksz > 0 && err == 0) { - int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); - err = dump_write(dsa, type, zb->zb_object, - offset, n, n, NULL, buf); - offset += n; - buf += n; - blksz -= n; - } - } else { - err = dump_write(dsa, type, zb->zb_object, offset, - blksz, arc_buf_size(abuf), bp, abuf->b_data); - } - arc_buf_destroy(abuf, &abuf); - } - - ASSERT(err == 0 || err == EINTR); - return (err); -} - -/* - * Pop the new data off the queue, and free the old data. - */ -static struct send_block_record * -get_next_record(bqueue_t *bq, struct send_block_record *data) -{ - struct send_block_record *tmp = bqueue_dequeue(bq); - kmem_free(data, sizeof (*data)); - return (tmp); -} - -/* - * Actually do the bulk of the work in a zfs send. - * - * Note: Releases dp using the specified tag. - */ -static int -dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, - zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, - boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, - int outfd, uint64_t resumeobj, uint64_t resumeoff, -#ifdef illumos - vnode_t *vp, offset_t *off) -#else - struct file *fp, offset_t *off) -#endif -{ - objset_t *os; - dmu_replay_record_t *drr; - dmu_sendarg_t *dsp; - int err; - uint64_t fromtxg = 0; - uint64_t featureflags = 0; - struct send_thread_arg to_arg = { 0 }; - - err = dmu_objset_from_ds(to_ds, &os); - if (err != 0) { - dsl_pool_rele(dp, tag); - return (err); - } - - drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); - drr->drr_type = DRR_BEGIN; - drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; - DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, - DMU_SUBSTREAM); - -#ifdef _KERNEL - if (dmu_objset_type(os) == DMU_OST_ZFS) { - uint64_t version; - if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { - kmem_free(drr, sizeof (dmu_replay_record_t)); - dsl_pool_rele(dp, tag); - return (SET_ERROR(EINVAL)); - } - if (version >= ZPL_VERSION_SA) { - featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; - } - } -#endif - - if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS]) - featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; - if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) - featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE; - if (embedok && - spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { - featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; - if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) - featureflags |= DMU_BACKUP_FEATURE_LZ4; - } - if (compressok) { - featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; - } - if ((featureflags & - (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED)) != - 0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { - featureflags |= DMU_BACKUP_FEATURE_LZ4; - } - - if (resumeobj != 0 || resumeoff != 0) { - featureflags |= DMU_BACKUP_FEATURE_RESUMING; - } - - DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, - featureflags); - - drr->drr_u.drr_begin.drr_creation_time = - dsl_dataset_phys(to_ds)->ds_creation_time; - drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); - if (is_clone) - drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; - drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; - if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) - drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; - if (zfs_send_set_freerecords_bit) - drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS; - - if (ancestor_zb != NULL) { - drr->drr_u.drr_begin.drr_fromguid = - ancestor_zb->zbm_guid; - fromtxg = ancestor_zb->zbm_creation_txg; - } - dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname); - if (!to_ds->ds_is_snapshot) { - (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", - sizeof (drr->drr_u.drr_begin.drr_toname)); - } - - dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); - - dsp->dsa_drr = drr; - dsp->dsa_outfd = outfd; - dsp->dsa_proc = curproc; - dsp->dsa_td = curthread; - dsp->dsa_fp = fp; - dsp->dsa_os = os; - dsp->dsa_off = off; - dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; - dsp->dsa_pending_op = PENDING_NONE; - dsp->dsa_featureflags = featureflags; - dsp->dsa_resume_object = resumeobj; - dsp->dsa_resume_offset = resumeoff; - - mutex_enter(&to_ds->ds_sendstream_lock); - list_insert_head(&to_ds->ds_sendstreams, dsp); - mutex_exit(&to_ds->ds_sendstream_lock); - - dsl_dataset_long_hold(to_ds, FTAG); - dsl_pool_rele(dp, tag); - - void *payload = NULL; - size_t payload_len = 0; - if (resumeobj != 0 || resumeoff != 0) { - dmu_object_info_t to_doi; - err = dmu_object_info(os, resumeobj, &to_doi); - if (err != 0) - goto out; - SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0, - resumeoff / to_doi.doi_data_block_size); - - nvlist_t *nvl = fnvlist_alloc(); - fnvlist_add_uint64(nvl, "resume_object", resumeobj); - fnvlist_add_uint64(nvl, "resume_offset", resumeoff); - payload = fnvlist_pack(nvl, &payload_len); - drr->drr_payloadlen = payload_len; - fnvlist_free(nvl); - } - - err = dump_record(dsp, payload, payload_len); - fnvlist_pack_free(payload, payload_len); - if (err != 0) { - err = dsp->dsa_err; - goto out; - } - - err = bqueue_init(&to_arg.q, zfs_send_queue_length, - offsetof(struct send_block_record, ln)); - to_arg.error_code = 0; - to_arg.cancel = B_FALSE; - to_arg.ds = to_ds; - to_arg.fromtxg = fromtxg; - to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; - (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, &p0, - TS_RUN, minclsyspri); - - struct send_block_record *to_data; - to_data = bqueue_dequeue(&to_arg.q); - - while (!to_data->eos_marker && err == 0) { - err = do_dump(dsp, to_data); - to_data = get_next_record(&to_arg.q, to_data); - if (issig(JUSTLOOKING) && issig(FORREAL)) - err = EINTR; - } - - if (err != 0) { - to_arg.cancel = B_TRUE; - while (!to_data->eos_marker) { - to_data = get_next_record(&to_arg.q, to_data); - } - } - kmem_free(to_data, sizeof (*to_data)); - - bqueue_destroy(&to_arg.q); - - if (err == 0 && to_arg.error_code != 0) - err = to_arg.error_code; - - if (err != 0) - goto out; - - if (dsp->dsa_pending_op != PENDING_NONE) - if (dump_record(dsp, NULL, 0) != 0) - err = SET_ERROR(EINTR); - - if (err != 0) { - if (err == EINTR && dsp->dsa_err != 0) - err = dsp->dsa_err; - goto out; - } - - bzero(drr, sizeof (dmu_replay_record_t)); - drr->drr_type = DRR_END; - drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; - drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; - - if (dump_record(dsp, NULL, 0) != 0) - err = dsp->dsa_err; - -out: - mutex_enter(&to_ds->ds_sendstream_lock); - list_remove(&to_ds->ds_sendstreams, dsp); - mutex_exit(&to_ds->ds_sendstream_lock); - - VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end)); - - kmem_free(drr, sizeof (dmu_replay_record_t)); - kmem_free(dsp, sizeof (dmu_sendarg_t)); - - dsl_dataset_long_rele(to_ds, FTAG); - - return (err); -} - -int -dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, - boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, -#ifdef illumos - int outfd, vnode_t *vp, offset_t *off) -#else - int outfd, struct file *fp, offset_t *off) -#endif -{ - dsl_pool_t *dp; - dsl_dataset_t *ds; - dsl_dataset_t *fromds = NULL; - int err; - - err = dsl_pool_hold(pool, FTAG, &dp); - if (err != 0) - return (err); - - err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); - if (err != 0) { - dsl_pool_rele(dp, FTAG); - return (err); - } - - if (fromsnap != 0) { - zfs_bookmark_phys_t zb; - boolean_t is_clone; - - err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); - if (err != 0) { - dsl_dataset_rele(ds, FTAG); - dsl_pool_rele(dp, FTAG); - return (err); - } - if (!dsl_dataset_is_before(ds, fromds, 0)) - err = SET_ERROR(EXDEV); - zb.zbm_creation_time = - dsl_dataset_phys(fromds)->ds_creation_time; - zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; - zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; - is_clone = (fromds->ds_dir != ds->ds_dir); - dsl_dataset_rele(fromds, FTAG); - err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, compressok, outfd, 0, 0, fp, off); - } else { - err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, compressok, outfd, 0, 0, fp, off); - } - dsl_dataset_rele(ds, FTAG); - return (err); -} - -int -dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, - boolean_t large_block_ok, boolean_t compressok, int outfd, - uint64_t resumeobj, uint64_t resumeoff, -#ifdef illumos - vnode_t *vp, offset_t *off) -#else - struct file *fp, offset_t *off) -#endif -{ - dsl_pool_t *dp; - dsl_dataset_t *ds; - int err; - boolean_t owned = B_FALSE; - - if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) - return (SET_ERROR(EINVAL)); - - err = dsl_pool_hold(tosnap, FTAG, &dp); - if (err != 0) - return (err); - - if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { - /* - * We are sending a filesystem or volume. Ensure - * that it doesn't change by owning the dataset. - */ - err = dsl_dataset_own(dp, tosnap, FTAG, &ds); - owned = B_TRUE; - } else { - err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); - } - if (err != 0) { - dsl_pool_rele(dp, FTAG); - return (err); - } - - if (fromsnap != NULL) { - zfs_bookmark_phys_t zb; - boolean_t is_clone = B_FALSE; - int fsnamelen = strchr(tosnap, '@') - tosnap; - - /* - * If the fromsnap is in a different filesystem, then - * mark the send stream as a clone. - */ - if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || - (fromsnap[fsnamelen] != '@' && - fromsnap[fsnamelen] != '#')) { - is_clone = B_TRUE; - } - - if (strchr(fromsnap, '@')) { - dsl_dataset_t *fromds; - err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); - if (err == 0) { - if (!dsl_dataset_is_before(ds, fromds, 0)) - err = SET_ERROR(EXDEV); - zb.zbm_creation_time = - dsl_dataset_phys(fromds)->ds_creation_time; - zb.zbm_creation_txg = - dsl_dataset_phys(fromds)->ds_creation_txg; - zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; - is_clone = (ds->ds_dir != fromds->ds_dir); - dsl_dataset_rele(fromds, FTAG); - } - } else { - err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); - } - if (err != 0) { - dsl_dataset_rele(ds, FTAG); - dsl_pool_rele(dp, FTAG); - return (err); - } - err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, compressok, - outfd, resumeobj, resumeoff, fp, off); - } else { - err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, compressok, - outfd, resumeobj, resumeoff, fp, off); - } - if (owned) - dsl_dataset_disown(ds, FTAG); - else - dsl_dataset_rele(ds, FTAG); - return (err); -} - -static int -dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, - uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep) -{ - int err = 0; - uint64_t size; - /* - * Assume that space (both on-disk and in-stream) is dominated by - * data. We will adjust for indirect blocks and the copies property, - * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). - */ - uint64_t recordsize; - uint64_t record_count; - objset_t *os; - VERIFY0(dmu_objset_from_ds(ds, &os)); - - /* Assume all (uncompressed) blocks are recordsize. */ - if (zfs_override_estimate_recordsize != 0) { - recordsize = zfs_override_estimate_recordsize; - } else if (os->os_phys->os_type == DMU_OST_ZVOL) { - err = dsl_prop_get_int_ds(ds, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize); - } else { - err = dsl_prop_get_int_ds(ds, - zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize); - } - if (err != 0) - return (err); - record_count = uncompressed / recordsize; - - /* - * If we're estimating a send size for a compressed stream, use the - * compressed data size to estimate the stream size. Otherwise, use the - * uncompressed data size. - */ - size = stream_compressed ? compressed : uncompressed; - - /* - * Subtract out approximate space used by indirect blocks. - * Assume most space is used by data blocks (non-indirect, non-dnode). - * Assume no ditto blocks or internal fragmentation. - * - * Therefore, space used by indirect blocks is sizeof(blkptr_t) per - * block. - */ - size -= record_count * sizeof (blkptr_t); - - /* Add in the space for the record associated with each block. */ - size += record_count * sizeof (dmu_replay_record_t); - - *sizep = size; - - return (0); -} - -int -dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, - boolean_t stream_compressed, uint64_t *sizep) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - int err; - uint64_t uncomp, comp; - - ASSERT(dsl_pool_config_held(dp)); - - /* tosnap must be a snapshot */ - if (!ds->ds_is_snapshot) - return (SET_ERROR(EINVAL)); - - /* fromsnap, if provided, must be a snapshot */ - if (fromds != NULL && !fromds->ds_is_snapshot) - return (SET_ERROR(EINVAL)); - - /* - * fromsnap must be an earlier snapshot from the same fs as tosnap, - * or the origin's fs. - */ - if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) - return (SET_ERROR(EXDEV)); - - /* Get compressed and uncompressed size estimates of changed data. */ - if (fromds == NULL) { - uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes; - comp = dsl_dataset_phys(ds)->ds_compressed_bytes; - } else { - uint64_t used; - err = dsl_dataset_space_written(fromds, ds, - &used, &comp, &uncomp); - if (err != 0) - return (err); - } - - err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp, - stream_compressed, sizep); - /* - * Add the size of the BEGIN and END records to the estimate. - */ - *sizep += 2 * sizeof (dmu_replay_record_t); - return (err); -} - -struct calculate_send_arg { - uint64_t uncompressed; - uint64_t compressed; -}; - -/* - * Simple callback used to traverse the blocks of a snapshot and sum their - * uncompressed and compressed sizes. - */ -/* ARGSUSED */ -static int -dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) -{ - struct calculate_send_arg *space = arg; - if (bp != NULL && !BP_IS_HOLE(bp)) { - space->uncompressed += BP_GET_UCSIZE(bp); - space->compressed += BP_GET_PSIZE(bp); - } - return (0); -} - -/* - * Given a desination snapshot and a TXG, calculate the approximate size of a - * send stream sent from that TXG. from_txg may be zero, indicating that the - * whole snapshot will be sent. - */ -int -dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, - boolean_t stream_compressed, uint64_t *sizep) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - int err; - struct calculate_send_arg size = { 0 }; - - ASSERT(dsl_pool_config_held(dp)); - - /* tosnap must be a snapshot */ - if (!ds->ds_is_snapshot) - return (SET_ERROR(EINVAL)); - - /* verify that from_txg is before the provided snapshot was taken */ - if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) { - return (SET_ERROR(EXDEV)); - } - - /* - * traverse the blocks of the snapshot with birth times after - * from_txg, summing their uncompressed size - */ - err = traverse_dataset(ds, from_txg, TRAVERSE_POST, - dmu_calculate_send_traversal, &size); - if (err) - return (err); - - err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed, - size.compressed, stream_compressed, sizep); - return (err); -} - -typedef struct dmu_recv_begin_arg { - const char *drba_origin; - dmu_recv_cookie_t *drba_cookie; - cred_t *drba_cred; - uint64_t drba_snapobj; -} dmu_recv_begin_arg_t; - -static int -recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, - uint64_t fromguid) -{ - uint64_t val; - uint64_t children; - int error; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - /* Temporary clone name must not exist. */ - error = zap_lookup(dp->dp_meta_objset, - dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, - 8, 1, &val); - if (error != ENOENT) - return (error == 0 ? SET_ERROR(EBUSY) : error); - - /* Resume state must not be set. */ - if (dsl_dataset_has_resume_receive_state(ds)) - return (SET_ERROR(EBUSY)); - - /* New snapshot name must not exist. */ - error = zap_lookup(dp->dp_meta_objset, - dsl_dataset_phys(ds)->ds_snapnames_zapobj, - drba->drba_cookie->drc_tosnap, 8, 1, &val); - if (error != ENOENT) - return (error == 0 ? SET_ERROR(EEXIST) : error); - - /* must not have children if receiving a ZVOL */ - error = zap_count(dp->dp_meta_objset, - dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children); - if (error != 0) - return (error); - if (drba->drba_cookie->drc_drrb->drr_type != DMU_OST_ZFS && - children > 0) - return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); - - /* - * Check snapshot limit before receiving. We'll recheck again at the - * end, but might as well abort before receiving if we're already over - * the limit. - * - * Note that we do not check the file system limit with - * dsl_dir_fscount_check because the temporary %clones don't count - * against that limit. - */ - error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, - NULL, drba->drba_cred); - if (error != 0) - return (error); - - if (fromguid != 0) { - dsl_dataset_t *snap; - uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; - - /* Find snapshot in this dir that matches fromguid. */ - while (obj != 0) { - error = dsl_dataset_hold_obj(dp, obj, FTAG, - &snap); - if (error != 0) - return (SET_ERROR(ENODEV)); - if (snap->ds_dir != ds->ds_dir) { - dsl_dataset_rele(snap, FTAG); - return (SET_ERROR(ENODEV)); - } - if (dsl_dataset_phys(snap)->ds_guid == fromguid) - break; - obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; - dsl_dataset_rele(snap, FTAG); - } - if (obj == 0) - return (SET_ERROR(ENODEV)); - - if (drba->drba_cookie->drc_force) { - drba->drba_snapobj = obj; - } else { - /* - * If we are not forcing, there must be no - * changes since fromsnap. - */ - if (dsl_dataset_modified_since_snap(ds, snap)) { - dsl_dataset_rele(snap, FTAG); - return (SET_ERROR(ETXTBSY)); - } - drba->drba_snapobj = ds->ds_prev->ds_object; - } - - dsl_dataset_rele(snap, FTAG); - } else { - /* if full, then must be forced */ - if (!drba->drba_cookie->drc_force) - return (SET_ERROR(EEXIST)); - /* start from $ORIGIN@$ORIGIN, if supported */ - drba->drba_snapobj = dp->dp_origin_snap != NULL ? - dp->dp_origin_snap->ds_object : 0; - } - - return (0); - -} - -static int -dmu_recv_begin_check(void *arg, dmu_tx_t *tx) -{ - dmu_recv_begin_arg_t *drba = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - struct drr_begin *drrb = drba->drba_cookie->drc_drrb; - uint64_t fromguid = drrb->drr_fromguid; - int flags = drrb->drr_flags; - int error; - uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); - dsl_dataset_t *ds; - const char *tofs = drba->drba_cookie->drc_tofs; - - /* already checked */ - ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); - ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); - - if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == - DMU_COMPOUNDSTREAM || - drrb->drr_type >= DMU_OST_NUMTYPES || - ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) - return (SET_ERROR(EINVAL)); - - /* Verify pool version supports SA if SA_SPILL feature set */ - if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && - spa_version(dp->dp_spa) < SPA_VERSION_SA) - return (SET_ERROR(ENOTSUP)); - - if (drba->drba_cookie->drc_resumable && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) - return (SET_ERROR(ENOTSUP)); - - /* - * The receiving code doesn't know how to translate a WRITE_EMBEDDED - * record to a plain WRITE record, so the pool must have the - * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED - * records. Same with WRITE_EMBEDDED records that use LZ4 compression. - */ - if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) - return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) - return (SET_ERROR(ENOTSUP)); - - /* - * The receiving code doesn't know how to translate large blocks - * to smaller ones, so the pool must have the LARGE_BLOCKS - * feature enabled if the stream has LARGE_BLOCKS. Same with - * large dnodes. - */ - if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) - return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) - return (SET_ERROR(ENOTSUP)); - - error = dsl_dataset_hold(dp, tofs, FTAG, &ds); - if (error == 0) { - /* target fs already exists; recv into temp clone */ - - /* Can't recv a clone into an existing fs */ - if (flags & DRR_FLAG_CLONE || drba->drba_origin) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EINVAL)); - } - - error = recv_begin_check_existing_impl(drba, ds, fromguid); - dsl_dataset_rele(ds, FTAG); - } else if (error == ENOENT) { - /* target fs does not exist; must be a full backup or clone */ - char buf[ZFS_MAX_DATASET_NAME_LEN]; - objset_t *os; - - /* - * If it's a non-clone incremental, we are missing the - * target fs, so fail the recv. - */ - if (fromguid != 0 && !(flags & DRR_FLAG_CLONE || - drba->drba_origin)) - return (SET_ERROR(ENOENT)); - - /* - * If we're receiving a full send as a clone, and it doesn't - * contain all the necessary free records and freeobject - * records, reject it. - */ - if (fromguid == 0 && drba->drba_origin && - !(flags & DRR_FLAG_FREERECORDS)) - return (SET_ERROR(EINVAL)); - - /* Open the parent of tofs */ - ASSERT3U(strlen(tofs), <, sizeof (buf)); - (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); - error = dsl_dataset_hold(dp, buf, FTAG, &ds); - if (error != 0) - return (error); - - /* - * Check filesystem and snapshot limits before receiving. We'll - * recheck snapshot limits again at the end (we create the - * filesystems and increment those counts during begin_sync). - */ - error = dsl_fs_ss_limit_check(ds->ds_dir, 1, - ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - - error = dsl_fs_ss_limit_check(ds->ds_dir, 1, - ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - - /* can't recv below anything but filesystems (eg. no ZVOLs) */ - error = dmu_objset_from_ds(ds, &os); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - if (dmu_objset_type(os) != DMU_OST_ZFS) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); - } - - if (drba->drba_origin != NULL) { - dsl_dataset_t *origin; - error = dsl_dataset_hold(dp, drba->drba_origin, - FTAG, &origin); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - if (!origin->ds_is_snapshot) { - dsl_dataset_rele(origin, FTAG); - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EINVAL)); - } - if (dsl_dataset_phys(origin)->ds_guid != fromguid && - fromguid != 0) { - dsl_dataset_rele(origin, FTAG); - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(ENODEV)); - } - dsl_dataset_rele(origin, FTAG); - } - - dsl_dataset_rele(ds, FTAG); - error = 0; - } - return (error); -} - -static void -dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) -{ - dmu_recv_begin_arg_t *drba = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - objset_t *mos = dp->dp_meta_objset; - struct drr_begin *drrb = drba->drba_cookie->drc_drrb; - const char *tofs = drba->drba_cookie->drc_tofs; - dsl_dataset_t *ds, *newds; - uint64_t dsobj; - int error; - uint64_t crflags = 0; - - if (drrb->drr_flags & DRR_FLAG_CI_DATA) - crflags |= DS_FLAG_CI_DATASET; - - error = dsl_dataset_hold(dp, tofs, FTAG, &ds); - if (error == 0) { - /* create temporary clone */ - dsl_dataset_t *snap = NULL; - if (drba->drba_snapobj != 0) { - VERIFY0(dsl_dataset_hold_obj(dp, - drba->drba_snapobj, FTAG, &snap)); - } - dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, - snap, crflags, drba->drba_cred, tx); - if (drba->drba_snapobj != 0) - dsl_dataset_rele(snap, FTAG); - dsl_dataset_rele(ds, FTAG); - } else { - dsl_dir_t *dd; - const char *tail; - dsl_dataset_t *origin = NULL; - - VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); - - if (drba->drba_origin != NULL) { - VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, - FTAG, &origin)); - } - - /* Create new dataset. */ - dsobj = dsl_dataset_create_sync(dd, - strrchr(tofs, '/') + 1, - origin, crflags, drba->drba_cred, tx); - if (origin != NULL) - dsl_dataset_rele(origin, FTAG); - dsl_dir_rele(dd, FTAG); - drba->drba_cookie->drc_newfs = B_TRUE; - } - VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); - - if (drba->drba_cookie->drc_resumable) { - dsl_dataset_zapify(newds, tx); - if (drrb->drr_fromguid != 0) { - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, - 8, 1, &drrb->drr_fromguid, tx)); - } - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, - 8, 1, &drrb->drr_toguid, tx)); - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, - 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); - uint64_t one = 1; - uint64_t zero = 0; - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, - 8, 1, &one, tx)); - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, - 8, 1, &zero, tx)); - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, - 8, 1, &zero, tx)); - if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & - DMU_BACKUP_FEATURE_LARGE_BLOCKS) { - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK, - 8, 1, &one, tx)); - } - if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & - DMU_BACKUP_FEATURE_EMBED_DATA) { - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, - 8, 1, &one, tx)); - } - if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & - DMU_BACKUP_FEATURE_COMPRESSED) { - VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK, - 8, 1, &one, tx)); - } - } - - dmu_buf_will_dirty(newds->ds_dbuf, tx); - dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; - - /* - * If we actually created a non-clone, we need to create the - * objset in our new dataset. - */ - rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG); - if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { - (void) dmu_objset_create_impl(dp->dp_spa, - newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); - } - rrw_exit(&newds->ds_bp_rwlock, FTAG); - - drba->drba_cookie->drc_ds = newds; - - spa_history_log_internal_ds(newds, "receive", tx, ""); -} - -static int -dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) -{ - dmu_recv_begin_arg_t *drba = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - struct drr_begin *drrb = drba->drba_cookie->drc_drrb; - int error; - uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); - dsl_dataset_t *ds; - const char *tofs = drba->drba_cookie->drc_tofs; - - /* 6 extra bytes for /%recv */ - char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; - - /* already checked */ - ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); - ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING); - - if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == - DMU_COMPOUNDSTREAM || - drrb->drr_type >= DMU_OST_NUMTYPES) - return (SET_ERROR(EINVAL)); - - /* Verify pool version supports SA if SA_SPILL feature set */ - if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && - spa_version(dp->dp_spa) < SPA_VERSION_SA) - return (SET_ERROR(ENOTSUP)); - - /* - * The receiving code doesn't know how to translate a WRITE_EMBEDDED - * record to a plain WRITE record, so the pool must have the - * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED - * records. Same with WRITE_EMBEDDED records that use LZ4 compression. - */ - if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) - return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) - return (SET_ERROR(ENOTSUP)); - - /* - * The receiving code doesn't know how to translate large blocks - * to smaller ones, so the pool must have the LARGE_BLOCKS - * feature enabled if the stream has LARGE_BLOCKS. Same with - * large dnodes. - */ - if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) - return (SET_ERROR(ENOTSUP)); - if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && - !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) - return (SET_ERROR(ENOTSUP)); - - (void) snprintf(recvname, sizeof (recvname), "%s/%s", - tofs, recv_clone_name); - - if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { - /* %recv does not exist; continue in tofs */ - error = dsl_dataset_hold(dp, tofs, FTAG, &ds); - if (error != 0) - return (error); - } - - /* check that ds is marked inconsistent */ - if (!DS_IS_INCONSISTENT(ds)) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EINVAL)); - } - - /* check that there is resuming data, and that the toguid matches */ - if (!dsl_dataset_is_zapified(ds)) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EINVAL)); - } - uint64_t val; - error = zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); - if (error != 0 || drrb->drr_toguid != val) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EINVAL)); - } - - /* - * Check if the receive is still running. If so, it will be owned. - * Note that nothing else can own the dataset (e.g. after the receive - * fails) because it will be marked inconsistent. - */ - if (dsl_dataset_has_owner(ds)) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EBUSY)); - } - - /* There should not be any snapshots of this fs yet. */ - if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EINVAL)); - } - - /* - * Note: resume point will be checked when we process the first WRITE - * record. - */ - - /* check that the origin matches */ - val = 0; - (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); - if (drrb->drr_fromguid != val) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EINVAL)); - } - - dsl_dataset_rele(ds, FTAG); - return (0); -} - -static void -dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) -{ - dmu_recv_begin_arg_t *drba = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - const char *tofs = drba->drba_cookie->drc_tofs; - dsl_dataset_t *ds; - uint64_t dsobj; - /* 6 extra bytes for /%recv */ - char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; - - (void) snprintf(recvname, sizeof (recvname), "%s/%s", - tofs, recv_clone_name); - - if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { - /* %recv does not exist; continue in tofs */ - VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds)); - drba->drba_cookie->drc_newfs = B_TRUE; - } - - /* clear the inconsistent flag so that we can own it */ - ASSERT(DS_IS_INCONSISTENT(ds)); - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; - dsobj = ds->ds_object; - dsl_dataset_rele(ds, FTAG); - - VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds)); - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; - - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds))); - rrw_exit(&ds->ds_bp_rwlock, FTAG); - - drba->drba_cookie->drc_ds = ds; - - spa_history_log_internal_ds(ds, "resume receive", tx, ""); -} - -/* - * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() - * succeeds; otherwise we will leak the holds on the datasets. - */ -int -dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, - boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc) -{ - dmu_recv_begin_arg_t drba = { 0 }; - - bzero(drc, sizeof (dmu_recv_cookie_t)); - drc->drc_drr_begin = drr_begin; - drc->drc_drrb = &drr_begin->drr_u.drr_begin; - drc->drc_tosnap = tosnap; - drc->drc_tofs = tofs; - drc->drc_force = force; - drc->drc_resumable = resumable; - drc->drc_cred = CRED(); - drc->drc_clone = (origin != NULL); - - if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { - drc->drc_byteswap = B_TRUE; - (void) fletcher_4_incremental_byteswap(drr_begin, - sizeof (dmu_replay_record_t), &drc->drc_cksum); - byteswap_record(drr_begin); - } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { - (void) fletcher_4_incremental_native(drr_begin, - sizeof (dmu_replay_record_t), &drc->drc_cksum); - } else { - return (SET_ERROR(EINVAL)); - } - - drba.drba_origin = origin; - drba.drba_cookie = drc; - drba.drba_cred = CRED(); - - if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & - DMU_BACKUP_FEATURE_RESUMING) { - return (dsl_sync_task(tofs, - dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, - &drba, 5, ZFS_SPACE_CHECK_NORMAL)); - } else { - return (dsl_sync_task(tofs, - dmu_recv_begin_check, dmu_recv_begin_sync, - &drba, 5, ZFS_SPACE_CHECK_NORMAL)); - } -} - -struct receive_record_arg { - dmu_replay_record_t header; - void *payload; /* Pointer to a buffer containing the payload */ - /* - * If the record is a write, pointer to the arc_buf_t containing the - * payload. - */ - arc_buf_t *write_buf; - int payload_size; - uint64_t bytes_read; /* bytes read from stream when record created */ - boolean_t eos_marker; /* Marks the end of the stream */ - bqueue_node_t node; -}; - -struct receive_writer_arg { - objset_t *os; - boolean_t byteswap; - bqueue_t q; - - /* - * These three args are used to signal to the main thread that we're - * done. - */ - kmutex_t mutex; - kcondvar_t cv; - boolean_t done; - - int err; - /* A map from guid to dataset to help handle dedup'd streams. */ - avl_tree_t *guid_to_ds_map; - boolean_t resumable; - uint64_t last_object; - uint64_t last_offset; - uint64_t max_object; /* highest object ID referenced in stream */ - uint64_t bytes_read; /* bytes read when current record created */ -}; - -struct objlist { - list_t list; /* List of struct receive_objnode. */ - /* - * Last object looked up. Used to assert that objects are being looked - * up in ascending order. - */ - uint64_t last_lookup; -}; - -struct receive_objnode { - list_node_t node; - uint64_t object; -}; - -struct receive_arg { - objset_t *os; - kthread_t *td; - struct file *fp; - uint64_t voff; /* The current offset in the stream */ - uint64_t bytes_read; - /* - * A record that has had its payload read in, but hasn't yet been handed - * off to the worker thread. - */ - struct receive_record_arg *rrd; - /* A record that has had its header read in, but not its payload. */ - struct receive_record_arg *next_rrd; - zio_cksum_t cksum; - zio_cksum_t prev_cksum; - int err; - boolean_t byteswap; - /* Sorted list of objects not to issue prefetches for. */ - struct objlist ignore_objlist; -}; - -typedef struct guid_map_entry { - uint64_t guid; - dsl_dataset_t *gme_ds; - avl_node_t avlnode; -} guid_map_entry_t; - -static int -guid_compare(const void *arg1, const void *arg2) -{ - const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1; - const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2; - - return (AVL_CMP(gmep1->guid, gmep2->guid)); -} - -static void -free_guid_map_onexit(void *arg) -{ - avl_tree_t *ca = arg; - void *cookie = NULL; - guid_map_entry_t *gmep; - - while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { - dsl_dataset_long_rele(gmep->gme_ds, gmep); - dsl_dataset_rele(gmep->gme_ds, gmep); - kmem_free(gmep, sizeof (guid_map_entry_t)); - } - avl_destroy(ca); - kmem_free(ca, sizeof (avl_tree_t)); -} - -static int -restore_bytes(struct receive_arg *ra, void *buf, int len, off_t off, ssize_t *resid) -{ - struct uio auio; - struct iovec aiov; - int error; - - aiov.iov_base = buf; - aiov.iov_len = len; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_resid = len; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_READ; - auio.uio_offset = off; - auio.uio_td = ra->td; -#ifdef _KERNEL - error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); -#else - fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); - error = EOPNOTSUPP; -#endif - *resid = auio.uio_resid; - return (error); -} - -static int -receive_read(struct receive_arg *ra, int len, void *buf) -{ - int done = 0; - - /* - * The code doesn't rely on this (lengths being multiples of 8). See - * comment in dump_bytes. - */ - ASSERT0(len % 8); - - while (done < len) { - ssize_t resid; - - ra->err = restore_bytes(ra, buf + done, - len - done, ra->voff, &resid); - - if (resid == len - done) { - /* - * Note: ECKSUM indicates that the receive - * was interrupted and can potentially be resumed. - */ - ra->err = SET_ERROR(ECKSUM); - } - ra->voff += len - done - resid; - done = len - resid; - if (ra->err != 0) - return (ra->err); - } - - ra->bytes_read += len; - - ASSERT3U(done, ==, len); - return (0); -} - -noinline static void -byteswap_record(dmu_replay_record_t *drr) -{ -#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) -#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) - drr->drr_type = BSWAP_32(drr->drr_type); - drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); - - switch (drr->drr_type) { - case DRR_BEGIN: - DO64(drr_begin.drr_magic); - DO64(drr_begin.drr_versioninfo); - DO64(drr_begin.drr_creation_time); - DO32(drr_begin.drr_type); - DO32(drr_begin.drr_flags); - DO64(drr_begin.drr_toguid); - DO64(drr_begin.drr_fromguid); - break; - case DRR_OBJECT: - DO64(drr_object.drr_object); - DO32(drr_object.drr_type); - DO32(drr_object.drr_bonustype); - DO32(drr_object.drr_blksz); - DO32(drr_object.drr_bonuslen); - DO64(drr_object.drr_toguid); - break; - case DRR_FREEOBJECTS: - DO64(drr_freeobjects.drr_firstobj); - DO64(drr_freeobjects.drr_numobjs); - DO64(drr_freeobjects.drr_toguid); - break; - case DRR_WRITE: - DO64(drr_write.drr_object); - DO32(drr_write.drr_type); - DO64(drr_write.drr_offset); - DO64(drr_write.drr_logical_size); - DO64(drr_write.drr_toguid); - ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); - DO64(drr_write.drr_key.ddk_prop); - DO64(drr_write.drr_compressed_size); - break; - case DRR_WRITE_BYREF: - DO64(drr_write_byref.drr_object); - DO64(drr_write_byref.drr_offset); - DO64(drr_write_byref.drr_length); - DO64(drr_write_byref.drr_toguid); - DO64(drr_write_byref.drr_refguid); - DO64(drr_write_byref.drr_refobject); - DO64(drr_write_byref.drr_refoffset); - ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. - drr_key.ddk_cksum); - DO64(drr_write_byref.drr_key.ddk_prop); - break; - case DRR_WRITE_EMBEDDED: - DO64(drr_write_embedded.drr_object); - DO64(drr_write_embedded.drr_offset); - DO64(drr_write_embedded.drr_length); - DO64(drr_write_embedded.drr_toguid); - DO32(drr_write_embedded.drr_lsize); - DO32(drr_write_embedded.drr_psize); - break; - case DRR_FREE: - DO64(drr_free.drr_object); - DO64(drr_free.drr_offset); - DO64(drr_free.drr_length); - DO64(drr_free.drr_toguid); - break; - case DRR_SPILL: - DO64(drr_spill.drr_object); - DO64(drr_spill.drr_length); - DO64(drr_spill.drr_toguid); - break; - case DRR_END: - DO64(drr_end.drr_toguid); - ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); - break; - } - - if (drr->drr_type != DRR_BEGIN) { - ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); - } - -#undef DO64 -#undef DO32 -} - -static inline uint8_t -deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) -{ - if (bonus_type == DMU_OT_SA) { - return (1); - } else { - return (1 + - ((DN_OLD_MAX_BONUSLEN - - MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT)); - } -} - -static void -save_resume_state(struct receive_writer_arg *rwa, - uint64_t object, uint64_t offset, dmu_tx_t *tx) -{ - int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; - - if (!rwa->resumable) - return; - - /* - * We use ds_resume_bytes[] != 0 to indicate that we need to - * update this on disk, so it must not be 0. - */ - ASSERT(rwa->bytes_read != 0); - - /* - * We only resume from write records, which have a valid - * (non-meta-dnode) object number. - */ - ASSERT(object != 0); - - /* - * For resuming to work correctly, we must receive records in order, - * sorted by object,offset. This is checked by the callers, but - * assert it here for good measure. - */ - ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); - ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || - offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); - ASSERT3U(rwa->bytes_read, >=, - rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); - - rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; - rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; - rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; -} - -noinline static int -receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, - void *data) -{ - dmu_object_info_t doi; - dmu_tx_t *tx; - uint64_t object; - int err; - uint8_t dn_slots = drro->drr_dn_slots != 0 ? - drro->drr_dn_slots : DNODE_MIN_SLOTS; - - if (drro->drr_type == DMU_OT_NONE || - !DMU_OT_IS_VALID(drro->drr_type) || - !DMU_OT_IS_VALID(drro->drr_bonustype) || - drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || - drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || - P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || - drro->drr_blksz < SPA_MINBLOCKSIZE || - drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || - drro->drr_bonuslen > - DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) || - dn_slots > - (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) { - return (SET_ERROR(EINVAL)); - } - - err = dmu_object_info(rwa->os, drro->drr_object, &doi); - - if (err != 0 && err != ENOENT && err != EEXIST) - return (SET_ERROR(EINVAL)); - - if (drro->drr_object > rwa->max_object) - rwa->max_object = drro->drr_object; - - /* - * If we are losing blkptrs or changing the block size this must - * be a new file instance. We must clear out the previous file - * contents before we can change this type of metadata in the dnode. - */ - if (err == 0) { - int nblkptr; - - object = drro->drr_object; - - nblkptr = deduce_nblkptr(drro->drr_bonustype, - drro->drr_bonuslen); - - if (drro->drr_blksz != doi.doi_data_block_size || - nblkptr < doi.doi_nblkptr || - dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) { - err = dmu_free_long_range(rwa->os, drro->drr_object, - 0, DMU_OBJECT_END); - if (err != 0) - return (SET_ERROR(EINVAL)); - } - } else if (err == EEXIST) { - /* - * The object requested is currently an interior slot of a - * multi-slot dnode. This will be resolved when the next txg - * is synced out, since the send stream will have told us - * to free this slot when we freed the associated dnode - * earlier in the stream. - */ - txg_wait_synced(dmu_objset_pool(rwa->os), 0); - object = drro->drr_object; - } else { - /* object is free and we are about to allocate a new one */ - object = DMU_NEW_OBJECT; - } - - /* - * If this is a multi-slot dnode there is a chance that this - * object will expand into a slot that is already used by - * another object from the previous snapshot. We must free - * these objects before we attempt to allocate the new dnode. - */ - if (dn_slots > 1) { - boolean_t need_sync = B_FALSE; - - for (uint64_t slot = drro->drr_object + 1; - slot < drro->drr_object + dn_slots; - slot++) { - dmu_object_info_t slot_doi; - - err = dmu_object_info(rwa->os, slot, &slot_doi); - if (err == ENOENT || err == EEXIST) - continue; - else if (err != 0) - return (err); - - err = dmu_free_long_object(rwa->os, slot); - - if (err != 0) - return (err); - - need_sync = B_TRUE; - } - - if (need_sync) - txg_wait_synced(dmu_objset_pool(rwa->os), 0); - } - - tx = dmu_tx_create(rwa->os); - dmu_tx_hold_bonus(tx, object); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { - dmu_tx_abort(tx); - return (err); - } - - if (object == DMU_NEW_OBJECT) { - /* currently free, want to be allocated */ - err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, - drro->drr_type, drro->drr_blksz, - drro->drr_bonustype, drro->drr_bonuslen, - dn_slots << DNODE_SHIFT, tx); - } else if (drro->drr_type != doi.doi_type || - drro->drr_blksz != doi.doi_data_block_size || - drro->drr_bonustype != doi.doi_bonus_type || - drro->drr_bonuslen != doi.doi_bonus_size || - drro->drr_dn_slots != (doi.doi_dnodesize >> DNODE_SHIFT)) { - /* currently allocated, but with different properties */ - err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object, - drro->drr_type, drro->drr_blksz, - drro->drr_bonustype, drro->drr_bonuslen, - drro->drr_dn_slots << DNODE_SHIFT, tx); - } - if (err != 0) { - dmu_tx_commit(tx); - return (SET_ERROR(EINVAL)); - } - - dmu_object_set_checksum(rwa->os, drro->drr_object, - drro->drr_checksumtype, tx); - dmu_object_set_compress(rwa->os, drro->drr_object, - drro->drr_compress, tx); - - if (data != NULL) { - dmu_buf_t *db; - - VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db)); - dmu_buf_will_dirty(db, tx); - - ASSERT3U(db->db_size, >=, drro->drr_bonuslen); - bcopy(data, db->db_data, drro->drr_bonuslen); - if (rwa->byteswap) { - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(drro->drr_bonustype); - dmu_ot_byteswap[byteswap].ob_func(db->db_data, - drro->drr_bonuslen); - } - dmu_buf_rele(db, FTAG); - } - dmu_tx_commit(tx); - - return (0); -} - -/* ARGSUSED */ -noinline static int -receive_freeobjects(struct receive_writer_arg *rwa, - struct drr_freeobjects *drrfo) -{ - uint64_t obj; - int next_err = 0; - - if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) - return (SET_ERROR(EINVAL)); - - for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; - obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0; - next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { - dmu_object_info_t doi; - int err; - - err = dmu_object_info(rwa->os, obj, NULL); - if (err == ENOENT) - continue; - else if (err != 0) - return (err); - - err = dmu_free_long_object(rwa->os, obj); - if (err != 0) - return (err); - - if (obj > rwa->max_object) - rwa->max_object = obj; - } - if (next_err != ESRCH) - return (next_err); - return (0); -} - -noinline static int -receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, - arc_buf_t *abuf) -{ - dmu_tx_t *tx; - int err; - - if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || - !DMU_OT_IS_VALID(drrw->drr_type)) - return (SET_ERROR(EINVAL)); - - /* - * For resuming to work, records must be in increasing order - * by (object, offset). - */ - if (drrw->drr_object < rwa->last_object || - (drrw->drr_object == rwa->last_object && - drrw->drr_offset < rwa->last_offset)) { - return (SET_ERROR(EINVAL)); - } - rwa->last_object = drrw->drr_object; - rwa->last_offset = drrw->drr_offset; - - if (rwa->last_object > rwa->max_object) - rwa->max_object = rwa->last_object; - - if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) - return (SET_ERROR(EINVAL)); - - tx = dmu_tx_create(rwa->os); - dmu_tx_hold_write(tx, drrw->drr_object, - drrw->drr_offset, drrw->drr_logical_size); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { - dmu_tx_abort(tx); - return (err); - } - if (rwa->byteswap) { - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(drrw->drr_type); - dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, - DRR_WRITE_PAYLOAD_SIZE(drrw)); - } - - /* use the bonus buf to look up the dnode in dmu_assign_arcbuf */ - dmu_buf_t *bonus; - if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) - return (SET_ERROR(EINVAL)); - dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); - - /* - * Note: If the receive fails, we want the resume stream to start - * with the same record that we last successfully received (as opposed - * to the next record), so that we can verify that we are - * resuming from the correct location. - */ - save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); - dmu_tx_commit(tx); - dmu_buf_rele(bonus, FTAG); - - return (0); -} - -/* - * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed - * streams to refer to a copy of the data that is already on the - * system because it came in earlier in the stream. This function - * finds the earlier copy of the data, and uses that copy instead of - * data from the stream to fulfill this write. - */ -static int -receive_write_byref(struct receive_writer_arg *rwa, - struct drr_write_byref *drrwbr) -{ - dmu_tx_t *tx; - int err; - guid_map_entry_t gmesrch; - guid_map_entry_t *gmep; - avl_index_t where; - objset_t *ref_os = NULL; - dmu_buf_t *dbp; - - if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) - return (SET_ERROR(EINVAL)); - - /* - * If the GUID of the referenced dataset is different from the - * GUID of the target dataset, find the referenced dataset. - */ - if (drrwbr->drr_toguid != drrwbr->drr_refguid) { - gmesrch.guid = drrwbr->drr_refguid; - if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch, - &where)) == NULL) { - return (SET_ERROR(EINVAL)); - } - if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) - return (SET_ERROR(EINVAL)); - } else { - ref_os = rwa->os; - } - - if (drrwbr->drr_object > rwa->max_object) - rwa->max_object = drrwbr->drr_object; - - err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, - drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); - if (err != 0) - return (err); - - tx = dmu_tx_create(rwa->os); - - dmu_tx_hold_write(tx, drrwbr->drr_object, - drrwbr->drr_offset, drrwbr->drr_length); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { - dmu_tx_abort(tx); - return (err); - } - dmu_write(rwa->os, drrwbr->drr_object, - drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); - dmu_buf_rele(dbp, FTAG); - - /* See comment in restore_write. */ - save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx); - dmu_tx_commit(tx); - return (0); -} - -static int -receive_write_embedded(struct receive_writer_arg *rwa, - struct drr_write_embedded *drrwe, void *data) -{ - dmu_tx_t *tx; - int err; - - if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) - return (EINVAL); - - if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) - return (EINVAL); - - if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) - return (EINVAL); - if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) - return (EINVAL); - - if (drrwe->drr_object > rwa->max_object) - rwa->max_object = drrwe->drr_object; - - tx = dmu_tx_create(rwa->os); - - dmu_tx_hold_write(tx, drrwe->drr_object, - drrwe->drr_offset, drrwe->drr_length); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { - dmu_tx_abort(tx); - return (err); - } - - dmu_write_embedded(rwa->os, drrwe->drr_object, - drrwe->drr_offset, data, drrwe->drr_etype, - drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, - rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); - - /* See comment in restore_write. */ - save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); - dmu_tx_commit(tx); - return (0); -} - -static int -receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, - void *data) -{ - dmu_tx_t *tx; - dmu_buf_t *db, *db_spill; - int err; - - if (drrs->drr_length < SPA_MINBLOCKSIZE || - drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) - return (SET_ERROR(EINVAL)); - - if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) - return (SET_ERROR(EINVAL)); - - if (drrs->drr_object > rwa->max_object) - rwa->max_object = drrs->drr_object; - - VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); - if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { - dmu_buf_rele(db, FTAG); - return (err); - } - - tx = dmu_tx_create(rwa->os); - - dmu_tx_hold_spill(tx, db->db_object); - - err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { - dmu_buf_rele(db, FTAG); - dmu_buf_rele(db_spill, FTAG); - dmu_tx_abort(tx); - return (err); - } - dmu_buf_will_dirty(db_spill, tx); - - if (db_spill->db_size < drrs->drr_length) - VERIFY(0 == dbuf_spill_set_blksz(db_spill, - drrs->drr_length, tx)); - bcopy(data, db_spill->db_data, drrs->drr_length); - - dmu_buf_rele(db, FTAG); - dmu_buf_rele(db_spill, FTAG); - - dmu_tx_commit(tx); - return (0); -} - -/* ARGSUSED */ -noinline static int -receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) -{ - int err; - - if (drrf->drr_length != -1ULL && - drrf->drr_offset + drrf->drr_length < drrf->drr_offset) - return (SET_ERROR(EINVAL)); - - if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) - return (SET_ERROR(EINVAL)); - - if (drrf->drr_object > rwa->max_object) - rwa->max_object = drrf->drr_object; - - err = dmu_free_long_range(rwa->os, drrf->drr_object, - drrf->drr_offset, drrf->drr_length); - - return (err); -} - -/* used to destroy the drc_ds on error */ -static void -dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) -{ - if (drc->drc_resumable) { - /* wait for our resume state to be written to disk */ - txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0); - dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); - } else { - char name[ZFS_MAX_DATASET_NAME_LEN]; - dsl_dataset_name(drc->drc_ds, name); - dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); - (void) dsl_destroy_head(name); - } -} - -static void -receive_cksum(struct receive_arg *ra, int len, void *buf) -{ - if (ra->byteswap) { - (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum); - } else { - (void) fletcher_4_incremental_native(buf, len, &ra->cksum); - } -} - -/* - * Read the payload into a buffer of size len, and update the current record's - * payload field. - * Allocate ra->next_rrd and read the next record's header into - * ra->next_rrd->header. - * Verify checksum of payload and next record. - */ -static int -receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) -{ - int err; - - if (len != 0) { - ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); - err = receive_read(ra, len, buf); - if (err != 0) - return (err); - receive_cksum(ra, len, buf); - - /* note: rrd is NULL when reading the begin record's payload */ - if (ra->rrd != NULL) { - ra->rrd->payload = buf; - ra->rrd->payload_size = len; - ra->rrd->bytes_read = ra->bytes_read; - } - } - - ra->prev_cksum = ra->cksum; - - ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); - err = receive_read(ra, sizeof (ra->next_rrd->header), - &ra->next_rrd->header); - ra->next_rrd->bytes_read = ra->bytes_read; - if (err != 0) { - kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); - ra->next_rrd = NULL; - return (err); - } - if (ra->next_rrd->header.drr_type == DRR_BEGIN) { - kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); - ra->next_rrd = NULL; - return (SET_ERROR(EINVAL)); - } - - /* - * Note: checksum is of everything up to but not including the - * checksum itself. - */ - ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), - ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - receive_cksum(ra, - offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), - &ra->next_rrd->header); - - zio_cksum_t cksum_orig = - ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; - zio_cksum_t *cksump = - &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; - - if (ra->byteswap) - byteswap_record(&ra->next_rrd->header); - - if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && - !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) { - kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); - ra->next_rrd = NULL; - return (SET_ERROR(ECKSUM)); - } - - receive_cksum(ra, sizeof (cksum_orig), &cksum_orig); - - return (0); -} - -static void -objlist_create(struct objlist *list) -{ - list_create(&list->list, sizeof (struct receive_objnode), - offsetof(struct receive_objnode, node)); - list->last_lookup = 0; -} - -static void -objlist_destroy(struct objlist *list) -{ - for (struct receive_objnode *n = list_remove_head(&list->list); - n != NULL; n = list_remove_head(&list->list)) { - kmem_free(n, sizeof (*n)); - } - list_destroy(&list->list); -} - -/* - * This function looks through the objlist to see if the specified object number - * is contained in the objlist. In the process, it will remove all object - * numbers in the list that are smaller than the specified object number. Thus, - * any lookup of an object number smaller than a previously looked up object - * number will always return false; therefore, all lookups should be done in - * ascending order. - */ -static boolean_t -objlist_exists(struct objlist *list, uint64_t object) -{ - struct receive_objnode *node = list_head(&list->list); - ASSERT3U(object, >=, list->last_lookup); - list->last_lookup = object; - while (node != NULL && node->object < object) { - VERIFY3P(node, ==, list_remove_head(&list->list)); - kmem_free(node, sizeof (*node)); - node = list_head(&list->list); - } - return (node != NULL && node->object == object); -} - -/* - * The objlist is a list of object numbers stored in ascending order. However, - * the insertion of new object numbers does not seek out the correct location to - * store a new object number; instead, it appends it to the list for simplicity. - * Thus, any users must take care to only insert new object numbers in ascending - * order. - */ -static void -objlist_insert(struct objlist *list, uint64_t object) -{ - struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP); - node->object = object; -#ifdef ZFS_DEBUG - struct receive_objnode *last_object = list_tail(&list->list); - uint64_t last_objnum = (last_object != NULL ? last_object->object : 0); - ASSERT3U(node->object, >, last_objnum); -#endif - list_insert_tail(&list->list, node); -} - -/* - * Issue the prefetch reads for any necessary indirect blocks. - * - * We use the object ignore list to tell us whether or not to issue prefetches - * for a given object. We do this for both correctness (in case the blocksize - * of an object has changed) and performance (if the object doesn't exist, don't - * needlessly try to issue prefetches). We also trim the list as we go through - * the stream to prevent it from growing to an unbounded size. - * - * The object numbers within will always be in sorted order, and any write - * records we see will also be in sorted order, but they're not sorted with - * respect to each other (i.e. we can get several object records before - * receiving each object's write records). As a result, once we've reached a - * given object number, we can safely remove any reference to lower object - * numbers in the ignore list. In practice, we receive up to 32 object records - * before receiving write records, so the list can have up to 32 nodes in it. - */ -/* ARGSUSED */ -static void -receive_read_prefetch(struct receive_arg *ra, - uint64_t object, uint64_t offset, uint64_t length) -{ - if (!objlist_exists(&ra->ignore_objlist, object)) { - dmu_prefetch(ra->os, object, 1, offset, length, - ZIO_PRIORITY_SYNC_READ); - } -} - -/* - * Read records off the stream, issuing any necessary prefetches. - */ -static int -receive_read_record(struct receive_arg *ra) -{ - int err; - - switch (ra->rrd->header.drr_type) { - case DRR_OBJECT: - { - struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; - uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8); - void *buf = kmem_zalloc(size, KM_SLEEP); - dmu_object_info_t doi; - err = receive_read_payload_and_next_header(ra, size, buf); - if (err != 0) { - kmem_free(buf, size); - return (err); - } - err = dmu_object_info(ra->os, drro->drr_object, &doi); - /* - * See receive_read_prefetch for an explanation why we're - * storing this object in the ignore_obj_list. - */ - if (err == ENOENT || - (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { - objlist_insert(&ra->ignore_objlist, drro->drr_object); - err = 0; - } - return (err); - } - case DRR_FREEOBJECTS: - { - err = receive_read_payload_and_next_header(ra, 0, NULL); - return (err); - } - case DRR_WRITE: - { - struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; - arc_buf_t *abuf; - boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type); - if (DRR_WRITE_COMPRESSED(drrw)) { - ASSERT3U(drrw->drr_compressed_size, >, 0); - ASSERT3U(drrw->drr_logical_size, >=, - drrw->drr_compressed_size); - ASSERT(!is_meta); - abuf = arc_loan_compressed_buf( - dmu_objset_spa(ra->os), - drrw->drr_compressed_size, drrw->drr_logical_size, - drrw->drr_compressiontype); - } else { - abuf = arc_loan_buf(dmu_objset_spa(ra->os), - is_meta, drrw->drr_logical_size); - } - - err = receive_read_payload_and_next_header(ra, - DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data); - if (err != 0) { - dmu_return_arcbuf(abuf); - return (err); - } - ra->rrd->write_buf = abuf; - receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, - drrw->drr_logical_size); - return (err); - } - case DRR_WRITE_BYREF: - { - struct drr_write_byref *drrwb = - &ra->rrd->header.drr_u.drr_write_byref; - err = receive_read_payload_and_next_header(ra, 0, NULL); - receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset, - drrwb->drr_length); - return (err); - } - case DRR_WRITE_EMBEDDED: - { - struct drr_write_embedded *drrwe = - &ra->rrd->header.drr_u.drr_write_embedded; - uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); - void *buf = kmem_zalloc(size, KM_SLEEP); - - err = receive_read_payload_and_next_header(ra, size, buf); - if (err != 0) { - kmem_free(buf, size); - return (err); - } - - receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset, - drrwe->drr_length); - return (err); - } - case DRR_FREE: - { - /* - * It might be beneficial to prefetch indirect blocks here, but - * we don't really have the data to decide for sure. - */ - err = receive_read_payload_and_next_header(ra, 0, NULL); - return (err); - } - case DRR_END: - { - struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; - if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) - return (SET_ERROR(ECKSUM)); - return (0); - } - case DRR_SPILL: - { - struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill; - void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP); - err = receive_read_payload_and_next_header(ra, drrs->drr_length, - buf); - if (err != 0) - kmem_free(buf, drrs->drr_length); - return (err); - } - default: - return (SET_ERROR(EINVAL)); - } -} - -/* - * Commit the records to the pool. - */ -static int -receive_process_record(struct receive_writer_arg *rwa, - struct receive_record_arg *rrd) -{ - int err; - - /* Processing in order, therefore bytes_read should be increasing. */ - ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); - rwa->bytes_read = rrd->bytes_read; - - switch (rrd->header.drr_type) { - case DRR_OBJECT: - { - struct drr_object *drro = &rrd->header.drr_u.drr_object; - err = receive_object(rwa, drro, rrd->payload); - kmem_free(rrd->payload, rrd->payload_size); - rrd->payload = NULL; - return (err); - } - case DRR_FREEOBJECTS: - { - struct drr_freeobjects *drrfo = - &rrd->header.drr_u.drr_freeobjects; - return (receive_freeobjects(rwa, drrfo)); - } - case DRR_WRITE: - { - struct drr_write *drrw = &rrd->header.drr_u.drr_write; - err = receive_write(rwa, drrw, rrd->write_buf); - /* if receive_write() is successful, it consumes the arc_buf */ - if (err != 0) - dmu_return_arcbuf(rrd->write_buf); - rrd->write_buf = NULL; - rrd->payload = NULL; - return (err); - } - case DRR_WRITE_BYREF: - { - struct drr_write_byref *drrwbr = - &rrd->header.drr_u.drr_write_byref; - return (receive_write_byref(rwa, drrwbr)); - } - case DRR_WRITE_EMBEDDED: - { - struct drr_write_embedded *drrwe = - &rrd->header.drr_u.drr_write_embedded; - err = receive_write_embedded(rwa, drrwe, rrd->payload); - kmem_free(rrd->payload, rrd->payload_size); - rrd->payload = NULL; - return (err); - } - case DRR_FREE: - { - struct drr_free *drrf = &rrd->header.drr_u.drr_free; - return (receive_free(rwa, drrf)); - } - case DRR_SPILL: - { - struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; - err = receive_spill(rwa, drrs, rrd->payload); - kmem_free(rrd->payload, rrd->payload_size); - rrd->payload = NULL; - return (err); - } - default: - return (SET_ERROR(EINVAL)); - } -} - -/* - * dmu_recv_stream's worker thread; pull records off the queue, and then call - * receive_process_record When we're done, signal the main thread and exit. - */ -static void -receive_writer_thread(void *arg) -{ - struct receive_writer_arg *rwa = arg; - struct receive_record_arg *rrd; - for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; - rrd = bqueue_dequeue(&rwa->q)) { - /* - * If there's an error, the main thread will stop putting things - * on the queue, but we need to clear everything in it before we - * can exit. - */ - if (rwa->err == 0) { - rwa->err = receive_process_record(rwa, rrd); - } else if (rrd->write_buf != NULL) { - dmu_return_arcbuf(rrd->write_buf); - rrd->write_buf = NULL; - rrd->payload = NULL; - } else if (rrd->payload != NULL) { - kmem_free(rrd->payload, rrd->payload_size); - rrd->payload = NULL; - } - kmem_free(rrd, sizeof (*rrd)); - } - kmem_free(rrd, sizeof (*rrd)); - mutex_enter(&rwa->mutex); - rwa->done = B_TRUE; - cv_signal(&rwa->cv); - mutex_exit(&rwa->mutex); - thread_exit(); -} - -static int -resume_check(struct receive_arg *ra, nvlist_t *begin_nvl) -{ - uint64_t val; - objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset; - uint64_t dsobj = dmu_objset_id(ra->os); - uint64_t resume_obj, resume_off; - - if (nvlist_lookup_uint64(begin_nvl, - "resume_object", &resume_obj) != 0 || - nvlist_lookup_uint64(begin_nvl, - "resume_offset", &resume_off) != 0) { - return (SET_ERROR(EINVAL)); - } - VERIFY0(zap_lookup(mos, dsobj, - DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); - if (resume_obj != val) - return (SET_ERROR(EINVAL)); - VERIFY0(zap_lookup(mos, dsobj, - DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); - if (resume_off != val) - return (SET_ERROR(EINVAL)); - - return (0); -} - -/* - * Read in the stream's records, one by one, and apply them to the pool. There - * are two threads involved; the thread that calls this function will spin up a - * worker thread, read the records off the stream one by one, and issue - * prefetches for any necessary indirect blocks. It will then push the records - * onto an internal blocking queue. The worker thread will pull the records off - * the queue, and actually write the data into the DMU. This way, the worker - * thread doesn't have to wait for reads to complete, since everything it needs - * (the indirect blocks) will be prefetched. - * - * NB: callers *must* call dmu_recv_end() if this succeeds. - */ -int -dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, - int cleanup_fd, uint64_t *action_handlep) -{ - int err = 0; - struct receive_arg ra = { 0 }; - struct receive_writer_arg rwa = { 0 }; - int featureflags; - nvlist_t *begin_nvl = NULL; - - ra.byteswap = drc->drc_byteswap; - ra.cksum = drc->drc_cksum; - ra.td = curthread; - ra.fp = fp; - ra.voff = *voffp; - - if (dsl_dataset_is_zapified(drc->drc_ds)) { - (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, - drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, - sizeof (ra.bytes_read), 1, &ra.bytes_read); - } - - objlist_create(&ra.ignore_objlist); - - /* these were verified in dmu_recv_begin */ - ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, - DMU_SUBSTREAM); - ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); - - /* - * Open the objset we are modifying. - */ - VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os)); - - ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); - - featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); - - /* if this stream is dedup'ed, set up the avl tree for guid mapping */ - if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { - minor_t minor; - - if (cleanup_fd == -1) { - ra.err = SET_ERROR(EBADF); - goto out; - } - ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); - if (ra.err != 0) { - cleanup_fd = -1; - goto out; - } - - if (*action_handlep == 0) { - rwa.guid_to_ds_map = - kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); - avl_create(rwa.guid_to_ds_map, guid_compare, - sizeof (guid_map_entry_t), - offsetof(guid_map_entry_t, avlnode)); - err = zfs_onexit_add_cb(minor, - free_guid_map_onexit, rwa.guid_to_ds_map, - action_handlep); - if (ra.err != 0) - goto out; - } else { - err = zfs_onexit_cb_data(minor, *action_handlep, - (void **)&rwa.guid_to_ds_map); - if (ra.err != 0) - goto out; - } - - drc->drc_guid_to_ds_map = rwa.guid_to_ds_map; - } - - uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen; - void *payload = NULL; - if (payloadlen != 0) - payload = kmem_alloc(payloadlen, KM_SLEEP); - - err = receive_read_payload_and_next_header(&ra, payloadlen, payload); - if (err != 0) { - if (payloadlen != 0) - kmem_free(payload, payloadlen); - goto out; - } - if (payloadlen != 0) { - err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP); - kmem_free(payload, payloadlen); - if (err != 0) - goto out; - } - - if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { - err = resume_check(&ra, begin_nvl); - if (err != 0) - goto out; - } - - (void) bqueue_init(&rwa.q, zfs_recv_queue_length, - offsetof(struct receive_record_arg, node)); - cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL); - mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL); - rwa.os = ra.os; - rwa.byteswap = drc->drc_byteswap; - rwa.resumable = drc->drc_resumable; - - (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, &p0, - TS_RUN, minclsyspri); - /* - * We're reading rwa.err without locks, which is safe since we are the - * only reader, and the worker thread is the only writer. It's ok if we - * miss a write for an iteration or two of the loop, since the writer - * thread will keep freeing records we send it until we send it an eos - * marker. - * - * We can leave this loop in 3 ways: First, if rwa.err is - * non-zero. In that case, the writer thread will free the rrd we just - * pushed. Second, if we're interrupted; in that case, either it's the - * first loop and ra.rrd was never allocated, or it's later, and ra.rrd - * has been handed off to the writer thread who will free it. Finally, - * if receive_read_record fails or we're at the end of the stream, then - * we free ra.rrd and exit. - */ - while (rwa.err == 0) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { - err = SET_ERROR(EINTR); - break; - } - - ASSERT3P(ra.rrd, ==, NULL); - ra.rrd = ra.next_rrd; - ra.next_rrd = NULL; - /* Allocates and loads header into ra.next_rrd */ - err = receive_read_record(&ra); - - if (ra.rrd->header.drr_type == DRR_END || err != 0) { - kmem_free(ra.rrd, sizeof (*ra.rrd)); - ra.rrd = NULL; - break; - } - - bqueue_enqueue(&rwa.q, ra.rrd, - sizeof (struct receive_record_arg) + ra.rrd->payload_size); - ra.rrd = NULL; - } - if (ra.next_rrd == NULL) - ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP); - ra.next_rrd->eos_marker = B_TRUE; - bqueue_enqueue(&rwa.q, ra.next_rrd, 1); - - mutex_enter(&rwa.mutex); - while (!rwa.done) { - cv_wait(&rwa.cv, &rwa.mutex); - } - mutex_exit(&rwa.mutex); - - /* - * If we are receiving a full stream as a clone, all object IDs which - * are greater than the maximum ID referenced in the stream are - * by definition unused and must be freed. Note that it's possible that - * we've resumed this send and the first record we received was the END - * record. In that case, max_object would be 0, but we shouldn't start - * freeing all objects from there; instead we should start from the - * resumeobj. - */ - if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) { - uint64_t obj; - if (nvlist_lookup_uint64(begin_nvl, "resume_object", &obj) != 0) - obj = 0; - if (rwa.max_object > obj) - obj = rwa.max_object; - obj++; - int free_err = 0; - int next_err = 0; - - while (next_err == 0) { - free_err = dmu_free_long_object(rwa.os, obj); - if (free_err != 0 && free_err != ENOENT) - break; - - next_err = dmu_object_next(rwa.os, &obj, FALSE, 0); - } - - if (err == 0) { - if (free_err != 0 && free_err != ENOENT) - err = free_err; - else if (next_err != ESRCH) - err = next_err; - } - } - - cv_destroy(&rwa.cv); - mutex_destroy(&rwa.mutex); - bqueue_destroy(&rwa.q); - if (err == 0) - err = rwa.err; - -out: - nvlist_free(begin_nvl); - if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) - zfs_onexit_fd_rele(cleanup_fd); - - if (err != 0) { - /* - * Clean up references. If receive is not resumable, - * destroy what we created, so we don't leave it in - * the inconsistent state. - */ - dmu_recv_cleanup_ds(drc); - } - - *voffp = ra.voff; - objlist_destroy(&ra.ignore_objlist); - return (err); -} - -static int -dmu_recv_end_check(void *arg, dmu_tx_t *tx) -{ - dmu_recv_cookie_t *drc = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - int error; - - ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); - - if (!drc->drc_newfs) { - dsl_dataset_t *origin_head; - - error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); - if (error != 0) - return (error); - if (drc->drc_force) { - /* - * We will destroy any snapshots in tofs (i.e. before - * origin_head) that are after the origin (which is - * the snap before drc_ds, because drc_ds can not - * have any snaps of its own). - */ - uint64_t obj; - - obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; - while (obj != - dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { - dsl_dataset_t *snap; - error = dsl_dataset_hold_obj(dp, obj, FTAG, - &snap); - if (error != 0) - break; - if (snap->ds_dir != origin_head->ds_dir) - error = SET_ERROR(EINVAL); - if (error == 0) { - error = dsl_destroy_snapshot_check_impl( - snap, B_FALSE); - } - obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; - dsl_dataset_rele(snap, FTAG); - if (error != 0) - break; - } - if (error != 0) { - dsl_dataset_rele(origin_head, FTAG); - return (error); - } - } - error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, - origin_head, drc->drc_force, drc->drc_owner, tx); - if (error != 0) { - dsl_dataset_rele(origin_head, FTAG); - return (error); - } - error = dsl_dataset_snapshot_check_impl(origin_head, - drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); - dsl_dataset_rele(origin_head, FTAG); - if (error != 0) - return (error); - - error = dsl_destroy_head_check_impl(drc->drc_ds, 1); - } else { - error = dsl_dataset_snapshot_check_impl(drc->drc_ds, - drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); - } - return (error); -} - -static void -dmu_recv_end_sync(void *arg, dmu_tx_t *tx) -{ - dmu_recv_cookie_t *drc = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - - spa_history_log_internal_ds(drc->drc_ds, "finish receiving", - tx, "snap=%s", drc->drc_tosnap); - - if (!drc->drc_newfs) { - dsl_dataset_t *origin_head; - - VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, - &origin_head)); - - if (drc->drc_force) { - /* - * Destroy any snapshots of drc_tofs (origin_head) - * after the origin (the snap before drc_ds). - */ - uint64_t obj; - - obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; - while (obj != - dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { - dsl_dataset_t *snap; - VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, - &snap)); - ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); - obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; - dsl_destroy_snapshot_sync_impl(snap, - B_FALSE, tx); - dsl_dataset_rele(snap, FTAG); - } - } - VERIFY3P(drc->drc_ds->ds_prev, ==, - origin_head->ds_prev); - - dsl_dataset_clone_swap_sync_impl(drc->drc_ds, - origin_head, tx); - dsl_dataset_snapshot_sync_impl(origin_head, - drc->drc_tosnap, tx); - - /* set snapshot's creation time and guid */ - dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); - dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = - drc->drc_drrb->drr_creation_time; - dsl_dataset_phys(origin_head->ds_prev)->ds_guid = - drc->drc_drrb->drr_toguid; - dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= - ~DS_FLAG_INCONSISTENT; - - dmu_buf_will_dirty(origin_head->ds_dbuf, tx); - dsl_dataset_phys(origin_head)->ds_flags &= - ~DS_FLAG_INCONSISTENT; - - drc->drc_newsnapobj = - dsl_dataset_phys(origin_head)->ds_prev_snap_obj; - - dsl_dataset_rele(origin_head, FTAG); - dsl_destroy_head_sync_impl(drc->drc_ds, tx); - - if (drc->drc_owner != NULL) - VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); - } else { - dsl_dataset_t *ds = drc->drc_ds; - - dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); - - /* set snapshot's creation time and guid */ - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - dsl_dataset_phys(ds->ds_prev)->ds_creation_time = - drc->drc_drrb->drr_creation_time; - dsl_dataset_phys(ds->ds_prev)->ds_guid = - drc->drc_drrb->drr_toguid; - dsl_dataset_phys(ds->ds_prev)->ds_flags &= - ~DS_FLAG_INCONSISTENT; - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; - if (dsl_dataset_has_resume_receive_state(ds)) { - (void) zap_remove(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_FROMGUID, tx); - (void) zap_remove(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_OBJECT, tx); - (void) zap_remove(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_OFFSET, tx); - (void) zap_remove(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_BYTES, tx); - (void) zap_remove(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_TOGUID, tx); - (void) zap_remove(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_TONAME, tx); - } - drc->drc_newsnapobj = - dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; - } - -#if defined(__FreeBSD__) && defined(_KERNEL) - zvol_create_minors(dp->dp_spa, drc->drc_tofs); -#endif - - /* - * Release the hold from dmu_recv_begin. This must be done before - * we return to open context, so that when we free the dataset's dnode, - * we can evict its bonus buffer. - */ - dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); - drc->drc_ds = NULL; -} - -static int -add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) -{ - dsl_pool_t *dp; - dsl_dataset_t *snapds; - guid_map_entry_t *gmep; - int err; - - ASSERT(guid_map != NULL); - - err = dsl_pool_hold(name, FTAG, &dp); - if (err != 0) - return (err); - gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); - err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); - if (err == 0) { - gmep->guid = dsl_dataset_phys(snapds)->ds_guid; - gmep->gme_ds = snapds; - avl_add(guid_map, gmep); - dsl_dataset_long_hold(snapds, gmep); - } else - kmem_free(gmep, sizeof (*gmep)); - - dsl_pool_rele(dp, FTAG); - return (err); -} - -static int dmu_recv_end_modified_blocks = 3; - -static int -dmu_recv_existing_end(dmu_recv_cookie_t *drc) -{ -#ifdef _KERNEL - /* - * We will be destroying the ds; make sure its origin is unmounted if - * necessary. - */ - char name[ZFS_MAX_DATASET_NAME_LEN]; - dsl_dataset_name(drc->drc_ds, name); - zfs_destroy_unmount_origin(name); -#endif - - return (dsl_sync_task(drc->drc_tofs, - dmu_recv_end_check, dmu_recv_end_sync, drc, - dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); -} - -static int -dmu_recv_new_end(dmu_recv_cookie_t *drc) -{ - return (dsl_sync_task(drc->drc_tofs, - dmu_recv_end_check, dmu_recv_end_sync, drc, - dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); -} - -int -dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) -{ - int error; - - drc->drc_owner = owner; - - if (drc->drc_newfs) - error = dmu_recv_new_end(drc); - else - error = dmu_recv_existing_end(drc); - - if (error != 0) { - dmu_recv_cleanup_ds(drc); - } else if (drc->drc_guid_to_ds_map != NULL) { - (void) add_ds_to_guidmap(drc->drc_tofs, - drc->drc_guid_to_ds_map, - drc->drc_newsnapobj); - } - return (error); -} - -/* - * Return TRUE if this objset is currently being received into. - */ -boolean_t -dmu_objset_is_receiving(objset_t *os) -{ - return (os->os_dsl_dataset != NULL && - os->os_dsl_dataset->ds_owner == dmu_recv_tag); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c deleted file mode 100644 index 8ed53914ceae..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ /dev/null @@ -1,712 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2015 Chunwei Chen. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */ -boolean_t send_holes_without_birth_time = B_TRUE; - -#ifdef _KERNEL -SYSCTL_DECL(_vfs_zfs); -SYSCTL_UINT(_vfs_zfs, OID_AUTO, send_holes_without_birth_time, CTLFLAG_RWTUN, - &send_holes_without_birth_time, 0, "Send holes without birth time"); -#endif - -typedef struct prefetch_data { - kmutex_t pd_mtx; - kcondvar_t pd_cv; - int32_t pd_bytes_fetched; - int pd_flags; - boolean_t pd_cancel; - boolean_t pd_exited; - zbookmark_phys_t pd_resume; -} prefetch_data_t; - -typedef struct traverse_data { - spa_t *td_spa; - uint64_t td_objset; - blkptr_t *td_rootbp; - uint64_t td_min_txg; - zbookmark_phys_t *td_resume; - int td_flags; - prefetch_data_t *td_pfd; - boolean_t td_paused; - uint64_t td_hole_birth_enabled_txg; - blkptr_cb_t *td_func; - void *td_arg; - boolean_t td_realloc_possible; -} traverse_data_t; - -static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, - uint64_t objset, uint64_t object); -static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, - uint64_t objset, uint64_t object); - -static int -traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) -{ - traverse_data_t *td = arg; - zbookmark_phys_t zb; - - if (BP_IS_HOLE(bp)) - return (0); - - if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa)) - return (-1); - - SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, - bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - - (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); - - return (0); -} - -static int -traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) -{ - traverse_data_t *td = arg; - - if (lrc->lrc_txtype == TX_WRITE) { - lr_write_t *lr = (lr_write_t *)lrc; - blkptr_t *bp = &lr->lr_blkptr; - zbookmark_phys_t zb; - - if (BP_IS_HOLE(bp)) - return (0); - - if (claim_txg == 0 || bp->blk_birth < claim_txg) - return (0); - - SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, - ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); - - (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, - td->td_arg); - } - return (0); -} - -static void -traverse_zil(traverse_data_t *td, zil_header_t *zh) -{ - uint64_t claim_txg = zh->zh_claim_txg; - - /* - * We only want to visit blocks that have been claimed but not yet - * replayed; plus blocks that are already stable in read-only mode. - */ - if (claim_txg == 0 && spa_writeable(td->td_spa)) - return; - - zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); - (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, - claim_txg); - zil_free(zilog); -} - -typedef enum resume_skip { - RESUME_SKIP_ALL, - RESUME_SKIP_NONE, - RESUME_SKIP_CHILDREN -} resume_skip_t; - -/* - * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and - * the block indicated by zb does not need to be visited at all. Returns - * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the - * resume point. This indicates that this block should be visited but not its - * children (since they must have been visited in a previous traversal). - * Otherwise returns RESUME_SKIP_NONE. - */ -static resume_skip_t -resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, - const zbookmark_phys_t *zb) -{ - if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { - /* - * If we already visited this bp & everything below, - * don't bother doing it again. - */ - if (zbookmark_subtree_completed(dnp, zb, td->td_resume)) - return (RESUME_SKIP_ALL); - - /* - * If we found the block we're trying to resume from, zero - * the bookmark out to indicate that we have resumed. - */ - if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { - bzero(td->td_resume, sizeof (*zb)); - if (td->td_flags & TRAVERSE_POST) - return (RESUME_SKIP_CHILDREN); - } - } - return (RESUME_SKIP_NONE); -} - -static void -traverse_prefetch_metadata(traverse_data_t *td, - const blkptr_t *bp, const zbookmark_phys_t *zb) -{ - arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; - - if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) - return; - /* - * If we are in the process of resuming, don't prefetch, because - * some children will not be needed (and in fact may have already - * been freed). - */ - if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) - return; - if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) - return; - if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) - return; - - (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); -} - -static boolean_t -prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp) -{ - ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA); - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || - BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) - return (B_FALSE); - return (B_TRUE); -} - -static int -traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - const blkptr_t *bp, const zbookmark_phys_t *zb) -{ - zbookmark_phys_t czb; - int err = 0; - arc_buf_t *buf = NULL; - prefetch_data_t *pd = td->td_pfd; - boolean_t hard = td->td_flags & TRAVERSE_HARD; - - switch (resume_skip_check(td, dnp, zb)) { - case RESUME_SKIP_ALL: - return (0); - case RESUME_SKIP_CHILDREN: - goto post; - case RESUME_SKIP_NONE: - break; - default: - ASSERT(0); - } - - if (bp->blk_birth == 0) { - /* - * Since this block has a birth time of 0 it must be one of - * two things: a hole created before the - * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole - * which has always been a hole in an object. - * - * If a file is written sparsely, then the unwritten parts of - * the file were "always holes" -- that is, they have been - * holes since this object was allocated. However, we (and - * our callers) can not necessarily tell when an object was - * allocated. Therefore, if it's possible that this object - * was freed and then its object number reused, we need to - * visit all the holes with birth==0. - * - * If it isn't possible that the object number was reused, - * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote - * all the blocks we will visit as part of this traversal, - * then this hole must have always existed, so we can skip - * it. We visit blocks born after (exclusive) td_min_txg. - * - * Note that the meta-dnode cannot be reallocated. - */ - if (!send_holes_without_birth_time && - (!td->td_realloc_possible || - zb->zb_object == DMU_META_DNODE_OBJECT) && - td->td_hole_birth_enabled_txg <= td->td_min_txg) - return (0); - } else if (bp->blk_birth <= td->td_min_txg) { - return (0); - } - - if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) { - uint64_t size = BP_GET_LSIZE(bp); - mutex_enter(&pd->pd_mtx); - ASSERT(pd->pd_bytes_fetched >= 0); - while (pd->pd_bytes_fetched < size && !pd->pd_exited) - cv_wait(&pd->pd_cv, &pd->pd_mtx); - pd->pd_bytes_fetched -= size; - cv_broadcast(&pd->pd_cv); - mutex_exit(&pd->pd_mtx); - } - - if (BP_IS_HOLE(bp)) { - err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); - if (err != 0) - goto post; - return (0); - } - - if (td->td_flags & TRAVERSE_PRE) { - err = td->td_func(td->td_spa, NULL, bp, zb, dnp, - td->td_arg); - if (err == TRAVERSE_VISIT_NO_CHILDREN) - return (0); - if (err != 0) - goto post; - } - - if (BP_GET_LEVEL(bp) > 0) { - arc_flags_t flags = ARC_FLAG_WAIT; - int i; - blkptr_t *cbp; - int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; - - err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err != 0) - goto post; - cbp = buf->b_data; - - for (i = 0; i < epb; i++) { - SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); - traverse_prefetch_metadata(td, &cbp[i], &czb); - } - - /* recursively visitbp() blocks below this */ - for (i = 0; i < epb; i++) { - SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); - err = traverse_visitbp(td, dnp, &cbp[i], &czb); - if (err != 0) - break; - } - } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - arc_flags_t flags = ARC_FLAG_WAIT; - int i; - int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - - err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err != 0) - goto post; - dnode_phys_t *child_dnp = buf->b_data; - - for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { - prefetch_dnode_metadata(td, &child_dnp[i], - zb->zb_objset, zb->zb_blkid * epb + i); - } - - /* recursively visitbp() blocks below this */ - for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { - err = traverse_dnode(td, &child_dnp[i], - zb->zb_objset, zb->zb_blkid * epb + i); - if (err != 0) - break; - } - } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - arc_flags_t flags = ARC_FLAG_WAIT; - - err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err != 0) - goto post; - - objset_phys_t *osp = buf->b_data; - prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset, - DMU_META_DNODE_OBJECT); - /* - * See the block comment above for the goal of this variable. - * If the maxblkid of the meta-dnode is 0, then we know that - * we've never had more than DNODES_PER_BLOCK objects in the - * dataset, which means we can't have reused any object ids. - */ - if (osp->os_meta_dnode.dn_maxblkid == 0) - td->td_realloc_possible = B_FALSE; - - if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { - prefetch_dnode_metadata(td, &osp->os_groupused_dnode, - zb->zb_objset, DMU_GROUPUSED_OBJECT); - prefetch_dnode_metadata(td, &osp->os_userused_dnode, - zb->zb_objset, DMU_USERUSED_OBJECT); - } - - err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset, - DMU_META_DNODE_OBJECT); - if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { - err = traverse_dnode(td, &osp->os_groupused_dnode, - zb->zb_objset, DMU_GROUPUSED_OBJECT); - } - if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { - err = traverse_dnode(td, &osp->os_userused_dnode, - zb->zb_objset, DMU_USERUSED_OBJECT); - } - } - - if (buf) - arc_buf_destroy(buf, &buf); - -post: - if (err == 0 && (td->td_flags & TRAVERSE_POST)) - err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); - - if (hard && (err == EIO || err == ECKSUM)) { - /* - * Ignore this disk error as requested by the HARD flag, - * and continue traversal. - */ - err = 0; - } - - /* - * If we are stopping here, set td_resume. - */ - if (td->td_resume != NULL && err != 0 && !td->td_paused) { - td->td_resume->zb_objset = zb->zb_objset; - td->td_resume->zb_object = zb->zb_object; - td->td_resume->zb_level = 0; - /* - * If we have stopped on an indirect block (e.g. due to - * i/o error), we have not visited anything below it. - * Set the bookmark to the first level-0 block that we need - * to visit. This way, the resuming code does not need to - * deal with resuming from indirect blocks. - * - * Note, if zb_level <= 0, dnp may be NULL, so we don't want - * to dereference it. - */ - td->td_resume->zb_blkid = zb->zb_blkid; - if (zb->zb_level > 0) { - td->td_resume->zb_blkid <<= zb->zb_level * - (dnp->dn_indblkshift - SPA_BLKPTRSHIFT); - } - td->td_paused = B_TRUE; - } - - return (err); -} - -static void -prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, - uint64_t objset, uint64_t object) -{ - int j; - zbookmark_phys_t czb; - - for (j = 0; j < dnp->dn_nblkptr; j++) { - SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); - traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb); - } - - if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { - SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); - traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb); - } -} - -static int -traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, - uint64_t objset, uint64_t object) -{ - int j, err = 0; - zbookmark_phys_t czb; - - if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL && - object < td->td_resume->zb_object) - return (0); - - if (td->td_flags & TRAVERSE_PRE) { - SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, - ZB_DNODE_BLKID); - err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, - td->td_arg); - if (err == TRAVERSE_VISIT_NO_CHILDREN) - return (0); - if (err != 0) - return (err); - } - - for (j = 0; j < dnp->dn_nblkptr; j++) { - SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); - err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); - if (err != 0) - break; - } - - if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { - SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); - err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb); - } - - if (err == 0 && (td->td_flags & TRAVERSE_POST)) { - SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, - ZB_DNODE_BLKID); - err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, - td->td_arg); - if (err == TRAVERSE_VISIT_NO_CHILDREN) - return (0); - if (err != 0) - return (err); - } - return (err); -} - -/* ARGSUSED */ -static int -traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) -{ - prefetch_data_t *pfd = arg; - arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | - ARC_FLAG_PRESCIENT_PREFETCH; - - ASSERT(pfd->pd_bytes_fetched >= 0); - if (bp == NULL) - return (0); - if (pfd->pd_cancel) - return (SET_ERROR(EINTR)); - - if (!prefetch_needed(pfd, bp)) - return (0); - - mutex_enter(&pfd->pd_mtx); - while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max) - cv_wait(&pfd->pd_cv, &pfd->pd_mtx); - pfd->pd_bytes_fetched += BP_GET_LSIZE(bp); - cv_broadcast(&pfd->pd_cv); - mutex_exit(&pfd->pd_mtx); - - (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb); - - return (0); -} - -static void -traverse_prefetch_thread(void *arg) -{ - traverse_data_t *td_main = arg; - traverse_data_t td = *td_main; - zbookmark_phys_t czb; - - td.td_func = traverse_prefetcher; - td.td_arg = td_main->td_pfd; - td.td_pfd = NULL; - td.td_resume = &td_main->td_pfd->pd_resume; - - SET_BOOKMARK(&czb, td.td_objset, - ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb); - - mutex_enter(&td_main->td_pfd->pd_mtx); - td_main->td_pfd->pd_exited = B_TRUE; - cv_broadcast(&td_main->td_pfd->pd_cv); - mutex_exit(&td_main->td_pfd->pd_mtx); -} - -/* - * NB: dataset must not be changing on-disk (eg, is a snapshot or we are - * in syncing context). - */ -static int -traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, - uint64_t txg_start, zbookmark_phys_t *resume, int flags, - blkptr_cb_t func, void *arg) -{ - traverse_data_t td; - prefetch_data_t pd = { 0 }; - zbookmark_phys_t czb; - int err; - - ASSERT(ds == NULL || objset == ds->ds_object); - ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); - - td.td_spa = spa; - td.td_objset = objset; - td.td_rootbp = rootbp; - td.td_min_txg = txg_start; - td.td_resume = resume; - td.td_func = func; - td.td_arg = arg; - td.td_pfd = &pd; - td.td_flags = flags; - td.td_paused = B_FALSE; - td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE); - - if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { - VERIFY(spa_feature_enabled_txg(spa, - SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg)); - } else { - td.td_hole_birth_enabled_txg = UINT64_MAX; - } - - pd.pd_flags = flags; - if (resume != NULL) - pd.pd_resume = *resume; - mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); - cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); - - /* See comment on ZIL traversal in dsl_scan_visitds. */ - if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { - arc_flags_t flags = ARC_FLAG_WAIT; - objset_phys_t *osp; - arc_buf_t *buf; - - err = arc_read(NULL, td.td_spa, rootbp, - arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL); - if (err != 0) - return (err); - - osp = buf->b_data; - traverse_zil(&td, &osp->os_zil_header); - arc_buf_destroy(buf, &buf); - } - - if (!(flags & TRAVERSE_PREFETCH_DATA) || - 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, - &td, TQ_NOQUEUE)) - pd.pd_exited = B_TRUE; - - SET_BOOKMARK(&czb, td.td_objset, - ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - err = traverse_visitbp(&td, NULL, rootbp, &czb); - - mutex_enter(&pd.pd_mtx); - pd.pd_cancel = B_TRUE; - cv_broadcast(&pd.pd_cv); - while (!pd.pd_exited) - cv_wait(&pd.pd_cv, &pd.pd_mtx); - mutex_exit(&pd.pd_mtx); - - mutex_destroy(&pd.pd_mtx); - cv_destroy(&pd.pd_cv); - - return (err); -} - -/* - * NB: dataset must not be changing on-disk (eg, is a snapshot or we are - * in syncing context). - */ -int -traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start, - zbookmark_phys_t *resume, - int flags, blkptr_cb_t func, void *arg) -{ - return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, - &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg)); -} - -int -traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, - int flags, blkptr_cb_t func, void *arg) -{ - return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg)); -} - -int -traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, - uint64_t txg_start, zbookmark_phys_t *resume, int flags, - blkptr_cb_t func, void *arg) -{ - return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, - blkptr, txg_start, resume, flags, func, arg)); -} - -/* - * NB: pool must not be changing on-disk (eg, from zdb or sync context). - */ -int -traverse_pool(spa_t *spa, uint64_t txg_start, int flags, - blkptr_cb_t func, void *arg) -{ - int err; - dsl_pool_t *dp = spa_get_dsl(spa); - objset_t *mos = dp->dp_meta_objset; - boolean_t hard = (flags & TRAVERSE_HARD); - - /* visit the MOS */ - err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), - txg_start, NULL, flags, func, arg); - if (err != 0) - return (err); - - /* visit each dataset */ - for (uint64_t obj = 1; err == 0; - err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) { - dmu_object_info_t doi; - - err = dmu_object_info(mos, obj, &doi); - if (err != 0) { - if (hard) - continue; - break; - } - - if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) { - dsl_dataset_t *ds; - uint64_t txg = txg_start; - - dsl_pool_config_enter(dp, FTAG); - err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); - dsl_pool_config_exit(dp, FTAG); - if (err != 0) { - if (hard) - continue; - break; - } - if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg) - txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; - err = traverse_dataset(ds, txg, flags, func, arg); - dsl_dataset_rele(ds, FTAG); - if (err != 0) - break; - } - } - if (err == ESRCH) - err = 0; - return (err); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c deleted file mode 100644 index 00784ab6c4df..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ /dev/null @@ -1,1345 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, - uint64_t arg1, uint64_t arg2); - - -dmu_tx_t * -dmu_tx_create_dd(dsl_dir_t *dd) -{ - dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); - tx->tx_dir = dd; - if (dd != NULL) - tx->tx_pool = dd->dd_pool; - list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), - offsetof(dmu_tx_hold_t, txh_node)); - list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), - offsetof(dmu_tx_callback_t, dcb_node)); - tx->tx_start = gethrtime(); - return (tx); -} - -dmu_tx_t * -dmu_tx_create(objset_t *os) -{ - dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); - tx->tx_objset = os; - return (tx); -} - -dmu_tx_t * -dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) -{ - dmu_tx_t *tx = dmu_tx_create_dd(NULL); - - txg_verify(dp->dp_spa, txg); - tx->tx_pool = dp; - tx->tx_txg = txg; - tx->tx_anyobj = TRUE; - - return (tx); -} - -int -dmu_tx_is_syncing(dmu_tx_t *tx) -{ - return (tx->tx_anyobj); -} - -int -dmu_tx_private_ok(dmu_tx_t *tx) -{ - return (tx->tx_anyobj); -} - -static dmu_tx_hold_t * -dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, - uint64_t arg1, uint64_t arg2) -{ - dmu_tx_hold_t *txh; - - if (dn != NULL) { - (void) zfs_refcount_add(&dn->dn_holds, tx); - if (tx->tx_txg != 0) { - mutex_enter(&dn->dn_mtx); - /* - * dn->dn_assigned_txg == tx->tx_txg doesn't pose a - * problem, but there's no way for it to happen (for - * now, at least). - */ - ASSERT(dn->dn_assigned_txg == 0); - dn->dn_assigned_txg = tx->tx_txg; - (void) zfs_refcount_add(&dn->dn_tx_holds, tx); - mutex_exit(&dn->dn_mtx); - } - } - - txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); - txh->txh_tx = tx; - txh->txh_dnode = dn; - zfs_refcount_create(&txh->txh_space_towrite); - zfs_refcount_create(&txh->txh_memory_tohold); - txh->txh_type = type; - txh->txh_arg1 = arg1; - txh->txh_arg2 = arg2; - list_insert_tail(&tx->tx_holds, txh); - - return (txh); -} - -static dmu_tx_hold_t * -dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, - enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) -{ - dnode_t *dn = NULL; - dmu_tx_hold_t *txh; - int err; - - if (object != DMU_NEW_OBJECT) { - err = dnode_hold(os, object, FTAG, &dn); - if (err != 0) { - tx->tx_err = err; - return (NULL); - } - } - txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); - if (dn != NULL) - dnode_rele(dn, FTAG); - return (txh); -} - -void -dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) -{ - /* - * If we're syncing, they can manipulate any object anyhow, and - * the hold on the dnode_t can cause problems. - */ - if (!dmu_tx_is_syncing(tx)) - (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); -} - -/* - * This function reads specified data from disk. The specified data will - * be needed to perform the transaction -- i.e, it will be read after - * we do dmu_tx_assign(). There are two reasons that we read the data now - * (before dmu_tx_assign()): - * - * 1. Reading it now has potentially better performance. The transaction - * has not yet been assigned, so the TXG is not held open, and also the - * caller typically has less locks held when calling dmu_tx_hold_*() than - * after the transaction has been assigned. This reduces the lock (and txg) - * hold times, thus reducing lock contention. - * - * 2. It is easier for callers (primarily the ZPL) to handle i/o errors - * that are detected before they start making changes to the DMU state - * (i.e. now). Once the transaction has been assigned, and some DMU - * state has been changed, it can be difficult to recover from an i/o - * error (e.g. to undo the changes already made in memory at the DMU - * layer). Typically code to do so does not exist in the caller -- it - * assumes that the data has already been cached and thus i/o errors are - * not possible. - * - * It has been observed that the i/o initiated here can be a performance - * problem, and it appears to be optional, because we don't look at the - * data which is read. However, removing this read would only serve to - * move the work elsewhere (after the dmu_tx_assign()), where it may - * have a greater impact on performance (in addition to the impact on - * fault tolerance noted above). - */ -static int -dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) -{ - int err; - dmu_buf_impl_t *db; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - db = dbuf_hold_level(dn, level, blkid, FTAG); - rw_exit(&dn->dn_struct_rwlock); - if (db == NULL) - return (SET_ERROR(EIO)); - err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); - dbuf_rele(db, FTAG); - return (err); -} - -/* ARGSUSED */ -static void -dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) -{ - dnode_t *dn = txh->txh_dnode; - int err = 0; - - if (len == 0) - return; - - (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); - - if (zfs_refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) - err = SET_ERROR(EFBIG); - - if (dn == NULL) - return; - - /* - * For i/o error checking, read the blocks that will be needed - * to perform the write: the first and last level-0 blocks (if - * they are not aligned, i.e. if they are partial-block writes), - * and all the level-1 blocks. - */ - if (dn->dn_maxblkid == 0) { - if (off < dn->dn_datablksz && - (off > 0 || len < dn->dn_datablksz)) { - err = dmu_tx_check_ioerr(NULL, dn, 0, 0); - if (err != 0) { - txh->txh_tx->tx_err = err; - } - } - } else { - zio_t *zio = zio_root(dn->dn_objset->os_spa, - NULL, NULL, ZIO_FLAG_CANFAIL); - - /* first level-0 block */ - uint64_t start = off >> dn->dn_datablkshift; - if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { - err = dmu_tx_check_ioerr(zio, dn, 0, start); - if (err != 0) { - txh->txh_tx->tx_err = err; - } - } - - /* last level-0 block */ - uint64_t end = (off + len - 1) >> dn->dn_datablkshift; - if (end != start && end <= dn->dn_maxblkid && - P2PHASE(off + len, dn->dn_datablksz)) { - err = dmu_tx_check_ioerr(zio, dn, 0, end); - if (err != 0) { - txh->txh_tx->tx_err = err; - } - } - - /* level-1 blocks */ - if (dn->dn_nlevels > 1) { - int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - for (uint64_t i = (start >> shft) + 1; - i < end >> shft; i++) { - err = dmu_tx_check_ioerr(zio, dn, 1, i); - if (err != 0) { - txh->txh_tx->tx_err = err; - } - } - } - - err = zio_wait(zio); - if (err != 0) { - txh->txh_tx->tx_err = err; - } - } -} - -static void -dmu_tx_count_dnode(dmu_tx_hold_t *txh) -{ - (void) zfs_refcount_add_many(&txh->txh_space_towrite, DNODE_MIN_SIZE, - FTAG); -} - -void -dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) -{ - dmu_tx_hold_t *txh; - - ASSERT0(tx->tx_txg); - ASSERT3U(len, <=, DMU_MAX_ACCESS); - ASSERT(len == 0 || UINT64_MAX - off >= len - 1); - - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - object, THT_WRITE, off, len); - if (txh != NULL) { - dmu_tx_count_write(txh, off, len); - dmu_tx_count_dnode(txh); - } -} - -void -dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object) -{ - dmu_tx_hold_t *txh; - - ASSERT(tx->tx_txg == 0); - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - object, THT_WRITE, 0, 0); - if (txh == NULL) - return; - - dnode_t *dn = txh->txh_dnode; - (void) zfs_refcount_add_many(&txh->txh_space_towrite, - 1ULL << dn->dn_indblkshift, FTAG); - dmu_tx_count_dnode(txh); -} - -void -dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) -{ - dmu_tx_hold_t *txh; - - ASSERT0(tx->tx_txg); - ASSERT3U(len, <=, DMU_MAX_ACCESS); - ASSERT(len == 0 || UINT64_MAX - off >= len - 1); - - txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); - if (txh != NULL) { - dmu_tx_count_write(txh, off, len); - dmu_tx_count_dnode(txh); - } -} - -/* - * This function marks the transaction as being a "net free". The end - * result is that refquotas will be disabled for this transaction, and - * this transaction will be able to use half of the pool space overhead - * (see dsl_pool_adjustedsize()). Therefore this function should only - * be called for transactions that we expect will not cause a net increase - * in the amount of space used (but it's OK if that is occasionally not true). - */ -void -dmu_tx_mark_netfree(dmu_tx_t *tx) -{ - tx->tx_netfree = B_TRUE; -} - -static void -dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) -{ - dmu_tx_t *tx; - dnode_t *dn; - int err; - - tx = txh->txh_tx; - ASSERT(tx->tx_txg == 0); - - dn = txh->txh_dnode; - dmu_tx_count_dnode(txh); - - if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) - return; - if (len == DMU_OBJECT_END) - len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; - - - /* - * For i/o error checking, we read the first and last level-0 - * blocks if they are not aligned, and all the level-1 blocks. - * - * Note: dbuf_free_range() assumes that we have not instantiated - * any level-0 dbufs that will be completely freed. Therefore we must - * exercise care to not read or count the first and last blocks - * if they are blocksize-aligned. - */ - if (dn->dn_datablkshift == 0) { - if (off != 0 || len < dn->dn_datablksz) - dmu_tx_count_write(txh, 0, dn->dn_datablksz); - } else { - /* first block will be modified if it is not aligned */ - if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) - dmu_tx_count_write(txh, off, 1); - /* last block will be modified if it is not aligned */ - if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) - dmu_tx_count_write(txh, off + len, 1); - } - - /* - * Check level-1 blocks. - */ - if (dn->dn_nlevels > 1) { - int shift = dn->dn_datablkshift + dn->dn_indblkshift - - SPA_BLKPTRSHIFT; - uint64_t start = off >> shift; - uint64_t end = (off + len) >> shift; - - ASSERT(dn->dn_indblkshift != 0); - - /* - * dnode_reallocate() can result in an object with indirect - * blocks having an odd data block size. In this case, - * just check the single block. - */ - if (dn->dn_datablkshift == 0) - start = end = 0; - - zio_t *zio = zio_root(tx->tx_pool->dp_spa, - NULL, NULL, ZIO_FLAG_CANFAIL); - for (uint64_t i = start; i <= end; i++) { - uint64_t ibyte = i << shift; - err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); - i = ibyte >> shift; - if (err == ESRCH || i > end) - break; - if (err != 0) { - tx->tx_err = err; - (void) zio_wait(zio); - return; - } - - (void) zfs_refcount_add_many(&txh->txh_memory_tohold, - 1 << dn->dn_indblkshift, FTAG); - - err = dmu_tx_check_ioerr(zio, dn, 1, i); - if (err != 0) { - tx->tx_err = err; - (void) zio_wait(zio); - return; - } - } - err = zio_wait(zio); - if (err != 0) { - tx->tx_err = err; - return; - } - } -} - -void -dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) -{ - dmu_tx_hold_t *txh; - - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - object, THT_FREE, off, len); - if (txh != NULL) - (void) dmu_tx_hold_free_impl(txh, off, len); -} - -void -dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) -{ - dmu_tx_hold_t *txh; - - txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); - if (txh != NULL) - (void) dmu_tx_hold_free_impl(txh, off, len); -} - -static void -dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) -{ - dmu_tx_t *tx = txh->txh_tx; - dnode_t *dn; - int err; - - ASSERT(tx->tx_txg == 0); - - dn = txh->txh_dnode; - - dmu_tx_count_dnode(txh); - - /* - * Modifying a almost-full microzap is around the worst case (128KB) - * - * If it is a fat zap, the worst case would be 7*16KB=112KB: - * - 3 blocks overwritten: target leaf, ptrtbl block, header block - * - 4 new blocks written if adding: - * - 2 blocks for possibly split leaves, - * - 2 grown ptrtbl blocks - */ - (void) zfs_refcount_add_many(&txh->txh_space_towrite, - MZAP_MAX_BLKSZ, FTAG); - - if (dn == NULL) - return; - - ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); - - if (dn->dn_maxblkid == 0 || name == NULL) { - /* - * This is a microzap (only one block), or we don't know - * the name. Check the first block for i/o errors. - */ - err = dmu_tx_check_ioerr(NULL, dn, 0, 0); - if (err != 0) { - tx->tx_err = err; - } - } else { - /* - * Access the name so that we'll check for i/o errors to - * the leaf blocks, etc. We ignore ENOENT, as this name - * may not yet exist. - */ - err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); - if (err == EIO || err == ECKSUM || err == ENXIO) { - tx->tx_err = err; - } - } -} - -void -dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) -{ - dmu_tx_hold_t *txh; - - ASSERT0(tx->tx_txg); - - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - object, THT_ZAP, add, (uintptr_t)name); - if (txh != NULL) - dmu_tx_hold_zap_impl(txh, name); -} - -void -dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) -{ - dmu_tx_hold_t *txh; - - ASSERT0(tx->tx_txg); - ASSERT(dn != NULL); - - txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); - if (txh != NULL) - dmu_tx_hold_zap_impl(txh, name); -} - -void -dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) -{ - dmu_tx_hold_t *txh; - - ASSERT(tx->tx_txg == 0); - - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - object, THT_BONUS, 0, 0); - if (txh) - dmu_tx_count_dnode(txh); -} - -void -dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) -{ - dmu_tx_hold_t *txh; - - ASSERT0(tx->tx_txg); - - txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); - if (txh) - dmu_tx_count_dnode(txh); -} - -void -dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) -{ - dmu_tx_hold_t *txh; - ASSERT(tx->tx_txg == 0); - - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - DMU_NEW_OBJECT, THT_SPACE, space, 0); - - (void) zfs_refcount_add_many(&txh->txh_space_towrite, space, FTAG); -} - -#ifdef ZFS_DEBUG -void -dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) -{ - boolean_t match_object = B_FALSE; - boolean_t match_offset = B_FALSE; - - DB_DNODE_ENTER(db); - dnode_t *dn = DB_DNODE(db); - ASSERT(tx->tx_txg != 0); - ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); - ASSERT3U(dn->dn_object, ==, db->db.db_object); - - if (tx->tx_anyobj) { - DB_DNODE_EXIT(db); - return; - } - - /* XXX No checking on the meta dnode for now */ - if (db->db.db_object == DMU_META_DNODE_OBJECT) { - DB_DNODE_EXIT(db); - return; - } - - for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; - txh = list_next(&tx->tx_holds, txh)) { - ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); - if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) - match_object = TRUE; - if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { - int datablkshift = dn->dn_datablkshift ? - dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - int shift = datablkshift + epbs * db->db_level; - uint64_t beginblk = shift >= 64 ? 0 : - (txh->txh_arg1 >> shift); - uint64_t endblk = shift >= 64 ? 0 : - ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); - uint64_t blkid = db->db_blkid; - - /* XXX txh_arg2 better not be zero... */ - - dprintf("found txh type %x beginblk=%llx endblk=%llx\n", - txh->txh_type, beginblk, endblk); - - switch (txh->txh_type) { - case THT_WRITE: - if (blkid >= beginblk && blkid <= endblk) - match_offset = TRUE; - /* - * We will let this hold work for the bonus - * or spill buffer so that we don't need to - * hold it when creating a new object. - */ - if (blkid == DMU_BONUS_BLKID || - blkid == DMU_SPILL_BLKID) - match_offset = TRUE; - /* - * They might have to increase nlevels, - * thus dirtying the new TLIBs. Or the - * might have to change the block size, - * thus dirying the new lvl=0 blk=0. - */ - if (blkid == 0) - match_offset = TRUE; - break; - case THT_FREE: - /* - * We will dirty all the level 1 blocks in - * the free range and perhaps the first and - * last level 0 block. - */ - if (blkid >= beginblk && (blkid <= endblk || - txh->txh_arg2 == DMU_OBJECT_END)) - match_offset = TRUE; - break; - case THT_SPILL: - if (blkid == DMU_SPILL_BLKID) - match_offset = TRUE; - break; - case THT_BONUS: - if (blkid == DMU_BONUS_BLKID) - match_offset = TRUE; - break; - case THT_ZAP: - match_offset = TRUE; - break; - case THT_NEWOBJECT: - match_object = TRUE; - break; - default: - ASSERT(!"bad txh_type"); - } - } - if (match_object && match_offset) { - DB_DNODE_EXIT(db); - return; - } - } - DB_DNODE_EXIT(db); - panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", - (u_longlong_t)db->db.db_object, db->db_level, - (u_longlong_t)db->db_blkid); -} -#endif - -/* - * If we can't do 10 iops, something is wrong. Let us go ahead - * and hit zfs_dirty_data_max. - */ -hrtime_t zfs_delay_max_ns = MSEC2NSEC(100); -int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ - -/* - * We delay transactions when we've determined that the backend storage - * isn't able to accommodate the rate of incoming writes. - * - * If there is already a transaction waiting, we delay relative to when - * that transaction finishes waiting. This way the calculated min_time - * is independent of the number of threads concurrently executing - * transactions. - * - * If we are the only waiter, wait relative to when the transaction - * started, rather than the current time. This credits the transaction for - * "time already served", e.g. reading indirect blocks. - * - * The minimum time for a transaction to take is calculated as: - * min_time = scale * (dirty - min) / (max - dirty) - * min_time is then capped at zfs_delay_max_ns. - * - * The delay has two degrees of freedom that can be adjusted via tunables. - * The percentage of dirty data at which we start to delay is defined by - * zfs_delay_min_dirty_percent. This should typically be at or above - * zfs_vdev_async_write_active_max_dirty_percent so that we only start to - * delay after writing at full speed has failed to keep up with the incoming - * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly - * speaking, this variable determines the amount of delay at the midpoint of - * the curve. - * - * delay - * 10ms +-------------------------------------------------------------*+ - * | *| - * 9ms + *+ - * | *| - * 8ms + *+ - * | * | - * 7ms + * + - * | * | - * 6ms + * + - * | * | - * 5ms + * + - * | * | - * 4ms + * + - * | * | - * 3ms + * + - * | * | - * 2ms + (midpoint) * + - * | | ** | - * 1ms + v *** + - * | zfs_delay_scale ----------> ******** | - * 0 +-------------------------------------*********----------------+ - * 0% <- zfs_dirty_data_max -> 100% - * - * Note that since the delay is added to the outstanding time remaining on the - * most recent transaction, the delay is effectively the inverse of IOPS. - * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve - * was chosen such that small changes in the amount of accumulated dirty data - * in the first 3/4 of the curve yield relatively small differences in the - * amount of delay. - * - * The effects can be easier to understand when the amount of delay is - * represented on a log scale: - * - * delay - * 100ms +-------------------------------------------------------------++ - * + + - * | | - * + *+ - * 10ms + *+ - * + ** + - * | (midpoint) ** | - * + | ** + - * 1ms + v **** + - * + zfs_delay_scale ----------> ***** + - * | **** | - * + **** + - * 100us + ** + - * + * + - * | * | - * + * + - * 10us + * + - * + + - * | | - * + + - * +--------------------------------------------------------------+ - * 0% <- zfs_dirty_data_max -> 100% - * - * Note here that only as the amount of dirty data approaches its limit does - * the delay start to increase rapidly. The goal of a properly tuned system - * should be to keep the amount of dirty data out of that range by first - * ensuring that the appropriate limits are set for the I/O scheduler to reach - * optimal throughput on the backend storage, and then by changing the value - * of zfs_delay_scale to increase the steepness of the curve. - */ -static void -dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) -{ - dsl_pool_t *dp = tx->tx_pool; - uint64_t delay_min_bytes = - zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - hrtime_t wakeup, min_tx_time, now; - - if (dirty <= delay_min_bytes) - return; - - /* - * The caller has already waited until we are under the max. - * We make them pass us the amount of dirty data so we don't - * have to handle the case of it being >= the max, which could - * cause a divide-by-zero if it's == the max. - */ - ASSERT3U(dirty, <, zfs_dirty_data_max); - - now = gethrtime(); - min_tx_time = zfs_delay_scale * - (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); - if (now > tx->tx_start + min_tx_time) - return; - - min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); - - DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, - uint64_t, min_tx_time); - - mutex_enter(&dp->dp_lock); - wakeup = MAX(tx->tx_start + min_tx_time, - dp->dp_last_wakeup + min_tx_time); - dp->dp_last_wakeup = wakeup; - mutex_exit(&dp->dp_lock); - -#ifdef _KERNEL -#ifdef illumos - mutex_enter(&curthread->t_delay_lock); - while (cv_timedwait_hires(&curthread->t_delay_cv, - &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns, - CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0) - continue; - mutex_exit(&curthread->t_delay_lock); -#else - pause_sbt("dmu_tx_delay", nstosbt(wakeup), - nstosbt(zfs_delay_resolution_ns), C_ABSOLUTE); -#endif -#else - hrtime_t delta = wakeup - gethrtime(); - struct timespec ts; - ts.tv_sec = delta / NANOSEC; - ts.tv_nsec = delta % NANOSEC; - (void) nanosleep(&ts, NULL); -#endif -} - -/* - * This routine attempts to assign the transaction to a transaction group. - * To do so, we must determine if there is sufficient free space on disk. - * - * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() - * on it), then it is assumed that there is sufficient free space, - * unless there's insufficient slop space in the pool (see the comment - * above spa_slop_shift in spa_misc.c). - * - * If it is not a "netfree" transaction, then if the data already on disk - * is over the allowed usage (e.g. quota), this will fail with EDQUOT or - * ENOSPC. Otherwise, if the current rough estimate of pending changes, - * plus the rough estimate of this transaction's changes, may exceed the - * allowed usage, then this will fail with ERESTART, which will cause the - * caller to wait for the pending changes to be written to disk (by waiting - * for the next TXG to open), and then check the space usage again. - * - * The rough estimate of pending changes is comprised of the sum of: - * - * - this transaction's holds' txh_space_towrite - * - * - dd_tempreserved[], which is the sum of in-flight transactions' - * holds' txh_space_towrite (i.e. those transactions that have called - * dmu_tx_assign() but not yet called dmu_tx_commit()). - * - * - dd_space_towrite[], which is the amount of dirtied dbufs. - * - * Note that all of these values are inflated by spa_get_worst_case_asize(), - * which means that we may get ERESTART well before we are actually in danger - * of running out of space, but this also mitigates any small inaccuracies - * in the rough estimate (e.g. txh_space_towrite doesn't take into account - * indirect blocks, and dd_space_towrite[] doesn't take into account changes - * to the MOS). - * - * Note that due to this algorithm, it is possible to exceed the allowed - * usage by one transaction. Also, as we approach the allowed usage, - * we will allow a very limited amount of changes into each TXG, thus - * decreasing performance. - */ -static int -dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) -{ - spa_t *spa = tx->tx_pool->dp_spa; - - ASSERT0(tx->tx_txg); - - if (tx->tx_err) - return (tx->tx_err); - - if (spa_suspended(spa)) { - /* - * If the user has indicated a blocking failure mode - * then return ERESTART which will block in dmu_tx_wait(). - * Otherwise, return EIO so that an error can get - * propagated back to the VOP calls. - * - * Note that we always honor the txg_how flag regardless - * of the failuremode setting. - */ - if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && - !(txg_how & TXG_WAIT)) - return (SET_ERROR(EIO)); - - return (SET_ERROR(ERESTART)); - } - - if (!tx->tx_dirty_delayed && - dsl_pool_need_dirty_delay(tx->tx_pool)) { - tx->tx_wait_dirty = B_TRUE; - return (SET_ERROR(ERESTART)); - } - - tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); - tx->tx_needassign_txh = NULL; - - /* - * NB: No error returns are allowed after txg_hold_open, but - * before processing the dnode holds, due to the - * dmu_tx_unassign() logic. - */ - - uint64_t towrite = 0; - uint64_t tohold = 0; - for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; - txh = list_next(&tx->tx_holds, txh)) { - dnode_t *dn = txh->txh_dnode; - if (dn != NULL) { - mutex_enter(&dn->dn_mtx); - if (dn->dn_assigned_txg == tx->tx_txg - 1) { - mutex_exit(&dn->dn_mtx); - tx->tx_needassign_txh = txh; - return (SET_ERROR(ERESTART)); - } - if (dn->dn_assigned_txg == 0) - dn->dn_assigned_txg = tx->tx_txg; - ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); - (void) zfs_refcount_add(&dn->dn_tx_holds, tx); - mutex_exit(&dn->dn_mtx); - } - towrite += zfs_refcount_count(&txh->txh_space_towrite); - tohold += zfs_refcount_count(&txh->txh_memory_tohold); - } - - /* needed allocation: worst-case estimate of write space */ - uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); - /* calculate memory footprint estimate */ - uint64_t memory = towrite + tohold; - - if (tx->tx_dir != NULL && asize != 0) { - int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, - asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); - if (err != 0) - return (err); - } - - return (0); -} - -static void -dmu_tx_unassign(dmu_tx_t *tx) -{ - if (tx->tx_txg == 0) - return; - - txg_rele_to_quiesce(&tx->tx_txgh); - - /* - * Walk the transaction's hold list, removing the hold on the - * associated dnode, and notifying waiters if the refcount drops to 0. - */ - for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); - txh != tx->tx_needassign_txh; - txh = list_next(&tx->tx_holds, txh)) { - dnode_t *dn = txh->txh_dnode; - - if (dn == NULL) - continue; - mutex_enter(&dn->dn_mtx); - ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); - - if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) { - dn->dn_assigned_txg = 0; - cv_broadcast(&dn->dn_notxholds); - } - mutex_exit(&dn->dn_mtx); - } - - txg_rele_to_sync(&tx->tx_txgh); - - tx->tx_lasttried_txg = tx->tx_txg; - tx->tx_txg = 0; -} - -/* - * Assign tx to a transaction group; txg_how is a bitmask: - * - * If TXG_WAIT is set and the currently open txg is full, this function - * will wait until there's a new txg. This should be used when no locks - * are being held. With this bit set, this function will only fail if - * we're truly out of space (or over quota). - * - * If TXG_WAIT is *not* set and we can't assign into the currently open - * txg without blocking, this function will return immediately with - * ERESTART. This should be used whenever locks are being held. On an - * ERESTART error, the caller should drop all locks, call dmu_tx_wait(), - * and try again. - * - * If TXG_NOTHROTTLE is set, this indicates that this tx should not be - * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for - * details on the throttle). This is used by the VFS operations, after - * they have already called dmu_tx_wait() (though most likely on a - * different tx). - */ -int -dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) -{ - int err; - - ASSERT(tx->tx_txg == 0); - ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE)); - ASSERT(!dsl_pool_sync_context(tx->tx_pool)); - - /* If we might wait, we must not hold the config lock. */ - IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool)); - - if ((txg_how & TXG_NOTHROTTLE)) - tx->tx_dirty_delayed = B_TRUE; - - while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { - dmu_tx_unassign(tx); - - if (err != ERESTART || !(txg_how & TXG_WAIT)) - return (err); - - dmu_tx_wait(tx); - } - - txg_rele_to_quiesce(&tx->tx_txgh); - - return (0); -} - -void -dmu_tx_wait(dmu_tx_t *tx) -{ - spa_t *spa = tx->tx_pool->dp_spa; - dsl_pool_t *dp = tx->tx_pool; - - ASSERT(tx->tx_txg == 0); - ASSERT(!dsl_pool_config_held(tx->tx_pool)); - - if (tx->tx_wait_dirty) { - /* - * dmu_tx_try_assign() has determined that we need to wait - * because we've consumed much or all of the dirty buffer - * space. - */ - mutex_enter(&dp->dp_lock); - while (dp->dp_dirty_total >= zfs_dirty_data_max) - cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); - uint64_t dirty = dp->dp_dirty_total; - mutex_exit(&dp->dp_lock); - - dmu_tx_delay(tx, dirty); - - tx->tx_wait_dirty = B_FALSE; - - /* - * Note: setting tx_dirty_delayed only has effect if the - * caller used TX_WAIT. Otherwise they are going to - * destroy this tx and try again. The common case, - * zfs_write(), uses TX_WAIT. - */ - tx->tx_dirty_delayed = B_TRUE; - } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { - /* - * If the pool is suspended we need to wait until it - * is resumed. Note that it's possible that the pool - * has become active after this thread has tried to - * obtain a tx. If that's the case then tx_lasttried_txg - * would not have been set. - */ - txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); - } else if (tx->tx_needassign_txh) { - /* - * A dnode is assigned to the quiescing txg. Wait for its - * transaction to complete. - */ - dnode_t *dn = tx->tx_needassign_txh->txh_dnode; - - mutex_enter(&dn->dn_mtx); - while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) - cv_wait(&dn->dn_notxholds, &dn->dn_mtx); - mutex_exit(&dn->dn_mtx); - tx->tx_needassign_txh = NULL; - } else { - /* - * If we have a lot of dirty data just wait until we sync - * out a TXG at which point we'll hopefully have synced - * a portion of the changes. - */ - txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); - } -} - -static void -dmu_tx_destroy(dmu_tx_t *tx) -{ - dmu_tx_hold_t *txh; - - while ((txh = list_head(&tx->tx_holds)) != NULL) { - dnode_t *dn = txh->txh_dnode; - - list_remove(&tx->tx_holds, txh); - zfs_refcount_destroy_many(&txh->txh_space_towrite, - zfs_refcount_count(&txh->txh_space_towrite)); - zfs_refcount_destroy_many(&txh->txh_memory_tohold, - zfs_refcount_count(&txh->txh_memory_tohold)); - kmem_free(txh, sizeof (dmu_tx_hold_t)); - if (dn != NULL) - dnode_rele(dn, tx); - } - - list_destroy(&tx->tx_callbacks); - list_destroy(&tx->tx_holds); - kmem_free(tx, sizeof (dmu_tx_t)); -} - -void -dmu_tx_commit(dmu_tx_t *tx) -{ - ASSERT(tx->tx_txg != 0); - - /* - * Go through the transaction's hold list and remove holds on - * associated dnodes, notifying waiters if no holds remain. - */ - for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; - txh = list_next(&tx->tx_holds, txh)) { - dnode_t *dn = txh->txh_dnode; - - if (dn == NULL) - continue; - - mutex_enter(&dn->dn_mtx); - ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); - - if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) { - dn->dn_assigned_txg = 0; - cv_broadcast(&dn->dn_notxholds); - } - mutex_exit(&dn->dn_mtx); - } - - if (tx->tx_tempreserve_cookie) - dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); - - if (!list_is_empty(&tx->tx_callbacks)) - txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); - - if (tx->tx_anyobj == FALSE) - txg_rele_to_sync(&tx->tx_txgh); - - dmu_tx_destroy(tx); -} - -void -dmu_tx_abort(dmu_tx_t *tx) -{ - ASSERT(tx->tx_txg == 0); - - /* - * Call any registered callbacks with an error code. - */ - if (!list_is_empty(&tx->tx_callbacks)) - dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); - - dmu_tx_destroy(tx); -} - -uint64_t -dmu_tx_get_txg(dmu_tx_t *tx) -{ - ASSERT(tx->tx_txg != 0); - return (tx->tx_txg); -} - -dsl_pool_t * -dmu_tx_pool(dmu_tx_t *tx) -{ - ASSERT(tx->tx_pool != NULL); - return (tx->tx_pool); -} - -void -dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) -{ - dmu_tx_callback_t *dcb; - - dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); - - dcb->dcb_func = func; - dcb->dcb_data = data; - - list_insert_tail(&tx->tx_callbacks, dcb); -} - -/* - * Call all the commit callbacks on a list, with a given error code. - */ -void -dmu_tx_do_callbacks(list_t *cb_list, int error) -{ - dmu_tx_callback_t *dcb; - - while ((dcb = list_head(cb_list)) != NULL) { - list_remove(cb_list, dcb); - dcb->dcb_func(dcb->dcb_data, error); - kmem_free(dcb, sizeof (dmu_tx_callback_t)); - } -} - -/* - * Interface to hold a bunch of attributes. - * used for creating new files. - * attrsize is the total size of all attributes - * to be added during object creation - * - * For updating/adding a single attribute dmu_tx_hold_sa() should be used. - */ - -/* - * hold necessary attribute name for attribute registration. - * should be a very rare case where this is needed. If it does - * happen it would only happen on the first write to the file system. - */ -static void -dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) -{ - if (!sa->sa_need_attr_registration) - return; - - for (int i = 0; i != sa->sa_num_attrs; i++) { - if (!sa->sa_attr_table[i].sa_registered) { - if (sa->sa_reg_attr_obj) - dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, - B_TRUE, sa->sa_attr_table[i].sa_name); - else - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, - B_TRUE, sa->sa_attr_table[i].sa_name); - } - } -} - -void -dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) -{ - dmu_tx_hold_t *txh; - - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, - THT_SPILL, 0, 0); - if (txh != NULL) - (void) zfs_refcount_add_many(&txh->txh_space_towrite, - SPA_OLD_MAXBLOCKSIZE, FTAG); -} - -void -dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) -{ - sa_os_t *sa = tx->tx_objset->os_sa; - - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - - if (tx->tx_objset->os_sa->sa_master_obj == 0) - return; - - if (tx->tx_objset->os_sa->sa_layout_attr_obj) { - dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); - } else { - dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); - dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); - } - - dmu_tx_sa_registration_hold(sa, tx); - - if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill) - return; - - (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, - THT_SPILL, 0, 0); -} - -/* - * Hold SA attribute - * - * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) - * - * variable_size is the total size of all variable sized attributes - * passed to this function. It is not the total size of all - * variable size attributes that *may* exist on this object. - */ -void -dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) -{ - uint64_t object; - sa_os_t *sa = tx->tx_objset->os_sa; - - ASSERT(hdl != NULL); - - object = sa_handle_object(hdl); - - dmu_tx_hold_bonus(tx, object); - - if (tx->tx_objset->os_sa->sa_master_obj == 0) - return; - - if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || - tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { - dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); - dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); - } - - dmu_tx_sa_registration_hold(sa, tx); - - if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) - dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); - - if (sa->sa_force_spill || may_grow || hdl->sa_spill) { - ASSERT(tx->tx_txg == 0); - dmu_tx_hold_spill(tx, object); - } else { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (dn->dn_have_spill) { - ASSERT(tx->tx_txg == 0); - dmu_tx_hold_spill(tx, object); - } - DB_DNODE_EXIT(db); - } -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c deleted file mode 100644 index 229032530e86..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c +++ /dev/null @@ -1,374 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * This tunable disables predictive prefetch. Note that it leaves "prescient" - * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, - * prescient prefetch never issues i/os that end up not being needed, - * so it can't hurt performance. - */ -boolean_t zfs_prefetch_disable = B_FALSE; - -/* max # of streams per zfetch */ -uint32_t zfetch_max_streams = 8; -/* min time before stream reclaim */ -uint32_t zfetch_min_sec_reap = 2; -/* max bytes to prefetch per stream (default 8MB) */ -uint32_t zfetch_max_distance = 8 * 1024 * 1024; -/* max bytes to prefetch indirects for per stream (default 64MB) */ -uint32_t zfetch_max_idistance = 64 * 1024 * 1024; -/* max number of bytes in an array_read in which we allow prefetching (1MB) */ -uint64_t zfetch_array_rd_sz = 1024 * 1024; - -SYSCTL_DECL(_vfs_zfs); -SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW, - &zfs_prefetch_disable, 0, "Disable prefetch"); -SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "ZFS ZFETCH"); -SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RWTUN, - &zfetch_max_streams, 0, "Max # of streams per zfetch"); -SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RWTUN, - &zfetch_min_sec_reap, 0, "Min time before stream reclaim"); -SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN, - &zfetch_max_distance, 0, "Max bytes to prefetch per stream"); -SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN, - &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream"); -SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RWTUN, - &zfetch_array_rd_sz, 0, - "Number of bytes in a array_read at which we stop prefetching"); - -typedef struct zfetch_stats { - kstat_named_t zfetchstat_hits; - kstat_named_t zfetchstat_misses; - kstat_named_t zfetchstat_max_streams; -} zfetch_stats_t; - -static zfetch_stats_t zfetch_stats = { - { "hits", KSTAT_DATA_UINT64 }, - { "misses", KSTAT_DATA_UINT64 }, - { "max_streams", KSTAT_DATA_UINT64 }, -}; - -#define ZFETCHSTAT_BUMP(stat) \ - atomic_inc_64(&zfetch_stats.stat.value.ui64); - -kstat_t *zfetch_ksp; - -void -zfetch_init(void) -{ - zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", - KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - - if (zfetch_ksp != NULL) { - zfetch_ksp->ks_data = &zfetch_stats; - kstat_install(zfetch_ksp); - } -} - -void -zfetch_fini(void) -{ - if (zfetch_ksp != NULL) { - kstat_delete(zfetch_ksp); - zfetch_ksp = NULL; - } -} - -/* - * This takes a pointer to a zfetch structure and a dnode. It performs the - * necessary setup for the zfetch structure, grokking data from the - * associated dnode. - */ -void -dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) -{ - if (zf == NULL) - return; - - zf->zf_dnode = dno; - - list_create(&zf->zf_stream, sizeof (zstream_t), - offsetof(zstream_t, zs_node)); - - rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); -} - -static void -dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) -{ - ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); - list_remove(&zf->zf_stream, zs); - mutex_destroy(&zs->zs_lock); - kmem_free(zs, sizeof (*zs)); -} - -/* - * Clean-up state associated with a zfetch structure (e.g. destroy the - * streams). This doesn't free the zfetch_t itself, that's left to the caller. - */ -void -dmu_zfetch_fini(zfetch_t *zf) -{ - zstream_t *zs; - - ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); - - rw_enter(&zf->zf_rwlock, RW_WRITER); - while ((zs = list_head(&zf->zf_stream)) != NULL) - dmu_zfetch_stream_remove(zf, zs); - rw_exit(&zf->zf_rwlock); - list_destroy(&zf->zf_stream); - rw_destroy(&zf->zf_rwlock); - - zf->zf_dnode = NULL; -} - -/* - * If there aren't too many streams already, create a new stream. - * The "blkid" argument is the next block that we expect this stream to access. - * While we're here, clean up old streams (which haven't been - * accessed for at least zfetch_min_sec_reap seconds). - */ -static void -dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) -{ - zstream_t *zs_next; - int numstreams = 0; - - ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); - - /* - * Clean up old streams. - */ - for (zstream_t *zs = list_head(&zf->zf_stream); - zs != NULL; zs = zs_next) { - zs_next = list_next(&zf->zf_stream, zs); - if (((gethrtime() - zs->zs_atime) / NANOSEC) > - zfetch_min_sec_reap) - dmu_zfetch_stream_remove(zf, zs); - else - numstreams++; - } - - /* - * The maximum number of streams is normally zfetch_max_streams, - * but for small files we lower it such that it's at least possible - * for all the streams to be non-overlapping. - * - * If we are already at the maximum number of streams for this file, - * even after removing old streams, then don't create this stream. - */ - uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, - zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / - zfetch_max_distance)); - if (numstreams >= max_streams) { - ZFETCHSTAT_BUMP(zfetchstat_max_streams); - return; - } - - zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); - zs->zs_blkid = blkid; - zs->zs_pf_blkid = blkid; - zs->zs_ipf_blkid = blkid; - zs->zs_atime = gethrtime(); - mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL); - - list_insert_head(&zf->zf_stream, zs); -} - -/* - * This is the predictive prefetch entry point. It associates dnode access - * specified with blkid and nblks arguments with prefetch stream, predicts - * further accesses based on that stats and initiates speculative prefetch. - * fetch_data argument specifies whether actual data blocks should be fetched: - * FALSE -- prefetch only indirect blocks for predicted data blocks; - * TRUE -- prefetch predicted data blocks plus following indirect blocks. - */ -void -dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) -{ - zstream_t *zs; - int64_t pf_start, ipf_start, ipf_istart, ipf_iend; - int64_t pf_ahead_blks, max_blks; - int epbs, max_dist_blks, pf_nblks, ipf_nblks; - uint64_t end_of_access_blkid = blkid + nblks; - spa_t *spa = zf->zf_dnode->dn_objset->os_spa; - - if (zfs_prefetch_disable) - return; - - /* - * If we haven't yet loaded the indirect vdevs' mappings, we - * can only read from blocks that we carefully ensure are on - * concrete vdevs (or previously-loaded indirect vdevs). So we - * can't allow the predictive prefetcher to attempt reads of other - * blocks (e.g. of the MOS's dnode obejct). - */ - if (!spa_indirect_vdevs_loaded(spa)) - return; - - /* - * As a fast path for small (single-block) files, ignore access - * to the first block. - */ - if (blkid == 0) - return; - - rw_enter(&zf->zf_rwlock, RW_READER); - - /* - * Find matching prefetch stream. Depending on whether the accesses - * are block-aligned, first block of the new access may either follow - * the last block of the previous access, or be equal to it. - */ - for (zs = list_head(&zf->zf_stream); zs != NULL; - zs = list_next(&zf->zf_stream, zs)) { - if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) { - mutex_enter(&zs->zs_lock); - /* - * zs_blkid could have changed before we - * acquired zs_lock; re-check them here. - */ - if (blkid == zs->zs_blkid) { - break; - } else if (blkid + 1 == zs->zs_blkid) { - blkid++; - nblks--; - if (nblks == 0) { - /* Already prefetched this before. */ - mutex_exit(&zs->zs_lock); - rw_exit(&zf->zf_rwlock); - return; - } - break; - } - mutex_exit(&zs->zs_lock); - } - } - - if (zs == NULL) { - /* - * This access is not part of any existing stream. Create - * a new stream for it. - */ - ZFETCHSTAT_BUMP(zfetchstat_misses); - if (rw_tryupgrade(&zf->zf_rwlock)) - dmu_zfetch_stream_create(zf, end_of_access_blkid); - rw_exit(&zf->zf_rwlock); - return; - } - - /* - * This access was to a block that we issued a prefetch for on - * behalf of this stream. Issue further prefetches for this stream. - * - * Normally, we start prefetching where we stopped - * prefetching last (zs_pf_blkid). But when we get our first - * hit on this stream, zs_pf_blkid == zs_blkid, we don't - * want to prefetch the block we just accessed. In this case, - * start just after the block we just accessed. - */ - pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid); - - /* - * Double our amount of prefetched data, but don't let the - * prefetch get further ahead than zfetch_max_distance. - */ - if (fetch_data) { - max_dist_blks = - zfetch_max_distance >> zf->zf_dnode->dn_datablkshift; - /* - * Previously, we were (zs_pf_blkid - blkid) ahead. We - * want to now be double that, so read that amount again, - * plus the amount we are catching up by (i.e. the amount - * read just now). - */ - pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks; - max_blks = max_dist_blks - (pf_start - end_of_access_blkid); - pf_nblks = MIN(pf_ahead_blks, max_blks); - } else { - pf_nblks = 0; - } - - zs->zs_pf_blkid = pf_start + pf_nblks; - - /* - * Do the same for indirects, starting from where we stopped last, - * or where we will stop reading data blocks (and the indirects - * that point to them). - */ - ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid); - max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift; - /* - * We want to double our distance ahead of the data prefetch - * (or reader, if we are not prefetching data). Previously, we - * were (zs_ipf_blkid - blkid) ahead. To double that, we read - * that amount again, plus the amount we are catching up by - * (i.e. the amount read now + the amount of data prefetched now). - */ - pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks; - max_blks = max_dist_blks - (ipf_start - end_of_access_blkid); - ipf_nblks = MIN(pf_ahead_blks, max_blks); - zs->zs_ipf_blkid = ipf_start + ipf_nblks; - - epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; - ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; - ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs; - - zs->zs_atime = gethrtime(); - zs->zs_blkid = end_of_access_blkid; - mutex_exit(&zs->zs_lock); - rw_exit(&zf->zf_rwlock); - - /* - * dbuf_prefetch() is asynchronous (even when it needs to read - * indirect blocks), but we still prefer to drop our locks before - * calling it to reduce the time we hold them. - */ - - for (int i = 0; i < pf_nblks; i++) { - dbuf_prefetch(zf->zf_dnode, 0, pf_start + i, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); - } - for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) { - dbuf_prefetch(zf->zf_dnode, 1, iblk, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); - } - ZFETCHSTAT_BUMP(zfetchstat_hits); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c deleted file mode 100644 index 50a7338fb9e8..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ /dev/null @@ -1,2418 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2017 RackTop Systems. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -dnode_stats_t dnode_stats = { - { "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 }, - { "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 }, - { "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 }, - { "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 }, - { "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 }, - { "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 }, - { "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 }, - { "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_hits", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_misses", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 }, - { "dnode_hold_free_txg", KSTAT_DATA_UINT64 }, - { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 }, - { "dnode_allocate", KSTAT_DATA_UINT64 }, - { "dnode_reallocate", KSTAT_DATA_UINT64 }, - { "dnode_buf_evict", KSTAT_DATA_UINT64 }, - { "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 }, - { "dnode_alloc_race", KSTAT_DATA_UINT64 }, - { "dnode_alloc_next_block", KSTAT_DATA_UINT64 }, - { "dnode_move_invalid", KSTAT_DATA_UINT64 }, - { "dnode_move_recheck1", KSTAT_DATA_UINT64 }, - { "dnode_move_recheck2", KSTAT_DATA_UINT64 }, - { "dnode_move_special", KSTAT_DATA_UINT64 }, - { "dnode_move_handle", KSTAT_DATA_UINT64 }, - { "dnode_move_rwlock", KSTAT_DATA_UINT64 }, - { "dnode_move_active", KSTAT_DATA_UINT64 }, -}; - -static kstat_t *dnode_ksp; -static kmem_cache_t *dnode_cache; - -static dnode_phys_t dnode_phys_zero; - -int zfs_default_bs = SPA_MINBLOCKSHIFT; -int zfs_default_ibs = DN_MAX_INDBLKSHIFT; - -SYSCTL_DECL(_vfs_zfs); -SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN, - &zfs_default_bs, 0, "Default dnode block shift"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN, - &zfs_default_ibs, 0, "Default dnode indirect block shift"); - -#ifdef illumos -#ifdef _KERNEL -static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); -#endif /* _KERNEL */ -#endif - -static int -dbuf_compare(const void *x1, const void *x2) -{ - const dmu_buf_impl_t *d1 = x1; - const dmu_buf_impl_t *d2 = x2; - - int cmp = AVL_CMP(d1->db_level, d2->db_level); - if (likely(cmp)) - return (cmp); - - cmp = AVL_CMP(d1->db_blkid, d2->db_blkid); - if (likely(cmp)) - return (cmp); - - if (d1->db_state == DB_SEARCH) { - ASSERT3S(d2->db_state, !=, DB_SEARCH); - return (-1); - } else if (d2->db_state == DB_SEARCH) { - ASSERT3S(d1->db_state, !=, DB_SEARCH); - return (1); - } - - return (AVL_PCMP(d1, d2)); -} - -/* ARGSUSED */ -static int -dnode_cons(void *arg, void *unused, int kmflag) -{ - dnode_t *dn = arg; - int i; - - rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); - mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); - cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); - - /* - * Every dbuf has a reference, and dropping a tracked reference is - * O(number of references), so don't track dn_holds. - */ - zfs_refcount_create_untracked(&dn->dn_holds); - zfs_refcount_create(&dn->dn_tx_holds); - list_link_init(&dn->dn_link); - - bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); - bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); - bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); - bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype)); - bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); - bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); - bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); - - for (i = 0; i < TXG_SIZE; i++) { - multilist_link_init(&dn->dn_dirty_link[i]); - dn->dn_free_ranges[i] = NULL; - list_create(&dn->dn_dirty_records[i], - sizeof (dbuf_dirty_record_t), - offsetof(dbuf_dirty_record_t, dr_dirty_node)); - } - - dn->dn_allocated_txg = 0; - dn->dn_free_txg = 0; - dn->dn_assigned_txg = 0; - dn->dn_dirty_txg = 0; - dn->dn_dirtyctx = 0; - dn->dn_dirtyctx_firstset = NULL; - dn->dn_bonus = NULL; - dn->dn_have_spill = B_FALSE; - dn->dn_zio = NULL; - dn->dn_oldused = 0; - dn->dn_oldflags = 0; - dn->dn_olduid = 0; - dn->dn_oldgid = 0; - dn->dn_newuid = 0; - dn->dn_newgid = 0; - dn->dn_id_flags = 0; - - dn->dn_dbufs_count = 0; - avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), - offsetof(dmu_buf_impl_t, db_link)); - - dn->dn_moved = 0; - POINTER_INVALIDATE(&dn->dn_objset); - return (0); -} - -/* ARGSUSED */ -static void -dnode_dest(void *arg, void *unused) -{ - int i; - dnode_t *dn = arg; - - rw_destroy(&dn->dn_struct_rwlock); - mutex_destroy(&dn->dn_mtx); - mutex_destroy(&dn->dn_dbufs_mtx); - cv_destroy(&dn->dn_notxholds); - zfs_refcount_destroy(&dn->dn_holds); - zfs_refcount_destroy(&dn->dn_tx_holds); - ASSERT(!list_link_active(&dn->dn_link)); - - for (i = 0; i < TXG_SIZE; i++) { - ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); - ASSERT3P(dn->dn_free_ranges[i], ==, NULL); - list_destroy(&dn->dn_dirty_records[i]); - ASSERT0(dn->dn_next_nblkptr[i]); - ASSERT0(dn->dn_next_nlevels[i]); - ASSERT0(dn->dn_next_indblkshift[i]); - ASSERT0(dn->dn_next_bonustype[i]); - ASSERT0(dn->dn_rm_spillblk[i]); - ASSERT0(dn->dn_next_bonuslen[i]); - ASSERT0(dn->dn_next_blksz[i]); - } - - ASSERT0(dn->dn_allocated_txg); - ASSERT0(dn->dn_free_txg); - ASSERT0(dn->dn_assigned_txg); - ASSERT0(dn->dn_dirty_txg); - ASSERT0(dn->dn_dirtyctx); - ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); - ASSERT3P(dn->dn_bonus, ==, NULL); - ASSERT(!dn->dn_have_spill); - ASSERT3P(dn->dn_zio, ==, NULL); - ASSERT0(dn->dn_oldused); - ASSERT0(dn->dn_oldflags); - ASSERT0(dn->dn_olduid); - ASSERT0(dn->dn_oldgid); - ASSERT0(dn->dn_newuid); - ASSERT0(dn->dn_newgid); - ASSERT0(dn->dn_id_flags); - - ASSERT0(dn->dn_dbufs_count); - avl_destroy(&dn->dn_dbufs); -} - -void -dnode_init(void) -{ - ASSERT(dnode_cache == NULL); - dnode_cache = kmem_cache_create("dnode_t", - sizeof (dnode_t), - 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); -#ifdef _KERNEL - kmem_cache_set_move(dnode_cache, dnode_move); - - dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc", - KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - if (dnode_ksp != NULL) { - dnode_ksp->ks_data = &dnode_stats; - kstat_install(dnode_ksp); - } -#endif /* _KERNEL */ -} - -void -dnode_fini(void) -{ - if (dnode_ksp != NULL) { - kstat_delete(dnode_ksp); - dnode_ksp = NULL; - } - - kmem_cache_destroy(dnode_cache); - dnode_cache = NULL; -} - - -#ifdef ZFS_DEBUG -void -dnode_verify(dnode_t *dn) -{ - int drop_struct_lock = FALSE; - - ASSERT(dn->dn_phys); - ASSERT(dn->dn_objset); - ASSERT(dn->dn_handle->dnh_dnode == dn); - - ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); - - if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) - return; - - if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { - rw_enter(&dn->dn_struct_rwlock, RW_READER); - drop_struct_lock = TRUE; - } - if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { - int i; - int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); - ASSERT3U(dn->dn_indblkshift, >=, 0); - ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); - if (dn->dn_datablkshift) { - ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); - ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT); - ASSERT3U(1<dn_datablkshift, ==, dn->dn_datablksz); - } - ASSERT3U(dn->dn_nlevels, <=, 30); - ASSERT(DMU_OT_IS_VALID(dn->dn_type)); - ASSERT3U(dn->dn_nblkptr, >=, 1); - ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); - ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen); - ASSERT3U(dn->dn_datablksz, ==, - dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); - ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); - ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + - dn->dn_bonuslen, <=, max_bonuslen); - for (i = 0; i < TXG_SIZE; i++) { - ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); - } - } - if (dn->dn_phys->dn_type != DMU_OT_NONE) - ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); - ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); - if (dn->dn_dbuf != NULL) { - ASSERT3P(dn->dn_phys, ==, - (dnode_phys_t *)dn->dn_dbuf->db.db_data + - (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); - } - if (drop_struct_lock) - rw_exit(&dn->dn_struct_rwlock); -} -#endif - -void -dnode_byteswap(dnode_phys_t *dnp) -{ - uint64_t *buf64 = (void*)&dnp->dn_blkptr; - int i; - - if (dnp->dn_type == DMU_OT_NONE) { - bzero(dnp, sizeof (dnode_phys_t)); - return; - } - - dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); - dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); - dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots); - dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); - dnp->dn_used = BSWAP_64(dnp->dn_used); - - /* - * dn_nblkptr is only one byte, so it's OK to read it in either - * byte order. We can't read dn_bouslen. - */ - ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT); - ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR); - for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++) - buf64[i] = BSWAP_64(buf64[i]); - - /* - * OK to check dn_bonuslen for zero, because it won't matter if - * we have the wrong byte order. This is necessary because the - * dnode dnode is smaller than a regular dnode. - */ - if (dnp->dn_bonuslen != 0) { - /* - * Note that the bonus length calculated here may be - * longer than the actual bonus buffer. This is because - * we always put the bonus buffer after the last block - * pointer (instead of packing it against the end of the - * dnode buffer). - */ - int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); - int slots = dnp->dn_extra_slots + 1; - size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off; - ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(dnp->dn_bonustype); - dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len); - } - - /* Swap SPILL block if we have one */ - if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) - byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t)); - -} - -void -dnode_buf_byteswap(void *vbuf, size_t size) -{ - int i = 0; - - ASSERT3U(sizeof (dnode_phys_t), ==, (1<dn_type != DMU_OT_NONE) - i += dnp->dn_extra_slots * DNODE_MIN_SIZE; - } -} - -void -dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) -{ - ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1); - - dnode_setdirty(dn, tx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - - (dn->dn_nblkptr-1) * sizeof (blkptr_t)); - dn->dn_bonuslen = newsize; - if (newsize == 0) - dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; - else - dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; - rw_exit(&dn->dn_struct_rwlock); -} - -void -dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) -{ - ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1); - dnode_setdirty(dn, tx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - dn->dn_bonustype = newtype; - dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; - rw_exit(&dn->dn_struct_rwlock); -} - -void -dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) -{ - ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1); - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); - dnode_setdirty(dn, tx); - dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK; - dn->dn_have_spill = B_FALSE; -} - -static void -dnode_setdblksz(dnode_t *dn, int size) -{ - ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE)); - ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); - ASSERT3U(size, >=, SPA_MINBLOCKSIZE); - ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, - 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); - dn->dn_datablksz = size; - dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; - dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0; -} - -static dnode_t * -dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, - uint64_t object, dnode_handle_t *dnh) -{ - dnode_t *dn; - - dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); -#ifdef _KERNEL - ASSERT(!POINTER_IS_VALID(dn->dn_objset)); -#endif /* _KERNEL */ - dn->dn_moved = 0; - - /* - * Defer setting dn_objset until the dnode is ready to be a candidate - * for the dnode_move() callback. - */ - dn->dn_object = object; - dn->dn_dbuf = db; - dn->dn_handle = dnh; - dn->dn_phys = dnp; - - if (dnp->dn_datablkszsec) { - dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); - } else { - dn->dn_datablksz = 0; - dn->dn_datablkszsec = 0; - dn->dn_datablkshift = 0; - } - dn->dn_indblkshift = dnp->dn_indblkshift; - dn->dn_nlevels = dnp->dn_nlevels; - dn->dn_type = dnp->dn_type; - dn->dn_nblkptr = dnp->dn_nblkptr; - dn->dn_checksum = dnp->dn_checksum; - dn->dn_compress = dnp->dn_compress; - dn->dn_bonustype = dnp->dn_bonustype; - dn->dn_bonuslen = dnp->dn_bonuslen; - dn->dn_num_slots = dnp->dn_extra_slots + 1; - dn->dn_maxblkid = dnp->dn_maxblkid; - dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); - dn->dn_id_flags = 0; - - dmu_zfetch_init(&dn->dn_zfetch, dn); - - ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); - ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); - ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode)); - - mutex_enter(&os->os_lock); - - /* - * Exclude special dnodes from os_dnodes so an empty os_dnodes - * signifies that the special dnodes have no references from - * their children (the entries in os_dnodes). This allows - * dnode_destroy() to easily determine if the last child has - * been removed and then complete eviction of the objset. - */ - if (!DMU_OBJECT_IS_SPECIAL(object)) - list_insert_head(&os->os_dnodes, dn); - membar_producer(); - - /* - * Everything else must be valid before assigning dn_objset - * makes the dnode eligible for dnode_move(). - */ - dn->dn_objset = os; - - dnh->dnh_dnode = dn; - mutex_exit(&os->os_lock); - - arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE); - - return (dn); -} - -/* - * Caller must be holding the dnode handle, which is released upon return. - */ -static void -dnode_destroy(dnode_t *dn) -{ - objset_t *os = dn->dn_objset; - boolean_t complete_os_eviction = B_FALSE; - - ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); - - mutex_enter(&os->os_lock); - POINTER_INVALIDATE(&dn->dn_objset); - if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { - list_remove(&os->os_dnodes, dn); - complete_os_eviction = - list_is_empty(&os->os_dnodes) && - list_link_active(&os->os_evicting_node); - } - mutex_exit(&os->os_lock); - - /* the dnode can no longer move, so we can release the handle */ - if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock)) - zrl_remove(&dn->dn_handle->dnh_zrlock); - - dn->dn_allocated_txg = 0; - dn->dn_free_txg = 0; - dn->dn_assigned_txg = 0; - dn->dn_dirty_txg = 0; - - dn->dn_dirtyctx = 0; - if (dn->dn_dirtyctx_firstset != NULL) { - kmem_free(dn->dn_dirtyctx_firstset, 1); - dn->dn_dirtyctx_firstset = NULL; - } - if (dn->dn_bonus != NULL) { - mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_destroy(dn->dn_bonus); - dn->dn_bonus = NULL; - } - dn->dn_zio = NULL; - - dn->dn_have_spill = B_FALSE; - dn->dn_oldused = 0; - dn->dn_oldflags = 0; - dn->dn_olduid = 0; - dn->dn_oldgid = 0; - dn->dn_newuid = 0; - dn->dn_newgid = 0; - dn->dn_id_flags = 0; - - dmu_zfetch_fini(&dn->dn_zfetch); - kmem_cache_free(dnode_cache, dn); - arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE); - - if (complete_os_eviction) - dmu_objset_evict_done(os); -} - -void -dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, - dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx) -{ - int i; - - ASSERT3U(dn_slots, >, 0); - ASSERT3U(dn_slots << DNODE_SHIFT, <=, - spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))); - ASSERT3U(blocksize, <=, - spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); - if (blocksize == 0) - blocksize = 1 << zfs_default_bs; - else - blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); - - if (ibs == 0) - ibs = zfs_default_ibs; - - ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); - - dprintf("os=%p obj=%" PRIu64 " txg=%" PRIu64 - " blocksize=%d ibs=%d dn_slots=%d\n", - dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots); - DNODE_STAT_BUMP(dnode_allocate); - - ASSERT(dn->dn_type == DMU_OT_NONE); - ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); - ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); - ASSERT(ot != DMU_OT_NONE); - ASSERT(DMU_OT_IS_VALID(ot)); - ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || - (bonustype == DMU_OT_SA && bonuslen == 0) || - (bonustype != DMU_OT_NONE && bonuslen != 0)); - ASSERT(DMU_OT_IS_VALID(bonustype)); - ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots)); - ASSERT(dn->dn_type == DMU_OT_NONE); - ASSERT0(dn->dn_maxblkid); - ASSERT0(dn->dn_allocated_txg); - ASSERT0(dn->dn_dirty_txg); - ASSERT0(dn->dn_assigned_txg); - ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds)); - ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1); - ASSERT(avl_is_empty(&dn->dn_dbufs)); - - for (i = 0; i < TXG_SIZE; i++) { - ASSERT0(dn->dn_next_nblkptr[i]); - ASSERT0(dn->dn_next_nlevels[i]); - ASSERT0(dn->dn_next_indblkshift[i]); - ASSERT0(dn->dn_next_bonuslen[i]); - ASSERT0(dn->dn_next_bonustype[i]); - ASSERT0(dn->dn_rm_spillblk[i]); - ASSERT0(dn->dn_next_blksz[i]); - ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); - ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); - ASSERT3P(dn->dn_free_ranges[i], ==, NULL); - } - - dn->dn_type = ot; - dnode_setdblksz(dn, blocksize); - dn->dn_indblkshift = ibs; - dn->dn_nlevels = 1; - dn->dn_num_slots = dn_slots; - if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ - dn->dn_nblkptr = 1; - else { - dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR, - 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> - SPA_BLKPTRSHIFT)); - } - - dn->dn_bonustype = bonustype; - dn->dn_bonuslen = bonuslen; - dn->dn_checksum = ZIO_CHECKSUM_INHERIT; - dn->dn_compress = ZIO_COMPRESS_INHERIT; - dn->dn_dirtyctx = 0; - - dn->dn_free_txg = 0; - if (dn->dn_dirtyctx_firstset) { - kmem_free(dn->dn_dirtyctx_firstset, 1); - dn->dn_dirtyctx_firstset = NULL; - } - - dn->dn_allocated_txg = tx->tx_txg; - dn->dn_id_flags = 0; - - dnode_setdirty(dn, tx); - dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; - dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; - dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; - dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; -} - -void -dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, - dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx) -{ - int nblkptr; - - ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); - ASSERT3U(blocksize, <=, - spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); - ASSERT0(blocksize % SPA_MINBLOCKSIZE); - ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); - ASSERT(tx->tx_txg != 0); - ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || - (bonustype != DMU_OT_NONE && bonuslen != 0) || - (bonustype == DMU_OT_SA && bonuslen == 0)); - ASSERT(DMU_OT_IS_VALID(bonustype)); - ASSERT3U(bonuslen, <=, - DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)))); - ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT)); - - dnode_free_interior_slots(dn); - DNODE_STAT_BUMP(dnode_reallocate); - - /* clean up any unreferenced dbufs */ - dnode_evict_dbufs(dn); - - dn->dn_id_flags = 0; - - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - dnode_setdirty(dn, tx); - if (dn->dn_datablksz != blocksize) { - /* change blocksize */ - ASSERT(dn->dn_maxblkid == 0 && - (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || - dnode_block_freed(dn, 0))); - dnode_setdblksz(dn, blocksize); - dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; - } - if (dn->dn_bonuslen != bonuslen) - dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; - - if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ - nblkptr = 1; - else - nblkptr = MIN(DN_MAX_NBLKPTR, - 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> - SPA_BLKPTRSHIFT)); - if (dn->dn_bonustype != bonustype) - dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype; - if (dn->dn_nblkptr != nblkptr) - dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; - if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { - dbuf_rm_spill(dn, tx); - dnode_rm_spill(dn, tx); - } - rw_exit(&dn->dn_struct_rwlock); - - /* change type */ - dn->dn_type = ot; - - /* change bonus size and type */ - mutex_enter(&dn->dn_mtx); - dn->dn_bonustype = bonustype; - dn->dn_bonuslen = bonuslen; - dn->dn_num_slots = dn_slots; - dn->dn_nblkptr = nblkptr; - dn->dn_checksum = ZIO_CHECKSUM_INHERIT; - dn->dn_compress = ZIO_COMPRESS_INHERIT; - ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); - - /* fix up the bonus db_size */ - if (dn->dn_bonus) { - dn->dn_bonus->db.db_size = - DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - - (dn->dn_nblkptr - 1) * sizeof (blkptr_t); - ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); - } - - dn->dn_allocated_txg = tx->tx_txg; - mutex_exit(&dn->dn_mtx); -} - -#ifdef _KERNEL -static void -dnode_move_impl(dnode_t *odn, dnode_t *ndn) -{ - int i; - - ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); - ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); - ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); - ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock)); - - /* Copy fields. */ - ndn->dn_objset = odn->dn_objset; - ndn->dn_object = odn->dn_object; - ndn->dn_dbuf = odn->dn_dbuf; - ndn->dn_handle = odn->dn_handle; - ndn->dn_phys = odn->dn_phys; - ndn->dn_type = odn->dn_type; - ndn->dn_bonuslen = odn->dn_bonuslen; - ndn->dn_bonustype = odn->dn_bonustype; - ndn->dn_nblkptr = odn->dn_nblkptr; - ndn->dn_checksum = odn->dn_checksum; - ndn->dn_compress = odn->dn_compress; - ndn->dn_nlevels = odn->dn_nlevels; - ndn->dn_indblkshift = odn->dn_indblkshift; - ndn->dn_datablkshift = odn->dn_datablkshift; - ndn->dn_datablkszsec = odn->dn_datablkszsec; - ndn->dn_datablksz = odn->dn_datablksz; - ndn->dn_maxblkid = odn->dn_maxblkid; - ndn->dn_num_slots = odn->dn_num_slots; - bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0], - sizeof (odn->dn_next_type)); - bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], - sizeof (odn->dn_next_nblkptr)); - bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], - sizeof (odn->dn_next_nlevels)); - bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0], - sizeof (odn->dn_next_indblkshift)); - bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0], - sizeof (odn->dn_next_bonustype)); - bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0], - sizeof (odn->dn_rm_spillblk)); - bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0], - sizeof (odn->dn_next_bonuslen)); - bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], - sizeof (odn->dn_next_blksz)); - for (i = 0; i < TXG_SIZE; i++) { - list_move_tail(&ndn->dn_dirty_records[i], - &odn->dn_dirty_records[i]); - } - bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0], - sizeof (odn->dn_free_ranges)); - ndn->dn_allocated_txg = odn->dn_allocated_txg; - ndn->dn_free_txg = odn->dn_free_txg; - ndn->dn_assigned_txg = odn->dn_assigned_txg; - ndn->dn_dirty_txg = odn->dn_dirty_txg; - ndn->dn_dirtyctx = odn->dn_dirtyctx; - ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; - ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0); - zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds); - ASSERT(avl_is_empty(&ndn->dn_dbufs)); - avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs); - ndn->dn_dbufs_count = odn->dn_dbufs_count; - ndn->dn_bonus = odn->dn_bonus; - ndn->dn_have_spill = odn->dn_have_spill; - ndn->dn_zio = odn->dn_zio; - ndn->dn_oldused = odn->dn_oldused; - ndn->dn_oldflags = odn->dn_oldflags; - ndn->dn_olduid = odn->dn_olduid; - ndn->dn_oldgid = odn->dn_oldgid; - ndn->dn_newuid = odn->dn_newuid; - ndn->dn_newgid = odn->dn_newgid; - ndn->dn_id_flags = odn->dn_id_flags; - dmu_zfetch_init(&ndn->dn_zfetch, NULL); - list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream); - ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode; - - /* - * Update back pointers. Updating the handle fixes the back pointer of - * every descendant dbuf as well as the bonus dbuf. - */ - ASSERT(ndn->dn_handle->dnh_dnode == odn); - ndn->dn_handle->dnh_dnode = ndn; - if (ndn->dn_zfetch.zf_dnode == odn) { - ndn->dn_zfetch.zf_dnode = ndn; - } - - /* - * Invalidate the original dnode by clearing all of its back pointers. - */ - odn->dn_dbuf = NULL; - odn->dn_handle = NULL; - avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), - offsetof(dmu_buf_impl_t, db_link)); - odn->dn_dbufs_count = 0; - odn->dn_bonus = NULL; - odn->dn_zfetch.zf_dnode = NULL; - - /* - * Set the low bit of the objset pointer to ensure that dnode_move() - * recognizes the dnode as invalid in any subsequent callback. - */ - POINTER_INVALIDATE(&odn->dn_objset); - - /* - * Satisfy the destructor. - */ - for (i = 0; i < TXG_SIZE; i++) { - list_create(&odn->dn_dirty_records[i], - sizeof (dbuf_dirty_record_t), - offsetof(dbuf_dirty_record_t, dr_dirty_node)); - odn->dn_free_ranges[i] = NULL; - odn->dn_next_nlevels[i] = 0; - odn->dn_next_indblkshift[i] = 0; - odn->dn_next_bonustype[i] = 0; - odn->dn_rm_spillblk[i] = 0; - odn->dn_next_bonuslen[i] = 0; - odn->dn_next_blksz[i] = 0; - } - odn->dn_allocated_txg = 0; - odn->dn_free_txg = 0; - odn->dn_assigned_txg = 0; - odn->dn_dirty_txg = 0; - odn->dn_dirtyctx = 0; - odn->dn_dirtyctx_firstset = NULL; - odn->dn_have_spill = B_FALSE; - odn->dn_zio = NULL; - odn->dn_oldused = 0; - odn->dn_oldflags = 0; - odn->dn_olduid = 0; - odn->dn_oldgid = 0; - odn->dn_newuid = 0; - odn->dn_newgid = 0; - odn->dn_id_flags = 0; - - /* - * Mark the dnode. - */ - ndn->dn_moved = 1; - odn->dn_moved = (uint8_t)-1; -} - -#ifdef illumos -/*ARGSUSED*/ -static kmem_cbrc_t -dnode_move(void *buf, void *newbuf, size_t size, void *arg) -{ - dnode_t *odn = buf, *ndn = newbuf; - objset_t *os; - int64_t refcount; - uint32_t dbufs; - - /* - * The dnode is on the objset's list of known dnodes if the objset - * pointer is valid. We set the low bit of the objset pointer when - * freeing the dnode to invalidate it, and the memory patterns written - * by kmem (baddcafe and deadbeef) set at least one of the two low bits. - * A newly created dnode sets the objset pointer last of all to indicate - * that the dnode is known and in a valid state to be moved by this - * function. - */ - os = odn->dn_objset; - if (!POINTER_IS_VALID(os)) { - DNODE_STAT_BUMP(dnode_move_invalid); - return (KMEM_CBRC_DONT_KNOW); - } - - /* - * Ensure that the objset does not go away during the move. - */ - rw_enter(&os_lock, RW_WRITER); - if (os != odn->dn_objset) { - rw_exit(&os_lock); - DNODE_STAT_BUMP(dnode_move_recheck1); - return (KMEM_CBRC_DONT_KNOW); - } - - /* - * If the dnode is still valid, then so is the objset. We know that no - * valid objset can be freed while we hold os_lock, so we can safely - * ensure that the objset remains in use. - */ - mutex_enter(&os->os_lock); - - /* - * Recheck the objset pointer in case the dnode was removed just before - * acquiring the lock. - */ - if (os != odn->dn_objset) { - mutex_exit(&os->os_lock); - rw_exit(&os_lock); - DNODE_STAT_BUMP(dnode_move_recheck2); - return (KMEM_CBRC_DONT_KNOW); - } - - /* - * At this point we know that as long as we hold os->os_lock, the dnode - * cannot be freed and fields within the dnode can be safely accessed. - * The objset listing this dnode cannot go away as long as this dnode is - * on its list. - */ - rw_exit(&os_lock); - if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { - mutex_exit(&os->os_lock); - DNODE_STAT_BUMP(dnode_move_special); - return (KMEM_CBRC_NO); - } - ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ - - /* - * Lock the dnode handle to prevent the dnode from obtaining any new - * holds. This also prevents the descendant dbufs and the bonus dbuf - * from accessing the dnode, so that we can discount their holds. The - * handle is safe to access because we know that while the dnode cannot - * go away, neither can its handle. Once we hold dnh_zrlock, we can - * safely move any dnode referenced only by dbufs. - */ - if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { - mutex_exit(&os->os_lock); - DNODE_STAT_BUMP(dnode_move_handle); - return (KMEM_CBRC_LATER); - } - - /* - * Ensure a consistent view of the dnode's holds and the dnode's dbufs. - * We need to guarantee that there is a hold for every dbuf in order to - * determine whether the dnode is actively referenced. Falsely matching - * a dbuf to an active hold would lead to an unsafe move. It's possible - * that a thread already having an active dnode hold is about to add a - * dbuf, and we can't compare hold and dbuf counts while the add is in - * progress. - */ - if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { - zrl_exit(&odn->dn_handle->dnh_zrlock); - mutex_exit(&os->os_lock); - DNODE_STAT_BUMP(dnode_move_rwlock); - return (KMEM_CBRC_LATER); - } - - /* - * A dbuf may be removed (evicted) without an active dnode hold. In that - * case, the dbuf count is decremented under the handle lock before the - * dbuf's hold is released. This order ensures that if we count the hold - * after the dbuf is removed but before its hold is released, we will - * treat the unmatched hold as active and exit safely. If we count the - * hold before the dbuf is removed, the hold is discounted, and the - * removal is blocked until the move completes. - */ - refcount = zfs_refcount_count(&odn->dn_holds); - ASSERT(refcount >= 0); - dbufs = DN_DBUFS_COUNT(odn); - - /* We can't have more dbufs than dnode holds. */ - ASSERT3U(dbufs, <=, refcount); - DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount, - uint32_t, dbufs); - - if (refcount > dbufs) { - rw_exit(&odn->dn_struct_rwlock); - zrl_exit(&odn->dn_handle->dnh_zrlock); - mutex_exit(&os->os_lock); - DNODE_STAT_BUMP(dnode_move_active); - return (KMEM_CBRC_LATER); - } - - rw_exit(&odn->dn_struct_rwlock); - - /* - * At this point we know that anyone with a hold on the dnode is not - * actively referencing it. The dnode is known and in a valid state to - * move. We're holding the locks needed to execute the critical section. - */ - dnode_move_impl(odn, ndn); - - list_link_replace(&odn->dn_link, &ndn->dn_link); - /* If the dnode was safe to move, the refcount cannot have changed. */ - ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds)); - ASSERT(dbufs == DN_DBUFS_COUNT(ndn)); - zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ - mutex_exit(&os->os_lock); - - return (KMEM_CBRC_YES); -} -#endif /* illumos */ -#endif /* _KERNEL */ - -static void -dnode_slots_hold(dnode_children_t *children, int idx, int slots) -{ - ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); - - for (int i = idx; i < idx + slots; i++) { - dnode_handle_t *dnh = &children->dnc_children[i]; - zrl_add(&dnh->dnh_zrlock); - } -} - -static void -dnode_slots_rele(dnode_children_t *children, int idx, int slots) -{ - ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); - - for (int i = idx; i < idx + slots; i++) { - dnode_handle_t *dnh = &children->dnc_children[i]; - - if (zrl_is_locked(&dnh->dnh_zrlock)) - zrl_exit(&dnh->dnh_zrlock); - else - zrl_remove(&dnh->dnh_zrlock); - } -} - -static int -dnode_slots_tryenter(dnode_children_t *children, int idx, int slots) -{ - ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); - - for (int i = idx; i < idx + slots; i++) { - dnode_handle_t *dnh = &children->dnc_children[i]; - - if (!zrl_tryenter(&dnh->dnh_zrlock)) { - for (int j = idx; j < i; j++) { - dnh = &children->dnc_children[j]; - zrl_exit(&dnh->dnh_zrlock); - } - - return (0); - } - } - - return (1); -} - -static void -dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr) -{ - ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); - - for (int i = idx; i < idx + slots; i++) { - dnode_handle_t *dnh = &children->dnc_children[i]; - dnh->dnh_dnode = ptr; - } -} - -static boolean_t -dnode_check_slots_free(dnode_children_t *children, int idx, int slots) -{ - ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); - - /* - * If all dnode slots are either already free or - * evictable return B_TRUE. - */ - for (int i = idx; i < idx + slots; i++) { - dnode_handle_t *dnh = &children->dnc_children[i]; - dnode_t *dn = dnh->dnh_dnode; - - if (dn == DN_SLOT_FREE) { - continue; - } else if (DN_SLOT_IS_PTR(dn)) { - mutex_enter(&dn->dn_mtx); - boolean_t can_free = (dn->dn_type == DMU_OT_NONE && - zfs_refcount_is_zero(&dn->dn_holds) && - !DNODE_IS_DIRTY(dn)); - mutex_exit(&dn->dn_mtx); - - if (!can_free) - return (B_FALSE); - else - continue; - } else { - return (B_FALSE); - } - } - - return (B_TRUE); -} - -static void -dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) -{ - ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); - - for (int i = idx; i < idx + slots; i++) { - dnode_handle_t *dnh = &children->dnc_children[i]; - - ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); - - if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { - ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE); - dnode_destroy(dnh->dnh_dnode); - dnh->dnh_dnode = DN_SLOT_FREE; - } - } -} - -void -dnode_free_interior_slots(dnode_t *dn) -{ - dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db); - int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT; - int idx = (dn->dn_object & (epb - 1)) + 1; - int slots = dn->dn_num_slots - 1; - - if (slots == 0) - return; - - ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); - - while (!dnode_slots_tryenter(children, idx, slots)) - DNODE_STAT_BUMP(dnode_free_interior_lock_retry); - - dnode_set_slots(children, idx, slots, DN_SLOT_FREE); - dnode_slots_rele(children, idx, slots); -} - -void -dnode_special_close(dnode_handle_t *dnh) -{ - dnode_t *dn = dnh->dnh_dnode; - - /* - * Wait for final references to the dnode to clear. This can - * only happen if the arc is asynchronously evicting state that - * has a hold on this dnode while we are trying to evict this - * dnode. - */ - while (zfs_refcount_count(&dn->dn_holds) > 0) - delay(1); - ASSERT(dn->dn_dbuf == NULL || - dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); - zrl_add(&dnh->dnh_zrlock); - dnode_destroy(dn); /* implicit zrl_remove() */ - zrl_destroy(&dnh->dnh_zrlock); - dnh->dnh_dnode = NULL; -} - -void -dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, - dnode_handle_t *dnh) -{ - dnode_t *dn; - - zrl_init(&dnh->dnh_zrlock); - zrl_tryenter(&dnh->dnh_zrlock); - - dn = dnode_create(os, dnp, NULL, object, dnh); - DNODE_VERIFY(dn); - - zrl_exit(&dnh->dnh_zrlock); -} - -static void -dnode_buf_evict_async(void *dbu) -{ - dnode_children_t *dnc = dbu; - - DNODE_STAT_BUMP(dnode_buf_evict); - - for (int i = 0; i < dnc->dnc_count; i++) { - dnode_handle_t *dnh = &dnc->dnc_children[i]; - dnode_t *dn; - - /* - * The dnode handle lock guards against the dnode moving to - * another valid address, so there is no need here to guard - * against changes to or from NULL. - */ - if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) { - zrl_destroy(&dnh->dnh_zrlock); - dnh->dnh_dnode = DN_SLOT_UNINIT; - continue; - } - - zrl_add(&dnh->dnh_zrlock); - dn = dnh->dnh_dnode; - /* - * If there are holds on this dnode, then there should - * be holds on the dnode's containing dbuf as well; thus - * it wouldn't be eligible for eviction and this function - * would not have been called. - */ - ASSERT(zfs_refcount_is_zero(&dn->dn_holds)); - ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds)); - - dnode_destroy(dn); /* implicit zrl_remove() for first slot */ - zrl_destroy(&dnh->dnh_zrlock); - dnh->dnh_dnode = DN_SLOT_UNINIT; - } - kmem_free(dnc, sizeof (dnode_children_t) + - dnc->dnc_count * sizeof (dnode_handle_t)); -} - -/* - * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used - * to ensure the hole at the specified object offset is large enough to - * hold the dnode being created. The slots parameter is also used to ensure - * a dnode does not span multiple dnode blocks. In both of these cases, if - * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases - * are only possible when using DNODE_MUST_BE_FREE. - * - * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0. - * dnode_hold_impl() will check if the requested dnode is already consumed - * as an extra dnode slot by an large dnode, in which case it returns - * ENOENT. - * - * errors: - * EINVAL - invalid object number or flags. - * ENOSPC - hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE) - * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE) - * - Refers to a freeing dnode (DNODE_MUST_BE_FREE) - * - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED) - * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED) - * - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED) - * EIO - i/o error error when reading the meta dnode dbuf. - * succeeds even for free dnodes. - */ -int -dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, - void *tag, dnode_t **dnp) -{ - int epb, idx, err, i; - int drop_struct_lock = FALSE; - int type; - uint64_t blk; - dnode_t *mdn, *dn; - dmu_buf_impl_t *db; - dnode_children_t *dnc; - dnode_phys_t *dn_block; - dnode_phys_t *dn_block_begin; - dnode_handle_t *dnh; - - ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0)); - ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0)); - - /* - * If you are holding the spa config lock as writer, you shouldn't - * be asking the DMU to do *anything* unless it's the root pool - * which may require us to read from the root filesystem while - * holding some (not all) of the locks as writer. - */ - ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 || - (spa_is_root(os->os_spa) && - spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); - - ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE)); - - if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { - dn = (object == DMU_USERUSED_OBJECT) ? - DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os); - if (dn == NULL) - return (SET_ERROR(ENOENT)); - type = dn->dn_type; - if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) - return (SET_ERROR(ENOENT)); - if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) - return (SET_ERROR(EEXIST)); - DNODE_VERIFY(dn); - (void) zfs_refcount_add(&dn->dn_holds, tag); - *dnp = dn; - return (0); - } - - if (object == 0 || object >= DN_MAX_OBJECT) - return (SET_ERROR(EINVAL)); - - mdn = DMU_META_DNODE(os); - ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT); - - DNODE_VERIFY(mdn); - - if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) { - rw_enter(&mdn->dn_struct_rwlock, RW_READER); - drop_struct_lock = TRUE; - } - - blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); - - db = dbuf_hold(mdn, blk, FTAG); - if (drop_struct_lock) - rw_exit(&mdn->dn_struct_rwlock); - if (db == NULL) { - DNODE_STAT_BUMP(dnode_hold_dbuf_hold); - return (SET_ERROR(EIO)); - } - err = dbuf_read(db, NULL, DB_RF_CANFAIL); - if (err) { - DNODE_STAT_BUMP(dnode_hold_dbuf_read); - dbuf_rele(db, FTAG); - return (err); - } - - ASSERT3U(db->db.db_size, >=, 1<db.db_size >> DNODE_SHIFT; - - idx = object & (epb - 1); - dn_block = (dnode_phys_t *)db->db.db_data; - - ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); - dnc = dmu_buf_get_user(&db->db); - dnh = NULL; - if (dnc == NULL) { - dnode_children_t *winner; - int skip = 0; - - dnc = kmem_zalloc(sizeof (dnode_children_t) + - epb * sizeof (dnode_handle_t), KM_SLEEP); - dnc->dnc_count = epb; - dnh = &dnc->dnc_children[0]; - - /* Initialize dnode slot status from dnode_phys_t */ - for (int i = 0; i < epb; i++) { - zrl_init(&dnh[i].dnh_zrlock); - - if (skip) { - skip--; - continue; - } - - if (dn_block[i].dn_type != DMU_OT_NONE) { - int interior = dn_block[i].dn_extra_slots; - - dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED); - dnode_set_slots(dnc, i + 1, interior, - DN_SLOT_INTERIOR); - skip = interior; - } else { - dnh[i].dnh_dnode = DN_SLOT_FREE; - skip = 0; - } - } - - dmu_buf_init_user(&dnc->dnc_dbu, NULL, - dnode_buf_evict_async, NULL); - winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu); - if (winner != NULL) { - - for (int i = 0; i < epb; i++) - zrl_destroy(&dnh[i].dnh_zrlock); - - kmem_free(dnc, sizeof (dnode_children_t) + - epb * sizeof (dnode_handle_t)); - dnc = winner; - } - } - - ASSERT(dnc->dnc_count == epb); - dn = DN_SLOT_UNINIT; - - if (flag & DNODE_MUST_BE_ALLOCATED) { - slots = 1; - - while (dn == DN_SLOT_UNINIT) { - dnode_slots_hold(dnc, idx, slots); - dnh = &dnc->dnc_children[idx]; - - if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { - dn = dnh->dnh_dnode; - break; - } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) { - DNODE_STAT_BUMP(dnode_hold_alloc_interior); - dnode_slots_rele(dnc, idx, slots); - dbuf_rele(db, FTAG); - return (SET_ERROR(EEXIST)); - } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) { - DNODE_STAT_BUMP(dnode_hold_alloc_misses); - dnode_slots_rele(dnc, idx, slots); - dbuf_rele(db, FTAG); - return (SET_ERROR(ENOENT)); - } - - dnode_slots_rele(dnc, idx, slots); - if (!dnode_slots_tryenter(dnc, idx, slots)) { - DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry); - continue; - } - - /* - * Someone else won the race and called dnode_create() - * after we checked DN_SLOT_IS_PTR() above but before - * we acquired the lock. - */ - if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { - DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses); - dn = dnh->dnh_dnode; - } else { - dn = dnode_create(os, dn_block + idx, db, - object, dnh); - } - } - - mutex_enter(&dn->dn_mtx); - if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) { - DNODE_STAT_BUMP(dnode_hold_alloc_type_none); - mutex_exit(&dn->dn_mtx); - dnode_slots_rele(dnc, idx, slots); - dbuf_rele(db, FTAG); - return (SET_ERROR(ENOENT)); - } - - DNODE_STAT_BUMP(dnode_hold_alloc_hits); - } else if (flag & DNODE_MUST_BE_FREE) { - - if (idx + slots - 1 >= DNODES_PER_BLOCK) { - DNODE_STAT_BUMP(dnode_hold_free_overflow); - dbuf_rele(db, FTAG); - return (SET_ERROR(ENOSPC)); - } - - while (dn == DN_SLOT_UNINIT) { - dnode_slots_hold(dnc, idx, slots); - - if (!dnode_check_slots_free(dnc, idx, slots)) { - DNODE_STAT_BUMP(dnode_hold_free_misses); - dnode_slots_rele(dnc, idx, slots); - dbuf_rele(db, FTAG); - return (SET_ERROR(ENOSPC)); - } - - dnode_slots_rele(dnc, idx, slots); - if (!dnode_slots_tryenter(dnc, idx, slots)) { - DNODE_STAT_BUMP(dnode_hold_free_lock_retry); - continue; - } - - if (!dnode_check_slots_free(dnc, idx, slots)) { - DNODE_STAT_BUMP(dnode_hold_free_lock_misses); - dnode_slots_rele(dnc, idx, slots); - dbuf_rele(db, FTAG); - return (SET_ERROR(ENOSPC)); - } - - /* - * Allocated but otherwise free dnodes which would - * be in the interior of a multi-slot dnodes need - * to be freed. Single slot dnodes can be safely - * re-purposed as a performance optimization. - */ - if (slots > 1) - dnode_reclaim_slots(dnc, idx + 1, slots - 1); - - dnh = &dnc->dnc_children[idx]; - if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { - dn = dnh->dnh_dnode; - } else { - dn = dnode_create(os, dn_block + idx, db, - object, dnh); - } - } - - mutex_enter(&dn->dn_mtx); - if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) { - DNODE_STAT_BUMP(dnode_hold_free_refcount); - mutex_exit(&dn->dn_mtx); - dnode_slots_rele(dnc, idx, slots); - dbuf_rele(db, FTAG); - return (SET_ERROR(EEXIST)); - } - - dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR); - DNODE_STAT_BUMP(dnode_hold_free_hits); - } else { - dbuf_rele(db, FTAG); - return (SET_ERROR(EINVAL)); - } - - if (dn->dn_free_txg) { - DNODE_STAT_BUMP(dnode_hold_free_txg); - type = dn->dn_type; - mutex_exit(&dn->dn_mtx); - dnode_slots_rele(dnc, idx, slots); - dbuf_rele(db, FTAG); - return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ? - ENOENT : EEXIST)); - } - - if (zfs_refcount_add(&dn->dn_holds, tag) == 1) - dbuf_add_ref(db, dnh); - - mutex_exit(&dn->dn_mtx); - - /* Now we can rely on the hold to prevent the dnode from moving. */ - dnode_slots_rele(dnc, idx, slots); - - DNODE_VERIFY(dn); - ASSERT3P(dn->dn_dbuf, ==, db); - ASSERT3U(dn->dn_object, ==, object); - dbuf_rele(db, FTAG); - - *dnp = dn; - return (0); -} - -/* - * Return held dnode if the object is allocated, NULL if not. - */ -int -dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) -{ - return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag, - dnp)); -} - -/* - * Can only add a reference if there is already at least one - * reference on the dnode. Returns FALSE if unable to add a - * new reference. - */ -boolean_t -dnode_add_ref(dnode_t *dn, void *tag) -{ - mutex_enter(&dn->dn_mtx); - if (zfs_refcount_is_zero(&dn->dn_holds)) { - mutex_exit(&dn->dn_mtx); - return (FALSE); - } - VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag)); - mutex_exit(&dn->dn_mtx); - return (TRUE); -} - -void -dnode_rele(dnode_t *dn, void *tag) -{ - mutex_enter(&dn->dn_mtx); - dnode_rele_and_unlock(dn, tag, B_FALSE); -} - -void -dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting) -{ - uint64_t refs; - /* Get while the hold prevents the dnode from moving. */ - dmu_buf_impl_t *db = dn->dn_dbuf; - dnode_handle_t *dnh = dn->dn_handle; - - refs = zfs_refcount_remove(&dn->dn_holds, tag); - mutex_exit(&dn->dn_mtx); - - /* - * It's unsafe to release the last hold on a dnode by dnode_rele() or - * indirectly by dbuf_rele() while relying on the dnode handle to - * prevent the dnode from moving, since releasing the last hold could - * result in the dnode's parent dbuf evicting its dnode handles. For - * that reason anyone calling dnode_rele() or dbuf_rele() without some - * other direct or indirect hold on the dnode must first drop the dnode - * handle. - */ - ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); - - /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ - if (refs == 0 && db != NULL) { - /* - * Another thread could add a hold to the dnode handle in - * dnode_hold_impl() while holding the parent dbuf. Since the - * hold on the parent dbuf prevents the handle from being - * destroyed, the hold on the handle is OK. We can't yet assert - * that the handle has zero references, but that will be - * asserted anyway when the handle gets destroyed. - */ - mutex_enter(&db->db_mtx); - dbuf_rele_and_unlock(db, dnh, evicting); - } -} - -void -dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) -{ - objset_t *os = dn->dn_objset; - uint64_t txg = tx->tx_txg; - - if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { - dsl_dataset_dirty(os->os_dsl_dataset, tx); - return; - } - - DNODE_VERIFY(dn); - -#ifdef ZFS_DEBUG - mutex_enter(&dn->dn_mtx); - ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); - ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); - mutex_exit(&dn->dn_mtx); -#endif - - /* - * Determine old uid/gid when necessary - */ - dmu_objset_userquota_get_ids(dn, B_TRUE, tx); - - multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK]; - multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn); - - /* - * If we are already marked dirty, we're done. - */ - if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { - multilist_sublist_unlock(mls); - return; - } - - ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) || - !avl_is_empty(&dn->dn_dbufs)); - ASSERT(dn->dn_datablksz != 0); - ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]); - ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]); - ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]); - - dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", - dn->dn_object, txg); - - multilist_sublist_insert_head(mls, dn); - - multilist_sublist_unlock(mls); - - /* - * The dnode maintains a hold on its containing dbuf as - * long as there are holds on it. Each instantiated child - * dbuf maintains a hold on the dnode. When the last child - * drops its hold, the dnode will drop its hold on the - * containing dbuf. We add a "dirty hold" here so that the - * dnode will hang around after we finish processing its - * children. - */ - VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); - - (void) dbuf_dirty(dn->dn_dbuf, tx); - - dsl_dataset_dirty(os->os_dsl_dataset, tx); -} - -void -dnode_free(dnode_t *dn, dmu_tx_t *tx) -{ - mutex_enter(&dn->dn_mtx); - if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { - mutex_exit(&dn->dn_mtx); - return; - } - dn->dn_free_txg = tx->tx_txg; - mutex_exit(&dn->dn_mtx); - - dnode_setdirty(dn, tx); -} - -/* - * Try to change the block size for the indicated dnode. This can only - * succeed if there are no blocks allocated or dirty beyond first block - */ -int -dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db; - int err; - - ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); - if (size == 0) - size = SPA_MINBLOCKSIZE; - else - size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); - - if (ibs == dn->dn_indblkshift) - ibs = 0; - - if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) - return (0); - - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - - /* Check for any allocated blocks beyond the first */ - if (dn->dn_maxblkid != 0) - goto fail; - - mutex_enter(&dn->dn_dbufs_mtx); - for (db = avl_first(&dn->dn_dbufs); db != NULL; - db = AVL_NEXT(&dn->dn_dbufs, db)) { - if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && - db->db_blkid != DMU_SPILL_BLKID) { - mutex_exit(&dn->dn_dbufs_mtx); - goto fail; - } - } - mutex_exit(&dn->dn_dbufs_mtx); - - if (ibs && dn->dn_nlevels != 1) - goto fail; - - /* resize the old block */ - err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); - if (err == 0) - dbuf_new_size(db, size, tx); - else if (err != ENOENT) - goto fail; - - dnode_setdblksz(dn, size); - dnode_setdirty(dn, tx); - dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; - if (ibs) { - dn->dn_indblkshift = ibs; - dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; - } - /* rele after we have fixed the blocksize in the dnode */ - if (db) - dbuf_rele(db, FTAG); - - rw_exit(&dn->dn_struct_rwlock); - return (0); - -fail: - rw_exit(&dn->dn_struct_rwlock); - return (SET_ERROR(ENOTSUP)); -} - -/* read-holding callers must not rely on the lock being continuously held */ -void -dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) -{ - uint64_t txgoff = tx->tx_txg & TXG_MASK; - int epbs, new_nlevels; - uint64_t sz; - - ASSERT(blkid != DMU_BONUS_BLKID); - - ASSERT(have_read ? - RW_READ_HELD(&dn->dn_struct_rwlock) : - RW_WRITE_HELD(&dn->dn_struct_rwlock)); - - /* - * if we have a read-lock, check to see if we need to do any work - * before upgrading to a write-lock. - */ - if (have_read) { - if (blkid <= dn->dn_maxblkid) - return; - - if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { - rw_exit(&dn->dn_struct_rwlock); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - } - } - - if (blkid <= dn->dn_maxblkid) - goto out; - - dn->dn_maxblkid = blkid; - - /* - * Compute the number of levels necessary to support the new maxblkid. - */ - new_nlevels = 1; - epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - for (sz = dn->dn_nblkptr; - sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) - new_nlevels++; - - if (new_nlevels > dn->dn_nlevels) { - int old_nlevels = dn->dn_nlevels; - dmu_buf_impl_t *db; - list_t *list; - dbuf_dirty_record_t *new, *dr, *dr_next; - - dn->dn_nlevels = new_nlevels; - - ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); - dn->dn_next_nlevels[txgoff] = new_nlevels; - - /* dirty the left indirects */ - db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); - ASSERT(db != NULL); - new = dbuf_dirty(db, tx); - dbuf_rele(db, FTAG); - - /* transfer the dirty records to the new indirect */ - mutex_enter(&dn->dn_mtx); - mutex_enter(&new->dt.di.dr_mtx); - list = &dn->dn_dirty_records[txgoff]; - for (dr = list_head(list); dr; dr = dr_next) { - dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); - if (dr->dr_dbuf->db_level != new_nlevels-1 && - dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && - dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { - ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); - list_remove(&dn->dn_dirty_records[txgoff], dr); - list_insert_tail(&new->dt.di.dr_children, dr); - dr->dr_parent = new; - } - } - mutex_exit(&new->dt.di.dr_mtx); - mutex_exit(&dn->dn_mtx); - } - -out: - if (have_read) - rw_downgrade(&dn->dn_struct_rwlock); -} - -static void -dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG); - if (db != NULL) { - dmu_buf_will_dirty(&db->db, tx); - dbuf_rele(db, FTAG); - } -} - -/* - * Dirty all the in-core level-1 dbufs in the range specified by start_blkid - * and end_blkid. - */ -static void -dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, - dmu_tx_t *tx) -{ - dmu_buf_impl_t db_search; - dmu_buf_impl_t *db; - avl_index_t where; - - mutex_enter(&dn->dn_dbufs_mtx); - - db_search.db_level = 1; - db_search.db_blkid = start_blkid + 1; - db_search.db_state = DB_SEARCH; - for (;;) { - - db = avl_find(&dn->dn_dbufs, &db_search, &where); - if (db == NULL) - db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); - - if (db == NULL || db->db_level != 1 || - db->db_blkid >= end_blkid) { - break; - } - - /* - * Setup the next blkid we want to search for. - */ - db_search.db_blkid = db->db_blkid + 1; - ASSERT3U(db->db_blkid, >=, start_blkid); - - /* - * If the dbuf transitions to DB_EVICTING while we're trying - * to dirty it, then we will be unable to discover it in - * the dbuf hash table. This will result in a call to - * dbuf_create() which needs to acquire the dn_dbufs_mtx - * lock. To avoid a deadlock, we drop the lock before - * dirtying the level-1 dbuf. - */ - mutex_exit(&dn->dn_dbufs_mtx); - dnode_dirty_l1(dn, db->db_blkid, tx); - mutex_enter(&dn->dn_dbufs_mtx); - } - -#ifdef ZFS_DEBUG - /* - * Walk all the in-core level-1 dbufs and verify they have been dirtied. - */ - db_search.db_level = 1; - db_search.db_blkid = start_blkid + 1; - db_search.db_state = DB_SEARCH; - db = avl_find(&dn->dn_dbufs, &db_search, &where); - if (db == NULL) - db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); - for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { - if (db->db_level != 1 || db->db_blkid >= end_blkid) - break; - ASSERT(db->db_dirtycnt > 0); - } -#endif - mutex_exit(&dn->dn_dbufs_mtx); -} - -void -dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db; - uint64_t blkoff, blkid, nblks; - int blksz, blkshift, head, tail; - int trunc = FALSE; - int epbs; - - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - blksz = dn->dn_datablksz; - blkshift = dn->dn_datablkshift; - epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - - if (len == DMU_OBJECT_END) { - len = UINT64_MAX - off; - trunc = TRUE; - } - - /* - * First, block align the region to free: - */ - if (ISP2(blksz)) { - head = P2NPHASE(off, blksz); - blkoff = P2PHASE(off, blksz); - if ((off >> blkshift) > dn->dn_maxblkid) - goto out; - } else { - ASSERT(dn->dn_maxblkid == 0); - if (off == 0 && len >= blksz) { - /* - * Freeing the whole block; fast-track this request. - */ - blkid = 0; - nblks = 1; - if (dn->dn_nlevels > 1) - dnode_dirty_l1(dn, 0, tx); - goto done; - } else if (off >= blksz) { - /* Freeing past end-of-data */ - goto out; - } else { - /* Freeing part of the block. */ - head = blksz - off; - ASSERT3U(head, >, 0); - } - blkoff = off; - } - /* zero out any partial block data at the start of the range */ - if (head) { - ASSERT3U(blkoff + head, ==, blksz); - if (len < head) - head = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), - TRUE, FALSE, FTAG, &db) == 0) { - caddr_t data; - - /* don't dirty if it isn't on disk and isn't dirty */ - if (db->db_last_dirty || - (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { - rw_exit(&dn->dn_struct_rwlock); - dmu_buf_will_dirty(&db->db, tx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - data = db->db.db_data; - bzero(data + blkoff, head); - } - dbuf_rele(db, FTAG); - } - off += head; - len -= head; - } - - /* If the range was less than one block, we're done */ - if (len == 0) - goto out; - - /* If the remaining range is past end of file, we're done */ - if ((off >> blkshift) > dn->dn_maxblkid) - goto out; - - ASSERT(ISP2(blksz)); - if (trunc) - tail = 0; - else - tail = P2PHASE(len, blksz); - - ASSERT0(P2PHASE(off, blksz)); - /* zero out any partial block data at the end of the range */ - if (tail) { - if (len < tail) - tail = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), - TRUE, FALSE, FTAG, &db) == 0) { - /* don't dirty if not on disk and not dirty */ - if (db->db_last_dirty || - (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { - rw_exit(&dn->dn_struct_rwlock); - dmu_buf_will_dirty(&db->db, tx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - bzero(db->db.db_data, tail); - } - dbuf_rele(db, FTAG); - } - len -= tail; - } - - /* If the range did not include a full block, we are done */ - if (len == 0) - goto out; - - ASSERT(IS_P2ALIGNED(off, blksz)); - ASSERT(trunc || IS_P2ALIGNED(len, blksz)); - blkid = off >> blkshift; - nblks = len >> blkshift; - if (trunc) - nblks += 1; - - /* - * Dirty all the indirect blocks in this range. Note that only - * the first and last indirect blocks can actually be written - * (if they were partially freed) -- they must be dirtied, even if - * they do not exist on disk yet. The interior blocks will - * be freed by free_children(), so they will not actually be written. - * Even though these interior blocks will not be written, we - * dirty them for two reasons: - * - * - It ensures that the indirect blocks remain in memory until - * syncing context. (They have already been prefetched by - * dmu_tx_hold_free(), so we don't have to worry about reading - * them serially here.) - * - * - The dirty space accounting will put pressure on the txg sync - * mechanism to begin syncing, and to delay transactions if there - * is a large amount of freeing. Even though these indirect - * blocks will not be written, we could need to write the same - * amount of space if we copy the freed BPs into deadlists. - */ - if (dn->dn_nlevels > 1) { - uint64_t first, last; - - first = blkid >> epbs; - dnode_dirty_l1(dn, first, tx); - if (trunc) - last = dn->dn_maxblkid >> epbs; - else - last = (blkid + nblks - 1) >> epbs; - if (last != first) - dnode_dirty_l1(dn, last, tx); - - dnode_dirty_l1range(dn, first, last, tx); - - int shift = dn->dn_datablkshift + dn->dn_indblkshift - - SPA_BLKPTRSHIFT; - for (uint64_t i = first + 1; i < last; i++) { - /* - * Set i to the blockid of the next non-hole - * level-1 indirect block at or after i. Note - * that dnode_next_offset() operates in terms of - * level-0-equivalent bytes. - */ - uint64_t ibyte = i << shift; - int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, - &ibyte, 2, 1, 0); - i = ibyte >> shift; - if (i >= last) - break; - - /* - * Normally we should not see an error, either - * from dnode_next_offset() or dbuf_hold_level() - * (except for ESRCH from dnode_next_offset). - * If there is an i/o error, then when we read - * this block in syncing context, it will use - * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according - * to the "failmode" property. dnode_next_offset() - * doesn't have a flag to indicate MUSTSUCCEED. - */ - if (err != 0) - break; - - dnode_dirty_l1(dn, i, tx); - } - } - -done: - /* - * Add this range to the dnode range list. - * We will finish up this free operation in the syncing phase. - */ - mutex_enter(&dn->dn_mtx); - int txgoff = tx->tx_txg & TXG_MASK; - if (dn->dn_free_ranges[txgoff] == NULL) { - dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL); - } - range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); - range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); - dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", - blkid, nblks, tx->tx_txg); - mutex_exit(&dn->dn_mtx); - - dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); - dnode_setdirty(dn, tx); -out: - - rw_exit(&dn->dn_struct_rwlock); -} - -static boolean_t -dnode_spill_freed(dnode_t *dn) -{ - int i; - - mutex_enter(&dn->dn_mtx); - for (i = 0; i < TXG_SIZE; i++) { - if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK) - break; - } - mutex_exit(&dn->dn_mtx); - return (i < TXG_SIZE); -} - -/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ -uint64_t -dnode_block_freed(dnode_t *dn, uint64_t blkid) -{ - void *dp = spa_get_dsl(dn->dn_objset->os_spa); - int i; - - if (blkid == DMU_BONUS_BLKID) - return (FALSE); - - /* - * If we're in the process of opening the pool, dp will not be - * set yet, but there shouldn't be anything dirty. - */ - if (dp == NULL) - return (FALSE); - - if (dn->dn_free_txg) - return (TRUE); - - if (blkid == DMU_SPILL_BLKID) - return (dnode_spill_freed(dn)); - - mutex_enter(&dn->dn_mtx); - for (i = 0; i < TXG_SIZE; i++) { - if (dn->dn_free_ranges[i] != NULL && - range_tree_contains(dn->dn_free_ranges[i], blkid, 1)) - break; - } - mutex_exit(&dn->dn_mtx); - return (i < TXG_SIZE); -} - -/* call from syncing context when we actually write/free space for this dnode */ -void -dnode_diduse_space(dnode_t *dn, int64_t delta) -{ - uint64_t space; - dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n", - dn, dn->dn_phys, - (u_longlong_t)dn->dn_phys->dn_used, - (longlong_t)delta); - - mutex_enter(&dn->dn_mtx); - space = DN_USED_BYTES(dn->dn_phys); - if (delta > 0) { - ASSERT3U(space + delta, >=, space); /* no overflow */ - } else { - ASSERT3U(space, >=, -delta); /* no underflow */ - } - space += delta; - if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) { - ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); - ASSERT0(P2PHASE(space, 1<dn_phys->dn_used = space >> DEV_BSHIFT; - } else { - dn->dn_phys->dn_used = space; - dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES; - } - mutex_exit(&dn->dn_mtx); -} - -/* - * Scans a block at the indicated "level" looking for a hole or data, - * depending on 'flags'. - * - * If level > 0, then we are scanning an indirect block looking at its - * pointers. If level == 0, then we are looking at a block of dnodes. - * - * If we don't find what we are looking for in the block, we return ESRCH. - * Otherwise, return with *offset pointing to the beginning (if searching - * forwards) or end (if searching backwards) of the range covered by the - * block pointer we matched on (or dnode). - * - * The basic search algorithm used below by dnode_next_offset() is to - * use this function to search up the block tree (widen the search) until - * we find something (i.e., we don't return ESRCH) and then search back - * down the tree (narrow the search) until we reach our original search - * level. - */ -static int -dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, - int lvl, uint64_t blkfill, uint64_t txg) -{ - dmu_buf_impl_t *db = NULL; - void *data = NULL; - uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - uint64_t epb = 1ULL << epbs; - uint64_t minfill, maxfill; - boolean_t hole; - int i, inc, error, span; - - dprintf("probing object %llu offset %llx level %d of %u\n", - dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); - - hole = ((flags & DNODE_FIND_HOLE) != 0); - inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; - ASSERT(txg == 0 || !hole); - - if (lvl == dn->dn_phys->dn_nlevels) { - error = 0; - epb = dn->dn_phys->dn_nblkptr; - data = dn->dn_phys->dn_blkptr; - } else { - uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); - error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); - if (error) { - if (error != ENOENT) - return (error); - if (hole) - return (0); - /* - * This can only happen when we are searching up - * the block tree for data. We don't really need to - * adjust the offset, as we will just end up looking - * at the pointer to this block in its parent, and its - * going to be unallocated, so we will skip over it. - */ - return (SET_ERROR(ESRCH)); - } - error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT); - if (error) { - dbuf_rele(db, FTAG); - return (error); - } - data = db->db.db_data; - } - - - if (db != NULL && txg != 0 && (db->db_blkptr == NULL || - db->db_blkptr->blk_birth <= txg || - BP_IS_HOLE(db->db_blkptr))) { - /* - * This can only happen when we are searching up the tree - * and these conditions mean that we need to keep climbing. - */ - error = SET_ERROR(ESRCH); - } else if (lvl == 0) { - dnode_phys_t *dnp = data; - - ASSERT(dn->dn_type == DMU_OT_DNODE); - ASSERT(!(flags & DNODE_FIND_BACKWARDS)); - - for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1); - i < blkfill; i += dnp[i].dn_extra_slots + 1) { - if ((dnp[i].dn_type == DMU_OT_NONE) == hole) - break; - } - - if (i == blkfill) - error = SET_ERROR(ESRCH); - - *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) + - (i << DNODE_SHIFT); - } else { - blkptr_t *bp = data; - uint64_t start = *offset; - span = (lvl - 1) * epbs + dn->dn_datablkshift; - minfill = 0; - maxfill = blkfill << ((lvl - 1) * epbs); - - if (hole) - maxfill--; - else - minfill++; - - *offset = *offset >> span; - for (i = BF64_GET(*offset, 0, epbs); - i >= 0 && i < epb; i += inc) { - if (BP_GET_FILL(&bp[i]) >= minfill && - BP_GET_FILL(&bp[i]) <= maxfill && - (hole || bp[i].blk_birth > txg)) - break; - if (inc > 0 || *offset > 0) - *offset += inc; - } - *offset = *offset << span; - if (inc < 0) { - /* traversing backwards; position offset at the end */ - ASSERT3U(*offset, <=, start); - *offset = MIN(*offset + (1ULL << span) - 1, start); - } else if (*offset < start) { - *offset = start; - } - if (i < 0 || i >= epb) - error = SET_ERROR(ESRCH); - } - - if (db) - dbuf_rele(db, FTAG); - - return (error); -} - -/* - * Find the next hole, data, or sparse region at or after *offset. - * The value 'blkfill' tells us how many items we expect to find - * in an L0 data block; this value is 1 for normal objects, - * DNODES_PER_BLOCK for the meta dnode, and some fraction of - * DNODES_PER_BLOCK when searching for sparse regions thereof. - * - * Examples: - * - * dnode_next_offset(dn, flags, offset, 1, 1, 0); - * Finds the next/previous hole/data in a file. - * Used in dmu_offset_next(). - * - * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg); - * Finds the next free/allocated dnode an objset's meta-dnode. - * Only finds objects that have new contents since txg (ie. - * bonus buffer changes and content removal are ignored). - * Used in dmu_object_next(). - * - * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0); - * Finds the next L2 meta-dnode bp that's at most 1/4 full. - * Used in dmu_object_alloc(). - */ -int -dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, - int minlvl, uint64_t blkfill, uint64_t txg) -{ - uint64_t initial_offset = *offset; - int lvl, maxlvl; - int error = 0; - - if (!(flags & DNODE_FIND_HAVELOCK)) - rw_enter(&dn->dn_struct_rwlock, RW_READER); - - if (dn->dn_phys->dn_nlevels == 0) { - error = SET_ERROR(ESRCH); - goto out; - } - - if (dn->dn_datablkshift == 0) { - if (*offset < dn->dn_datablksz) { - if (flags & DNODE_FIND_HOLE) - *offset = dn->dn_datablksz; - } else { - error = SET_ERROR(ESRCH); - } - goto out; - } - - maxlvl = dn->dn_phys->dn_nlevels; - - for (lvl = minlvl; lvl <= maxlvl; lvl++) { - error = dnode_next_offset_level(dn, - flags, offset, lvl, blkfill, txg); - if (error != ESRCH) - break; - } - - while (error == 0 && --lvl >= minlvl) { - error = dnode_next_offset_level(dn, - flags, offset, lvl, blkfill, txg); - } - - /* - * There's always a "virtual hole" at the end of the object, even - * if all BP's which physically exist are non-holes. - */ - if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 && - minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) { - error = 0; - } - - if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? - initial_offset < *offset : initial_offset > *offset)) - error = SET_ERROR(ESRCH); -out: - if (!(flags & DNODE_FIND_HAVELOCK)) - rw_exit(&dn->dn_struct_rwlock); - - return (error); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c deleted file mode 100644 index 9283356608aa..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ /dev/null @@ -1,779 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static void -dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db; - int txgoff = tx->tx_txg & TXG_MASK; - int nblkptr = dn->dn_phys->dn_nblkptr; - int old_toplvl = dn->dn_phys->dn_nlevels - 1; - int new_level = dn->dn_next_nlevels[txgoff]; - int i; - - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - - /* this dnode can't be paged out because it's dirty */ - ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); - ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); - - db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); - ASSERT(db != NULL); - - dn->dn_phys->dn_nlevels = new_level; - dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, - dn->dn_object, dn->dn_phys->dn_nlevels); - - /* transfer dnode's block pointers to new indirect block */ - (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); - ASSERT(db->db.db_data); - ASSERT(arc_released(db->db_buf)); - ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); - bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, - sizeof (blkptr_t) * nblkptr); - arc_buf_freeze(db->db_buf); - - /* set dbuf's parent pointers to new indirect buf */ - for (i = 0; i < nblkptr; i++) { - dmu_buf_impl_t *child = - dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); - - if (child == NULL) - continue; -#ifdef DEBUG - DB_DNODE_ENTER(child); - ASSERT3P(DB_DNODE(child), ==, dn); - DB_DNODE_EXIT(child); -#endif /* DEBUG */ - if (child->db_parent && child->db_parent != dn->dn_dbuf) { - ASSERT(child->db_parent->db_level == db->db_level); - ASSERT(child->db_blkptr != - &dn->dn_phys->dn_blkptr[child->db_blkid]); - mutex_exit(&child->db_mtx); - continue; - } - ASSERT(child->db_parent == NULL || - child->db_parent == dn->dn_dbuf); - - child->db_parent = db; - dbuf_add_ref(db, child); - if (db->db.db_data) - child->db_blkptr = (blkptr_t *)db->db.db_data + i; - else - child->db_blkptr = NULL; - dprintf_dbuf_bp(child, child->db_blkptr, - "changed db_blkptr to new indirect %s", ""); - - mutex_exit(&child->db_mtx); - } - - bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); - - dbuf_rele(db, FTAG); - - rw_exit(&dn->dn_struct_rwlock); -} - -static void -free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - uint64_t bytesfreed = 0; - - dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); - - for (int i = 0; i < num; i++, bp++) { - if (BP_IS_HOLE(bp)) - continue; - - bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE); - ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); - - /* - * Save some useful information on the holes being - * punched, including logical size, type, and indirection - * level. Retaining birth time enables detection of when - * holes are punched for reducing the number of free - * records transmitted during a zfs send. - */ - - uint64_t lsize = BP_GET_LSIZE(bp); - dmu_object_type_t type = BP_GET_TYPE(bp); - uint64_t lvl = BP_GET_LEVEL(bp); - - bzero(bp, sizeof (blkptr_t)); - - if (spa_feature_is_active(dn->dn_objset->os_spa, - SPA_FEATURE_HOLE_BIRTH)) { - BP_SET_LSIZE(bp, lsize); - BP_SET_TYPE(bp, type); - BP_SET_LEVEL(bp, lvl); - BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0); - } - } - dnode_diduse_space(dn, -bytesfreed); -} - -#ifdef ZFS_DEBUG -static void -free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) -{ - int off, num; - int i, err, epbs; - uint64_t txg = tx->tx_txg; - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - off = start - (db->db_blkid * 1<=, 0); - ASSERT3U(num, >=, 0); - ASSERT3U(db->db_level, >, 0); - ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); - ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); - ASSERT(db->db_blkptr != NULL); - - for (i = off; i < off+num; i++) { - uint64_t *buf; - dmu_buf_impl_t *child; - dbuf_dirty_record_t *dr; - int j; - - ASSERT(db->db_level == 1); - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, db->db_level-1, - (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); - rw_exit(&dn->dn_struct_rwlock); - if (err == ENOENT) - continue; - ASSERT(err == 0); - ASSERT(child->db_level == 0); - dr = child->db_last_dirty; - while (dr && dr->dr_txg > txg) - dr = dr->dr_next; - ASSERT(dr == NULL || dr->dr_txg == txg); - - /* data_old better be zeroed */ - if (dr) { - buf = dr->dt.dl.dr_data->b_data; - for (j = 0; j < child->db.db_size >> 3; j++) { - if (buf[j] != 0) { - panic("freed data not zero: " - "child=%p i=%d off=%d num=%d\n", - (void *)child, i, off, num); - } - } - } - - /* - * db_data better be zeroed unless it's dirty in a - * future txg. - */ - mutex_enter(&child->db_mtx); - buf = child->db.db_data; - if (buf != NULL && child->db_state != DB_FILL && - child->db_last_dirty == NULL) { - for (j = 0; j < child->db.db_size >> 3; j++) { - if (buf[j] != 0) { - panic("freed data not zero: " - "child=%p i=%d off=%d num=%d\n", - (void *)child, i, off, num); - } - } - } - mutex_exit(&child->db_mtx); - - dbuf_rele(child, FTAG); - } - DB_DNODE_EXIT(db); -} -#endif - -/* - * We don't usually free the indirect blocks here. If in one txg we have a - * free_range and a write to the same indirect block, it's important that we - * preserve the hole's birth times. Therefore, we don't free any any indirect - * blocks in free_children(). If an indirect block happens to turn into all - * holes, it will be freed by dbuf_write_children_ready, which happens at a - * point in the syncing process where we know for certain the contents of the - * indirect block. - * - * However, if we're freeing a dnode, its space accounting must go to zero - * before we actually try to free the dnode, or we will trip an assertion. In - * addition, we know the case described above cannot occur, because the dnode is - * being freed. Therefore, we free the indirect blocks immediately in that - * case. - */ -static void -free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, - boolean_t free_indirects, dmu_tx_t *tx) -{ - dnode_t *dn; - blkptr_t *bp; - dmu_buf_impl_t *subdb; - uint64_t start, end, dbstart, dbend; - unsigned int epbs, shift, i; - - /* - * There is a small possibility that this block will not be cached: - * 1 - if level > 1 and there are no children with level <= 1 - * 2 - if this block was evicted since we read it from - * dmu_tx_hold_free(). - */ - if (db->db_state != DB_CACHED) - (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); - - /* - * If we modify this indirect block, and we are not freeing the - * dnode (!free_indirects), then this indirect block needs to get - * written to disk by dbuf_write(). If it is dirty, we know it will - * be written (otherwise, we would have incorrect on-disk state - * because the space would be freed but still referenced by the BP - * in this indirect block). Therefore we VERIFY that it is - * dirty. - * - * Our VERIFY covers some cases that do not actually have to be - * dirty, but the open-context code happens to dirty. E.g. if the - * blocks we are freeing are all holes, because in that case, we - * are only freeing part of this indirect block, so it is an - * ancestor of the first or last block to be freed. The first and - * last L1 indirect blocks are always dirtied by dnode_free_range(). - */ - VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0); - - dbuf_release_bp(db); - bp = db->db.db_data; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - ASSERT3U(epbs, <, 31); - shift = (db->db_level - 1) * epbs; - dbstart = db->db_blkid << epbs; - start = blkid >> shift; - if (dbstart < start) { - bp += start - dbstart; - } else { - start = dbstart; - } - dbend = ((db->db_blkid + 1) << epbs) - 1; - end = (blkid + nblks - 1) >> shift; - if (dbend <= end) - end = dbend; - - ASSERT3U(start, <=, end); - - if (db->db_level == 1) { - FREE_VERIFY(db, start, end, tx); - free_blocks(dn, bp, end-start+1, tx); - } else { - for (uint64_t id = start; id <= end; id++, bp++) { - if (BP_IS_HOLE(bp)) - continue; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, - id, TRUE, FALSE, FTAG, &subdb)); - rw_exit(&dn->dn_struct_rwlock); - ASSERT3P(bp, ==, subdb->db_blkptr); - - free_children(subdb, blkid, nblks, free_indirects, tx); - dbuf_rele(subdb, FTAG); - } - } - - if (free_indirects) { - for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) - ASSERT(BP_IS_HOLE(bp)); - bzero(db->db.db_data, db->db.db_size); - free_blocks(dn, db->db_blkptr, 1, tx); - } - - DB_DNODE_EXIT(db); - arc_buf_freeze(db->db_buf); -} - -/* - * Traverse the indicated range of the provided file - * and "free" all the blocks contained there. - */ -static void -dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, - boolean_t free_indirects, dmu_tx_t *tx) -{ - blkptr_t *bp = dn->dn_phys->dn_blkptr; - int dnlevel = dn->dn_phys->dn_nlevels; - boolean_t trunc = B_FALSE; - - if (blkid > dn->dn_phys->dn_maxblkid) - return; - - ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX); - if (blkid + nblks > dn->dn_phys->dn_maxblkid) { - nblks = dn->dn_phys->dn_maxblkid - blkid + 1; - trunc = B_TRUE; - } - - /* There are no indirect blocks in the object */ - if (dnlevel == 1) { - if (blkid >= dn->dn_phys->dn_nblkptr) { - /* this range was never made persistent */ - return; - } - ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); - free_blocks(dn, bp + blkid, nblks, tx); - } else { - int shift = (dnlevel - 1) * - (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); - int start = blkid >> shift; - int end = (blkid + nblks - 1) >> shift; - dmu_buf_impl_t *db; - - ASSERT(start < dn->dn_phys->dn_nblkptr); - bp += start; - for (int i = start; i <= end; i++, bp++) { - if (BP_IS_HOLE(bp)) - continue; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, - TRUE, FALSE, FTAG, &db)); - rw_exit(&dn->dn_struct_rwlock); - - free_children(db, blkid, nblks, free_indirects, tx); - dbuf_rele(db, FTAG); - } - } - - if (trunc) { - dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1; - - uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * - (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); - ASSERT(off < dn->dn_phys->dn_maxblkid || - dn->dn_phys->dn_maxblkid == 0 || - dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); - } -} - -typedef struct dnode_sync_free_range_arg { - dnode_t *dsfra_dnode; - dmu_tx_t *dsfra_tx; - boolean_t dsfra_free_indirects; -} dnode_sync_free_range_arg_t; - -static void -dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) -{ - dnode_sync_free_range_arg_t *dsfra = arg; - dnode_t *dn = dsfra->dsfra_dnode; - - mutex_exit(&dn->dn_mtx); - dnode_sync_free_range_impl(dn, blkid, nblks, - dsfra->dsfra_free_indirects, dsfra->dsfra_tx); - mutex_enter(&dn->dn_mtx); -} - -/* - * Try to kick all the dnode's dbufs out of the cache... - */ -void -dnode_evict_dbufs(dnode_t *dn) -{ - dmu_buf_impl_t db_marker; - dmu_buf_impl_t *db, *db_next; - - mutex_enter(&dn->dn_dbufs_mtx); - for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { - -#ifdef DEBUG - DB_DNODE_ENTER(db); - ASSERT3P(DB_DNODE(db), ==, dn); - DB_DNODE_EXIT(db); -#endif /* DEBUG */ - - mutex_enter(&db->db_mtx); - if (db->db_state != DB_EVICTING && - zfs_refcount_is_zero(&db->db_holds)) { - db_marker.db_level = db->db_level; - db_marker.db_blkid = db->db_blkid; - db_marker.db_state = DB_SEARCH; - avl_insert_here(&dn->dn_dbufs, &db_marker, db, - AVL_BEFORE); - - /* - * We need to use the "marker" dbuf rather than - * simply getting the next dbuf, because - * dbuf_destroy() may actually remove multiple dbufs. - * It can call itself recursively on the parent dbuf, - * which may also be removed from dn_dbufs. The code - * flow would look like: - * - * dbuf_destroy(): - * dnode_rele_and_unlock(parent_dbuf, evicting=TRUE): - * if (!cacheable || pending_evict) - * dbuf_destroy() - */ - dbuf_destroy(db); - - db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker); - avl_remove(&dn->dn_dbufs, &db_marker); - } else { - db->db_pending_evict = TRUE; - mutex_exit(&db->db_mtx); - db_next = AVL_NEXT(&dn->dn_dbufs, db); - } - } - mutex_exit(&dn->dn_dbufs_mtx); - - dnode_evict_bonus(dn); -} - -void -dnode_evict_bonus(dnode_t *dn) -{ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - if (dn->dn_bonus != NULL) { - if (zfs_refcount_is_zero(&dn->dn_bonus->db_holds)) { - mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_destroy(dn->dn_bonus); - dn->dn_bonus = NULL; - } else { - dn->dn_bonus->db_pending_evict = TRUE; - } - } - rw_exit(&dn->dn_struct_rwlock); -} - -static void -dnode_undirty_dbufs(list_t *list) -{ - dbuf_dirty_record_t *dr; - - while (dr = list_head(list)) { - dmu_buf_impl_t *db = dr->dr_dbuf; - uint64_t txg = dr->dr_txg; - - if (db->db_level != 0) - dnode_undirty_dbufs(&dr->dt.di.dr_children); - - mutex_enter(&db->db_mtx); - /* XXX - use dbuf_undirty()? */ - list_remove(list, dr); - ASSERT(db->db_last_dirty == dr); - db->db_last_dirty = NULL; - db->db_dirtycnt -= 1; - if (db->db_level == 0) { - ASSERT(db->db_blkid == DMU_BONUS_BLKID || - dr->dt.dl.dr_data == db->db_buf); - dbuf_unoverride(dr); - } else { - mutex_destroy(&dr->dt.di.dr_mtx); - list_destroy(&dr->dt.di.dr_children); - } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); - } -} - -static void -dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) -{ - int txgoff = tx->tx_txg & TXG_MASK; - - ASSERT(dmu_tx_is_syncing(tx)); - - /* - * Our contents should have been freed in dnode_sync() by the - * free range record inserted by the caller of dnode_free(). - */ - ASSERT0(DN_USED_BYTES(dn->dn_phys)); - ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr)); - - dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); - dnode_evict_dbufs(dn); - - /* - * XXX - It would be nice to assert this, but we may still - * have residual holds from async evictions from the arc... - * - * zfs_obj_to_path() also depends on this being - * commented out. - * - * ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 1); - */ - - /* Undirty next bits */ - dn->dn_next_nlevels[txgoff] = 0; - dn->dn_next_indblkshift[txgoff] = 0; - dn->dn_next_blksz[txgoff] = 0; - - /* ASSERT(blkptrs are zero); */ - ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); - ASSERT(dn->dn_type != DMU_OT_NONE); - - ASSERT(dn->dn_free_txg > 0); - if (dn->dn_allocated_txg != dn->dn_free_txg) - dmu_buf_will_dirty(&dn->dn_dbuf->db, tx); - bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots); - dnode_free_interior_slots(dn); - - mutex_enter(&dn->dn_mtx); - dn->dn_type = DMU_OT_NONE; - dn->dn_maxblkid = 0; - dn->dn_allocated_txg = 0; - dn->dn_free_txg = 0; - dn->dn_have_spill = B_FALSE; - dn->dn_num_slots = 1; - mutex_exit(&dn->dn_mtx); - - ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); - - dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); - /* - * Now that we've released our hold, the dnode may - * be evicted, so we musn't access it. - */ -} - -/* - * Write out the dnode's dirty buffers. - */ -void -dnode_sync(dnode_t *dn, dmu_tx_t *tx) -{ - dnode_phys_t *dnp = dn->dn_phys; - int txgoff = tx->tx_txg & TXG_MASK; - list_t *list = &dn->dn_dirty_records[txgoff]; - static const dnode_phys_t zerodn = { 0 }; - boolean_t kill_spill = B_FALSE; - - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); - ASSERT(dnp->dn_type != DMU_OT_NONE || - bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0); - DNODE_VERIFY(dn); - - ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); - - if (dmu_objset_userused_enabled(dn->dn_objset) && - !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { - mutex_enter(&dn->dn_mtx); - dn->dn_oldused = DN_USED_BYTES(dn->dn_phys); - dn->dn_oldflags = dn->dn_phys->dn_flags; - dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED; - mutex_exit(&dn->dn_mtx); - dmu_objset_userquota_get_ids(dn, B_FALSE, tx); - } else { - /* Once we account for it, we should always account for it. */ - ASSERT(!(dn->dn_phys->dn_flags & - DNODE_FLAG_USERUSED_ACCOUNTED)); - } - - mutex_enter(&dn->dn_mtx); - if (dn->dn_allocated_txg == tx->tx_txg) { - /* The dnode is newly allocated or reallocated */ - if (dnp->dn_type == DMU_OT_NONE) { - /* this is a first alloc, not a realloc */ - dnp->dn_nlevels = 1; - dnp->dn_nblkptr = dn->dn_nblkptr; - } - - dnp->dn_type = dn->dn_type; - dnp->dn_bonustype = dn->dn_bonustype; - dnp->dn_bonuslen = dn->dn_bonuslen; - } - - dnp->dn_extra_slots = dn->dn_num_slots - 1; - - ASSERT(dnp->dn_nlevels > 1 || - BP_IS_HOLE(&dnp->dn_blkptr[0]) || - BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) || - BP_GET_LSIZE(&dnp->dn_blkptr[0]) == - dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); - ASSERT(dnp->dn_nlevels < 2 || - BP_IS_HOLE(&dnp->dn_blkptr[0]) || - BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift); - - if (dn->dn_next_type[txgoff] != 0) { - dnp->dn_type = dn->dn_type; - dn->dn_next_type[txgoff] = 0; - } - - if (dn->dn_next_blksz[txgoff] != 0) { - ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], - SPA_MINBLOCKSIZE) == 0); - ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || - dn->dn_maxblkid == 0 || list_head(list) != NULL || - dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == - dnp->dn_datablkszsec || - !range_tree_is_empty(dn->dn_free_ranges[txgoff])); - dnp->dn_datablkszsec = - dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT; - dn->dn_next_blksz[txgoff] = 0; - } - - if (dn->dn_next_bonuslen[txgoff] != 0) { - if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN) - dnp->dn_bonuslen = 0; - else - dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff]; - ASSERT(dnp->dn_bonuslen <= - DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1)); - dn->dn_next_bonuslen[txgoff] = 0; - } - - if (dn->dn_next_bonustype[txgoff] != 0) { - ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff])); - dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; - dn->dn_next_bonustype[txgoff] = 0; - } - - boolean_t freeing_dnode = dn->dn_free_txg > 0 && - dn->dn_free_txg <= tx->tx_txg; - - /* - * Remove the spill block if we have been explicitly asked to - * remove it, or if the object is being removed. - */ - if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) { - if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) - kill_spill = B_TRUE; - dn->dn_rm_spillblk[txgoff] = 0; - } - - if (dn->dn_next_indblkshift[txgoff] != 0) { - ASSERT(dnp->dn_nlevels == 1); - dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; - dn->dn_next_indblkshift[txgoff] = 0; - } - - /* - * Just take the live (open-context) values for checksum and compress. - * Strictly speaking it's a future leak, but nothing bad happens if we - * start using the new checksum or compress algorithm a little early. - */ - dnp->dn_checksum = dn->dn_checksum; - dnp->dn_compress = dn->dn_compress; - - mutex_exit(&dn->dn_mtx); - - if (kill_spill) { - free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx); - mutex_enter(&dn->dn_mtx); - dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR; - mutex_exit(&dn->dn_mtx); - } - - /* process all the "freed" ranges in the file */ - if (dn->dn_free_ranges[txgoff] != NULL) { - dnode_sync_free_range_arg_t dsfra; - dsfra.dsfra_dnode = dn; - dsfra.dsfra_tx = tx; - dsfra.dsfra_free_indirects = freeing_dnode; - if (freeing_dnode) { - ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff], - 0, dn->dn_maxblkid + 1)); - } - mutex_enter(&dn->dn_mtx); - range_tree_vacate(dn->dn_free_ranges[txgoff], - dnode_sync_free_range, &dsfra); - range_tree_destroy(dn->dn_free_ranges[txgoff]); - dn->dn_free_ranges[txgoff] = NULL; - mutex_exit(&dn->dn_mtx); - } - - if (freeing_dnode) { - dn->dn_objset->os_freed_dnodes++; - dnode_sync_free(dn, tx); - return; - } - - if (dn->dn_num_slots > DNODE_MIN_SLOTS) { - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - mutex_enter(&ds->ds_lock); - ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_DNODE] = - B_TRUE; - mutex_exit(&ds->ds_lock); - } - - if (dn->dn_next_nlevels[txgoff]) { - dnode_increase_indirection(dn, tx); - dn->dn_next_nlevels[txgoff] = 0; - } - - if (dn->dn_next_nblkptr[txgoff]) { - /* this should only happen on a realloc */ - ASSERT(dn->dn_allocated_txg == tx->tx_txg); - if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) { - /* zero the new blkptrs we are gaining */ - bzero(dnp->dn_blkptr + dnp->dn_nblkptr, - sizeof (blkptr_t) * - (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr)); -#ifdef ZFS_DEBUG - } else { - int i; - ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr); - /* the blkptrs we are losing better be unallocated */ - for (i = dn->dn_next_nblkptr[txgoff]; - i < dnp->dn_nblkptr; i++) - ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i])); -#endif - } - mutex_enter(&dn->dn_mtx); - dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff]; - dn->dn_next_nblkptr[txgoff] = 0; - mutex_exit(&dn->dn_mtx); - } - - dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx); - - if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { - ASSERT3P(list_head(list), ==, NULL); - dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); - } - - /* - * Although we have dropped our reference to the dnode, it - * can't be evicted until its written, and we haven't yet - * initiated the IO for the dnode's dbuf. - */ -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c deleted file mode 100644 index cae6d00ca2ce..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c +++ /dev/null @@ -1,566 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int -dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, - dsl_dataset_t **dsp, void *tag, char **shortnamep) -{ - char buf[ZFS_MAX_DATASET_NAME_LEN]; - char *hashp; - - if (strlen(fullname) >= ZFS_MAX_DATASET_NAME_LEN) - return (SET_ERROR(ENAMETOOLONG)); - hashp = strchr(fullname, '#'); - if (hashp == NULL) - return (SET_ERROR(EINVAL)); - - *shortnamep = hashp + 1; - if (zfs_component_namecheck(*shortnamep, NULL, NULL)) - return (SET_ERROR(EINVAL)); - (void) strlcpy(buf, fullname, hashp - fullname + 1); - return (dsl_dataset_hold(dp, buf, tag, dsp)); -} - -/* - * Returns ESRCH if bookmark is not found. - */ -static int -dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname, - zfs_bookmark_phys_t *bmark_phys) -{ - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t bmark_zapobj = ds->ds_bookmarks; - matchtype_t mt = 0; - int err; - - if (bmark_zapobj == 0) - return (SET_ERROR(ESRCH)); - - if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) - mt = MT_NORMALIZE; - - err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t), - sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, - NULL, 0, NULL); - - return (err == ENOENT ? ESRCH : err); -} - -/* - * If later_ds is non-NULL, this will return EXDEV if the the specified bookmark - * does not represents an earlier point in later_ds's timeline. - * - * Returns ENOENT if the dataset containing the bookmark does not exist. - * Returns ESRCH if the dataset exists but the bookmark was not found in it. - */ -int -dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname, - dsl_dataset_t *later_ds, zfs_bookmark_phys_t *bmp) -{ - char *shortname; - dsl_dataset_t *ds; - int error; - - error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname); - if (error != 0) - return (error); - - error = dsl_dataset_bmark_lookup(ds, shortname, bmp); - if (error == 0 && later_ds != NULL) { - if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg)) - error = SET_ERROR(EXDEV); - } - dsl_dataset_rele(ds, FTAG); - return (error); -} - -typedef struct dsl_bookmark_create_arg { - nvlist_t *dbca_bmarks; - nvlist_t *dbca_errors; -} dsl_bookmark_create_arg_t; - -static int -dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name, - dmu_tx_t *tx) -{ - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *bmark_fs; - char *shortname; - int error; - zfs_bookmark_phys_t bmark_phys; - - if (!snapds->ds_is_snapshot) - return (SET_ERROR(EINVAL)); - - error = dsl_bookmark_hold_ds(dp, bookmark_name, - &bmark_fs, FTAG, &shortname); - if (error != 0) - return (error); - - if (!dsl_dataset_is_before(bmark_fs, snapds, 0)) { - dsl_dataset_rele(bmark_fs, FTAG); - return (SET_ERROR(EINVAL)); - } - - error = dsl_dataset_bmark_lookup(bmark_fs, shortname, - &bmark_phys); - dsl_dataset_rele(bmark_fs, FTAG); - if (error == 0) - return (SET_ERROR(EEXIST)); - if (error == ESRCH) - return (0); - return (error); -} - -static int -dsl_bookmark_create_check(void *arg, dmu_tx_t *tx) -{ - dsl_bookmark_create_arg_t *dbca = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - int rv = 0; - - if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)) - return (SET_ERROR(ENOTSUP)); - - for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); - pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { - dsl_dataset_t *snapds; - int error; - - /* note: validity of nvlist checked by ioctl layer */ - error = dsl_dataset_hold(dp, fnvpair_value_string(pair), - FTAG, &snapds); - if (error == 0) { - error = dsl_bookmark_create_check_impl(snapds, - nvpair_name(pair), tx); - dsl_dataset_rele(snapds, FTAG); - } - if (error != 0) { - fnvlist_add_int32(dbca->dbca_errors, - nvpair_name(pair), error); - rv = error; - } - } - - return (rv); -} - -static void -dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) -{ - dsl_bookmark_create_arg_t *dbca = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - objset_t *mos = dp->dp_meta_objset; - - ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)); - - for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); - pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { - dsl_dataset_t *snapds, *bmark_fs; - zfs_bookmark_phys_t bmark_phys; - char *shortname; - - VERIFY0(dsl_dataset_hold(dp, fnvpair_value_string(pair), - FTAG, &snapds)); - VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), - &bmark_fs, FTAG, &shortname)); - if (bmark_fs->ds_bookmarks == 0) { - bmark_fs->ds_bookmarks = - zap_create_norm(mos, U8_TEXTPREP_TOUPPER, - DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); - spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); - - dsl_dataset_zapify(bmark_fs, tx); - VERIFY0(zap_add(mos, bmark_fs->ds_object, - DS_FIELD_BOOKMARK_NAMES, - sizeof (bmark_fs->ds_bookmarks), 1, - &bmark_fs->ds_bookmarks, tx)); - } - - bmark_phys.zbm_guid = dsl_dataset_phys(snapds)->ds_guid; - bmark_phys.zbm_creation_txg = - dsl_dataset_phys(snapds)->ds_creation_txg; - bmark_phys.zbm_creation_time = - dsl_dataset_phys(snapds)->ds_creation_time; - - VERIFY0(zap_add(mos, bmark_fs->ds_bookmarks, - shortname, sizeof (uint64_t), - sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), - &bmark_phys, tx)); - - spa_history_log_internal_ds(bmark_fs, "bookmark", tx, - "name=%s creation_txg=%llu target_snap=%llu", - shortname, - (longlong_t)bmark_phys.zbm_creation_txg, - (longlong_t)snapds->ds_object); - - dsl_dataset_rele(bmark_fs, FTAG); - dsl_dataset_rele(snapds, FTAG); - } -} - -/* - * The bookmarks must all be in the same pool. - */ -int -dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors) -{ - nvpair_t *pair; - dsl_bookmark_create_arg_t dbca; - - pair = nvlist_next_nvpair(bmarks, NULL); - if (pair == NULL) - return (0); - - dbca.dbca_bmarks = bmarks; - dbca.dbca_errors = errors; - - return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check, - dsl_bookmark_create_sync, &dbca, - fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL)); -} - -int -dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl) -{ - int err = 0; - zap_cursor_t zc; - zap_attribute_t attr; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - uint64_t bmark_zapobj = ds->ds_bookmarks; - if (bmark_zapobj == 0) - return (0); - - for (zap_cursor_init(&zc, dp->dp_meta_objset, bmark_zapobj); - zap_cursor_retrieve(&zc, &attr) == 0; - zap_cursor_advance(&zc)) { - char *bmark_name = attr.za_name; - zfs_bookmark_phys_t bmark_phys; - - err = dsl_dataset_bmark_lookup(ds, bmark_name, &bmark_phys); - ASSERT3U(err, !=, ENOENT); - if (err != 0) - break; - - nvlist_t *out_props = fnvlist_alloc(); - if (nvlist_exists(props, - zfs_prop_to_name(ZFS_PROP_GUID))) { - dsl_prop_nvlist_add_uint64(out_props, - ZFS_PROP_GUID, bmark_phys.zbm_guid); - } - if (nvlist_exists(props, - zfs_prop_to_name(ZFS_PROP_CREATETXG))) { - dsl_prop_nvlist_add_uint64(out_props, - ZFS_PROP_CREATETXG, bmark_phys.zbm_creation_txg); - } - if (nvlist_exists(props, - zfs_prop_to_name(ZFS_PROP_CREATION))) { - dsl_prop_nvlist_add_uint64(out_props, - ZFS_PROP_CREATION, bmark_phys.zbm_creation_time); - } - - fnvlist_add_nvlist(outnvl, bmark_name, out_props); - fnvlist_free(out_props); - } - zap_cursor_fini(&zc); - return (err); -} - -/* - * Retrieve the bookmarks that exist in the specified dataset, and the - * requested properties of each bookmark. - * - * The "props" nvlist specifies which properties are requested. - * See lzc_get_bookmarks() for the list of valid properties. - */ -int -dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl) -{ - dsl_pool_t *dp; - dsl_dataset_t *ds; - int err; - - err = dsl_pool_hold(dsname, FTAG, &dp); - if (err != 0) - return (err); - err = dsl_dataset_hold(dp, dsname, FTAG, &ds); - if (err != 0) { - dsl_pool_rele(dp, FTAG); - return (err); - } - - err = dsl_get_bookmarks_impl(ds, props, outnvl); - - dsl_dataset_rele(ds, FTAG); - dsl_pool_rele(dp, FTAG); - return (err); -} - -typedef struct dsl_bookmark_destroy_arg { - nvlist_t *dbda_bmarks; - nvlist_t *dbda_success; - nvlist_t *dbda_errors; -} dsl_bookmark_destroy_arg_t; - -static int -dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) -{ - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t bmark_zapobj = ds->ds_bookmarks; - matchtype_t mt = 0; - - if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) - mt = MT_NORMALIZE; - - return (zap_remove_norm(mos, bmark_zapobj, name, mt, tx)); -} - -static int -dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx) -{ - dsl_bookmark_destroy_arg_t *dbda = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - int rv = 0; - - ASSERT(nvlist_empty(dbda->dbda_success)); - ASSERT(nvlist_empty(dbda->dbda_errors)); - - if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)) - return (0); - - for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_bmarks, NULL); - pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_bmarks, pair)) { - const char *fullname = nvpair_name(pair); - dsl_dataset_t *ds; - zfs_bookmark_phys_t bm; - int error; - char *shortname; - - error = dsl_bookmark_hold_ds(dp, fullname, &ds, - FTAG, &shortname); - if (error == ENOENT) { - /* ignore it; the bookmark is "already destroyed" */ - continue; - } - if (error == 0) { - error = dsl_dataset_bmark_lookup(ds, shortname, &bm); - dsl_dataset_rele(ds, FTAG); - if (error == ESRCH) { - /* - * ignore it; the bookmark is - * "already destroyed" - */ - continue; - } - } - if (error == 0) { - if (dmu_tx_is_syncing(tx)) { - fnvlist_add_boolean(dbda->dbda_success, - fullname); - } - } else { - fnvlist_add_int32(dbda->dbda_errors, fullname, error); - rv = error; - } - } - return (rv); -} - -static void -dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx) -{ - dsl_bookmark_destroy_arg_t *dbda = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - objset_t *mos = dp->dp_meta_objset; - - for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_success, NULL); - pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_success, pair)) { - dsl_dataset_t *ds; - char *shortname; - uint64_t zap_cnt; - - VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), - &ds, FTAG, &shortname)); - VERIFY0(dsl_dataset_bookmark_remove(ds, shortname, tx)); - - /* - * If all of this dataset's bookmarks have been destroyed, - * free the zap object and decrement the feature's use count. - */ - VERIFY0(zap_count(mos, ds->ds_bookmarks, - &zap_cnt)); - if (zap_cnt == 0) { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx)); - ds->ds_bookmarks = 0; - spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); - VERIFY0(zap_remove(mos, ds->ds_object, - DS_FIELD_BOOKMARK_NAMES, tx)); - } - - spa_history_log_internal_ds(ds, "remove bookmark", tx, - "name=%s", shortname); - - dsl_dataset_rele(ds, FTAG); - } -} - -/* - * The bookmarks must all be in the same pool. - */ -int -dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors) -{ - int rv; - dsl_bookmark_destroy_arg_t dbda; - nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL); - if (pair == NULL) - return (0); - - dbda.dbda_bmarks = bmarks; - dbda.dbda_errors = errors; - dbda.dbda_success = fnvlist_alloc(); - - rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check, - dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks), - ZFS_SPACE_CHECK_RESERVED); - fnvlist_free(dbda.dbda_success); - return (rv); -} - -typedef struct dsl_bookmark_rename_arg { - const char *dbra_fsname; - const char *dbra_oldname; - const char *dbra_newname; -} dsl_bookmark_rename_arg_t; - -static int -dsl_bookmark_rename_check(void *arg, dmu_tx_t *tx) -{ - dsl_bookmark_rename_arg_t *dbra = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - zfs_bookmark_phys_t bmark_phys; - int error; - - if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)) - return (SET_ERROR(ENOTSUP)); - - /* Check validity and the full length of the new bookmark name. */ - if (zfs_component_namecheck(dbra->dbra_newname, NULL, NULL)) - return (SET_ERROR(EINVAL)); - if (strlen(dbra->dbra_fsname) + strlen(dbra->dbra_newname) + 1 >= - ZFS_MAX_DATASET_NAME_LEN) - return (SET_ERROR(ENAMETOOLONG)); - - error = dsl_dataset_hold(dp, dbra->dbra_fsname, FTAG, &ds); - if (error != 0) - return (error); - if (ds->ds_is_snapshot) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EINVAL)); - } - error = dsl_dataset_bmark_lookup(ds, dbra->dbra_oldname, &bmark_phys); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - - error = dsl_dataset_bmark_lookup(ds, dbra->dbra_newname, &bmark_phys); - dsl_dataset_rele(ds, FTAG); - if (error == 0) - return (SET_ERROR(EEXIST)); - if (error != ESRCH) - return (error); - return (0); -} - -static void -dsl_bookmark_rename_sync(void *arg, dmu_tx_t *tx) -{ - zfs_bookmark_phys_t bmark_phys; - dsl_bookmark_rename_arg_t *dbra = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - objset_t *mos; - dsl_dataset_t *ds; - uint64_t bmark_zapobj; - uint64_t int_size, num_ints; - matchtype_t mt = 0; - int error; - - ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)); - VERIFY0(dsl_dataset_hold(dp, dbra->dbra_fsname, FTAG, &ds)); - - mos = ds->ds_dir->dd_pool->dp_meta_objset; - bmark_zapobj = ds->ds_bookmarks; - - if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) - mt = MT_NORMALIZE; - - VERIFY0(zap_length(mos, bmark_zapobj, dbra->dbra_oldname, - &int_size, &num_ints)); - ASSERT3U(int_size, ==, sizeof (uint64_t)); - VERIFY0(zap_lookup_norm(mos, bmark_zapobj, dbra->dbra_oldname, int_size, - num_ints, &bmark_phys, mt, NULL, 0, NULL)); - VERIFY0(zap_remove_norm(mos, bmark_zapobj, dbra->dbra_oldname, mt, tx)); - - VERIFY0(zap_add(mos, bmark_zapobj, dbra->dbra_newname, int_size, - num_ints, &bmark_phys, tx)); - - spa_history_log_internal_ds(ds, "rename bookmark", tx, - "#%s -> #%s creation_txg=%llu", - dbra->dbra_oldname, dbra->dbra_newname, - (longlong_t)bmark_phys.zbm_creation_txg); - - dsl_dataset_rele(ds, FTAG); -} - -/* - * The bookmarks must all be in the same pool. - */ -int -dsl_bookmark_rename(const char *fsname, const char *oldbmark, - const char *newbmark) -{ - dsl_bookmark_rename_arg_t dbra; - - dbra.dbra_fsname = fsname; - dbra.dbra_oldname = oldbmark; - dbra.dbra_newname = newbmark; - - return (dsl_sync_task(fsname, dsl_bookmark_rename_check, - dsl_bookmark_rename_sync, &dbra, 1, ZFS_SPACE_CHECK_NORMAL)); -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c deleted file mode 100644 index f226c0244004..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ /dev/null @@ -1,4252 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2011 Martin Matuska - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. - * Copyright (c) 2014 RackTop Systems. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -SYSCTL_DECL(_vfs_zfs); - -/* - * The SPA supports block sizes up to 16MB. However, very large blocks - * can have an impact on i/o latency (e.g. tying up a spinning disk for - * ~300ms), and also potentially on the memory allocator. Therefore, - * we do not allow the recordsize to be set larger than zfs_max_recordsize - * (default 1MB). Larger blocks can be created by changing this tunable, - * and pools with larger blocks can always be imported and used, regardless - * of this setting. - */ -int zfs_max_recordsize = 1 * 1024 * 1024; -SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN, - &zfs_max_recordsize, 0, - "Maximum block size. Expect dragons when tuning this."); - -#define SWITCH64(x, y) \ - { \ - uint64_t __tmp = (x); \ - (x) = (y); \ - (y) = __tmp; \ - } - -#define DS_REF_MAX (1ULL << 62) - -extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds); - -static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, - uint64_t obj, dmu_tx_t *tx); -static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, - dmu_tx_t *tx); - -extern int spa_asize_inflation; - -static zil_header_t zero_zil; - -/* - * Figure out how much of this delta should be propogated to the dsl_dir - * layer. If there's a refreservation, that space has already been - * partially accounted for in our ancestors. - */ -static int64_t -parent_delta(dsl_dataset_t *ds, int64_t delta) -{ - dsl_dataset_phys_t *ds_phys; - uint64_t old_bytes, new_bytes; - - if (ds->ds_reserved == 0) - return (delta); - - ds_phys = dsl_dataset_phys(ds); - old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved); - new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved); - - ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); - return (new_bytes - old_bytes); -} - -void -dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) -{ - int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); - int compressed = BP_GET_PSIZE(bp); - int uncompressed = BP_GET_UCSIZE(bp); - int64_t delta; - - dprintf_bp(bp, "ds=%p", ds); - - ASSERT(dmu_tx_is_syncing(tx)); - /* It could have been compressed away to nothing */ - if (BP_IS_HOLE(bp)) - return; - ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); - ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); - if (ds == NULL) { - dsl_pool_mos_diduse_space(tx->tx_pool, - used, compressed, uncompressed); - return; - } - - ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg); - dmu_buf_will_dirty(ds->ds_dbuf, tx); - mutex_enter(&ds->ds_lock); - delta = parent_delta(ds, used); - dsl_dataset_phys(ds)->ds_referenced_bytes += used; - dsl_dataset_phys(ds)->ds_compressed_bytes += compressed; - dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed; - dsl_dataset_phys(ds)->ds_unique_bytes += used; - - if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) { - ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] = - B_TRUE; - } - - spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); - if (f != SPA_FEATURE_NONE) - ds->ds_feature_activation_needed[f] = B_TRUE; - - mutex_exit(&ds->ds_lock); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, - compressed, uncompressed, tx); - dsl_dir_transfer_space(ds->ds_dir, used - delta, - DD_USED_REFRSRV, DD_USED_HEAD, NULL); -} - -/* - * Called when the specified segment has been remapped, and is thus no - * longer referenced in the head dataset. The vdev must be indirect. - * - * If the segment is referenced by a snapshot, put it on the remap deadlist. - * Otherwise, add this segment to the obsolete spacemap. - */ -void -dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset, - uint64_t size, uint64_t birth, dmu_tx_t *tx) -{ - spa_t *spa = ds->ds_dir->dd_pool->dp_spa; - - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(birth <= tx->tx_txg); - ASSERT(!ds->ds_is_snapshot); - - if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { - spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); - } else { - blkptr_t fakebp; - dva_t *dva = &fakebp.blk_dva[0]; - - ASSERT(ds != NULL); - - mutex_enter(&ds->ds_remap_deadlist_lock); - if (!dsl_dataset_remap_deadlist_exists(ds)) { - dsl_dataset_create_remap_deadlist(ds, tx); - } - mutex_exit(&ds->ds_remap_deadlist_lock); - - BP_ZERO(&fakebp); - fakebp.blk_birth = birth; - DVA_SET_VDEV(dva, vdev); - DVA_SET_OFFSET(dva, offset); - DVA_SET_ASIZE(dva, size); - - dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx); - } -} - -int -dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, - boolean_t async) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - - int used = bp_get_dsize_sync(spa, bp); - int compressed = BP_GET_PSIZE(bp); - int uncompressed = BP_GET_UCSIZE(bp); - - if (BP_IS_HOLE(bp)) - return (0); - - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(bp->blk_birth <= tx->tx_txg); - - if (ds == NULL) { - dsl_free(tx->tx_pool, tx->tx_txg, bp); - dsl_pool_mos_diduse_space(tx->tx_pool, - -used, -compressed, -uncompressed); - return (used); - } - ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); - - ASSERT(!ds->ds_is_snapshot); - dmu_buf_will_dirty(ds->ds_dbuf, tx); - - if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { - int64_t delta; - - dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); - dsl_free(tx->tx_pool, tx->tx_txg, bp); - - mutex_enter(&ds->ds_lock); - ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used || - !DS_UNIQUE_IS_ACCURATE(ds)); - delta = parent_delta(ds, -used); - dsl_dataset_phys(ds)->ds_unique_bytes -= used; - mutex_exit(&ds->ds_lock); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, - delta, -compressed, -uncompressed, tx); - dsl_dir_transfer_space(ds->ds_dir, -used - delta, - DD_USED_REFRSRV, DD_USED_HEAD, NULL); - } else { - dprintf_bp(bp, "putting on dead list: %s", ""); - if (async) { - /* - * We are here as part of zio's write done callback, - * which means we're a zio interrupt thread. We can't - * call dsl_deadlist_insert() now because it may block - * waiting for I/O. Instead, put bp on the deferred - * queue and let dsl_pool_sync() finish the job. - */ - bplist_append(&ds->ds_pending_deadlist, bp); - } else { - dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); - } - ASSERT3U(ds->ds_prev->ds_object, ==, - dsl_dataset_phys(ds)->ds_prev_snap_obj); - ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0); - /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ - if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == - ds->ds_object && bp->blk_birth > - dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) { - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - mutex_enter(&ds->ds_prev->ds_lock); - dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used; - mutex_exit(&ds->ds_prev->ds_lock); - } - if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { - dsl_dir_transfer_space(ds->ds_dir, used, - DD_USED_HEAD, DD_USED_SNAP, tx); - } - } - mutex_enter(&ds->ds_lock); - ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used); - dsl_dataset_phys(ds)->ds_referenced_bytes -= used; - ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed); - dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed; - ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed); - dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed; - mutex_exit(&ds->ds_lock); - - return (used); -} - -/* - * We have to release the fsid syncronously or we risk that a subsequent - * mount of the same dataset will fail to unique_insert the fsid. This - * failure would manifest itself as the fsid of this dataset changing - * between mounts which makes NFS clients quite unhappy. - */ -static void -dsl_dataset_evict_sync(void *dbu) -{ - dsl_dataset_t *ds = dbu; - - ASSERT(ds->ds_owner == NULL); - - unique_remove(ds->ds_fsid_guid); -} - -static void -dsl_dataset_evict_async(void *dbu) -{ - dsl_dataset_t *ds = dbu; - - ASSERT(ds->ds_owner == NULL); - - ds->ds_dbuf = NULL; - - if (ds->ds_objset != NULL) - dmu_objset_evict(ds->ds_objset); - - if (ds->ds_prev) { - dsl_dataset_rele(ds->ds_prev, ds); - ds->ds_prev = NULL; - } - - bplist_destroy(&ds->ds_pending_deadlist); - if (dsl_deadlist_is_open(&ds->ds_deadlist)) - dsl_deadlist_close(&ds->ds_deadlist); - if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) - dsl_deadlist_close(&ds->ds_remap_deadlist); - if (ds->ds_dir) - dsl_dir_async_rele(ds->ds_dir, ds); - - ASSERT(!list_link_active(&ds->ds_synced_link)); - - list_destroy(&ds->ds_prop_cbs); - if (mutex_owned(&ds->ds_lock)) - mutex_exit(&ds->ds_lock); - mutex_destroy(&ds->ds_lock); - if (mutex_owned(&ds->ds_opening_lock)) - mutex_exit(&ds->ds_opening_lock); - mutex_destroy(&ds->ds_opening_lock); - mutex_destroy(&ds->ds_sendstream_lock); - mutex_destroy(&ds->ds_remap_deadlist_lock); - zfs_refcount_destroy(&ds->ds_longholds); - rrw_destroy(&ds->ds_bp_rwlock); - - kmem_free(ds, sizeof (dsl_dataset_t)); -} - -int -dsl_dataset_get_snapname(dsl_dataset_t *ds) -{ - dsl_dataset_phys_t *headphys; - int err; - dmu_buf_t *headdbuf; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - - if (ds->ds_snapname[0]) - return (0); - if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) - return (0); - - err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, - FTAG, &headdbuf); - if (err != 0) - return (err); - headphys = headdbuf->db_data; - err = zap_value_search(dp->dp_meta_objset, - headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); - dmu_buf_rele(headdbuf, FTAG); - return (err); -} - -int -dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) -{ - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; - matchtype_t mt = 0; - int err; - - if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) - mt = MT_NORMALIZE; - - err = zap_lookup_norm(mos, snapobj, name, 8, 1, - value, mt, NULL, 0, NULL); - if (err == ENOTSUP && (mt & MT_NORMALIZE)) - err = zap_lookup(mos, snapobj, name, 8, 1, value); - return (err); -} - -int -dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, - boolean_t adj_cnt) -{ - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; - matchtype_t mt = 0; - int err; - - dsl_dir_snap_cmtime_update(ds->ds_dir); - - if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) - mt = MT_NORMALIZE; - - err = zap_remove_norm(mos, snapobj, name, mt, tx); - if (err == ENOTSUP && (mt & MT_NORMALIZE)) - err = zap_remove(mos, snapobj, name, tx); - - if (err == 0 && adj_cnt) - dsl_fs_ss_count_adjust(ds->ds_dir, -1, - DD_FIELD_SNAPSHOT_COUNT, tx); - - return (err); -} - -boolean_t -dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag) -{ - dmu_buf_t *dbuf = ds->ds_dbuf; - boolean_t result = B_FALSE; - - if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset, - ds->ds_object, DMU_BONUS_BLKID, tag)) { - - if (ds == dmu_buf_get_user(dbuf)) - result = B_TRUE; - else - dmu_buf_rele(dbuf, tag); - } - - return (result); -} - -int -dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, - dsl_dataset_t **dsp) -{ - objset_t *mos = dp->dp_meta_objset; - dmu_buf_t *dbuf; - dsl_dataset_t *ds; - int err; - dmu_object_info_t doi; - - ASSERT(dsl_pool_config_held(dp)); - - err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); - if (err != 0) - return (err); - - /* Make sure dsobj has the correct object type. */ - dmu_object_info_from_db(dbuf, &doi); - if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) { - dmu_buf_rele(dbuf, tag); - return (SET_ERROR(EINVAL)); - } - - ds = dmu_buf_get_user(dbuf); - if (ds == NULL) { - dsl_dataset_t *winner = NULL; - - ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); - ds->ds_dbuf = dbuf; - ds->ds_object = dsobj; - ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0; - - err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj, - NULL, ds, &ds->ds_dir); - if (err != 0) { - kmem_free(ds, sizeof (dsl_dataset_t)); - dmu_buf_rele(dbuf, tag); - return (err); - } - - mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ds->ds_remap_deadlist_lock, - NULL, MUTEX_DEFAULT, NULL); - rrw_init(&ds->ds_bp_rwlock, B_FALSE); - zfs_refcount_create(&ds->ds_longholds); - - bplist_create(&ds->ds_pending_deadlist); - - list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), - offsetof(dmu_sendarg_t, dsa_link)); - - list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t), - offsetof(dsl_prop_cb_record_t, cbr_ds_node)); - - if (doi.doi_type == DMU_OTN_ZAP_METADATA) { - for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - if (!(spa_feature_table[f].fi_flags & - ZFEATURE_FLAG_PER_DATASET)) - continue; - err = zap_contains(mos, dsobj, - spa_feature_table[f].fi_guid); - if (err == 0) { - ds->ds_feature_inuse[f] = B_TRUE; - } else { - ASSERT3U(err, ==, ENOENT); - err = 0; - } - } - } - - if (!ds->ds_is_snapshot) { - ds->ds_snapname[0] = '\0'; - if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { - err = dsl_dataset_hold_obj(dp, - dsl_dataset_phys(ds)->ds_prev_snap_obj, - ds, &ds->ds_prev); - } - if (doi.doi_type == DMU_OTN_ZAP_METADATA) { - int zaperr = zap_lookup(mos, ds->ds_object, - DS_FIELD_BOOKMARK_NAMES, - sizeof (ds->ds_bookmarks), 1, - &ds->ds_bookmarks); - if (zaperr != ENOENT) - VERIFY0(zaperr); - } - } else { - if (zfs_flags & ZFS_DEBUG_SNAPNAMES) - err = dsl_dataset_get_snapname(ds); - if (err == 0 && - dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { - err = zap_count( - ds->ds_dir->dd_pool->dp_meta_objset, - dsl_dataset_phys(ds)->ds_userrefs_obj, - &ds->ds_userrefs); - } - } - - if (err == 0 && !ds->ds_is_snapshot) { - err = dsl_prop_get_int_ds(ds, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), - &ds->ds_reserved); - if (err == 0) { - err = dsl_prop_get_int_ds(ds, - zfs_prop_to_name(ZFS_PROP_REFQUOTA), - &ds->ds_quota); - } - } else { - ds->ds_reserved = ds->ds_quota = 0; - } - - dsl_deadlist_open(&ds->ds_deadlist, - mos, dsl_dataset_phys(ds)->ds_deadlist_obj); - uint64_t remap_deadlist_obj = - dsl_dataset_get_remap_deadlist_object(ds); - if (remap_deadlist_obj != 0) { - dsl_deadlist_open(&ds->ds_remap_deadlist, mos, - remap_deadlist_obj); - } - - dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync, - dsl_dataset_evict_async, &ds->ds_dbuf); - if (err == 0) - winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu); - - if (err != 0 || winner != NULL) { - bplist_destroy(&ds->ds_pending_deadlist); - dsl_deadlist_close(&ds->ds_deadlist); - if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) - dsl_deadlist_close(&ds->ds_remap_deadlist); - if (ds->ds_prev) - dsl_dataset_rele(ds->ds_prev, ds); - dsl_dir_rele(ds->ds_dir, ds); - list_destroy(&ds->ds_prop_cbs); - list_destroy(&ds->ds_sendstreams); - mutex_destroy(&ds->ds_lock); - mutex_destroy(&ds->ds_opening_lock); - mutex_destroy(&ds->ds_sendstream_lock); - mutex_destroy(&ds->ds_remap_deadlist_lock); - zfs_refcount_destroy(&ds->ds_longholds); - rrw_destroy(&ds->ds_bp_rwlock); - kmem_free(ds, sizeof (dsl_dataset_t)); - if (err != 0) { - dmu_buf_rele(dbuf, tag); - return (err); - } - ds = winner; - } else { - ds->ds_fsid_guid = - unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid); - if (ds->ds_fsid_guid != - dsl_dataset_phys(ds)->ds_fsid_guid) { - zfs_dbgmsg("ds_fsid_guid changed from " - "%llx to %llx for pool %s dataset id %llu", - (long long) - dsl_dataset_phys(ds)->ds_fsid_guid, - (long long)ds->ds_fsid_guid, - spa_name(dp->dp_spa), - dsobj); - } - } - } - ASSERT3P(ds->ds_dbuf, ==, dbuf); - ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data); - ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 || - spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || - dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); - *dsp = ds; - return (0); -} - -int -dsl_dataset_hold(dsl_pool_t *dp, const char *name, - void *tag, dsl_dataset_t **dsp) -{ - dsl_dir_t *dd; - const char *snapname; - uint64_t obj; - int err = 0; - dsl_dataset_t *ds; - - err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); - if (err != 0) - return (err); - - ASSERT(dsl_pool_config_held(dp)); - obj = dsl_dir_phys(dd)->dd_head_dataset_obj; - if (obj != 0) - err = dsl_dataset_hold_obj(dp, obj, tag, &ds); - else - err = SET_ERROR(ENOENT); - - /* we may be looking for a snapshot */ - if (err == 0 && snapname != NULL) { - dsl_dataset_t *snap_ds; - - if (*snapname++ != '@') { - dsl_dataset_rele(ds, tag); - dsl_dir_rele(dd, FTAG); - return (SET_ERROR(ENOENT)); - } - - dprintf("looking for snapshot '%s'\n", snapname); - err = dsl_dataset_snap_lookup(ds, snapname, &obj); - if (err == 0) - err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds); - dsl_dataset_rele(ds, tag); - - if (err == 0) { - mutex_enter(&snap_ds->ds_lock); - if (snap_ds->ds_snapname[0] == 0) - (void) strlcpy(snap_ds->ds_snapname, snapname, - sizeof (snap_ds->ds_snapname)); - mutex_exit(&snap_ds->ds_lock); - ds = snap_ds; - } - } - if (err == 0) - *dsp = ds; - dsl_dir_rele(dd, FTAG); - return (err); -} - -int -dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, - void *tag, dsl_dataset_t **dsp) -{ - int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); - if (err != 0) - return (err); - if (!dsl_dataset_tryown(*dsp, tag)) { - dsl_dataset_rele(*dsp, tag); - *dsp = NULL; - return (SET_ERROR(EBUSY)); - } - return (0); -} - -int -dsl_dataset_own(dsl_pool_t *dp, const char *name, - void *tag, dsl_dataset_t **dsp) -{ - int err = dsl_dataset_hold(dp, name, tag, dsp); - if (err != 0) - return (err); - if (!dsl_dataset_tryown(*dsp, tag)) { - dsl_dataset_rele(*dsp, tag); - return (SET_ERROR(EBUSY)); - } - return (0); -} - -/* - * See the comment above dsl_pool_hold() for details. In summary, a long - * hold is used to prevent destruction of a dataset while the pool hold - * is dropped, allowing other concurrent operations (e.g. spa_sync()). - * - * The dataset and pool must be held when this function is called. After it - * is called, the pool hold may be released while the dataset is still held - * and accessed. - */ -void -dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag) -{ - ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); - (void) zfs_refcount_add(&ds->ds_longholds, tag); -} - -void -dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag) -{ - (void) zfs_refcount_remove(&ds->ds_longholds, tag); -} - -/* Return B_TRUE if there are any long holds on this dataset. */ -boolean_t -dsl_dataset_long_held(dsl_dataset_t *ds) -{ - return (!zfs_refcount_is_zero(&ds->ds_longholds)); -} - -void -dsl_dataset_name(dsl_dataset_t *ds, char *name) -{ - if (ds == NULL) { - (void) strcpy(name, "mos"); - } else { - dsl_dir_name(ds->ds_dir, name); - VERIFY0(dsl_dataset_get_snapname(ds)); - if (ds->ds_snapname[0]) { - VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN), - <, ZFS_MAX_DATASET_NAME_LEN); - /* - * We use a "recursive" mutex so that we - * can call dprintf_ds() with ds_lock held. - */ - if (!MUTEX_HELD(&ds->ds_lock)) { - mutex_enter(&ds->ds_lock); - VERIFY3U(strlcat(name, ds->ds_snapname, - ZFS_MAX_DATASET_NAME_LEN), <, - ZFS_MAX_DATASET_NAME_LEN); - mutex_exit(&ds->ds_lock); - } else { - VERIFY3U(strlcat(name, ds->ds_snapname, - ZFS_MAX_DATASET_NAME_LEN), <, - ZFS_MAX_DATASET_NAME_LEN); - } - } - } -} - -int -dsl_dataset_namelen(dsl_dataset_t *ds) -{ - VERIFY0(dsl_dataset_get_snapname(ds)); - mutex_enter(&ds->ds_lock); - int len = dsl_dir_namelen(ds->ds_dir) + 1 + strlen(ds->ds_snapname); - mutex_exit(&ds->ds_lock); - return (len); -} - -void -dsl_dataset_rele(dsl_dataset_t *ds, void *tag) -{ - dmu_buf_rele(ds->ds_dbuf, tag); -} - -void -dsl_dataset_disown(dsl_dataset_t *ds, void *tag) -{ - ASSERT3P(ds->ds_owner, ==, tag); - ASSERT(ds->ds_dbuf != NULL); - - mutex_enter(&ds->ds_lock); - ds->ds_owner = NULL; - mutex_exit(&ds->ds_lock); - dsl_dataset_long_rele(ds, tag); - dsl_dataset_rele(ds, tag); -} - -boolean_t -dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) -{ - boolean_t gotit = FALSE; - - ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); - mutex_enter(&ds->ds_lock); - if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) { - ds->ds_owner = tag; - dsl_dataset_long_hold(ds, tag); - gotit = TRUE; - } - mutex_exit(&ds->ds_lock); - return (gotit); -} - -boolean_t -dsl_dataset_has_owner(dsl_dataset_t *ds) -{ - boolean_t rv; - mutex_enter(&ds->ds_lock); - rv = (ds->ds_owner != NULL); - mutex_exit(&ds->ds_lock); - return (rv); -} - -static void -dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; - uint64_t zero = 0; - - VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); - - spa_feature_incr(spa, f, tx); - dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); - - VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid, - sizeof (zero), 1, &zero, tx)); -} - -void -dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; - - VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); - - VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx)); - spa_feature_decr(spa, f, tx); -} - -uint64_t -dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - uint64_t flags, dmu_tx_t *tx) -{ - dsl_pool_t *dp = dd->dd_pool; - dmu_buf_t *dbuf; - dsl_dataset_phys_t *dsphys; - uint64_t dsobj; - objset_t *mos = dp->dp_meta_objset; - - if (origin == NULL) - origin = dp->dp_origin_snap; - - ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); - ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0); - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0); - - dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, - DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; - bzero(dsphys, sizeof (dsl_dataset_phys_t)); - dsphys->ds_dir_obj = dd->dd_object; - dsphys->ds_flags = flags; - dsphys->ds_fsid_guid = unique_create(); - do { - (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, - sizeof (dsphys->ds_guid)); - } while (dsphys->ds_guid == 0); - dsphys->ds_snapnames_zapobj = - zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, - DMU_OT_NONE, 0, tx); - dsphys->ds_creation_time = gethrestime_sec(); - dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; - - if (origin == NULL) { - dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); - } else { - dsl_dataset_t *ohds; /* head of the origin snapshot */ - - dsphys->ds_prev_snap_obj = origin->ds_object; - dsphys->ds_prev_snap_txg = - dsl_dataset_phys(origin)->ds_creation_txg; - dsphys->ds_referenced_bytes = - dsl_dataset_phys(origin)->ds_referenced_bytes; - dsphys->ds_compressed_bytes = - dsl_dataset_phys(origin)->ds_compressed_bytes; - dsphys->ds_uncompressed_bytes = - dsl_dataset_phys(origin)->ds_uncompressed_bytes; - rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG); - dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp; - rrw_exit(&origin->ds_bp_rwlock, FTAG); - - /* - * Inherit flags that describe the dataset's contents - * (INCONSISTENT) or properties (Case Insensitive). - */ - dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags & - (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET); - - for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - if (origin->ds_feature_inuse[f]) - dsl_dataset_activate_feature(dsobj, f, tx); - } - - dmu_buf_will_dirty(origin->ds_dbuf, tx); - dsl_dataset_phys(origin)->ds_num_children++; - - VERIFY0(dsl_dataset_hold_obj(dp, - dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj, - FTAG, &ohds)); - dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, - dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); - dsl_dataset_rele(ohds, FTAG); - - if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { - if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) { - dsl_dataset_phys(origin)->ds_next_clones_obj = - zap_create(mos, - DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); - } - VERIFY0(zap_add_int(mos, - dsl_dataset_phys(origin)->ds_next_clones_obj, - dsobj, tx)); - } - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object; - if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { - if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { - dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); - dsl_dir_phys(origin->ds_dir)->dd_clones = - zap_create(mos, - DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); - } - VERIFY0(zap_add_int(mos, - dsl_dir_phys(origin->ds_dir)->dd_clones, - dsobj, tx)); - } - } - - if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) - dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; - - dmu_buf_rele(dbuf, FTAG); - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj; - - return (dsobj); -} - -static void -dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - objset_t *os; - - VERIFY0(dmu_objset_from_ds(ds, &os)); - if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) { - dsl_pool_t *dp = ds->ds_dir->dd_pool; - zio_t *zio; - - bzero(&os->os_zil_header, sizeof (os->os_zil_header)); - - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - dsl_dataset_sync(ds, zio, tx); - VERIFY0(zio_wait(zio)); - - /* dsl_dataset_sync_done will drop this reference. */ - dmu_buf_add_ref(ds->ds_dbuf, ds); - dsl_dataset_sync_done(ds, tx); - } -} - -uint64_t -dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, - dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) -{ - dsl_pool_t *dp = pdd->dd_pool; - uint64_t dsobj, ddobj; - dsl_dir_t *dd; - - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(lastname[0] != '@'); - - ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); - VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); - - dsobj = dsl_dataset_create_sync_dd(dd, origin, - flags & ~DS_CREATE_FLAG_NODIRTY, tx); - - dsl_deleg_set_create_perms(dd, tx, cr); - - /* - * Since we're creating a new node we know it's a leaf, so we can - * initialize the counts if the limit feature is active. - */ - if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { - uint64_t cnt = 0; - objset_t *os = dd->dd_pool->dp_meta_objset; - - dsl_dir_zapify(dd, tx); - VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, - sizeof (cnt), 1, &cnt, tx)); - VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, - sizeof (cnt), 1, &cnt, tx)); - } - - dsl_dir_rele(dd, FTAG); - - /* - * If we are creating a clone, make sure we zero out any stale - * data from the origin snapshots zil header. - */ - if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) { - dsl_dataset_t *ds; - - VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - dsl_dataset_zero_zil(ds, tx); - dsl_dataset_rele(ds, FTAG); - } - - return (dsobj); -} - -#ifdef __FreeBSD__ -/* FreeBSD ioctl compat begin */ -struct destroyarg { - nvlist_t *nvl; - const char *snapname; -}; - -static int -dsl_check_snap_cb(const char *name, void *arg) -{ - struct destroyarg *da = arg; - dsl_dataset_t *ds; - char *dsname; - - dsname = kmem_asprintf("%s@%s", name, da->snapname); - fnvlist_add_boolean(da->nvl, dsname); - kmem_free(dsname, strlen(dsname) + 1); - - return (0); -} - -int -dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname, - nvlist_t *snaps) -{ - struct destroyarg *da; - int err; - - da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP); - da->nvl = snaps; - da->snapname = snapname; - err = dmu_objset_find(fsname, dsl_check_snap_cb, da, - DS_FIND_CHILDREN); - kmem_free(da, sizeof (struct destroyarg)); - - return (err); -} -/* FreeBSD ioctl compat end */ -#endif /* __FreeBSD__ */ - -/* - * The unique space in the head dataset can be calculated by subtracting - * the space used in the most recent snapshot, that is still being used - * in this file system, from the space currently in use. To figure out - * the space in the most recent snapshot still in use, we need to take - * the total space used in the snapshot and subtract out the space that - * has been freed up since the snapshot was taken. - */ -void -dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) -{ - uint64_t mrs_used; - uint64_t dlused, dlcomp, dluncomp; - - ASSERT(!ds->ds_is_snapshot); - - if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) - mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes; - else - mrs_used = 0; - - dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); - - ASSERT3U(dlused, <=, mrs_used); - dsl_dataset_phys(ds)->ds_unique_bytes = - dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused); - - if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= - SPA_VERSION_UNIQUE_ACCURATE) - dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; -} - -void -dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, - dmu_tx_t *tx) -{ - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t count; - int err; - - ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2); - err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, - obj, tx); - /* - * The err should not be ENOENT, but a bug in a previous version - * of the code could cause upgrade_clones_cb() to not set - * ds_next_snap_obj when it should, leading to a missing entry. - * If we knew that the pool was created after - * SPA_VERSION_NEXT_CLONES, we could assert that it isn't - * ENOENT. However, at least we can check that we don't have - * too many entries in the next_clones_obj even after failing to - * remove this one. - */ - if (err != ENOENT) - VERIFY0(err); - ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, - &count)); - ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2); -} - - -blkptr_t * -dsl_dataset_get_blkptr(dsl_dataset_t *ds) -{ - return (&dsl_dataset_phys(ds)->ds_bp); -} - -spa_t * -dsl_dataset_get_spa(dsl_dataset_t *ds) -{ - return (ds->ds_dir->dd_pool->dp_spa); -} - -void -dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - dsl_pool_t *dp; - - if (ds == NULL) /* this is the meta-objset */ - return; - - ASSERT(ds->ds_objset != NULL); - - if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) - panic("dirtying snapshot!"); - - /* Must not dirty a dataset in the same txg where it got snapshotted. */ - ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg); - - dp = ds->ds_dir->dd_pool; - if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { - /* up the hold count until we can be written out */ - dmu_buf_add_ref(ds->ds_dbuf, ds); - } -} - -boolean_t -dsl_dataset_is_dirty(dsl_dataset_t *ds) -{ - for (int t = 0; t < TXG_SIZE; t++) { - if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, - ds, t)) - return (B_TRUE); - } - return (B_FALSE); -} - -static int -dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - uint64_t asize; - - if (!dmu_tx_is_syncing(tx)) - return (0); - - /* - * If there's an fs-only reservation, any blocks that might become - * owned by the snapshot dataset must be accommodated by space - * outside of the reservation. - */ - ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); - asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved); - if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) - return (SET_ERROR(ENOSPC)); - - /* - * Propagate any reserved space for this snapshot to other - * snapshot checks in this sync group. - */ - if (asize > 0) - dsl_dir_willuse_space(ds->ds_dir, asize, tx); - - return (0); -} - -int -dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, - dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr) -{ - int error; - uint64_t value; - - ds->ds_trysnap_txg = tx->tx_txg; - - if (!dmu_tx_is_syncing(tx)) - return (0); - - /* - * We don't allow multiple snapshots of the same txg. If there - * is already one, try again. - */ - if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) - return (SET_ERROR(EAGAIN)); - - /* - * Check for conflicting snapshot name. - */ - error = dsl_dataset_snap_lookup(ds, snapname, &value); - if (error == 0) - return (SET_ERROR(EEXIST)); - if (error != ENOENT) - return (error); - - /* - * We don't allow taking snapshots of inconsistent datasets, such as - * those into which we are currently receiving. However, if we are - * creating this snapshot as part of a receive, this check will be - * executed atomically with respect to the completion of the receive - * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this - * case we ignore this, knowing it will be fixed up for us shortly in - * dmu_recv_end_sync(). - */ - if (!recv && DS_IS_INCONSISTENT(ds)) - return (SET_ERROR(EBUSY)); - - /* - * Skip the check for temporary snapshots or if we have already checked - * the counts in dsl_dataset_snapshot_check. This means we really only - * check the count here when we're receiving a stream. - */ - if (cnt != 0 && cr != NULL) { - error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, - ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr); - if (error != 0) - return (error); - } - - error = dsl_dataset_snapshot_reserve_space(ds, tx); - if (error != 0) - return (error); - - return (0); -} - -int -dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_snapshot_arg_t *ddsa = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - nvpair_t *pair; - int rv = 0; - - /* - * Pre-compute how many total new snapshots will be created for each - * level in the tree and below. This is needed for validating the - * snapshot limit when either taking a recursive snapshot or when - * taking multiple snapshots. - * - * The problem is that the counts are not actually adjusted when - * we are checking, only when we finally sync. For a single snapshot, - * this is easy, the count will increase by 1 at each node up the tree, - * but its more complicated for the recursive/multiple snapshot case. - * - * The dsl_fs_ss_limit_check function does recursively check the count - * at each level up the tree but since it is validating each snapshot - * independently we need to be sure that we are validating the complete - * count for the entire set of snapshots. We do this by rolling up the - * counts for each component of the name into an nvlist and then - * checking each of those cases with the aggregated count. - * - * This approach properly handles not only the recursive snapshot - * case (where we get all of those on the ddsa_snaps list) but also - * the sibling case (e.g. snapshot a/b and a/c so that we will also - * validate the limit on 'a' using a count of 2). - * - * We validate the snapshot names in the third loop and only report - * name errors once. - */ - if (dmu_tx_is_syncing(tx)) { - nvlist_t *cnt_track = NULL; - cnt_track = fnvlist_alloc(); - - /* Rollup aggregated counts into the cnt_track list */ - for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); - pair != NULL; - pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { - char *pdelim; - uint64_t val; - char nm[MAXPATHLEN]; - - (void) strlcpy(nm, nvpair_name(pair), sizeof (nm)); - pdelim = strchr(nm, '@'); - if (pdelim == NULL) - continue; - *pdelim = '\0'; - - do { - if (nvlist_lookup_uint64(cnt_track, nm, - &val) == 0) { - /* update existing entry */ - fnvlist_add_uint64(cnt_track, nm, - val + 1); - } else { - /* add to list */ - fnvlist_add_uint64(cnt_track, nm, 1); - } - - pdelim = strrchr(nm, '/'); - if (pdelim != NULL) - *pdelim = '\0'; - } while (pdelim != NULL); - } - - /* Check aggregated counts at each level */ - for (pair = nvlist_next_nvpair(cnt_track, NULL); - pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) { - int error = 0; - char *name; - uint64_t cnt = 0; - dsl_dataset_t *ds; - - name = nvpair_name(pair); - cnt = fnvpair_value_uint64(pair); - ASSERT(cnt > 0); - - error = dsl_dataset_hold(dp, name, FTAG, &ds); - if (error == 0) { - error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, - ZFS_PROP_SNAPSHOT_LIMIT, NULL, - ddsa->ddsa_cr); - dsl_dataset_rele(ds, FTAG); - } - - if (error != 0) { - if (ddsa->ddsa_errors != NULL) - fnvlist_add_int32(ddsa->ddsa_errors, - name, error); - rv = error; - /* only report one error for this check */ - break; - } - } - nvlist_free(cnt_track); - } - - for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); - pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { - int error = 0; - dsl_dataset_t *ds; - char *name, *atp; - char dsname[ZFS_MAX_DATASET_NAME_LEN]; - - name = nvpair_name(pair); - if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN) - error = SET_ERROR(ENAMETOOLONG); - if (error == 0) { - atp = strchr(name, '@'); - if (atp == NULL) - error = SET_ERROR(EINVAL); - if (error == 0) - (void) strlcpy(dsname, name, atp - name + 1); - } - if (error == 0) - error = dsl_dataset_hold(dp, dsname, FTAG, &ds); - if (error == 0) { - /* passing 0/NULL skips dsl_fs_ss_limit_check */ - error = dsl_dataset_snapshot_check_impl(ds, - atp + 1, tx, B_FALSE, 0, NULL); - dsl_dataset_rele(ds, FTAG); - } - - if (error != 0) { - if (ddsa->ddsa_errors != NULL) { - fnvlist_add_int32(ddsa->ddsa_errors, - name, error); - } - rv = error; - } - } - - return (rv); -} - -void -dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, - dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - dmu_buf_t *dbuf; - dsl_dataset_phys_t *dsphys; - uint64_t dsobj, crtxg; - objset_t *mos = dp->dp_meta_objset; - objset_t *os; - - ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); - - /* - * If we are on an old pool, the zil must not be active, in which - * case it will be zeroed. Usually zil_suspend() accomplishes this. - */ - ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP || - dmu_objset_from_ds(ds, &os) != 0 || - bcmp(&os->os_phys->os_zil_header, &zero_zil, - sizeof (zero_zil)) == 0); - - /* Should not snapshot a dirty dataset. */ - ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, - ds, tx->tx_txg)); - - dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx); - - /* - * The origin's ds_creation_txg has to be < TXG_INITIAL - */ - if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) - crtxg = 1; - else - crtxg = tx->tx_txg; - - dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, - DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; - bzero(dsphys, sizeof (dsl_dataset_phys_t)); - dsphys->ds_dir_obj = ds->ds_dir->dd_object; - dsphys->ds_fsid_guid = unique_create(); - do { - (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, - sizeof (dsphys->ds_guid)); - } while (dsphys->ds_guid == 0); - dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; - dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; - dsphys->ds_next_snap_obj = ds->ds_object; - dsphys->ds_num_children = 1; - dsphys->ds_creation_time = gethrestime_sec(); - dsphys->ds_creation_txg = crtxg; - dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; - dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes; - dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes; - dsphys->ds_uncompressed_bytes = - dsl_dataset_phys(ds)->ds_uncompressed_bytes; - dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags; - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp; - rrw_exit(&ds->ds_bp_rwlock, FTAG); - dmu_buf_rele(dbuf, FTAG); - - for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - if (ds->ds_feature_inuse[f]) - dsl_dataset_activate_feature(dsobj, f, tx); - } - - ASSERT3U(ds->ds_prev != 0, ==, - dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); - if (ds->ds_prev) { - uint64_t next_clones_obj = - dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj; - ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == - ds->ds_object || - dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1); - if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == - ds->ds_object) { - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, - dsl_dataset_phys(ds->ds_prev)->ds_creation_txg); - dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj; - } else if (next_clones_obj != 0) { - dsl_dataset_remove_from_next_clones(ds->ds_prev, - dsphys->ds_next_snap_obj, tx); - VERIFY0(zap_add_int(mos, - next_clones_obj, dsobj, tx)); - } - } - - /* - * If we have a reference-reservation on this dataset, we will - * need to increase the amount of refreservation being charged - * since our unique space is going to zero. - */ - if (ds->ds_reserved) { - int64_t delta; - ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); - delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, - ds->ds_reserved); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, - delta, 0, 0, tx); - } - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_deadlist_obj = - dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX, - dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); - dsl_deadlist_close(&ds->ds_deadlist); - dsl_deadlist_open(&ds->ds_deadlist, mos, - dsl_dataset_phys(ds)->ds_deadlist_obj); - dsl_deadlist_add_key(&ds->ds_deadlist, - dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); - - if (dsl_dataset_remap_deadlist_exists(ds)) { - uint64_t remap_deadlist_obj = - dsl_dataset_get_remap_deadlist_object(ds); - /* - * Move the remap_deadlist to the snapshot. The head - * will create a new remap deadlist on demand, from - * dsl_dataset_block_remapped(). - */ - dsl_dataset_unset_remap_deadlist_object(ds, tx); - dsl_deadlist_close(&ds->ds_remap_deadlist); - - dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); - VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST, - sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx)); - } - - ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg); - dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj; - dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg; - dsl_dataset_phys(ds)->ds_unique_bytes = 0; - - if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) - dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; - - VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj, - snapname, 8, 1, &dsobj, tx)); - - if (ds->ds_prev) - dsl_dataset_rele(ds->ds_prev, ds); - VERIFY0(dsl_dataset_hold_obj(dp, - dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev)); - - dsl_scan_ds_snapshotted(ds, tx); - - dsl_dir_snap_cmtime_update(ds->ds_dir); - - spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, ""); -} - -void -dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_snapshot_arg_t *ddsa = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - nvpair_t *pair; - - for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); - pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { - dsl_dataset_t *ds; - char *name, *atp; - char dsname[ZFS_MAX_DATASET_NAME_LEN]; - - name = nvpair_name(pair); - atp = strchr(name, '@'); - (void) strlcpy(dsname, name, atp - name + 1); - VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds)); - - dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx); - if (ddsa->ddsa_props != NULL) { - dsl_props_set_sync_impl(ds->ds_prev, - ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx); - } -#if defined(__FreeBSD__) && defined(_KERNEL) - zvol_create_minors(dp->dp_spa, name); -#endif - dsl_dataset_rele(ds, FTAG); - } -} - -/* - * The snapshots must all be in the same pool. - * All-or-nothing: if there are any failures, nothing will be modified. - */ -int -dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) -{ - dsl_dataset_snapshot_arg_t ddsa; - nvpair_t *pair; - boolean_t needsuspend; - int error; - spa_t *spa; - char *firstname; - nvlist_t *suspended = NULL; - - pair = nvlist_next_nvpair(snaps, NULL); - if (pair == NULL) - return (0); - firstname = nvpair_name(pair); - - error = spa_open(firstname, &spa, FTAG); - if (error != 0) - return (error); - needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); - spa_close(spa, FTAG); - - if (needsuspend) { - suspended = fnvlist_alloc(); - for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; - pair = nvlist_next_nvpair(snaps, pair)) { - char fsname[ZFS_MAX_DATASET_NAME_LEN]; - char *snapname = nvpair_name(pair); - char *atp; - void *cookie; - - atp = strchr(snapname, '@'); - if (atp == NULL) { - error = SET_ERROR(EINVAL); - break; - } - (void) strlcpy(fsname, snapname, atp - snapname + 1); - - error = zil_suspend(fsname, &cookie); - if (error != 0) - break; - fnvlist_add_uint64(suspended, fsname, - (uintptr_t)cookie); - } - } - - ddsa.ddsa_snaps = snaps; - ddsa.ddsa_props = props; - ddsa.ddsa_errors = errors; - ddsa.ddsa_cr = CRED(); - - if (error == 0) { - error = dsl_sync_task(firstname, dsl_dataset_snapshot_check, - dsl_dataset_snapshot_sync, &ddsa, - fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL); - } - - if (suspended != NULL) { - for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL; - pair = nvlist_next_nvpair(suspended, pair)) { - zil_resume((void *)(uintptr_t) - fnvpair_value_uint64(pair)); - } - fnvlist_free(suspended); - } - - return (error); -} - -typedef struct dsl_dataset_snapshot_tmp_arg { - const char *ddsta_fsname; - const char *ddsta_snapname; - minor_t ddsta_cleanup_minor; - const char *ddsta_htag; -} dsl_dataset_snapshot_tmp_arg_t; - -static int -dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - int error; - - error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds); - if (error != 0) - return (error); - - /* NULL cred means no limit check for tmp snapshot */ - error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, - tx, B_FALSE, 0, NULL); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - - if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(ENOTSUP)); - } - error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag, - B_TRUE, tx); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - - dsl_dataset_rele(ds, FTAG); - return (0); -} - -static void -dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - - VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds)); - - dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx); - dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag, - ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx); - dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx); - - dsl_dataset_rele(ds, FTAG); -} - -int -dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, - minor_t cleanup_minor, const char *htag) -{ - dsl_dataset_snapshot_tmp_arg_t ddsta; - int error; - spa_t *spa; - boolean_t needsuspend; - void *cookie; - - ddsta.ddsta_fsname = fsname; - ddsta.ddsta_snapname = snapname; - ddsta.ddsta_cleanup_minor = cleanup_minor; - ddsta.ddsta_htag = htag; - - error = spa_open(fsname, &spa, FTAG); - if (error != 0) - return (error); - needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); - spa_close(spa, FTAG); - - if (needsuspend) { - error = zil_suspend(fsname, &cookie); - if (error != 0) - return (error); - } - - error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check, - dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED); - - if (needsuspend) - zil_resume(cookie); - return (error); -} - -void -dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) -{ - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(ds->ds_objset != NULL); - ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0); - - /* - * in case we had to change ds_fsid_guid when we opened it, - * sync it out now. - */ - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid; - - if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) { - VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, - ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1, - &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx)); - VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, - ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1, - &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx)); - VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, - ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1, - &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx)); - ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0; - ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0; - ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0; - } - - dmu_objset_sync(ds->ds_objset, zio, tx); - - for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - if (ds->ds_feature_activation_needed[f]) { - if (ds->ds_feature_inuse[f]) - continue; - dsl_dataset_activate_feature(ds->ds_object, f, tx); - ds->ds_feature_inuse[f] = B_TRUE; - } - } -} - -static int -deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -{ - dsl_deadlist_t *dl = arg; - dsl_deadlist_insert(dl, bp, tx); - return (0); -} - -void -dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - objset_t *os = ds->ds_objset; - - bplist_iterate(&ds->ds_pending_deadlist, - deadlist_enqueue_cb, &ds->ds_deadlist, tx); - - if (os->os_synced_dnodes != NULL) { - multilist_destroy(os->os_synced_dnodes); - os->os_synced_dnodes = NULL; - } - - ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx))); - - dmu_buf_rele(ds->ds_dbuf, ds); -} - -int -get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val) -{ - uint64_t count = 0; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - zap_cursor_t zc; - zap_attribute_t za; - - ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); - - /* - * There may be missing entries in ds_next_clones_obj - * due to a bug in a previous version of the code. - * Only trust it if it has the right number of entries. - */ - if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { - VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, - &count)); - } - if (count != dsl_dataset_phys(ds)->ds_num_children - 1) { - return (ENOENT); - } - for (zap_cursor_init(&zc, mos, - dsl_dataset_phys(ds)->ds_next_clones_obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - dsl_dataset_t *clone; - char buf[ZFS_MAX_DATASET_NAME_LEN]; - VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, - za.za_first_integer, FTAG, &clone)); - dsl_dir_name(clone->ds_dir, buf); - fnvlist_add_boolean(val, buf); - dsl_dataset_rele(clone, FTAG); - } - zap_cursor_fini(&zc); - return (0); -} - -void -get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) -{ - nvlist_t *propval = fnvlist_alloc(); - nvlist_t *val; - - /* - * We use nvlist_alloc() instead of fnvlist_alloc() because the - * latter would allocate the list with NV_UNIQUE_NAME flag. - * As a result, every time a clone name is appended to the list - * it would be (linearly) searched for for a duplicate name. - * We already know that all clone names must be unique and we - * want avoid the quadratic complexity of double-checking that - * because we can have a large number of clones. - */ - VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP)); - - if (get_clones_stat_impl(ds, val) == 0) { - fnvlist_add_nvlist(propval, ZPROP_VALUE, val); - fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), - propval); - } - - nvlist_free(val); - nvlist_free(propval); -} - -/* - * Returns a string that represents the receive resume stats token. It should - * be freed with strfree(). - */ -char * -get_receive_resume_stats_impl(dsl_dataset_t *ds) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - if (dsl_dataset_has_resume_receive_state(ds)) { - char *str; - void *packed; - uint8_t *compressed; - uint64_t val; - nvlist_t *token_nv = fnvlist_alloc(); - size_t packed_size, compressed_size; - - if (zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) { - fnvlist_add_uint64(token_nv, "fromguid", val); - } - if (zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) { - fnvlist_add_uint64(token_nv, "object", val); - } - if (zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) { - fnvlist_add_uint64(token_nv, "offset", val); - } - if (zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) { - fnvlist_add_uint64(token_nv, "bytes", val); - } - if (zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) { - fnvlist_add_uint64(token_nv, "toguid", val); - } - char buf[256]; - if (zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) { - fnvlist_add_string(token_nv, "toname", buf); - } - if (zap_contains(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_LARGEBLOCK) == 0) { - fnvlist_add_boolean(token_nv, "largeblockok"); - } - if (zap_contains(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_EMBEDOK) == 0) { - fnvlist_add_boolean(token_nv, "embedok"); - } - if (zap_contains(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_COMPRESSOK) == 0) { - fnvlist_add_boolean(token_nv, "compressok"); - } - packed = fnvlist_pack(token_nv, &packed_size); - fnvlist_free(token_nv); - compressed = kmem_alloc(packed_size, KM_SLEEP); - - compressed_size = gzip_compress(packed, compressed, - packed_size, packed_size, 6); - - zio_cksum_t cksum; - fletcher_4_native(compressed, compressed_size, NULL, &cksum); - - str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP); - for (int i = 0; i < compressed_size; i++) { - (void) sprintf(str + i * 2, "%02x", compressed[i]); - } - str[compressed_size * 2] = '\0'; - char *propval = kmem_asprintf("%u-%llx-%llx-%s", - ZFS_SEND_RESUME_TOKEN_VERSION, - (longlong_t)cksum.zc_word[0], - (longlong_t)packed_size, str); - kmem_free(packed, packed_size); - kmem_free(str, compressed_size * 2 + 1); - kmem_free(compressed, packed_size); - return (propval); - } - return (spa_strdup("")); -} - -/* - * Returns a string that represents the receive resume stats token of the - * dataset's child. It should be freed with strfree(). - */ -char * -get_child_receive_stats(dsl_dataset_t *ds) -{ - char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; - dsl_dataset_t *recv_ds; - dsl_dataset_name(ds, recvname); - if (strlcat(recvname, "/", sizeof (recvname)) < - sizeof (recvname) && - strlcat(recvname, recv_clone_name, sizeof (recvname)) < - sizeof (recvname) && - dsl_dataset_hold(ds->ds_dir->dd_pool, recvname, FTAG, - &recv_ds) == 0) { - char *propval = get_receive_resume_stats_impl(recv_ds); - dsl_dataset_rele(recv_ds, FTAG); - return (propval); - } - return (spa_strdup("")); -} - -static void -get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv) -{ - char *propval = get_receive_resume_stats_impl(ds); - if (strcmp(propval, "") != 0) { - dsl_prop_nvlist_add_string(nv, - ZFS_PROP_RECEIVE_RESUME_TOKEN, propval); - } else { - char *childval = get_child_receive_stats(ds); - if (strcmp(childval, "") != 0) { - dsl_prop_nvlist_add_string(nv, - ZFS_PROP_RECEIVE_RESUME_TOKEN, childval); - } - strfree(childval); - } - strfree(propval); -} - -uint64_t -dsl_get_refratio(dsl_dataset_t *ds) -{ - uint64_t ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 : - (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 / - dsl_dataset_phys(ds)->ds_compressed_bytes); - return (ratio); -} - -uint64_t -dsl_get_logicalreferenced(dsl_dataset_t *ds) -{ - return (dsl_dataset_phys(ds)->ds_uncompressed_bytes); -} - -uint64_t -dsl_get_compressratio(dsl_dataset_t *ds) -{ - if (ds->ds_is_snapshot) { - return (dsl_get_refratio(ds)); - } else { - dsl_dir_t *dd = ds->ds_dir; - mutex_enter(&dd->dd_lock); - uint64_t val = dsl_dir_get_compressratio(dd); - mutex_exit(&dd->dd_lock); - return (val); - } -} - -uint64_t -dsl_get_used(dsl_dataset_t *ds) -{ - if (ds->ds_is_snapshot) { - return (dsl_dataset_phys(ds)->ds_unique_bytes); - } else { - dsl_dir_t *dd = ds->ds_dir; - mutex_enter(&dd->dd_lock); - uint64_t val = dsl_dir_get_used(dd); - mutex_exit(&dd->dd_lock); - return (val); - } -} - -uint64_t -dsl_get_creation(dsl_dataset_t *ds) -{ - return (dsl_dataset_phys(ds)->ds_creation_time); -} - -uint64_t -dsl_get_creationtxg(dsl_dataset_t *ds) -{ - return (dsl_dataset_phys(ds)->ds_creation_txg); -} - -uint64_t -dsl_get_refquota(dsl_dataset_t *ds) -{ - return (ds->ds_quota); -} - -uint64_t -dsl_get_refreservation(dsl_dataset_t *ds) -{ - return (ds->ds_reserved); -} - -uint64_t -dsl_get_guid(dsl_dataset_t *ds) -{ - return (dsl_dataset_phys(ds)->ds_guid); -} - -uint64_t -dsl_get_unique(dsl_dataset_t *ds) -{ - return (dsl_dataset_phys(ds)->ds_unique_bytes); -} - -uint64_t -dsl_get_objsetid(dsl_dataset_t *ds) -{ - return (ds->ds_object); -} - -uint64_t -dsl_get_userrefs(dsl_dataset_t *ds) -{ - return (ds->ds_userrefs); -} - -uint64_t -dsl_get_defer_destroy(dsl_dataset_t *ds) -{ - return (DS_IS_DEFER_DESTROY(ds) ? 1 : 0); -} - -uint64_t -dsl_get_referenced(dsl_dataset_t *ds) -{ - return (dsl_dataset_phys(ds)->ds_referenced_bytes); -} - -uint64_t -dsl_get_numclones(dsl_dataset_t *ds) -{ - ASSERT(ds->ds_is_snapshot); - return (dsl_dataset_phys(ds)->ds_num_children - 1); -} - -uint64_t -dsl_get_inconsistent(dsl_dataset_t *ds) -{ - return ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) ? - 1 : 0); -} - -uint64_t -dsl_get_available(dsl_dataset_t *ds) -{ - uint64_t refdbytes = dsl_get_referenced(ds); - uint64_t availbytes = dsl_dir_space_available(ds->ds_dir, - NULL, 0, TRUE); - if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) { - availbytes += - ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes; - } - if (ds->ds_quota != 0) { - /* - * Adjust available bytes according to refquota - */ - if (refdbytes < ds->ds_quota) { - availbytes = MIN(availbytes, - ds->ds_quota - refdbytes); - } else { - availbytes = 0; - } - } - return (availbytes); -} - -int -dsl_get_written(dsl_dataset_t *ds, uint64_t *written) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - dsl_dataset_t *prev; - int err = dsl_dataset_hold_obj(dp, - dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); - if (err == 0) { - uint64_t comp, uncomp; - err = dsl_dataset_space_written(prev, ds, written, - &comp, &uncomp); - dsl_dataset_rele(prev, FTAG); - } - return (err); -} - -/* - * 'snap' should be a buffer of size ZFS_MAX_DATASET_NAME_LEN. - */ -int -dsl_get_prev_snap(dsl_dataset_t *ds, char *snap) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) { - dsl_dataset_name(ds->ds_prev, snap); - return (0); - } else { - return (ENOENT); - } -} - -/* - * Returns the mountpoint property and source for the given dataset in the value - * and source buffers. The value buffer must be at least as large as MAXPATHLEN - * and the source buffer as least as large a ZFS_MAX_DATASET_NAME_LEN. - * Returns 0 on success and an error on failure. - */ -int -dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value, - char *source) -{ - int error; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - /* Retrieve the mountpoint value stored in the zap opbject */ - error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1, - ZAP_MAXVALUELEN, value, source); - if (error != 0) { - return (error); - } - - /* - * Process the dsname and source to find the full mountpoint string. - * Can be skipped for 'legacy' or 'none'. - */ - if (value[0] == '/') { - char *buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); - char *root = buf; - const char *relpath; - - /* - * If we inherit the mountpoint, even from a dataset - * with a received value, the source will be the path of - * the dataset we inherit from. If source is - * ZPROP_SOURCE_VAL_RECVD, the received value is not - * inherited. - */ - if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) { - relpath = ""; - } else { - ASSERT0(strncmp(dsname, source, strlen(source))); - relpath = dsname + strlen(source); - if (relpath[0] == '/') - relpath++; - } - - spa_altroot(dp->dp_spa, root, ZAP_MAXVALUELEN); - - /* - * Special case an alternate root of '/'. This will - * avoid having multiple leading slashes in the - * mountpoint path. - */ - if (strcmp(root, "/") == 0) - root++; - - /* - * If the mountpoint is '/' then skip over this - * if we are obtaining either an alternate root or - * an inherited mountpoint. - */ - char *mnt = value; - if (value[1] == '\0' && (root[0] != '\0' || - relpath[0] != '\0')) - mnt = value + 1; - - if (relpath[0] == '\0') { - (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s", - root, mnt); - } else { - (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s%s%s", - root, mnt, relpath[0] == '@' ? "" : "/", - relpath); - } - kmem_free(buf, ZAP_MAXVALUELEN); - } - - return (0); -} - -void -dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - ASSERT(dsl_pool_config_held(dp)); - - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, - dsl_get_refratio(ds)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, - dsl_get_logicalreferenced(ds)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, - dsl_get_compressratio(ds)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, - dsl_get_used(ds)); - - if (ds->ds_is_snapshot) { - get_clones_stat(ds, nv); - } else { - char buf[ZFS_MAX_DATASET_NAME_LEN]; - if (dsl_get_prev_snap(ds, buf) == 0) - dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, - buf); - dsl_dir_stats(ds->ds_dir, nv); - } - - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, - dsl_get_available(ds)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, - dsl_get_referenced(ds)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, - dsl_get_creation(ds)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, - dsl_get_creationtxg(ds)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, - dsl_get_refquota(ds)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, - dsl_get_refreservation(ds)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, - dsl_get_guid(ds)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, - dsl_get_unique(ds)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, - dsl_get_objsetid(ds)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, - dsl_get_userrefs(ds)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, - dsl_get_defer_destroy(ds)); - - if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { - uint64_t written; - if (dsl_get_written(ds, &written) == 0) { - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, - written); - } - } - - if (!dsl_dataset_is_snapshot(ds)) { - /* - * A failed "newfs" (e.g. full) resumable receive leaves - * the stats set on this dataset. Check here for the prop. - */ - get_receive_resume_stats(ds, nv); - - /* - * A failed incremental resumable receive leaves the - * stats set on our child named "%recv". Check the child - * for the prop. - */ - /* 6 extra bytes for /%recv */ - char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; - dsl_dataset_t *recv_ds; - dsl_dataset_name(ds, recvname); - if (strlcat(recvname, "/", sizeof (recvname)) < - sizeof (recvname) && - strlcat(recvname, recv_clone_name, sizeof (recvname)) < - sizeof (recvname) && - dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) { - get_receive_resume_stats(recv_ds, nv); - dsl_dataset_rele(recv_ds, FTAG); - } - } -} - -void -dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - ASSERT(dsl_pool_config_held(dp)); - - stat->dds_creation_txg = dsl_get_creationtxg(ds); - stat->dds_inconsistent = dsl_get_inconsistent(ds); - stat->dds_guid = dsl_get_guid(ds); - stat->dds_origin[0] = '\0'; - if (ds->ds_is_snapshot) { - stat->dds_is_snapshot = B_TRUE; - stat->dds_num_clones = dsl_get_numclones(ds); - } else { - stat->dds_is_snapshot = B_FALSE; - stat->dds_num_clones = 0; - - if (dsl_dir_is_clone(ds->ds_dir)) { - dsl_dir_get_origin(ds->ds_dir, stat->dds_origin); - } - } -} - -uint64_t -dsl_dataset_fsid_guid(dsl_dataset_t *ds) -{ - return (ds->ds_fsid_guid); -} - -void -dsl_dataset_space(dsl_dataset_t *ds, - uint64_t *refdbytesp, uint64_t *availbytesp, - uint64_t *usedobjsp, uint64_t *availobjsp) -{ - *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes; - *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); - if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) - *availbytesp += - ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes; - if (ds->ds_quota != 0) { - /* - * Adjust available bytes according to refquota - */ - if (*refdbytesp < ds->ds_quota) - *availbytesp = MIN(*availbytesp, - ds->ds_quota - *refdbytesp); - else - *availbytesp = 0; - } - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp); - rrw_exit(&ds->ds_bp_rwlock, FTAG); - *availobjsp = DN_MAX_OBJECT - *usedobjsp; -} - -boolean_t -dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - uint64_t birth; - - ASSERT(dsl_pool_config_held(dp)); - if (snap == NULL) - return (B_FALSE); - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - birth = dsl_dataset_get_blkptr(ds)->blk_birth; - rrw_exit(&ds->ds_bp_rwlock, FTAG); - if (birth > dsl_dataset_phys(snap)->ds_creation_txg) { - objset_t *os, *os_snap; - /* - * It may be that only the ZIL differs, because it was - * reset in the head. Don't count that as being - * modified. - */ - if (dmu_objset_from_ds(ds, &os) != 0) - return (B_TRUE); - if (dmu_objset_from_ds(snap, &os_snap) != 0) - return (B_TRUE); - return (bcmp(&os->os_phys->os_meta_dnode, - &os_snap->os_phys->os_meta_dnode, - sizeof (os->os_phys->os_meta_dnode)) != 0); - } - return (B_FALSE); -} - -typedef struct dsl_dataset_rename_snapshot_arg { - const char *ddrsa_fsname; - const char *ddrsa_oldsnapname; - const char *ddrsa_newsnapname; - boolean_t ddrsa_recursive; - dmu_tx_t *ddrsa_tx; -} dsl_dataset_rename_snapshot_arg_t; - -/* ARGSUSED */ -static int -dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, - dsl_dataset_t *hds, void *arg) -{ - dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; - int error; - uint64_t val; - - error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); - if (error != 0) { - /* ignore nonexistent snapshots */ - return (error == ENOENT ? 0 : error); - } - - /* new name should not exist */ - error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val); - if (error == 0) - error = SET_ERROR(EEXIST); - else if (error == ENOENT) - error = 0; - - /* dataset name + 1 for the "@" + the new snapshot name must fit */ - if (dsl_dir_namelen(hds->ds_dir) + 1 + - strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN) - error = SET_ERROR(ENAMETOOLONG); - - return (error); -} - -static int -dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *hds; - int error; - - error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds); - if (error != 0) - return (error); - - if (ddrsa->ddrsa_recursive) { - error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object, - dsl_dataset_rename_snapshot_check_impl, ddrsa, - DS_FIND_CHILDREN); - } else { - error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa); - } - dsl_dataset_rele(hds, FTAG); - return (error); -} - -static int -dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, - dsl_dataset_t *hds, void *arg) -{ -#ifdef __FreeBSD__ -#ifdef _KERNEL - char *oldname, *newname; -#endif -#endif - dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; - dsl_dataset_t *ds; - uint64_t val; - dmu_tx_t *tx = ddrsa->ddrsa_tx; - int error; - - error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); - ASSERT(error == 0 || error == ENOENT); - if (error == ENOENT) { - /* ignore nonexistent snapshots */ - return (0); - } - - VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds)); - - /* log before we change the name */ - spa_history_log_internal_ds(ds, "rename", tx, - "-> @%s", ddrsa->ddrsa_newsnapname); - - VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx, - B_FALSE)); - mutex_enter(&ds->ds_lock); - (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname); - mutex_exit(&ds->ds_lock); - VERIFY0(zap_add(dp->dp_meta_objset, - dsl_dataset_phys(hds)->ds_snapnames_zapobj, - ds->ds_snapname, 8, 1, &ds->ds_object, tx)); - -#ifdef __FreeBSD__ -#ifdef _KERNEL - oldname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); - newname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); - snprintf(oldname, ZFS_MAX_DATASET_NAME_LEN, "%s@%s", - ddrsa->ddrsa_fsname, ddrsa->ddrsa_oldsnapname); - snprintf(newname, ZFS_MAX_DATASET_NAME_LEN, "%s@%s", - ddrsa->ddrsa_fsname, ddrsa->ddrsa_newsnapname); - zfsvfs_update_fromname(oldname, newname); - zvol_rename_minors(dp->dp_spa, oldname, newname); - kmem_free(newname, ZFS_MAX_DATASET_NAME_LEN); - kmem_free(oldname, ZFS_MAX_DATASET_NAME_LEN); -#endif -#endif - dsl_dataset_rele(ds, FTAG); - - return (0); -} - -static void -dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *hds; - - VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds)); - ddrsa->ddrsa_tx = tx; - if (ddrsa->ddrsa_recursive) { - VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object, - dsl_dataset_rename_snapshot_sync_impl, ddrsa, - DS_FIND_CHILDREN)); - } else { - VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa)); - } - dsl_dataset_rele(hds, FTAG); -} - -int -dsl_dataset_rename_snapshot(const char *fsname, - const char *oldsnapname, const char *newsnapname, boolean_t recursive) -{ - dsl_dataset_rename_snapshot_arg_t ddrsa; - - ddrsa.ddrsa_fsname = fsname; - ddrsa.ddrsa_oldsnapname = oldsnapname; - ddrsa.ddrsa_newsnapname = newsnapname; - ddrsa.ddrsa_recursive = recursive; - - return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check, - dsl_dataset_rename_snapshot_sync, &ddrsa, - 1, ZFS_SPACE_CHECK_RESERVED)); -} - -/* - * If we're doing an ownership handoff, we need to make sure that there is - * only one long hold on the dataset. We're not allowed to change anything here - * so we don't permanently release the long hold or regular hold here. We want - * to do this only when syncing to avoid the dataset unexpectedly going away - * when we release the long hold. - */ -static int -dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) -{ - boolean_t held; - - if (!dmu_tx_is_syncing(tx)) - return (0); - - if (owner != NULL) { - VERIFY3P(ds->ds_owner, ==, owner); - dsl_dataset_long_rele(ds, owner); - } - - held = dsl_dataset_long_held(ds); - - if (owner != NULL) - dsl_dataset_long_hold(ds, owner); - - if (held) - return (SET_ERROR(EBUSY)); - - return (0); -} - -int -dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_rollback_arg_t *ddra = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - int64_t unused_refres_delta; - int error; - - error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds); - if (error != 0) - return (error); - - /* must not be a snapshot */ - if (ds->ds_is_snapshot) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EINVAL)); - } - - /* must have a most recent snapshot */ - if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(ESRCH)); - } - - /* - * No rollback to a snapshot created in the current txg, because - * the rollback may dirty the dataset and create blocks that are - * not reachable from the rootbp while having a birth txg that - * falls into the snapshot's range. - */ - if (dmu_tx_is_syncing(tx) && - dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EAGAIN)); - } - - /* - * If the expected target snapshot is specified, then check that - * the latest snapshot is it. - */ - if (ddra->ddra_tosnap != NULL) { - dsl_dataset_t *snapds; - - /* Check if the target snapshot exists at all. */ - error = dsl_dataset_hold(dp, ddra->ddra_tosnap, FTAG, &snapds); - if (error != 0) { - /* - * ESRCH is used to signal that the target snapshot does - * not exist, while ENOENT is used to report that - * the rolled back dataset does not exist. - * ESRCH is also used to cover other cases where the - * target snapshot is not related to the dataset being - * rolled back such as being in a different pool. - */ - if (error == ENOENT || error == EXDEV) - error = SET_ERROR(ESRCH); - dsl_dataset_rele(ds, FTAG); - return (error); - } - ASSERT(snapds->ds_is_snapshot); - - /* Check if the snapshot is the latest snapshot indeed. */ - if (snapds != ds->ds_prev) { - /* - * Distinguish between the case where the only problem - * is intervening snapshots (EEXIST) vs the snapshot - * not being a valid target for rollback (ESRCH). - */ - if (snapds->ds_dir == ds->ds_dir || - (dsl_dir_is_clone(ds->ds_dir) && - dsl_dir_phys(ds->ds_dir)->dd_origin_obj == - snapds->ds_object)) { - error = SET_ERROR(EEXIST); - } else { - error = SET_ERROR(ESRCH); - } - dsl_dataset_rele(snapds, FTAG); - dsl_dataset_rele(ds, FTAG); - return (error); - } - dsl_dataset_rele(snapds, FTAG); - } - - /* must not have any bookmarks after the most recent snapshot */ - nvlist_t *proprequest = fnvlist_alloc(); - fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG)); - nvlist_t *bookmarks = fnvlist_alloc(); - error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks); - fnvlist_free(proprequest); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL); - pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) { - nvlist_t *valuenv = - fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair), - zfs_prop_to_name(ZFS_PROP_CREATETXG)); - uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value"); - if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) { - fnvlist_free(bookmarks); - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EEXIST)); - } - } - fnvlist_free(bookmarks); - - error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - - /* - * Check if the snap we are rolling back to uses more than - * the refquota. - */ - if (ds->ds_quota != 0 && - dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EDQUOT)); - } - - /* - * When we do the clone swap, we will temporarily use more space - * due to the refreservation (the head will no longer have any - * unique space, so the entire amount of the refreservation will need - * to be free). We will immediately destroy the clone, freeing - * this space, but the freeing happens over many txg's. - */ - unused_refres_delta = (int64_t)MIN(ds->ds_reserved, - dsl_dataset_phys(ds)->ds_unique_bytes); - - if (unused_refres_delta > 0 && - unused_refres_delta > - dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(ENOSPC)); - } - - dsl_dataset_rele(ds, FTAG); - return (0); -} - -void -dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_rollback_arg_t *ddra = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds, *clone; - uint64_t cloneobj; - char namebuf[ZFS_MAX_DATASET_NAME_LEN]; - - VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds)); - - dsl_dataset_name(ds->ds_prev, namebuf); - fnvlist_add_string(ddra->ddra_result, "target", namebuf); - - cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback", - ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx); - - VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone)); - - dsl_dataset_clone_swap_sync_impl(clone, ds, tx); - dsl_dataset_zero_zil(ds, tx); - - dsl_destroy_head_sync_impl(clone, tx); - - dsl_dataset_rele(clone, FTAG); - dsl_dataset_rele(ds, FTAG); -} - -/* - * Rolls back the given filesystem or volume to the most recent snapshot. - * The name of the most recent snapshot will be returned under key "target" - * in the result nvlist. - * - * If owner != NULL: - * - The existing dataset MUST be owned by the specified owner at entry - * - Upon return, dataset will still be held by the same owner, whether we - * succeed or not. - * - * This mode is required any time the existing filesystem is mounted. See - * notes above zfs_suspend_fs() for further details. - */ -int -dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner, - nvlist_t *result) -{ - dsl_dataset_rollback_arg_t ddra; - - ddra.ddra_fsname = fsname; - ddra.ddra_tosnap = tosnap; - ddra.ddra_owner = owner; - ddra.ddra_result = result; - - return (dsl_sync_task(fsname, dsl_dataset_rollback_check, - dsl_dataset_rollback_sync, &ddra, - 1, ZFS_SPACE_CHECK_RESERVED)); -} - -struct promotenode { - list_node_t link; - dsl_dataset_t *ds; -}; - -static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); -static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, - void *tag); -static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag); - -int -dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_promote_arg_t *ddpa = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *hds; - struct promotenode *snap; - dsl_dataset_t *origin_ds; - int err; - uint64_t unused; - uint64_t ss_mv_cnt; - size_t max_snap_len; - boolean_t conflicting_snaps; - - err = promote_hold(ddpa, dp, FTAG); - if (err != 0) - return (err); - - hds = ddpa->ddpa_clone; - snap = list_head(&ddpa->shared_snaps); - origin_ds = snap->ds; - max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1; - - snap = list_head(&ddpa->origin_snaps); - - if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) { - promote_rele(ddpa, FTAG); - return (SET_ERROR(EXDEV)); - } - - /* - * Compute and check the amount of space to transfer. Since this is - * so expensive, don't do the preliminary check. - */ - if (!dmu_tx_is_syncing(tx)) { - promote_rele(ddpa, FTAG); - return (0); - } - - /* compute origin's new unique space */ - snap = list_tail(&ddpa->clone_snaps); - ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, - origin_ds->ds_object); - dsl_deadlist_space_range(&snap->ds->ds_deadlist, - dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX, - &ddpa->unique, &unused, &unused); - - /* - * Walk the snapshots that we are moving - * - * Compute space to transfer. Consider the incremental changes - * to used by each snapshot: - * (my used) = (prev's used) + (blocks born) - (blocks killed) - * So each snapshot gave birth to: - * (blocks born) = (my used) - (prev's used) + (blocks killed) - * So a sequence would look like: - * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) - * Which simplifies to: - * uN + kN + kN-1 + ... + k1 + k0 - * Note however, if we stop before we reach the ORIGIN we get: - * uN + kN + kN-1 + ... + kM - uM-1 - */ - conflicting_snaps = B_FALSE; - ss_mv_cnt = 0; - ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes; - ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes; - ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes; - for (snap = list_head(&ddpa->shared_snaps); snap; - snap = list_next(&ddpa->shared_snaps, snap)) { - uint64_t val, dlused, dlcomp, dluncomp; - dsl_dataset_t *ds = snap->ds; - - ss_mv_cnt++; - - /* - * If there are long holds, we won't be able to evict - * the objset. - */ - if (dsl_dataset_long_held(ds)) { - err = SET_ERROR(EBUSY); - goto out; - } - - /* Check that the snapshot name does not conflict */ - VERIFY0(dsl_dataset_get_snapname(ds)); - if (strlen(ds->ds_snapname) >= max_snap_len) { - err = SET_ERROR(ENAMETOOLONG); - goto out; - } - err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); - if (err == 0) { - fnvlist_add_boolean(ddpa->err_ds, - snap->ds->ds_snapname); - conflicting_snaps = B_TRUE; - } else if (err != ENOENT) { - goto out; - } - - /* The very first snapshot does not have a deadlist */ - if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0) - continue; - - dsl_deadlist_space(&ds->ds_deadlist, - &dlused, &dlcomp, &dluncomp); - ddpa->used += dlused; - ddpa->comp += dlcomp; - ddpa->uncomp += dluncomp; - } - - /* - * In order to return the full list of conflicting snapshots, we check - * whether there was a conflict after traversing all of them. - */ - if (conflicting_snaps) { - err = SET_ERROR(EEXIST); - goto out; - } - - /* - * If we are a clone of a clone then we never reached ORIGIN, - * so we need to subtract out the clone origin's used space. - */ - if (ddpa->origin_origin) { - ddpa->used -= - dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes; - ddpa->comp -= - dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes; - ddpa->uncomp -= - dsl_dataset_phys(ddpa->origin_origin)-> - ds_uncompressed_bytes; - } - - /* Check that there is enough space and limit headroom here */ - err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, - 0, ss_mv_cnt, ddpa->used, ddpa->cr); - if (err != 0) - goto out; - - /* - * Compute the amounts of space that will be used by snapshots - * after the promotion (for both origin and clone). For each, - * it is the amount of space that will be on all of their - * deadlists (that was not born before their new origin). - */ - if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) { - uint64_t space; - - /* - * Note, typically this will not be a clone of a clone, - * so dd_origin_txg will be < TXG_INITIAL, so - * these snaplist_space() -> dsl_deadlist_space_range() - * calls will be fast because they do not have to - * iterate over all bps. - */ - snap = list_head(&ddpa->origin_snaps); - err = snaplist_space(&ddpa->shared_snaps, - snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap); - if (err != 0) - goto out; - - err = snaplist_space(&ddpa->clone_snaps, - snap->ds->ds_dir->dd_origin_txg, &space); - if (err != 0) - goto out; - ddpa->cloneusedsnap += space; - } - if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags & - DD_FLAG_USED_BREAKDOWN) { - err = snaplist_space(&ddpa->origin_snaps, - dsl_dataset_phys(origin_ds)->ds_creation_txg, - &ddpa->originusedsnap); - if (err != 0) - goto out; - } - -out: - promote_rele(ddpa, FTAG); - return (err); -} - -void -dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_promote_arg_t *ddpa = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *hds; - struct promotenode *snap; - dsl_dataset_t *origin_ds; - dsl_dataset_t *origin_head; - dsl_dir_t *dd; - dsl_dir_t *odd = NULL; - uint64_t oldnext_obj; - int64_t delta; -#if defined(__FreeBSD__) && defined(_KERNEL) - char *oldname, *newname; -#endif - - VERIFY0(promote_hold(ddpa, dp, FTAG)); - hds = ddpa->ddpa_clone; - - ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE); - - snap = list_head(&ddpa->shared_snaps); - origin_ds = snap->ds; - dd = hds->ds_dir; - - snap = list_head(&ddpa->origin_snaps); - origin_head = snap->ds; - - /* - * We need to explicitly open odd, since origin_ds's dd will be - * changing. - */ - VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object, - NULL, FTAG, &odd)); - - /* change origin's next snap */ - dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); - oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj; - snap = list_tail(&ddpa->clone_snaps); - ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, - origin_ds->ds_object); - dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object; - - /* change the origin's next clone */ - if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) { - dsl_dataset_remove_from_next_clones(origin_ds, - snap->ds->ds_object, tx); - VERIFY0(zap_add_int(dp->dp_meta_objset, - dsl_dataset_phys(origin_ds)->ds_next_clones_obj, - oldnext_obj, tx)); - } - - /* change origin */ - dmu_buf_will_dirty(dd->dd_dbuf, tx); - ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object); - dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj; - dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; - dmu_buf_will_dirty(odd->dd_dbuf, tx); - dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object; - origin_head->ds_dir->dd_origin_txg = - dsl_dataset_phys(origin_ds)->ds_creation_txg; - - /* change dd_clone entries */ - if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { - VERIFY0(zap_remove_int(dp->dp_meta_objset, - dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx)); - VERIFY0(zap_add_int(dp->dp_meta_objset, - dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, - hds->ds_object, tx)); - - VERIFY0(zap_remove_int(dp->dp_meta_objset, - dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, - origin_head->ds_object, tx)); - if (dsl_dir_phys(dd)->dd_clones == 0) { - dsl_dir_phys(dd)->dd_clones = - zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES, - DMU_OT_NONE, 0, tx); - } - VERIFY0(zap_add_int(dp->dp_meta_objset, - dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx)); - } - -#if defined(__FreeBSD__) && defined(_KERNEL) - oldname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); - newname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); -#endif - - /* move snapshots to this dir */ - for (snap = list_head(&ddpa->shared_snaps); snap; - snap = list_next(&ddpa->shared_snaps, snap)) { - dsl_dataset_t *ds = snap->ds; - - /* - * Property callbacks are registered to a particular - * dsl_dir. Since ours is changing, evict the objset - * so that they will be unregistered from the old dsl_dir. - */ - if (ds->ds_objset) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } - -#if defined(__FreeBSD__) && defined(_KERNEL) - dsl_dataset_name(ds, oldname); -#endif - - /* move snap name entry */ - VERIFY0(dsl_dataset_get_snapname(ds)); - VERIFY0(dsl_dataset_snap_remove(origin_head, - ds->ds_snapname, tx, B_TRUE)); - VERIFY0(zap_add(dp->dp_meta_objset, - dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname, - 8, 1, &ds->ds_object, tx)); - dsl_fs_ss_count_adjust(hds->ds_dir, 1, - DD_FIELD_SNAPSHOT_COUNT, tx); - - /* change containing dsl_dir */ - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object); - dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object; - ASSERT3P(ds->ds_dir, ==, odd); - dsl_dir_rele(ds->ds_dir, ds); - VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object, - NULL, ds, &ds->ds_dir)); - -#if defined(__FreeBSD__) && defined(_KERNEL) - dsl_dataset_name(ds, newname); - zfsvfs_update_fromname(oldname, newname); - zvol_rename_minors(dp->dp_spa, oldname, newname); -#endif - - /* move any clone references */ - if (dsl_dataset_phys(ds)->ds_next_clones_obj && - spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { - zap_cursor_t zc; - zap_attribute_t za; - - for (zap_cursor_init(&zc, dp->dp_meta_objset, - dsl_dataset_phys(ds)->ds_next_clones_obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - dsl_dataset_t *cnds; - uint64_t o; - - if (za.za_first_integer == oldnext_obj) { - /* - * We've already moved the - * origin's reference. - */ - continue; - } - - VERIFY0(dsl_dataset_hold_obj(dp, - za.za_first_integer, FTAG, &cnds)); - o = dsl_dir_phys(cnds->ds_dir)-> - dd_head_dataset_obj; - - VERIFY0(zap_remove_int(dp->dp_meta_objset, - dsl_dir_phys(odd)->dd_clones, o, tx)); - VERIFY0(zap_add_int(dp->dp_meta_objset, - dsl_dir_phys(dd)->dd_clones, o, tx)); - dsl_dataset_rele(cnds, FTAG); - } - zap_cursor_fini(&zc); - } - - ASSERT(!dsl_prop_hascb(ds)); - } - -#if defined(__FreeBSD__) && defined(_KERNEL) - kmem_free(newname, ZFS_MAX_DATASET_NAME_LEN); - kmem_free(oldname, ZFS_MAX_DATASET_NAME_LEN); -#endif - /* - * Change space accounting. - * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either - * both be valid, or both be 0 (resulting in delta == 0). This - * is true for each of {clone,origin} independently. - */ - - delta = ddpa->cloneusedsnap - - dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]; - ASSERT3S(delta, >=, 0); - ASSERT3U(ddpa->used, >=, delta); - dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); - dsl_dir_diduse_space(dd, DD_USED_HEAD, - ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx); - - delta = ddpa->originusedsnap - - dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP]; - ASSERT3S(delta, <=, 0); - ASSERT3U(ddpa->used, >=, -delta); - dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); - dsl_dir_diduse_space(odd, DD_USED_HEAD, - -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx); - - dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique; - - /* log history record */ - spa_history_log_internal_ds(hds, "promote", tx, ""); - - dsl_dir_rele(odd, FTAG); - promote_rele(ddpa, FTAG); -} - -/* - * Make a list of dsl_dataset_t's for the snapshots between first_obj - * (exclusive) and last_obj (inclusive). The list will be in reverse - * order (last_obj will be the list_head()). If first_obj == 0, do all - * snapshots back to this dataset's origin. - */ -static int -snaplist_make(dsl_pool_t *dp, - uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag) -{ - uint64_t obj = last_obj; - - list_create(l, sizeof (struct promotenode), - offsetof(struct promotenode, link)); - - while (obj != first_obj) { - dsl_dataset_t *ds; - struct promotenode *snap; - int err; - - err = dsl_dataset_hold_obj(dp, obj, tag, &ds); - ASSERT(err != ENOENT); - if (err != 0) - return (err); - - if (first_obj == 0) - first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj; - - snap = kmem_alloc(sizeof (*snap), KM_SLEEP); - snap->ds = ds; - list_insert_tail(l, snap); - obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; - } - - return (0); -} - -static int -snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) -{ - struct promotenode *snap; - - *spacep = 0; - for (snap = list_head(l); snap; snap = list_next(l, snap)) { - uint64_t used, comp, uncomp; - dsl_deadlist_space_range(&snap->ds->ds_deadlist, - mintxg, UINT64_MAX, &used, &comp, &uncomp); - *spacep += used; - } - return (0); -} - -static void -snaplist_destroy(list_t *l, void *tag) -{ - struct promotenode *snap; - - if (l == NULL || !list_link_active(&l->list_head)) - return; - - while ((snap = list_tail(l)) != NULL) { - list_remove(l, snap); - dsl_dataset_rele(snap->ds, tag); - kmem_free(snap, sizeof (*snap)); - } - list_destroy(l); -} - -static int -promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) -{ - int error; - dsl_dir_t *dd; - struct promotenode *snap; - - error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag, - &ddpa->ddpa_clone); - if (error != 0) - return (error); - dd = ddpa->ddpa_clone->ds_dir; - - if (ddpa->ddpa_clone->ds_is_snapshot || - !dsl_dir_is_clone(dd)) { - dsl_dataset_rele(ddpa->ddpa_clone, tag); - return (SET_ERROR(EINVAL)); - } - - error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj, - &ddpa->shared_snaps, tag); - if (error != 0) - goto out; - - error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object, - &ddpa->clone_snaps, tag); - if (error != 0) - goto out; - - snap = list_head(&ddpa->shared_snaps); - ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj); - error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj, - dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj, - &ddpa->origin_snaps, tag); - if (error != 0) - goto out; - - if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) { - error = dsl_dataset_hold_obj(dp, - dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj, - tag, &ddpa->origin_origin); - if (error != 0) - goto out; - } -out: - if (error != 0) - promote_rele(ddpa, tag); - return (error); -} - -static void -promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag) -{ - snaplist_destroy(&ddpa->shared_snaps, tag); - snaplist_destroy(&ddpa->clone_snaps, tag); - snaplist_destroy(&ddpa->origin_snaps, tag); - if (ddpa->origin_origin != NULL) - dsl_dataset_rele(ddpa->origin_origin, tag); - dsl_dataset_rele(ddpa->ddpa_clone, tag); -} - -/* - * Promote a clone. - * - * If it fails due to a conflicting snapshot name, "conflsnap" will be filled - * in with the name. (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.) - */ -int -dsl_dataset_promote(const char *name, char *conflsnap) -{ - dsl_dataset_promote_arg_t ddpa = { 0 }; - uint64_t numsnaps; - int error; - nvpair_t *snap_pair; - objset_t *os; - - /* - * We will modify space proportional to the number of - * snapshots. Compute numsnaps. - */ - error = dmu_objset_hold(name, FTAG, &os); - if (error != 0) - return (error); - error = zap_count(dmu_objset_pool(os)->dp_meta_objset, - dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj, - &numsnaps); - dmu_objset_rele(os, FTAG); - if (error != 0) - return (error); - - ddpa.ddpa_clonename = name; - ddpa.err_ds = fnvlist_alloc(); - ddpa.cr = CRED(); - - error = dsl_sync_task(name, dsl_dataset_promote_check, - dsl_dataset_promote_sync, &ddpa, - 2 + numsnaps, ZFS_SPACE_CHECK_RESERVED); - - /* - * Return the first conflicting snapshot found. - */ - snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL); - if (snap_pair != NULL && conflsnap != NULL) - (void) strcpy(conflsnap, nvpair_name(snap_pair)); - - fnvlist_free(ddpa.err_ds); - return (error); -} - -int -dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, - dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx) -{ - /* - * "slack" factor for received datasets with refquota set on them. - * See the bottom of this function for details on its use. - */ - uint64_t refquota_slack = DMU_MAX_ACCESS * spa_asize_inflation; - int64_t unused_refres_delta; - - /* they should both be heads */ - if (clone->ds_is_snapshot || - origin_head->ds_is_snapshot) - return (SET_ERROR(EINVAL)); - - /* if we are not forcing, the branch point should be just before them */ - if (!force && clone->ds_prev != origin_head->ds_prev) - return (SET_ERROR(EINVAL)); - - /* clone should be the clone (unless they are unrelated) */ - if (clone->ds_prev != NULL && - clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap && - origin_head->ds_dir != clone->ds_prev->ds_dir) - return (SET_ERROR(EINVAL)); - - /* the clone should be a child of the origin */ - if (clone->ds_dir->dd_parent != origin_head->ds_dir) - return (SET_ERROR(EINVAL)); - - /* origin_head shouldn't be modified unless 'force' */ - if (!force && - dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev)) - return (SET_ERROR(ETXTBSY)); - - /* origin_head should have no long holds (e.g. is not mounted) */ - if (dsl_dataset_handoff_check(origin_head, owner, tx)) - return (SET_ERROR(EBUSY)); - - /* check amount of any unconsumed refreservation */ - unused_refres_delta = - (int64_t)MIN(origin_head->ds_reserved, - dsl_dataset_phys(origin_head)->ds_unique_bytes) - - (int64_t)MIN(origin_head->ds_reserved, - dsl_dataset_phys(clone)->ds_unique_bytes); - - if (unused_refres_delta > 0 && - unused_refres_delta > - dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE)) - return (SET_ERROR(ENOSPC)); - - /* - * The clone can't be too much over the head's refquota. - * - * To ensure that the entire refquota can be used, we allow one - * transaction to exceed the the refquota. Therefore, this check - * needs to also allow for the space referenced to be more than the - * refquota. The maximum amount of space that one transaction can use - * on disk is DMU_MAX_ACCESS * spa_asize_inflation. Allowing this - * overage ensures that we are able to receive a filesystem that - * exceeds the refquota on the source system. - * - * So that overage is the refquota_slack we use below. - */ - if (origin_head->ds_quota != 0 && - dsl_dataset_phys(clone)->ds_referenced_bytes > - origin_head->ds_quota + refquota_slack) - return (SET_ERROR(EDQUOT)); - - return (0); -} - -static void -dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone, - dsl_dataset_t *origin, dmu_tx_t *tx) -{ - uint64_t clone_remap_dl_obj, origin_remap_dl_obj; - dsl_pool_t *dp = dmu_tx_pool(tx); - - ASSERT(dsl_pool_sync_context(dp)); - - clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone); - origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin); - - if (clone_remap_dl_obj != 0) { - dsl_deadlist_close(&clone->ds_remap_deadlist); - dsl_dataset_unset_remap_deadlist_object(clone, tx); - } - if (origin_remap_dl_obj != 0) { - dsl_deadlist_close(&origin->ds_remap_deadlist); - dsl_dataset_unset_remap_deadlist_object(origin, tx); - } - - if (clone_remap_dl_obj != 0) { - dsl_dataset_set_remap_deadlist_object(origin, - clone_remap_dl_obj, tx); - dsl_deadlist_open(&origin->ds_remap_deadlist, - dp->dp_meta_objset, clone_remap_dl_obj); - } - if (origin_remap_dl_obj != 0) { - dsl_dataset_set_remap_deadlist_object(clone, - origin_remap_dl_obj, tx); - dsl_deadlist_open(&clone->ds_remap_deadlist, - dp->dp_meta_objset, origin_remap_dl_obj); - } -} - -void -dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, - dsl_dataset_t *origin_head, dmu_tx_t *tx) -{ - dsl_pool_t *dp = dmu_tx_pool(tx); - int64_t unused_refres_delta; - - ASSERT(clone->ds_reserved == 0); - /* - * NOTE: On DEBUG kernels there could be a race between this and - * the check function if spa_asize_inflation is adjusted... - */ - ASSERT(origin_head->ds_quota == 0 || - dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota + - DMU_MAX_ACCESS * spa_asize_inflation); - ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev); - - /* - * Swap per-dataset feature flags. - */ - for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - if (!(spa_feature_table[f].fi_flags & - ZFEATURE_FLAG_PER_DATASET)) { - ASSERT(!clone->ds_feature_inuse[f]); - ASSERT(!origin_head->ds_feature_inuse[f]); - continue; - } - - boolean_t clone_inuse = clone->ds_feature_inuse[f]; - boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f]; - - if (clone_inuse) { - dsl_dataset_deactivate_feature(clone->ds_object, f, tx); - clone->ds_feature_inuse[f] = B_FALSE; - } - if (origin_head_inuse) { - dsl_dataset_deactivate_feature(origin_head->ds_object, - f, tx); - origin_head->ds_feature_inuse[f] = B_FALSE; - } - if (clone_inuse) { - dsl_dataset_activate_feature(origin_head->ds_object, - f, tx); - origin_head->ds_feature_inuse[f] = B_TRUE; - } - if (origin_head_inuse) { - dsl_dataset_activate_feature(clone->ds_object, f, tx); - clone->ds_feature_inuse[f] = B_TRUE; - } - } - - dmu_buf_will_dirty(clone->ds_dbuf, tx); - dmu_buf_will_dirty(origin_head->ds_dbuf, tx); - - if (clone->ds_objset != NULL) { - dmu_objset_evict(clone->ds_objset); - clone->ds_objset = NULL; - } - - if (origin_head->ds_objset != NULL) { - dmu_objset_evict(origin_head->ds_objset); - origin_head->ds_objset = NULL; - } - - unused_refres_delta = - (int64_t)MIN(origin_head->ds_reserved, - dsl_dataset_phys(origin_head)->ds_unique_bytes) - - (int64_t)MIN(origin_head->ds_reserved, - dsl_dataset_phys(clone)->ds_unique_bytes); - - /* - * Reset origin's unique bytes, if it exists. - */ - if (clone->ds_prev) { - dsl_dataset_t *origin = clone->ds_prev; - uint64_t comp, uncomp; - - dmu_buf_will_dirty(origin->ds_dbuf, tx); - dsl_deadlist_space_range(&clone->ds_deadlist, - dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX, - &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp); - } - - /* swap blkptrs */ - { - rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG); - rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG); - blkptr_t tmp; - tmp = dsl_dataset_phys(origin_head)->ds_bp; - dsl_dataset_phys(origin_head)->ds_bp = - dsl_dataset_phys(clone)->ds_bp; - dsl_dataset_phys(clone)->ds_bp = tmp; - rrw_exit(&origin_head->ds_bp_rwlock, FTAG); - rrw_exit(&clone->ds_bp_rwlock, FTAG); - } - - /* set dd_*_bytes */ - { - int64_t dused, dcomp, duncomp; - uint64_t cdl_used, cdl_comp, cdl_uncomp; - uint64_t odl_used, odl_comp, odl_uncomp; - - ASSERT3U(dsl_dir_phys(clone->ds_dir)-> - dd_used_breakdown[DD_USED_SNAP], ==, 0); - - dsl_deadlist_space(&clone->ds_deadlist, - &cdl_used, &cdl_comp, &cdl_uncomp); - dsl_deadlist_space(&origin_head->ds_deadlist, - &odl_used, &odl_comp, &odl_uncomp); - - dused = dsl_dataset_phys(clone)->ds_referenced_bytes + - cdl_used - - (dsl_dataset_phys(origin_head)->ds_referenced_bytes + - odl_used); - dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes + - cdl_comp - - (dsl_dataset_phys(origin_head)->ds_compressed_bytes + - odl_comp); - duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes + - cdl_uncomp - - (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes + - odl_uncomp); - - dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD, - dused, dcomp, duncomp, tx); - dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD, - -dused, -dcomp, -duncomp, tx); - - /* - * The difference in the space used by snapshots is the - * difference in snapshot space due to the head's - * deadlist (since that's the only thing that's - * changing that affects the snapused). - */ - dsl_deadlist_space_range(&clone->ds_deadlist, - origin_head->ds_dir->dd_origin_txg, UINT64_MAX, - &cdl_used, &cdl_comp, &cdl_uncomp); - dsl_deadlist_space_range(&origin_head->ds_deadlist, - origin_head->ds_dir->dd_origin_txg, UINT64_MAX, - &odl_used, &odl_comp, &odl_uncomp); - dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used, - DD_USED_HEAD, DD_USED_SNAP, NULL); - } - - /* swap ds_*_bytes */ - SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes, - dsl_dataset_phys(clone)->ds_referenced_bytes); - SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes, - dsl_dataset_phys(clone)->ds_compressed_bytes); - SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes, - dsl_dataset_phys(clone)->ds_uncompressed_bytes); - SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes, - dsl_dataset_phys(clone)->ds_unique_bytes); - - /* apply any parent delta for change in unconsumed refreservation */ - dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV, - unused_refres_delta, 0, 0, tx); - - /* - * Swap deadlists. - */ - dsl_deadlist_close(&clone->ds_deadlist); - dsl_deadlist_close(&origin_head->ds_deadlist); - SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj, - dsl_dataset_phys(clone)->ds_deadlist_obj); - dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset, - dsl_dataset_phys(clone)->ds_deadlist_obj); - dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, - dsl_dataset_phys(origin_head)->ds_deadlist_obj); - dsl_dataset_swap_remap_deadlists(clone, origin_head, tx); - - dsl_scan_ds_clone_swapped(origin_head, clone, tx); - - spa_history_log_internal_ds(clone, "clone swap", tx, - "parent=%s", origin_head->ds_dir->dd_myname); -} - -/* - * Given a pool name and a dataset object number in that pool, - * return the name of that dataset. - */ -int -dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) -{ - dsl_pool_t *dp; - dsl_dataset_t *ds; - int error; - - error = dsl_pool_hold(pname, FTAG, &dp); - if (error != 0) - return (error); - - error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); - if (error == 0) { - dsl_dataset_name(ds, buf); - dsl_dataset_rele(ds, FTAG); - } - dsl_pool_rele(dp, FTAG); - - return (error); -} - -int -dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, - uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) -{ - int error = 0; - - ASSERT3S(asize, >, 0); - - /* - * *ref_rsrv is the portion of asize that will come from any - * unconsumed refreservation space. - */ - *ref_rsrv = 0; - - mutex_enter(&ds->ds_lock); - /* - * Make a space adjustment for reserved bytes. - */ - if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) { - ASSERT3U(*used, >=, - ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); - *used -= - (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); - *ref_rsrv = - asize - MIN(asize, parent_delta(ds, asize + inflight)); - } - - if (!check_quota || ds->ds_quota == 0) { - mutex_exit(&ds->ds_lock); - return (0); - } - /* - * If they are requesting more space, and our current estimate - * is over quota, they get to try again unless the actual - * on-disk is over quota and there are no pending changes (which - * may free up space for us). - */ - if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >= - ds->ds_quota) { - if (inflight > 0 || - dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota) - error = SET_ERROR(ERESTART); - else - error = SET_ERROR(EDQUOT); - } - mutex_exit(&ds->ds_lock); - - return (error); -} - -typedef struct dsl_dataset_set_qr_arg { - const char *ddsqra_name; - zprop_source_t ddsqra_source; - uint64_t ddsqra_value; -} dsl_dataset_set_qr_arg_t; - - -/* ARGSUSED */ -static int -dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_set_qr_arg_t *ddsqra = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - int error; - uint64_t newval; - - if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA) - return (SET_ERROR(ENOTSUP)); - - error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); - if (error != 0) - return (error); - - if (ds->ds_is_snapshot) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EINVAL)); - } - - error = dsl_prop_predict(ds->ds_dir, - zfs_prop_to_name(ZFS_PROP_REFQUOTA), - ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - - if (newval == 0) { - dsl_dataset_rele(ds, FTAG); - return (0); - } - - if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes || - newval < ds->ds_reserved) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(ENOSPC)); - } - - dsl_dataset_rele(ds, FTAG); - return (0); -} - -static void -dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_set_qr_arg_t *ddsqra = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - uint64_t newval; - - VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - - dsl_prop_set_sync_impl(ds, - zfs_prop_to_name(ZFS_PROP_REFQUOTA), - ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, - &ddsqra->ddsqra_value, tx); - - VERIFY0(dsl_prop_get_int_ds(ds, - zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval)); - - if (ds->ds_quota != newval) { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_quota = newval; - } - dsl_dataset_rele(ds, FTAG); -} - -int -dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, - uint64_t refquota) -{ - dsl_dataset_set_qr_arg_t ddsqra; - - ddsqra.ddsqra_name = dsname; - ddsqra.ddsqra_source = source; - ddsqra.ddsqra_value = refquota; - - return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check, - dsl_dataset_set_refquota_sync, &ddsqra, 0, - ZFS_SPACE_CHECK_EXTRA_RESERVED)); -} - -static int -dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_set_qr_arg_t *ddsqra = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - int error; - uint64_t newval, unique; - - if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION) - return (SET_ERROR(ENOTSUP)); - - error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); - if (error != 0) - return (error); - - if (ds->ds_is_snapshot) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EINVAL)); - } - - error = dsl_prop_predict(ds->ds_dir, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), - ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - - /* - * If we are doing the preliminary check in open context, the - * space estimates may be inaccurate. - */ - if (!dmu_tx_is_syncing(tx)) { - dsl_dataset_rele(ds, FTAG); - return (0); - } - - mutex_enter(&ds->ds_lock); - if (!DS_UNIQUE_IS_ACCURATE(ds)) - dsl_dataset_recalc_head_uniq(ds); - unique = dsl_dataset_phys(ds)->ds_unique_bytes; - mutex_exit(&ds->ds_lock); - - if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) { - uint64_t delta = MAX(unique, newval) - - MAX(unique, ds->ds_reserved); - - if (delta > - dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) || - (ds->ds_quota > 0 && newval > ds->ds_quota)) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(ENOSPC)); - } - } - - dsl_dataset_rele(ds, FTAG); - return (0); -} - -void -dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, - zprop_source_t source, uint64_t value, dmu_tx_t *tx) -{ - uint64_t newval; - uint64_t unique; - int64_t delta; - - dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), - source, sizeof (value), 1, &value, tx); - - VERIFY0(dsl_prop_get_int_ds(ds, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval)); - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - mutex_enter(&ds->ds_dir->dd_lock); - mutex_enter(&ds->ds_lock); - ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); - unique = dsl_dataset_phys(ds)->ds_unique_bytes; - delta = MAX(0, (int64_t)(newval - unique)) - - MAX(0, (int64_t)(ds->ds_reserved - unique)); - ds->ds_reserved = newval; - mutex_exit(&ds->ds_lock); - - dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); - mutex_exit(&ds->ds_dir->dd_lock); -} - -static void -dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_set_qr_arg_t *ddsqra = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - - VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - dsl_dataset_set_refreservation_sync_impl(ds, - ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx); - dsl_dataset_rele(ds, FTAG); -} - -int -dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, - uint64_t refreservation) -{ - dsl_dataset_set_qr_arg_t ddsqra; - - ddsqra.ddsqra_name = dsname; - ddsqra.ddsqra_source = source; - ddsqra.ddsqra_value = refreservation; - - return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check, - dsl_dataset_set_refreservation_sync, &ddsqra, 0, - ZFS_SPACE_CHECK_EXTRA_RESERVED)); -} - -/* - * Return (in *usedp) the amount of space written in new that is not - * present in oldsnap. New may be a snapshot or the head. Old must be - * a snapshot before new, in new's filesystem (or its origin). If not then - * fail and return EINVAL. - * - * The written space is calculated by considering two components: First, we - * ignore any freed space, and calculate the written as new's used space - * minus old's used space. Next, we add in the amount of space that was freed - * between the two snapshots, thus reducing new's used space relative to old's. - * Specifically, this is the space that was born before old->ds_creation_txg, - * and freed before new (ie. on new's deadlist or a previous deadlist). - * - * space freed [---------------------] - * snapshots ---O-------O--------O-------O------ - * oldsnap new - */ -int -dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) -{ - int err = 0; - uint64_t snapobj; - dsl_pool_t *dp = new->ds_dir->dd_pool; - - ASSERT(dsl_pool_config_held(dp)); - - *usedp = 0; - *usedp += dsl_dataset_phys(new)->ds_referenced_bytes; - *usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes; - - *compp = 0; - *compp += dsl_dataset_phys(new)->ds_compressed_bytes; - *compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes; - - *uncompp = 0; - *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes; - *uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes; - - snapobj = new->ds_object; - while (snapobj != oldsnap->ds_object) { - dsl_dataset_t *snap; - uint64_t used, comp, uncomp; - - if (snapobj == new->ds_object) { - snap = new; - } else { - err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); - if (err != 0) - break; - } - - if (dsl_dataset_phys(snap)->ds_prev_snap_txg == - dsl_dataset_phys(oldsnap)->ds_creation_txg) { - /* - * The blocks in the deadlist can not be born after - * ds_prev_snap_txg, so get the whole deadlist space, - * which is more efficient (especially for old-format - * deadlists). Unfortunately the deadlist code - * doesn't have enough information to make this - * optimization itself. - */ - dsl_deadlist_space(&snap->ds_deadlist, - &used, &comp, &uncomp); - } else { - dsl_deadlist_space_range(&snap->ds_deadlist, - 0, dsl_dataset_phys(oldsnap)->ds_creation_txg, - &used, &comp, &uncomp); - } - *usedp += used; - *compp += comp; - *uncompp += uncomp; - - /* - * If we get to the beginning of the chain of snapshots - * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap - * was not a snapshot of/before new. - */ - snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj; - if (snap != new) - dsl_dataset_rele(snap, FTAG); - if (snapobj == 0) { - err = SET_ERROR(EINVAL); - break; - } - - } - return (err); -} - -/* - * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, - * lastsnap, and all snapshots in between are deleted. - * - * blocks that would be freed [---------------------------] - * snapshots ---O-------O--------O-------O--------O - * firstsnap lastsnap - * - * This is the set of blocks that were born after the snap before firstsnap, - * (birth > firstsnap->prev_snap_txg) and died before the snap after the - * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). - * We calculate this by iterating over the relevant deadlists (from the snap - * after lastsnap, backward to the snap after firstsnap), summing up the - * space on the deadlist that was born after the snap before firstsnap. - */ -int -dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, - dsl_dataset_t *lastsnap, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) -{ - int err = 0; - uint64_t snapobj; - dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; - - ASSERT(firstsnap->ds_is_snapshot); - ASSERT(lastsnap->ds_is_snapshot); - - /* - * Check that the snapshots are in the same dsl_dir, and firstsnap - * is before lastsnap. - */ - if (firstsnap->ds_dir != lastsnap->ds_dir || - dsl_dataset_phys(firstsnap)->ds_creation_txg > - dsl_dataset_phys(lastsnap)->ds_creation_txg) - return (SET_ERROR(EINVAL)); - - *usedp = *compp = *uncompp = 0; - - snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj; - while (snapobj != firstsnap->ds_object) { - dsl_dataset_t *ds; - uint64_t used, comp, uncomp; - - err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); - if (err != 0) - break; - - dsl_deadlist_space_range(&ds->ds_deadlist, - dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX, - &used, &comp, &uncomp); - *usedp += used; - *compp += comp; - *uncompp += uncomp; - - snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj; - ASSERT3U(snapobj, !=, 0); - dsl_dataset_rele(ds, FTAG); - } - return (err); -} - -/* - * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. - * For example, they could both be snapshots of the same filesystem, and - * 'earlier' is before 'later'. Or 'earlier' could be the origin of - * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's - * filesystem. Or 'earlier' could be the origin's origin. - * - * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg. - */ -boolean_t -dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, - uint64_t earlier_txg) -{ - dsl_pool_t *dp = later->ds_dir->dd_pool; - int error; - boolean_t ret; - - ASSERT(dsl_pool_config_held(dp)); - ASSERT(earlier->ds_is_snapshot || earlier_txg != 0); - - if (earlier_txg == 0) - earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg; - - if (later->ds_is_snapshot && - earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg) - return (B_FALSE); - - if (later->ds_dir == earlier->ds_dir) - return (B_TRUE); - if (!dsl_dir_is_clone(later->ds_dir)) - return (B_FALSE); - - if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object) - return (B_TRUE); - dsl_dataset_t *origin; - error = dsl_dataset_hold_obj(dp, - dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin); - if (error != 0) - return (B_FALSE); - ret = dsl_dataset_is_before(origin, earlier, earlier_txg); - dsl_dataset_rele(origin, FTAG); - return (ret); -} - -void -dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx); -} - -boolean_t -dsl_dataset_is_zapified(dsl_dataset_t *ds) -{ - dmu_object_info_t doi; - - dmu_object_info_from_db(ds->ds_dbuf, &doi); - return (doi.doi_type == DMU_OTN_ZAP_METADATA); -} - -boolean_t -dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds) -{ - return (dsl_dataset_is_zapified(ds) && - zap_contains(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0); -} - -uint64_t -dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds) -{ - uint64_t remap_deadlist_obj; - int err; - - if (!dsl_dataset_is_zapified(ds)) - return (0); - - err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, - DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1, - &remap_deadlist_obj); - - if (err != 0) { - VERIFY3S(err, ==, ENOENT); - return (0); - } - - ASSERT(remap_deadlist_obj != 0); - return (remap_deadlist_obj); -} - -boolean_t -dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds) -{ - EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist), - dsl_dataset_get_remap_deadlist_object(ds) != 0); - return (dsl_deadlist_is_open(&ds->ds_remap_deadlist)); -} - -static void -dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj, - dmu_tx_t *tx) -{ - ASSERT(obj != 0); - dsl_dataset_zapify(ds, tx); - VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, - DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx)); -} - -static void -dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx)); -} - -void -dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - uint64_t remap_deadlist_object; - spa_t *spa = ds->ds_dir->dd_pool->dp_spa; - - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(dsl_dataset_remap_deadlist_exists(ds)); - - remap_deadlist_object = ds->ds_remap_deadlist.dl_object; - dsl_deadlist_close(&ds->ds_remap_deadlist); - dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx); - dsl_dataset_unset_remap_deadlist_object(ds, tx); - spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); -} - -void -dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - uint64_t remap_deadlist_obj; - spa_t *spa = ds->ds_dir->dd_pool->dp_spa; - - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock)); - /* - * Currently we only create remap deadlists when there are indirect - * vdevs with referenced mappings. - */ - ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); - - remap_deadlist_obj = dsl_deadlist_clone( - &ds->ds_deadlist, UINT64_MAX, - dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); - dsl_dataset_set_remap_deadlist_object(ds, - remap_deadlist_obj, tx); - dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa), - remap_deadlist_obj); - spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c deleted file mode 100644 index 2f3647bc8e86..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c +++ /dev/null @@ -1,561 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -#include -#include -#include -#include -#include -#include - -/* - * Deadlist concurrency: - * - * Deadlists can only be modified from the syncing thread. - * - * Except for dsl_deadlist_insert(), it can only be modified with the - * dp_config_rwlock held with RW_WRITER. - * - * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can - * be called concurrently, from open context, with the dl_config_rwlock held - * with RW_READER. - * - * Therefore, we only need to provide locking between dsl_deadlist_insert() and - * the accessors, protecting: - * dl_phys->dl_used,comp,uncomp - * and protecting the dl_tree from being loaded. - * The locking is provided by dl_lock. Note that locking on the bpobj_t - * provides its own locking, and dl_oldfmt is immutable. - */ - -static int -dsl_deadlist_compare(const void *arg1, const void *arg2) -{ - const dsl_deadlist_entry_t *dle1 = (const dsl_deadlist_entry_t *)arg1; - const dsl_deadlist_entry_t *dle2 = (const dsl_deadlist_entry_t *)arg2; - - return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); -} - -static void -dsl_deadlist_load_tree(dsl_deadlist_t *dl) -{ - zap_cursor_t zc; - zap_attribute_t za; - - ASSERT(MUTEX_HELD(&dl->dl_lock)); - - ASSERT(!dl->dl_oldfmt); - if (dl->dl_havetree) - return; - - avl_create(&dl->dl_tree, dsl_deadlist_compare, - sizeof (dsl_deadlist_entry_t), - offsetof(dsl_deadlist_entry_t, dle_node)); - for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); - dle->dle_mintxg = zfs_strtonum(za.za_name, NULL); - VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, - za.za_first_integer)); - avl_add(&dl->dl_tree, dle); - } - zap_cursor_fini(&zc); - dl->dl_havetree = B_TRUE; -} - -void -dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) -{ - dmu_object_info_t doi; - - ASSERT(!dsl_deadlist_is_open(dl)); - - mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); - dl->dl_os = os; - dl->dl_object = object; - VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); - dmu_object_info_from_db(dl->dl_dbuf, &doi); - if (doi.doi_type == DMU_OT_BPOBJ) { - dmu_buf_rele(dl->dl_dbuf, dl); - dl->dl_dbuf = NULL; - dl->dl_oldfmt = B_TRUE; - VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); - return; - } - - dl->dl_oldfmt = B_FALSE; - dl->dl_phys = dl->dl_dbuf->db_data; - dl->dl_havetree = B_FALSE; -} - -boolean_t -dsl_deadlist_is_open(dsl_deadlist_t *dl) -{ - return (dl->dl_os != NULL); -} - -void -dsl_deadlist_close(dsl_deadlist_t *dl) -{ - void *cookie = NULL; - dsl_deadlist_entry_t *dle; - - ASSERT(dsl_deadlist_is_open(dl)); - - if (dl->dl_oldfmt) { - dl->dl_oldfmt = B_FALSE; - bpobj_close(&dl->dl_bpobj); - dl->dl_os = NULL; - dl->dl_object = 0; - return; - } - - if (dl->dl_havetree) { - while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) - != NULL) { - bpobj_close(&dle->dle_bpobj); - kmem_free(dle, sizeof (*dle)); - } - avl_destroy(&dl->dl_tree); - } - dmu_buf_rele(dl->dl_dbuf, dl); - mutex_destroy(&dl->dl_lock); - dl->dl_dbuf = NULL; - dl->dl_phys = NULL; - dl->dl_os = NULL; - dl->dl_object = 0; -} - -uint64_t -dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) -{ - if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) - return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); - return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, - sizeof (dsl_deadlist_phys_t), tx)); -} - -void -dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) -{ - dmu_object_info_t doi; - zap_cursor_t zc; - zap_attribute_t za; - - VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); - if (doi.doi_type == DMU_OT_BPOBJ) { - bpobj_free(os, dlobj, tx); - return; - } - - for (zap_cursor_init(&zc, os, dlobj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - uint64_t obj = za.za_first_integer; - if (obj == dmu_objset_pool(os)->dp_empty_bpobj) - bpobj_decr_empty(os, tx); - else - bpobj_free(os, obj, tx); - } - zap_cursor_fini(&zc); - VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); -} - -static void -dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, - const blkptr_t *bp, dmu_tx_t *tx) -{ - ASSERT(MUTEX_HELD(&dl->dl_lock)); - if (dle->dle_bpobj.bpo_object == - dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { - uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); - bpobj_close(&dle->dle_bpobj); - bpobj_decr_empty(dl->dl_os, tx); - VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); - VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, - dle->dle_mintxg, obj, tx)); - } - bpobj_enqueue(&dle->dle_bpobj, bp, tx); -} - -static void -dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, - uint64_t obj, dmu_tx_t *tx) -{ - ASSERT(MUTEX_HELD(&dl->dl_lock)); - if (dle->dle_bpobj.bpo_object != - dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { - bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); - } else { - bpobj_close(&dle->dle_bpobj); - bpobj_decr_empty(dl->dl_os, tx); - VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); - VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, - dle->dle_mintxg, obj, tx)); - } -} - -void -dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) -{ - dsl_deadlist_entry_t dle_tofind; - dsl_deadlist_entry_t *dle; - avl_index_t where; - - if (dl->dl_oldfmt) { - bpobj_enqueue(&dl->dl_bpobj, bp, tx); - return; - } - - mutex_enter(&dl->dl_lock); - dsl_deadlist_load_tree(dl); - - dmu_buf_will_dirty(dl->dl_dbuf, tx); - dl->dl_phys->dl_used += - bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); - dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); - dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); - - dle_tofind.dle_mintxg = bp->blk_birth; - dle = avl_find(&dl->dl_tree, &dle_tofind, &where); - if (dle == NULL) - dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); - else - dle = AVL_PREV(&dl->dl_tree, dle); - dle_enqueue(dl, dle, bp, tx); - mutex_exit(&dl->dl_lock); -} - -/* - * Insert new key in deadlist, which must be > all current entries. - * mintxg is not inclusive. - */ -void -dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) -{ - uint64_t obj; - dsl_deadlist_entry_t *dle; - - if (dl->dl_oldfmt) - return; - - dle = kmem_alloc(sizeof (*dle), KM_SLEEP); - dle->dle_mintxg = mintxg; - - mutex_enter(&dl->dl_lock); - dsl_deadlist_load_tree(dl); - - obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); - VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); - avl_add(&dl->dl_tree, dle); - - VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, - mintxg, obj, tx)); - mutex_exit(&dl->dl_lock); -} - -/* - * Remove this key, merging its entries into the previous key. - */ -void -dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) -{ - dsl_deadlist_entry_t dle_tofind; - dsl_deadlist_entry_t *dle, *dle_prev; - - if (dl->dl_oldfmt) - return; - - mutex_enter(&dl->dl_lock); - dsl_deadlist_load_tree(dl); - - dle_tofind.dle_mintxg = mintxg; - dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); - dle_prev = AVL_PREV(&dl->dl_tree, dle); - - dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); - - avl_remove(&dl->dl_tree, dle); - bpobj_close(&dle->dle_bpobj); - kmem_free(dle, sizeof (*dle)); - - VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); - mutex_exit(&dl->dl_lock); -} - -/* - * Walk ds's snapshots to regenerate generate ZAP & AVL. - */ -static void -dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, - uint64_t mrs_obj, dmu_tx_t *tx) -{ - dsl_deadlist_t dl = { 0 }; - dsl_pool_t *dp = dmu_objset_pool(os); - - dsl_deadlist_open(&dl, os, dlobj); - if (dl.dl_oldfmt) { - dsl_deadlist_close(&dl); - return; - } - - while (mrs_obj != 0) { - dsl_dataset_t *ds; - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); - dsl_deadlist_add_key(&dl, - dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); - mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; - dsl_dataset_rele(ds, FTAG); - } - dsl_deadlist_close(&dl); -} - -uint64_t -dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, - uint64_t mrs_obj, dmu_tx_t *tx) -{ - dsl_deadlist_entry_t *dle; - uint64_t newobj; - - newobj = dsl_deadlist_alloc(dl->dl_os, tx); - - if (dl->dl_oldfmt) { - dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); - return (newobj); - } - - mutex_enter(&dl->dl_lock); - dsl_deadlist_load_tree(dl); - - for (dle = avl_first(&dl->dl_tree); dle; - dle = AVL_NEXT(&dl->dl_tree, dle)) { - uint64_t obj; - - if (dle->dle_mintxg >= maxtxg) - break; - - obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); - VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, - dle->dle_mintxg, obj, tx)); - } - mutex_exit(&dl->dl_lock); - return (newobj); -} - -void -dsl_deadlist_space(dsl_deadlist_t *dl, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) -{ - ASSERT(dsl_deadlist_is_open(dl)); - if (dl->dl_oldfmt) { - VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, - usedp, compp, uncompp)); - return; - } - - mutex_enter(&dl->dl_lock); - *usedp = dl->dl_phys->dl_used; - *compp = dl->dl_phys->dl_comp; - *uncompp = dl->dl_phys->dl_uncomp; - mutex_exit(&dl->dl_lock); -} - -/* - * return space used in the range (mintxg, maxtxg]. - * Includes maxtxg, does not include mintxg. - * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is - * larger than any bp in the deadlist (eg. UINT64_MAX)). - */ -void -dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) -{ - dsl_deadlist_entry_t *dle; - dsl_deadlist_entry_t dle_tofind; - avl_index_t where; - - if (dl->dl_oldfmt) { - VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, - mintxg, maxtxg, usedp, compp, uncompp)); - return; - } - - *usedp = *compp = *uncompp = 0; - - mutex_enter(&dl->dl_lock); - dsl_deadlist_load_tree(dl); - dle_tofind.dle_mintxg = mintxg; - dle = avl_find(&dl->dl_tree, &dle_tofind, &where); - /* - * If we don't find this mintxg, there shouldn't be anything - * after it either. - */ - ASSERT(dle != NULL || - avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); - - for (; dle && dle->dle_mintxg < maxtxg; - dle = AVL_NEXT(&dl->dl_tree, dle)) { - uint64_t used, comp, uncomp; - - VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, - &used, &comp, &uncomp)); - - *usedp += used; - *compp += comp; - *uncompp += uncomp; - } - mutex_exit(&dl->dl_lock); -} - -static void -dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, - dmu_tx_t *tx) -{ - dsl_deadlist_entry_t dle_tofind; - dsl_deadlist_entry_t *dle; - avl_index_t where; - uint64_t used, comp, uncomp; - bpobj_t bpo; - - ASSERT(MUTEX_HELD(&dl->dl_lock)); - - VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); - VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); - bpobj_close(&bpo); - - dsl_deadlist_load_tree(dl); - - dmu_buf_will_dirty(dl->dl_dbuf, tx); - dl->dl_phys->dl_used += used; - dl->dl_phys->dl_comp += comp; - dl->dl_phys->dl_uncomp += uncomp; - - dle_tofind.dle_mintxg = birth; - dle = avl_find(&dl->dl_tree, &dle_tofind, &where); - if (dle == NULL) - dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); - dle_enqueue_subobj(dl, dle, obj, tx); -} - -static int -dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -{ - dsl_deadlist_t *dl = arg; - dsl_deadlist_insert(dl, bp, tx); - return (0); -} - -/* - * Merge the deadlist pointed to by 'obj' into dl. obj will be left as - * an empty deadlist. - */ -void -dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) -{ - zap_cursor_t zc; - zap_attribute_t za; - dmu_buf_t *bonus; - dsl_deadlist_phys_t *dlp; - dmu_object_info_t doi; - - VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); - if (doi.doi_type == DMU_OT_BPOBJ) { - bpobj_t bpo; - VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); - VERIFY3U(0, ==, bpobj_iterate(&bpo, - dsl_deadlist_insert_cb, dl, tx)); - bpobj_close(&bpo); - return; - } - - mutex_enter(&dl->dl_lock); - for (zap_cursor_init(&zc, dl->dl_os, obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - uint64_t mintxg = zfs_strtonum(za.za_name, NULL); - dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); - VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); - } - zap_cursor_fini(&zc); - - VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); - dlp = bonus->db_data; - dmu_buf_will_dirty(bonus, tx); - bzero(dlp, sizeof (*dlp)); - dmu_buf_rele(bonus, FTAG); - mutex_exit(&dl->dl_lock); -} - -/* - * Remove entries on dl that are >= mintxg, and put them on the bpobj. - */ -void -dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, - dmu_tx_t *tx) -{ - dsl_deadlist_entry_t dle_tofind; - dsl_deadlist_entry_t *dle; - avl_index_t where; - - ASSERT(!dl->dl_oldfmt); - - mutex_enter(&dl->dl_lock); - dmu_buf_will_dirty(dl->dl_dbuf, tx); - dsl_deadlist_load_tree(dl); - - dle_tofind.dle_mintxg = mintxg; - dle = avl_find(&dl->dl_tree, &dle_tofind, &where); - if (dle == NULL) - dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); - while (dle) { - uint64_t used, comp, uncomp; - dsl_deadlist_entry_t *dle_next; - - bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); - - VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, - &used, &comp, &uncomp)); - ASSERT3U(dl->dl_phys->dl_used, >=, used); - ASSERT3U(dl->dl_phys->dl_comp, >=, comp); - ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); - dl->dl_phys->dl_used -= used; - dl->dl_phys->dl_comp -= comp; - dl->dl_phys->dl_uncomp -= uncomp; - - VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, - dle->dle_mintxg, tx)); - - dle_next = AVL_NEXT(&dl->dl_tree, dle); - avl_remove(&dl->dl_tree, dle); - bpobj_close(&dle->dle_bpobj); - kmem_free(dle, sizeof (*dle)); - dle = dle_next; - } - mutex_exit(&dl->dl_lock); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c deleted file mode 100644 index 0ad658f910ec..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c +++ /dev/null @@ -1,760 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. - */ - -/* - * DSL permissions are stored in a two level zap attribute - * mechanism. The first level identifies the "class" of - * entry. The class is identified by the first 2 letters of - * the attribute. The second letter "l" or "d" identifies whether - * it is a local or descendent permission. The first letter - * identifies the type of entry. - * - * ul$ identifies permissions granted locally for this userid. - * ud$ identifies permissions granted on descendent datasets for - * this userid. - * Ul$ identifies permission sets granted locally for this userid. - * Ud$ identifies permission sets granted on descendent datasets for - * this userid. - * gl$ identifies permissions granted locally for this groupid. - * gd$ identifies permissions granted on descendent datasets for - * this groupid. - * Gl$ identifies permission sets granted locally for this groupid. - * Gd$ identifies permission sets granted on descendent datasets for - * this groupid. - * el$ identifies permissions granted locally for everyone. - * ed$ identifies permissions granted on descendent datasets - * for everyone. - * El$ identifies permission sets granted locally for everyone. - * Ed$ identifies permission sets granted to descendent datasets for - * everyone. - * c-$ identifies permission to create at dataset creation time. - * C-$ identifies permission sets to grant locally at dataset creation - * time. - * s-$@ permissions defined in specified set @ - * S-$@ Sets defined in named set @ - * - * Each of the above entities points to another zap attribute that contains one - * attribute for each allowed permission, such as create, destroy,... - * All of the "upper" case class types will specify permission set names - * rather than permissions. - * - * Basically it looks something like this: - * ul$12 -> ZAP OBJ -> permissions... - * - * The ZAP OBJ is referred to as the jump object. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zfs_deleg.h" - -/* - * Validate that user is allowed to delegate specified permissions. - * - * In order to delegate "create" you must have "create" - * and "allow". - */ -int -dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr) -{ - nvpair_t *whopair = NULL; - int error; - - if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) - return (error); - - while (whopair = nvlist_next_nvpair(nvp, whopair)) { - nvlist_t *perms; - nvpair_t *permpair = NULL; - - VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); - - while (permpair = nvlist_next_nvpair(perms, permpair)) { - const char *perm = nvpair_name(permpair); - - if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0) - return (SET_ERROR(EPERM)); - - if ((error = dsl_deleg_access(ddname, perm, cr)) != 0) - return (error); - } - } - return (0); -} - -/* - * Validate that user is allowed to unallow specified permissions. They - * must have the 'allow' permission, and even then can only unallow - * perms for their uid. - */ -int -dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) -{ - nvpair_t *whopair = NULL; - int error; - char idstr[32]; - - if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) - return (error); - - (void) snprintf(idstr, sizeof (idstr), "%lld", - (longlong_t)crgetuid(cr)); - - while (whopair = nvlist_next_nvpair(nvp, whopair)) { - zfs_deleg_who_type_t type = nvpair_name(whopair)[0]; - - if (type != ZFS_DELEG_USER && - type != ZFS_DELEG_USER_SETS) - return (SET_ERROR(EPERM)); - - if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0) - return (SET_ERROR(EPERM)); - } - return (0); -} - -typedef struct dsl_deleg_arg { - const char *dda_name; - nvlist_t *dda_nvlist; -} dsl_deleg_arg_t; - -static void -dsl_deleg_set_sync(void *arg, dmu_tx_t *tx) -{ - dsl_deleg_arg_t *dda = arg; - dsl_dir_t *dd; - dsl_pool_t *dp = dmu_tx_pool(tx); - objset_t *mos = dp->dp_meta_objset; - nvpair_t *whopair = NULL; - uint64_t zapobj; - - VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); - - zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; - if (zapobj == 0) { - dmu_buf_will_dirty(dd->dd_dbuf, tx); - zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos, - DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); - } - - while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) { - const char *whokey = nvpair_name(whopair); - nvlist_t *perms; - nvpair_t *permpair = NULL; - uint64_t jumpobj; - - perms = fnvpair_value_nvlist(whopair); - - if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) { - jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS, - zapobj, whokey, tx); - } - - while (permpair = nvlist_next_nvpair(perms, permpair)) { - const char *perm = nvpair_name(permpair); - uint64_t n = 0; - - VERIFY(zap_update(mos, jumpobj, - perm, 8, 1, &n, tx) == 0); - spa_history_log_internal_dd(dd, "permission update", tx, - "%s %s", whokey, perm); - } - } - dsl_dir_rele(dd, FTAG); -} - -static void -dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx) -{ - dsl_deleg_arg_t *dda = arg; - dsl_dir_t *dd; - dsl_pool_t *dp = dmu_tx_pool(tx); - objset_t *mos = dp->dp_meta_objset; - nvpair_t *whopair = NULL; - uint64_t zapobj; - - VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); - zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; - if (zapobj == 0) { - dsl_dir_rele(dd, FTAG); - return; - } - - while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) { - const char *whokey = nvpair_name(whopair); - nvlist_t *perms; - nvpair_t *permpair = NULL; - uint64_t jumpobj; - - if (nvpair_value_nvlist(whopair, &perms) != 0) { - if (zap_lookup(mos, zapobj, whokey, 8, - 1, &jumpobj) == 0) { - (void) zap_remove(mos, zapobj, whokey, tx); - VERIFY(0 == zap_destroy(mos, jumpobj, tx)); - } - spa_history_log_internal_dd(dd, "permission who remove", - tx, "%s", whokey); - continue; - } - - if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) - continue; - - while (permpair = nvlist_next_nvpair(perms, permpair)) { - const char *perm = nvpair_name(permpair); - uint64_t n = 0; - - (void) zap_remove(mos, jumpobj, perm, tx); - if (zap_count(mos, jumpobj, &n) == 0 && n == 0) { - (void) zap_remove(mos, zapobj, - whokey, tx); - VERIFY(0 == zap_destroy(mos, - jumpobj, tx)); - } - spa_history_log_internal_dd(dd, "permission remove", tx, - "%s %s", whokey, perm); - } - } - dsl_dir_rele(dd, FTAG); -} - -static int -dsl_deleg_check(void *arg, dmu_tx_t *tx) -{ - dsl_deleg_arg_t *dda = arg; - dsl_dir_t *dd; - int error; - - if (spa_version(dmu_tx_pool(tx)->dp_spa) < - SPA_VERSION_DELEGATED_PERMS) { - return (SET_ERROR(ENOTSUP)); - } - - error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL); - if (error == 0) - dsl_dir_rele(dd, FTAG); - return (error); -} - -int -dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) -{ - dsl_deleg_arg_t dda; - - /* nvp must already have been verified to be valid */ - - dda.dda_name = ddname; - dda.dda_nvlist = nvp; - - return (dsl_sync_task(ddname, dsl_deleg_check, - unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, - &dda, fnvlist_num_pairs(nvp), ZFS_SPACE_CHECK_RESERVED)); -} - -/* - * Find all 'allow' permissions from a given point and then continue - * traversing up to the root. - * - * This function constructs an nvlist of nvlists. - * each setpoint is an nvlist composed of an nvlist of an nvlist - * of the individual * users/groups/everyone/create - * permissions. - * - * The nvlist will look like this. - * - * { source fsname -> { whokeys { permissions,...}, ...}} - * - * The fsname nvpairs will be arranged in a bottom up order. For example, - * if we have the following structure a/b/c then the nvpairs for the fsnames - * will be ordered a/b/c, a/b, a. - */ -int -dsl_deleg_get(const char *ddname, nvlist_t **nvp) -{ - dsl_dir_t *dd, *startdd; - dsl_pool_t *dp; - int error; - objset_t *mos; - - error = dsl_pool_hold(ddname, FTAG, &dp); - if (error != 0) - return (error); - - error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL); - if (error != 0) { - dsl_pool_rele(dp, FTAG); - return (error); - } - - dp = startdd->dd_pool; - mos = dp->dp_meta_objset; - - VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - for (dd = startdd; dd != NULL; dd = dd->dd_parent) { - zap_cursor_t basezc; - zap_attribute_t baseza; - nvlist_t *sp_nvp; - uint64_t n; - char source[ZFS_MAX_DATASET_NAME_LEN]; - - if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 || - zap_count(mos, - dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 || n == 0) - continue; - - sp_nvp = fnvlist_alloc(); - for (zap_cursor_init(&basezc, mos, - dsl_dir_phys(dd)->dd_deleg_zapobj); - zap_cursor_retrieve(&basezc, &baseza) == 0; - zap_cursor_advance(&basezc)) { - zap_cursor_t zc; - zap_attribute_t za; - nvlist_t *perms_nvp; - - ASSERT(baseza.za_integer_length == 8); - ASSERT(baseza.za_num_integers == 1); - - perms_nvp = fnvlist_alloc(); - for (zap_cursor_init(&zc, mos, baseza.za_first_integer); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - fnvlist_add_boolean(perms_nvp, za.za_name); - } - zap_cursor_fini(&zc); - fnvlist_add_nvlist(sp_nvp, baseza.za_name, perms_nvp); - fnvlist_free(perms_nvp); - } - - zap_cursor_fini(&basezc); - - dsl_dir_name(dd, source); - fnvlist_add_nvlist(*nvp, source, sp_nvp); - nvlist_free(sp_nvp); - } - - dsl_dir_rele(startdd, FTAG); - dsl_pool_rele(dp, FTAG); - return (0); -} - -/* - * Routines for dsl_deleg_access() -- access checking. - */ -typedef struct perm_set { - avl_node_t p_node; - boolean_t p_matched; - char p_setname[ZFS_MAX_DELEG_NAME]; -} perm_set_t; - -static int -perm_set_compare(const void *arg1, const void *arg2) -{ - const perm_set_t *node1 = (const perm_set_t *)arg1; - const perm_set_t *node2 = (const perm_set_t *)arg2; - int val; - - val = strcmp(node1->p_setname, node2->p_setname); - - return (AVL_ISIGN(val)); -} - -/* - * Determine whether a specified permission exists. - * - * First the base attribute has to be retrieved. i.e. ul$12 - * Once the base object has been retrieved the actual permission - * is lookup up in the zap object the base object points to. - * - * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if - * there is no perm in that jumpobj. - */ -static int -dsl_check_access(objset_t *mos, uint64_t zapobj, - char type, char checkflag, void *valp, const char *perm) -{ - int error; - uint64_t jumpobj, zero; - char whokey[ZFS_MAX_DELEG_NAME]; - - zfs_deleg_whokey(whokey, type, checkflag, valp); - error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); - if (error == 0) { - error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero); - if (error == ENOENT) - error = SET_ERROR(EPERM); - } - return (error); -} - -/* - * check a specified user/group for a requested permission - */ -static int -dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm, - int checkflag, cred_t *cr) -{ - const gid_t *gids; - int ngids; - int i; - uint64_t id; - - /* check for user */ - id = crgetuid(cr); - if (dsl_check_access(mos, zapobj, - ZFS_DELEG_USER, checkflag, &id, perm) == 0) - return (0); - - /* check for users primary group */ - id = crgetgid(cr); - if (dsl_check_access(mos, zapobj, - ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) - return (0); - - /* check for everyone entry */ - id = -1; - if (dsl_check_access(mos, zapobj, - ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0) - return (0); - - /* check each supplemental group user is a member of */ - ngids = crgetngroups(cr); - gids = crgetgroups(cr); - for (i = 0; i != ngids; i++) { - id = gids[i]; - if (dsl_check_access(mos, zapobj, - ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) - return (0); - } - - return (SET_ERROR(EPERM)); -} - -/* - * Iterate over the sets specified in the specified zapobj - * and load them into the permsets avl tree. - */ -static int -dsl_load_sets(objset_t *mos, uint64_t zapobj, - char type, char checkflag, void *valp, avl_tree_t *avl) -{ - zap_cursor_t zc; - zap_attribute_t za; - perm_set_t *permnode; - avl_index_t idx; - uint64_t jumpobj; - int error; - char whokey[ZFS_MAX_DELEG_NAME]; - - zfs_deleg_whokey(whokey, type, checkflag, valp); - - error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); - if (error != 0) - return (error); - - for (zap_cursor_init(&zc, mos, jumpobj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP); - (void) strlcpy(permnode->p_setname, za.za_name, - sizeof (permnode->p_setname)); - permnode->p_matched = B_FALSE; - - if (avl_find(avl, permnode, &idx) == NULL) { - avl_insert(avl, permnode, idx); - } else { - kmem_free(permnode, sizeof (perm_set_t)); - } - } - zap_cursor_fini(&zc); - return (0); -} - -/* - * Load all permissions user based on cred belongs to. - */ -static void -dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl, - char checkflag, cred_t *cr) -{ - const gid_t *gids; - int ngids, i; - uint64_t id; - - id = crgetuid(cr); - (void) dsl_load_sets(mos, zapobj, - ZFS_DELEG_USER_SETS, checkflag, &id, avl); - - id = crgetgid(cr); - (void) dsl_load_sets(mos, zapobj, - ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); - - (void) dsl_load_sets(mos, zapobj, - ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl); - - ngids = crgetngroups(cr); - gids = crgetgroups(cr); - for (i = 0; i != ngids; i++) { - id = gids[i]; - (void) dsl_load_sets(mos, zapobj, - ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); - } -} - -/* - * Check if user has requested permission. - */ -int -dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) -{ - dsl_dir_t *dd; - dsl_pool_t *dp; - void *cookie; - int error; - char checkflag; - objset_t *mos; - avl_tree_t permsets; - perm_set_t *setnode; - - dp = ds->ds_dir->dd_pool; - mos = dp->dp_meta_objset; - - if (dsl_delegation_on(mos) == B_FALSE) - return (SET_ERROR(ECANCELED)); - - if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) < - SPA_VERSION_DELEGATED_PERMS) - return (SET_ERROR(EPERM)); - - if (ds->ds_is_snapshot) { - /* - * Snapshots are treated as descendents only, - * local permissions do not apply. - */ - checkflag = ZFS_DELEG_DESCENDENT; - } else { - checkflag = ZFS_DELEG_LOCAL; - } - - avl_create(&permsets, perm_set_compare, sizeof (perm_set_t), - offsetof(perm_set_t, p_node)); - - ASSERT(dsl_pool_config_held(dp)); - for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent, - checkflag = ZFS_DELEG_DESCENDENT) { - uint64_t zapobj; - boolean_t expanded; - - /* - * If not in global zone then make sure - * the zoned property is set - */ - if (!INGLOBALZONE(curthread)) { - uint64_t zoned; - - if (dsl_prop_get_dd(dd, - zfs_prop_to_name(ZFS_PROP_ZONED), - 8, 1, &zoned, NULL, B_FALSE) != 0) - break; - if (!zoned) - break; - } - zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; - - if (zapobj == 0) - continue; - - dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr); -again: - expanded = B_FALSE; - for (setnode = avl_first(&permsets); setnode; - setnode = AVL_NEXT(&permsets, setnode)) { - if (setnode->p_matched == B_TRUE) - continue; - - /* See if this set directly grants this permission */ - error = dsl_check_access(mos, zapobj, - ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm); - if (error == 0) - goto success; - if (error == EPERM) - setnode->p_matched = B_TRUE; - - /* See if this set includes other sets */ - error = dsl_load_sets(mos, zapobj, - ZFS_DELEG_NAMED_SET_SETS, 0, - setnode->p_setname, &permsets); - if (error == 0) - setnode->p_matched = expanded = B_TRUE; - } - /* - * If we expanded any sets, that will define more sets, - * which we need to check. - */ - if (expanded) - goto again; - - error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr); - if (error == 0) - goto success; - } - error = SET_ERROR(EPERM); -success: - - cookie = NULL; - while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL) - kmem_free(setnode, sizeof (perm_set_t)); - - return (error); -} - -int -dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) -{ - dsl_pool_t *dp; - dsl_dataset_t *ds; - int error; - - error = dsl_pool_hold(dsname, FTAG, &dp); - if (error != 0) - return (error); - error = dsl_dataset_hold(dp, dsname, FTAG, &ds); - if (error == 0) { - error = dsl_deleg_access_impl(ds, perm, cr); - dsl_dataset_rele(ds, FTAG); - } - dsl_pool_rele(dp, FTAG); - - return (error); -} - -/* - * Other routines. - */ - -static void -copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj, - boolean_t dosets, uint64_t uid, dmu_tx_t *tx) -{ - objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t jumpobj, pjumpobj; - uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; - zap_cursor_t zc; - zap_attribute_t za; - char whokey[ZFS_MAX_DELEG_NAME]; - - zfs_deleg_whokey(whokey, - dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE, - ZFS_DELEG_LOCAL, NULL); - if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0) - return; - - if (zapobj == 0) { - dmu_buf_will_dirty(dd->dd_dbuf, tx); - zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos, - DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); - } - - zfs_deleg_whokey(whokey, - dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER, - ZFS_DELEG_LOCAL, &uid); - if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) { - jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); - VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0); - } - - for (zap_cursor_init(&zc, mos, pjumpobj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - uint64_t zero = 0; - ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); - - VERIFY(zap_update(mos, jumpobj, za.za_name, - 8, 1, &zero, tx) == 0); - } - zap_cursor_fini(&zc); -} - -/* - * set all create time permission on new dataset. - */ -void -dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr) -{ - dsl_dir_t *dd; - uint64_t uid = crgetuid(cr); - - if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) < - SPA_VERSION_DELEGATED_PERMS) - return; - - for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) { - uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; - - if (pzapobj == 0) - continue; - - copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx); - copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx); - } -} - -int -dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx) -{ - zap_cursor_t zc; - zap_attribute_t za; - - if (zapobj == 0) - return (0); - - for (zap_cursor_init(&zc, mos, zapobj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); - VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx)); - } - zap_cursor_fini(&zc); - VERIFY(0 == zap_destroy(mos, zapobj, tx)); - return (0); -} - -boolean_t -dsl_delegation_on(objset_t *os) -{ - return (!!spa_delegation(os->os_spa)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c deleted file mode 100644 index 41b016a1d8ae..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c +++ /dev/null @@ -1,1097 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2013 Steven Hartland. All rights reserved. - * Copyright (c) 2013 by Joyent, Inc. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if defined(__FreeBSD__) && defined(_KERNEL) -#include -#endif - - -int -dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) -{ - if (!ds->ds_is_snapshot) - return (SET_ERROR(EINVAL)); - - if (dsl_dataset_long_held(ds)) - return (SET_ERROR(EBUSY)); - - /* - * Only allow deferred destroy on pools that support it. - * NOTE: deferred destroy is only supported on snapshots. - */ - if (defer) { - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < - SPA_VERSION_USERREFS) - return (SET_ERROR(ENOTSUP)); - return (0); - } - - /* - * If this snapshot has an elevated user reference count, - * we can't destroy it yet. - */ - if (ds->ds_userrefs > 0) - return (SET_ERROR(EBUSY)); - - /* - * Can't delete a branch point. - */ - if (dsl_dataset_phys(ds)->ds_num_children > 1) - return (SET_ERROR(EEXIST)); - - return (0); -} - -int -dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx) -{ - dsl_destroy_snapshot_arg_t *ddsa = arg; - const char *dsname = ddsa->ddsa_name; - boolean_t defer = ddsa->ddsa_defer; - - dsl_pool_t *dp = dmu_tx_pool(tx); - int error = 0; - dsl_dataset_t *ds; - - error = dsl_dataset_hold(dp, dsname, FTAG, &ds); - - /* - * If the snapshot does not exist, silently ignore it, and - * dsl_destroy_snapshot_sync() will be a no-op - * (it's "already destroyed"). - */ - if (error == ENOENT) - return (0); - - if (error == 0) { - error = dsl_destroy_snapshot_check_impl(ds, defer); - dsl_dataset_rele(ds, FTAG); - } - - return (error); -} - -struct process_old_arg { - dsl_dataset_t *ds; - dsl_dataset_t *ds_prev; - boolean_t after_branch_point; - zio_t *pio; - uint64_t used, comp, uncomp; -}; - -static int -process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -{ - struct process_old_arg *poa = arg; - dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; - - ASSERT(!BP_IS_HOLE(bp)); - - if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { - dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); - if (poa->ds_prev && !poa->after_branch_point && - bp->blk_birth > - dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) { - dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes += - bp_get_dsize_sync(dp->dp_spa, bp); - } - } else { - poa->used += bp_get_dsize_sync(dp->dp_spa, bp); - poa->comp += BP_GET_PSIZE(bp); - poa->uncomp += BP_GET_UCSIZE(bp); - dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); - } - return (0); -} - -static void -process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, - dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) -{ - struct process_old_arg poa = { 0 }; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - uint64_t deadlist_obj; - - ASSERT(ds->ds_deadlist.dl_oldfmt); - ASSERT(ds_next->ds_deadlist.dl_oldfmt); - - poa.ds = ds; - poa.ds_prev = ds_prev; - poa.after_branch_point = after_branch_point; - poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, - process_old_cb, &poa, tx)); - VERIFY0(zio_wait(poa.pio)); - ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes); - - /* change snapused */ - dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, - -poa.used, -poa.comp, -poa.uncomp, tx); - - /* swap next's deadlist to our deadlist */ - dsl_deadlist_close(&ds->ds_deadlist); - dsl_deadlist_close(&ds_next->ds_deadlist); - deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; - dsl_dataset_phys(ds)->ds_deadlist_obj = - dsl_dataset_phys(ds_next)->ds_deadlist_obj; - dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj; - dsl_deadlist_open(&ds->ds_deadlist, mos, - dsl_dataset_phys(ds)->ds_deadlist_obj); - dsl_deadlist_open(&ds_next->ds_deadlist, mos, - dsl_dataset_phys(ds_next)->ds_deadlist_obj); -} - -static void -dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) -{ - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - zap_cursor_t zc; - zap_attribute_t za; - - /* - * If it is the old version, dd_clones doesn't exist so we can't - * find the clones, but dsl_deadlist_remove_key() is a no-op so it - * doesn't matter. - */ - if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0) - return; - - for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - dsl_dataset_t *clone; - - VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, - za.za_first_integer, FTAG, &clone)); - if (clone->ds_dir->dd_origin_txg > mintxg) { - dsl_deadlist_remove_key(&clone->ds_deadlist, - mintxg, tx); - if (dsl_dataset_remap_deadlist_exists(clone)) { - dsl_deadlist_remove_key( - &clone->ds_remap_deadlist, mintxg, tx); - } - dsl_dataset_remove_clones_key(clone, mintxg, tx); - } - dsl_dataset_rele(clone, FTAG); - } - zap_cursor_fini(&zc); -} - -static void -dsl_destroy_snapshot_handle_remaps(dsl_dataset_t *ds, dsl_dataset_t *ds_next, - dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - /* Move blocks to be obsoleted to pool's obsolete list. */ - if (dsl_dataset_remap_deadlist_exists(ds_next)) { - if (!bpobj_is_open(&dp->dp_obsolete_bpobj)) - dsl_pool_create_obsolete_bpobj(dp, tx); - - dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist, - &dp->dp_obsolete_bpobj, - dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); - } - - /* Merge our deadlist into next's and free it. */ - if (dsl_dataset_remap_deadlist_exists(ds)) { - uint64_t remap_deadlist_object = - dsl_dataset_get_remap_deadlist_object(ds); - ASSERT(remap_deadlist_object != 0); - - mutex_enter(&ds_next->ds_remap_deadlist_lock); - if (!dsl_dataset_remap_deadlist_exists(ds_next)) - dsl_dataset_create_remap_deadlist(ds_next, tx); - mutex_exit(&ds_next->ds_remap_deadlist_lock); - - dsl_deadlist_merge(&ds_next->ds_remap_deadlist, - remap_deadlist_object, tx); - dsl_dataset_destroy_remap_deadlist(ds, tx); - } -} - -void -dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) -{ - int err; - int after_branch_point = FALSE; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - dsl_dataset_t *ds_prev = NULL; - uint64_t obj; - - ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); - rrw_exit(&ds->ds_bp_rwlock, FTAG); - ASSERT(zfs_refcount_is_zero(&ds->ds_longholds)); - - if (defer && - (ds->ds_userrefs > 0 || - dsl_dataset_phys(ds)->ds_num_children > 1)) { - ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY; - spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); - return; - } - - ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); - - /* We need to log before removing it from the namespace. */ - spa_history_log_internal_ds(ds, "destroy", tx, ""); - - dsl_scan_ds_destroyed(ds, tx); - - obj = ds->ds_object; - - for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - if (ds->ds_feature_inuse[f]) { - dsl_dataset_deactivate_feature(obj, f, tx); - ds->ds_feature_inuse[f] = B_FALSE; - } - } - if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { - ASSERT3P(ds->ds_prev, ==, NULL); - VERIFY0(dsl_dataset_hold_obj(dp, - dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev)); - after_branch_point = - (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj); - - dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); - if (after_branch_point && - dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) { - dsl_dataset_remove_from_next_clones(ds_prev, obj, tx); - if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { - VERIFY0(zap_add_int(mos, - dsl_dataset_phys(ds_prev)-> - ds_next_clones_obj, - dsl_dataset_phys(ds)->ds_next_snap_obj, - tx)); - } - } - if (!after_branch_point) { - dsl_dataset_phys(ds_prev)->ds_next_snap_obj = - dsl_dataset_phys(ds)->ds_next_snap_obj; - } - } - - dsl_dataset_t *ds_next; - uint64_t old_unique; - uint64_t used = 0, comp = 0, uncomp = 0; - - VERIFY0(dsl_dataset_hold_obj(dp, - dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next)); - ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj); - - old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes; - - dmu_buf_will_dirty(ds_next->ds_dbuf, tx); - dsl_dataset_phys(ds_next)->ds_prev_snap_obj = - dsl_dataset_phys(ds)->ds_prev_snap_obj; - dsl_dataset_phys(ds_next)->ds_prev_snap_txg = - dsl_dataset_phys(ds)->ds_prev_snap_txg; - ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, - ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0); - - if (ds_next->ds_deadlist.dl_oldfmt) { - process_old_deadlist(ds, ds_prev, ds_next, - after_branch_point, tx); - } else { - /* Adjust prev's unique space. */ - if (ds_prev && !after_branch_point) { - dsl_deadlist_space_range(&ds_next->ds_deadlist, - dsl_dataset_phys(ds_prev)->ds_prev_snap_txg, - dsl_dataset_phys(ds)->ds_prev_snap_txg, - &used, &comp, &uncomp); - dsl_dataset_phys(ds_prev)->ds_unique_bytes += used; - } - - /* Adjust snapused. */ - dsl_deadlist_space_range(&ds_next->ds_deadlist, - dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX, - &used, &comp, &uncomp); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, - -used, -comp, -uncomp, tx); - - /* Move blocks to be freed to pool's free list. */ - dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, - &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg, - tx); - dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, - DD_USED_HEAD, used, comp, uncomp, tx); - - /* Merge our deadlist into next's and free it. */ - dsl_deadlist_merge(&ds_next->ds_deadlist, - dsl_dataset_phys(ds)->ds_deadlist_obj, tx); - } - - dsl_deadlist_close(&ds->ds_deadlist); - dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_deadlist_obj = 0; - - dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx); - - /* Collapse range in clone heads */ - dsl_dataset_remove_clones_key(ds, - dsl_dataset_phys(ds)->ds_creation_txg, tx); - - if (ds_next->ds_is_snapshot) { - dsl_dataset_t *ds_nextnext; - - /* - * Update next's unique to include blocks which - * were previously shared by only this snapshot - * and it. Those blocks will be born after the - * prev snap and before this snap, and will have - * died after the next snap and before the one - * after that (ie. be on the snap after next's - * deadlist). - */ - VERIFY0(dsl_dataset_hold_obj(dp, - dsl_dataset_phys(ds_next)->ds_next_snap_obj, - FTAG, &ds_nextnext)); - dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, - dsl_dataset_phys(ds)->ds_prev_snap_txg, - dsl_dataset_phys(ds)->ds_creation_txg, - &used, &comp, &uncomp); - dsl_dataset_phys(ds_next)->ds_unique_bytes += used; - dsl_dataset_rele(ds_nextnext, FTAG); - ASSERT3P(ds_next->ds_prev, ==, NULL); - - /* Collapse range in this head. */ - dsl_dataset_t *hds; - VERIFY0(dsl_dataset_hold_obj(dp, - dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds)); - dsl_deadlist_remove_key(&hds->ds_deadlist, - dsl_dataset_phys(ds)->ds_creation_txg, tx); - if (dsl_dataset_remap_deadlist_exists(hds)) { - dsl_deadlist_remove_key(&hds->ds_remap_deadlist, - dsl_dataset_phys(ds)->ds_creation_txg, tx); - } - dsl_dataset_rele(hds, FTAG); - - } else { - ASSERT3P(ds_next->ds_prev, ==, ds); - dsl_dataset_rele(ds_next->ds_prev, ds_next); - ds_next->ds_prev = NULL; - if (ds_prev) { - VERIFY0(dsl_dataset_hold_obj(dp, - dsl_dataset_phys(ds)->ds_prev_snap_obj, - ds_next, &ds_next->ds_prev)); - } - - dsl_dataset_recalc_head_uniq(ds_next); - - /* - * Reduce the amount of our unconsumed refreservation - * being charged to our parent by the amount of - * new unique data we have gained. - */ - if (old_unique < ds_next->ds_reserved) { - int64_t mrsdelta; - uint64_t new_unique = - dsl_dataset_phys(ds_next)->ds_unique_bytes; - - ASSERT(old_unique <= new_unique); - mrsdelta = MIN(new_unique - old_unique, - ds_next->ds_reserved - old_unique); - dsl_dir_diduse_space(ds->ds_dir, - DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); - } - } - dsl_dataset_rele(ds_next, FTAG); - - /* - * This must be done after the dsl_traverse(), because it will - * re-open the objset. - */ - if (ds->ds_objset) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } - - /* remove from snapshot namespace */ - dsl_dataset_t *ds_head; - ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0); - VERIFY0(dsl_dataset_hold_obj(dp, - dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head)); - VERIFY0(dsl_dataset_get_snapname(ds)); -#ifdef ZFS_DEBUG - { - uint64_t val; - - err = dsl_dataset_snap_lookup(ds_head, - ds->ds_snapname, &val); - ASSERT0(err); - ASSERT3U(val, ==, obj); - } -#endif - VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE)); - dsl_dataset_rele(ds_head, FTAG); - - if (ds_prev != NULL) - dsl_dataset_rele(ds_prev, FTAG); - - spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); - - if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { - uint64_t count; - ASSERT0(zap_count(mos, - dsl_dataset_phys(ds)->ds_next_clones_obj, &count) && - count == 0); - VERIFY0(dmu_object_free(mos, - dsl_dataset_phys(ds)->ds_next_clones_obj, tx)); - } - if (dsl_dataset_phys(ds)->ds_props_obj != 0) - VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj, - tx)); - if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) - VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, - tx)); - -#if defined(__FreeBSD__) && defined(_KERNEL) - char dsname[ZFS_MAX_DATASET_NAME_LEN]; - - dsl_dataset_name(ds, dsname); - zvol_remove_minors(dp->dp_spa, dsname); -#endif - - dsl_dir_rele(ds->ds_dir, ds); - ds->ds_dir = NULL; - dmu_object_free_zapified(mos, obj, tx); -} - -void -dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx) -{ - dsl_destroy_snapshot_arg_t *ddsa = arg; - const char *dsname = ddsa->ddsa_name; - boolean_t defer = ddsa->ddsa_defer; - - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - - int error = dsl_dataset_hold(dp, dsname, FTAG, &ds); - if (error == ENOENT) - return; - ASSERT0(error); - dsl_destroy_snapshot_sync_impl(ds, defer, tx); - dsl_dataset_rele(ds, FTAG); -} - -/* - * The semantics of this function are described in the comment above - * lzc_destroy_snaps(). To summarize: - * - * The snapshots must all be in the same pool. - * - * Snapshots that don't exist will be silently ignored (considered to be - * "already deleted"). - * - * On success, all snaps will be destroyed and this will return 0. - * On failure, no snaps will be destroyed, the errlist will be filled in, - * and this will return an errno. - */ -int -dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer, - nvlist_t *errlist) -{ - if (nvlist_next_nvpair(snaps, NULL) == NULL) - return (0); - - /* - * lzc_destroy_snaps() is documented to take an nvlist whose - * values "don't matter". We need to convert that nvlist to - * one that we know can be converted to LUA. We also don't - * care about any duplicate entries because the nvlist will - * be converted to a LUA table which should take care of this. - */ - nvlist_t *snaps_normalized; - VERIFY0(nvlist_alloc(&snaps_normalized, 0, KM_SLEEP)); - for (nvpair_t *pair = nvlist_next_nvpair(snaps, NULL); - pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { - fnvlist_add_boolean_value(snaps_normalized, - nvpair_name(pair), B_TRUE); - } - - nvlist_t *arg; - VERIFY0(nvlist_alloc(&arg, 0, KM_SLEEP)); - fnvlist_add_nvlist(arg, "snaps", snaps_normalized); - fnvlist_free(snaps_normalized); - fnvlist_add_boolean_value(arg, "defer", defer); - - nvlist_t *wrapper; - VERIFY0(nvlist_alloc(&wrapper, 0, KM_SLEEP)); - fnvlist_add_nvlist(wrapper, ZCP_ARG_ARGLIST, arg); - fnvlist_free(arg); - - const char *program = - "arg = ...\n" - "snaps = arg['snaps']\n" - "defer = arg['defer']\n" - "errors = { }\n" - "has_errors = false\n" - "for snap, v in pairs(snaps) do\n" - " errno = zfs.check.destroy{snap, defer=defer}\n" - " zfs.debug('snap: ' .. snap .. ' errno: ' .. errno)\n" - " if errno == ENOENT then\n" - " snaps[snap] = nil\n" - " elseif errno ~= 0 then\n" - " errors[snap] = errno\n" - " has_errors = true\n" - " end\n" - "end\n" - "if has_errors then\n" - " return errors\n" - "end\n" - "for snap, v in pairs(snaps) do\n" - " errno = zfs.sync.destroy{snap, defer=defer}\n" - " assert(errno == 0)\n" - "end\n" - "return { }\n"; - - nvlist_t *result = fnvlist_alloc(); - int error = zcp_eval(nvpair_name(nvlist_next_nvpair(snaps, NULL)), - program, - B_TRUE, - 0, - zfs_lua_max_memlimit, - nvlist_next_nvpair(wrapper, NULL), result); - if (error != 0) { - char *errorstr = NULL; - (void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr); - if (errorstr != NULL) { - zfs_dbgmsg(errorstr); - } - return (error); - } - fnvlist_free(wrapper); - - /* - * lzc_destroy_snaps() is documented to fill the errlist with - * int32 values, so we need to covert the int64 values that are - * returned from LUA. - */ - int rv = 0; - nvlist_t *errlist_raw = fnvlist_lookup_nvlist(result, ZCP_RET_RETURN); - for (nvpair_t *pair = nvlist_next_nvpair(errlist_raw, NULL); - pair != NULL; pair = nvlist_next_nvpair(errlist_raw, pair)) { - int32_t val = (int32_t)fnvpair_value_int64(pair); - if (rv == 0) - rv = val; - fnvlist_add_int32(errlist, nvpair_name(pair), val); - } - fnvlist_free(result); - return (rv); -} - -int -dsl_destroy_snapshot(const char *name, boolean_t defer) -{ - int error; - nvlist_t *nvl = fnvlist_alloc(); - nvlist_t *errlist = fnvlist_alloc(); - - fnvlist_add_boolean(nvl, name); - error = dsl_destroy_snapshots_nvl(nvl, defer, errlist); - fnvlist_free(errlist); - fnvlist_free(nvl); - return (error); -} - -struct killarg { - dsl_dataset_t *ds; - dmu_tx_t *tx; -}; - -/* ARGSUSED */ -static int -kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) -{ - struct killarg *ka = arg; - dmu_tx_t *tx = ka->tx; - - if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) - return (0); - - if (zb->zb_level == ZB_ZIL_LEVEL) { - ASSERT(zilog != NULL); - /* - * It's a block in the intent log. It has no - * accounting, so just free it. - */ - dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); - } else { - ASSERT(zilog == NULL); - ASSERT3U(bp->blk_birth, >, - dsl_dataset_phys(ka->ds)->ds_prev_snap_txg); - (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); - } - - return (0); -} - -static void -old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - struct killarg ka; - - /* - * Free everything that we point to (that's born after - * the previous snapshot, if we are a clone) - * - * NB: this should be very quick, because we already - * freed all the objects in open context. - */ - ka.ds = ds; - ka.tx = tx; - VERIFY0(traverse_dataset(ds, - dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST, - kill_blkptr, &ka)); - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || - dsl_dataset_phys(ds)->ds_unique_bytes == 0); -} - -int -dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) -{ - int error; - uint64_t count; - objset_t *mos; - - ASSERT(!ds->ds_is_snapshot); - if (ds->ds_is_snapshot) - return (SET_ERROR(EINVAL)); - - if (zfs_refcount_count(&ds->ds_longholds) != expected_holds) - return (SET_ERROR(EBUSY)); - - mos = ds->ds_dir->dd_pool->dp_meta_objset; - - /* - * Can't delete a head dataset if there are snapshots of it. - * (Except if the only snapshots are from the branch we cloned - * from.) - */ - if (ds->ds_prev != NULL && - dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object) - return (SET_ERROR(EBUSY)); - - /* - * Can't delete if there are children of this fs. - */ - error = zap_count(mos, - dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count); - if (error != 0) - return (error); - if (count != 0) - return (SET_ERROR(EEXIST)); - - if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && - dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 && - ds->ds_prev->ds_userrefs == 0) { - /* We need to remove the origin snapshot as well. */ - if (!zfs_refcount_is_zero(&ds->ds_prev->ds_longholds)) - return (SET_ERROR(EBUSY)); - } - return (0); -} - -int -dsl_destroy_head_check(void *arg, dmu_tx_t *tx) -{ - dsl_destroy_head_arg_t *ddha = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - int error; - - error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds); - if (error != 0) - return (error); - - error = dsl_destroy_head_check_impl(ds, 0); - dsl_dataset_rele(ds, FTAG); - return (error); -} - -static void -dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) -{ - dsl_dir_t *dd; - dsl_pool_t *dp = dmu_tx_pool(tx); - objset_t *mos = dp->dp_meta_objset; - dd_used_t t; - - ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock)); - - VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); - - ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj); - - /* - * Decrement the filesystem count for all parent filesystems. - * - * When we receive an incremental stream into a filesystem that already - * exists, a temporary clone is created. We never count this temporary - * clone, whose name begins with a '%'. - */ - if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL) - dsl_fs_ss_count_adjust(dd->dd_parent, -1, - DD_FIELD_FILESYSTEM_COUNT, tx); - - /* - * Remove our reservation. The impl() routine avoids setting the - * actual property, which would require the (already destroyed) ds. - */ - dsl_dir_set_reservation_sync_impl(dd, 0, tx); - - ASSERT0(dsl_dir_phys(dd)->dd_used_bytes); - ASSERT0(dsl_dir_phys(dd)->dd_reserved); - for (t = 0; t < DD_USED_NUM; t++) - ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]); - - VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx)); - VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx)); - if (dsl_dir_phys(dd)->dd_clones != 0) - VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_clones, tx)); - VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx)); - VERIFY0(zap_remove(mos, - dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, - dd->dd_myname, tx)); - - dsl_dir_rele(dd, FTAG); - dmu_object_free_zapified(mos, ddobj, tx); -} - -void -dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - dsl_pool_t *dp = dmu_tx_pool(tx); - objset_t *mos = dp->dp_meta_objset; - uint64_t obj, ddobj, prevobj = 0; - boolean_t rmorigin; - - ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); - ASSERT(ds->ds_prev == NULL || - dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object); - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); - rrw_exit(&ds->ds_bp_rwlock, FTAG); - ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); - - /* We need to log before removing it from the namespace. */ - spa_history_log_internal_ds(ds, "destroy", tx, ""); - - rmorigin = (dsl_dir_is_clone(ds->ds_dir) && - DS_IS_DEFER_DESTROY(ds->ds_prev) && - dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 && - ds->ds_prev->ds_userrefs == 0); - - /* Remove our reservation. */ - if (ds->ds_reserved != 0) { - dsl_dataset_set_refreservation_sync_impl(ds, - (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), - 0, tx); - ASSERT0(ds->ds_reserved); - } - - obj = ds->ds_object; - - for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - if (ds->ds_feature_inuse[f]) { - dsl_dataset_deactivate_feature(obj, f, tx); - ds->ds_feature_inuse[f] = B_FALSE; - } - } - - dsl_scan_ds_destroyed(ds, tx); - - if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { - /* This is a clone */ - ASSERT(ds->ds_prev != NULL); - ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=, - obj); - ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj); - - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) { - dsl_dataset_remove_from_next_clones(ds->ds_prev, - obj, tx); - } - - ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1); - dsl_dataset_phys(ds->ds_prev)->ds_num_children--; - } - - /* - * Destroy the deadlist. Unless it's a clone, the - * deadlist should be empty since the dataset has no snapshots. - * (If it's a clone, it's safe to ignore the deadlist contents - * since they are still referenced by the origin snapshot.) - */ - dsl_deadlist_close(&ds->ds_deadlist); - dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_deadlist_obj = 0; - - if (dsl_dataset_remap_deadlist_exists(ds)) - dsl_dataset_destroy_remap_deadlist(ds, tx); - - objset_t *os; - VERIFY0(dmu_objset_from_ds(ds, &os)); - - if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { - old_synchronous_dataset_destroy(ds, tx); - } else { - /* - * Move the bptree into the pool's list of trees to - * clean up and update space accounting information. - */ - uint64_t used, comp, uncomp; - - zil_destroy_sync(dmu_objset_zil(os), tx); - - if (!spa_feature_is_active(dp->dp_spa, - SPA_FEATURE_ASYNC_DESTROY)) { - dsl_scan_t *scn = dp->dp_scan; - spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY, - tx); - dp->dp_bptree_obj = bptree_alloc(mos, tx); - VERIFY0(zap_add(mos, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, - &dp->dp_bptree_obj, tx)); - ASSERT(!scn->scn_async_destroying); - scn->scn_async_destroying = B_TRUE; - } - - used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes; - comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes; - uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes; - - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || - dsl_dataset_phys(ds)->ds_unique_bytes == used); - - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - bptree_add(mos, dp->dp_bptree_obj, - &dsl_dataset_phys(ds)->ds_bp, - dsl_dataset_phys(ds)->ds_prev_snap_txg, - used, comp, uncomp, tx); - rrw_exit(&ds->ds_bp_rwlock, FTAG); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, - -used, -comp, -uncomp, tx); - dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, - used, comp, uncomp, tx); - } - - if (ds->ds_prev != NULL) { - if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { - VERIFY0(zap_remove_int(mos, - dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones, - ds->ds_object, tx)); - } - prevobj = ds->ds_prev->ds_object; - dsl_dataset_rele(ds->ds_prev, ds); - ds->ds_prev = NULL; - } - - /* - * This must be done after the dsl_traverse(), because it will - * re-open the objset. - */ - if (ds->ds_objset) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } - - /* Erase the link in the dir */ - dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); - dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0; - ddobj = ds->ds_dir->dd_object; - ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0); - VERIFY0(zap_destroy(mos, - dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx)); - - if (ds->ds_bookmarks != 0) { - VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx)); - spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); - } - - spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); - - ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj); - ASSERT0(dsl_dataset_phys(ds)->ds_props_obj); - ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj); - dsl_dir_rele(ds->ds_dir, ds); - ds->ds_dir = NULL; - dmu_object_free_zapified(mos, obj, tx); - - dsl_dir_destroy_sync(ddobj, tx); - - if (rmorigin) { - dsl_dataset_t *prev; - VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev)); - dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx); - dsl_dataset_rele(prev, FTAG); - } -} - -void -dsl_destroy_head_sync(void *arg, dmu_tx_t *tx) -{ - dsl_destroy_head_arg_t *ddha = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - - VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); - dsl_destroy_head_sync_impl(ds, tx); -#if defined(__FreeBSD__) && defined(_KERNEL) - zvol_remove_minors(dp->dp_spa, ddha->ddha_name); -#endif - dsl_dataset_rele(ds, FTAG); -} - -static void -dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx) -{ - dsl_destroy_head_arg_t *ddha = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - - VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); - - /* Mark it as inconsistent on-disk, in case we crash */ - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; - - spa_history_log_internal_ds(ds, "destroy begin", tx, ""); - dsl_dataset_rele(ds, FTAG); -} - -int -dsl_destroy_head(const char *name) -{ - dsl_destroy_head_arg_t ddha; - int error; - spa_t *spa; - boolean_t isenabled; - -#ifdef _KERNEL - zfs_destroy_unmount_origin(name); -#endif - - error = spa_open(name, &spa, FTAG); - if (error != 0) - return (error); - isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY); - spa_close(spa, FTAG); - - ddha.ddha_name = name; - - if (!isenabled) { - objset_t *os; - - error = dsl_sync_task(name, dsl_destroy_head_check, - dsl_destroy_head_begin_sync, &ddha, - 0, ZFS_SPACE_CHECK_DESTROY); - if (error != 0) - return (error); - - /* - * Head deletion is processed in one txg on old pools; - * remove the objects from open context so that the txg sync - * is not too long. - */ - error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os); - if (error == 0) { - uint64_t prev_snap_txg = - dsl_dataset_phys(dmu_objset_ds(os))-> - ds_prev_snap_txg; - for (uint64_t obj = 0; error == 0; - error = dmu_object_next(os, &obj, FALSE, - prev_snap_txg)) - (void) dmu_free_long_object(os, obj); - /* sync out all frees */ - txg_wait_synced(dmu_objset_pool(os), 0); - dmu_objset_disown(os, FTAG); - } - } - - return (dsl_sync_task(name, dsl_destroy_head_check, - dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY)); -} - -/* - * Note, this function is used as the callback for dmu_objset_find(). We - * always return 0 so that we will continue to find and process - * inconsistent datasets, even if we encounter an error trying to - * process one of them. - */ -/* ARGSUSED */ -int -dsl_destroy_inconsistent(const char *dsname, void *arg) -{ - objset_t *os; - - if (dmu_objset_hold(dsname, FTAG, &os) == 0) { - boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os)); - - /* - * If the dataset is inconsistent because a resumable receive - * has failed, then do not destroy it. - */ - if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os))) - need_destroy = B_FALSE; - - dmu_objset_rele(os, FTAG); - if (need_destroy) - (void) dsl_destroy_head(dsname); - } - return (0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c deleted file mode 100644 index 2f43aabf7c82..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c +++ /dev/null @@ -1,2184 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 Pawel Jakub Dawidek . - * All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2014 Joyent, Inc. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright 2015 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2018, loli10K . All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef _KERNEL -#include -#endif -#include -#include -#include -#include "zfs_namecheck.h" -#include "zfs_prop.h" - -/* - * Filesystem and Snapshot Limits - * ------------------------------ - * - * These limits are used to restrict the number of filesystems and/or snapshots - * that can be created at a given level in the tree or below. A typical - * use-case is with a delegated dataset where the administrator wants to ensure - * that a user within the zone is not creating too many additional filesystems - * or snapshots, even though they're not exceeding their space quota. - * - * The filesystem and snapshot counts are stored as extensible properties. This - * capability is controlled by a feature flag and must be enabled to be used. - * Once enabled, the feature is not active until the first limit is set. At - * that point, future operations to create/destroy filesystems or snapshots - * will validate and update the counts. - * - * Because the count properties will not exist before the feature is active, - * the counts are updated when a limit is first set on an uninitialized - * dsl_dir node in the tree (The filesystem/snapshot count on a node includes - * all of the nested filesystems/snapshots. Thus, a new leaf node has a - * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and - * snapshot count properties on a node indicate uninitialized counts on that - * node.) When first setting a limit on an uninitialized node, the code starts - * at the filesystem with the new limit and descends into all sub-filesystems - * to add the count properties. - * - * In practice this is lightweight since a limit is typically set when the - * filesystem is created and thus has no children. Once valid, changing the - * limit value won't require a re-traversal since the counts are already valid. - * When recursively fixing the counts, if a node with a limit is encountered - * during the descent, the counts are known to be valid and there is no need to - * descend into that filesystem's children. The counts on filesystems above the - * one with the new limit will still be uninitialized, unless a limit is - * eventually set on one of those filesystems. The counts are always recursively - * updated when a limit is set on a dataset, unless there is already a limit. - * When a new limit value is set on a filesystem with an existing limit, it is - * possible for the new limit to be less than the current count at that level - * since a user who can change the limit is also allowed to exceed the limit. - * - * Once the feature is active, then whenever a filesystem or snapshot is - * created, the code recurses up the tree, validating the new count against the - * limit at each initialized level. In practice, most levels will not have a - * limit set. If there is a limit at any initialized level up the tree, the - * check must pass or the creation will fail. Likewise, when a filesystem or - * snapshot is destroyed, the counts are recursively adjusted all the way up - * the initizized nodes in the tree. Renaming a filesystem into different point - * in the tree will first validate, then update the counts on each branch up to - * the common ancestor. A receive will also validate the counts and then update - * them. - * - * An exception to the above behavior is that the limit is not enforced if the - * user has permission to modify the limit. This is primarily so that - * recursive snapshots in the global zone always work. We want to prevent a - * denial-of-service in which a lower level delegated dataset could max out its - * limit and thus block recursive snapshots from being taken in the global zone. - * Because of this, it is possible for the snapshot count to be over the limit - * and snapshots taken in the global zone could cause a lower level dataset to - * hit or exceed its limit. The administrator taking the global zone recursive - * snapshot should be aware of this side-effect and behave accordingly. - * For consistency, the filesystem limit is also not enforced if the user can - * modify the limit. - * - * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() - * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in - * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by - * dsl_dir_init_fs_ss_count(). - * - * There is a special case when we receive a filesystem that already exists. In - * this case a temporary clone name of %X is created (see dmu_recv_begin). We - * never update the filesystem counts for temporary clones. - * - * Likewise, we do not update the snapshot counts for temporary snapshots, - * such as those created by zfs diff. - */ - -extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); - -static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); - -typedef struct ddulrt_arg { - dsl_dir_t *ddulrta_dd; - uint64_t ddlrta_txg; -} ddulrt_arg_t; - -static void -dsl_dir_evict_async(void *dbu) -{ - dsl_dir_t *dd = dbu; - dsl_pool_t *dp = dd->dd_pool; - int t; - - dd->dd_dbuf = NULL; - - for (t = 0; t < TXG_SIZE; t++) { - ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); - ASSERT(dd->dd_tempreserved[t] == 0); - ASSERT(dd->dd_space_towrite[t] == 0); - } - - if (dd->dd_parent) - dsl_dir_async_rele(dd->dd_parent, dd); - - spa_async_close(dd->dd_pool->dp_spa, dd); - - dsl_prop_fini(dd); - mutex_destroy(&dd->dd_lock); - kmem_free(dd, sizeof (dsl_dir_t)); -} - -int -dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, - const char *tail, void *tag, dsl_dir_t **ddp) -{ - dmu_buf_t *dbuf; - dsl_dir_t *dd; - int err; - - ASSERT(dsl_pool_config_held(dp)); - - err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); - if (err != 0) - return (err); - dd = dmu_buf_get_user(dbuf); -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(dbuf, &doi); - ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); - ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); - } -#endif - if (dd == NULL) { - dsl_dir_t *winner; - - dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); - dd->dd_object = ddobj; - dd->dd_dbuf = dbuf; - dd->dd_pool = dp; - mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); - dsl_prop_init(dd); - - dsl_dir_snap_cmtime_update(dd); - - if (dsl_dir_phys(dd)->dd_parent_obj) { - err = dsl_dir_hold_obj(dp, - dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, - &dd->dd_parent); - if (err != 0) - goto errout; - if (tail) { -#ifdef ZFS_DEBUG - uint64_t foundobj; - - err = zap_lookup(dp->dp_meta_objset, - dsl_dir_phys(dd->dd_parent)-> - dd_child_dir_zapobj, tail, - sizeof (foundobj), 1, &foundobj); - ASSERT(err || foundobj == ddobj); -#endif - (void) strcpy(dd->dd_myname, tail); - } else { - err = zap_value_search(dp->dp_meta_objset, - dsl_dir_phys(dd->dd_parent)-> - dd_child_dir_zapobj, - ddobj, 0, dd->dd_myname); - } - if (err != 0) - goto errout; - } else { - (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); - } - - if (dsl_dir_is_clone(dd)) { - dmu_buf_t *origin_bonus; - dsl_dataset_phys_t *origin_phys; - - /* - * We can't open the origin dataset, because - * that would require opening this dsl_dir. - * Just look at its phys directly instead. - */ - err = dmu_bonus_hold(dp->dp_meta_objset, - dsl_dir_phys(dd)->dd_origin_obj, FTAG, - &origin_bonus); - if (err != 0) - goto errout; - origin_phys = origin_bonus->db_data; - dd->dd_origin_txg = - origin_phys->ds_creation_txg; - dmu_buf_rele(origin_bonus, FTAG); - } - - dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, - &dd->dd_dbuf); - winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); - if (winner != NULL) { - if (dd->dd_parent) - dsl_dir_rele(dd->dd_parent, dd); - dsl_prop_fini(dd); - mutex_destroy(&dd->dd_lock); - kmem_free(dd, sizeof (dsl_dir_t)); - dd = winner; - } else { - spa_open_ref(dp->dp_spa, dd); - } - } - - /* - * The dsl_dir_t has both open-to-close and instantiate-to-evict - * holds on the spa. We need the open-to-close holds because - * otherwise the spa_refcnt wouldn't change when we open a - * dir which the spa also has open, so we could incorrectly - * think it was OK to unload/export/destroy the pool. We need - * the instantiate-to-evict hold because the dsl_dir_t has a - * pointer to the dd_pool, which has a pointer to the spa_t. - */ - spa_open_ref(dp->dp_spa, tag); - ASSERT3P(dd->dd_pool, ==, dp); - ASSERT3U(dd->dd_object, ==, ddobj); - ASSERT3P(dd->dd_dbuf, ==, dbuf); - *ddp = dd; - return (0); - -errout: - if (dd->dd_parent) - dsl_dir_rele(dd->dd_parent, dd); - dsl_prop_fini(dd); - mutex_destroy(&dd->dd_lock); - kmem_free(dd, sizeof (dsl_dir_t)); - dmu_buf_rele(dbuf, tag); - return (err); -} - -void -dsl_dir_rele(dsl_dir_t *dd, void *tag) -{ - dprintf_dd(dd, "%s\n", ""); - spa_close(dd->dd_pool->dp_spa, tag); - dmu_buf_rele(dd->dd_dbuf, tag); -} - -/* - * Remove a reference to the given dsl dir that is being asynchronously - * released. Async releases occur from a taskq performing eviction of - * dsl datasets and dirs. This process is identical to a normal release - * with the exception of using the async API for releasing the reference on - * the spa. - */ -void -dsl_dir_async_rele(dsl_dir_t *dd, void *tag) -{ - dprintf_dd(dd, "%s\n", ""); - spa_async_close(dd->dd_pool->dp_spa, tag); - dmu_buf_rele(dd->dd_dbuf, tag); -} - -/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */ -void -dsl_dir_name(dsl_dir_t *dd, char *buf) -{ - if (dd->dd_parent) { - dsl_dir_name(dd->dd_parent, buf); - VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <, - ZFS_MAX_DATASET_NAME_LEN); - } else { - buf[0] = '\0'; - } - if (!MUTEX_HELD(&dd->dd_lock)) { - /* - * recursive mutex so that we can use - * dprintf_dd() with dd_lock held - */ - mutex_enter(&dd->dd_lock); - VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), - <, ZFS_MAX_DATASET_NAME_LEN); - mutex_exit(&dd->dd_lock); - } else { - VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), - <, ZFS_MAX_DATASET_NAME_LEN); - } -} - -/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ -int -dsl_dir_namelen(dsl_dir_t *dd) -{ - int result = 0; - - if (dd->dd_parent) { - /* parent's name + 1 for the "/" */ - result = dsl_dir_namelen(dd->dd_parent) + 1; - } - - if (!MUTEX_HELD(&dd->dd_lock)) { - /* see dsl_dir_name */ - mutex_enter(&dd->dd_lock); - result += strlen(dd->dd_myname); - mutex_exit(&dd->dd_lock); - } else { - result += strlen(dd->dd_myname); - } - - return (result); -} - -static int -getcomponent(const char *path, char *component, const char **nextp) -{ - char *p; - - if ((path == NULL) || (path[0] == '\0')) - return (SET_ERROR(ENOENT)); - /* This would be a good place to reserve some namespace... */ - p = strpbrk(path, "/@"); - if (p && (p[1] == '/' || p[1] == '@')) { - /* two separators in a row */ - return (SET_ERROR(EINVAL)); - } - if (p == NULL || p == path) { - /* - * if the first thing is an @ or /, it had better be an - * @ and it had better not have any more ats or slashes, - * and it had better have something after the @. - */ - if (p != NULL && - (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) - return (SET_ERROR(EINVAL)); - if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) - return (SET_ERROR(ENAMETOOLONG)); - (void) strcpy(component, path); - p = NULL; - } else if (p[0] == '/') { - if (p - path >= ZFS_MAX_DATASET_NAME_LEN) - return (SET_ERROR(ENAMETOOLONG)); - (void) strncpy(component, path, p - path); - component[p - path] = '\0'; - p++; - } else if (p[0] == '@') { - /* - * if the next separator is an @, there better not be - * any more slashes. - */ - if (strchr(path, '/')) - return (SET_ERROR(EINVAL)); - if (p - path >= ZFS_MAX_DATASET_NAME_LEN) - return (SET_ERROR(ENAMETOOLONG)); - (void) strncpy(component, path, p - path); - component[p - path] = '\0'; - } else { - panic("invalid p=%p", (void *)p); - } - *nextp = p; - return (0); -} - -/* - * Return the dsl_dir_t, and possibly the last component which couldn't - * be found in *tail. The name must be in the specified dsl_pool_t. This - * thread must hold the dp_config_rwlock for the pool. Returns NULL if the - * path is bogus, or if tail==NULL and we couldn't parse the whole name. - * (*tail)[0] == '@' means that the last component is a snapshot. - */ -int -dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, - dsl_dir_t **ddp, const char **tailp) -{ - char buf[ZFS_MAX_DATASET_NAME_LEN]; - const char *spaname, *next, *nextnext = NULL; - int err; - dsl_dir_t *dd; - uint64_t ddobj; - - err = getcomponent(name, buf, &next); - if (err != 0) - return (err); - - /* Make sure the name is in the specified pool. */ - spaname = spa_name(dp->dp_spa); - if (strcmp(buf, spaname) != 0) - return (SET_ERROR(EXDEV)); - - ASSERT(dsl_pool_config_held(dp)); - - err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); - if (err != 0) { - return (err); - } - - while (next != NULL) { - dsl_dir_t *child_dd; - err = getcomponent(next, buf, &nextnext); - if (err != 0) - break; - ASSERT(next[0] != '\0'); - if (next[0] == '@') - break; - dprintf("looking up %s in obj%lld\n", - buf, dsl_dir_phys(dd)->dd_child_dir_zapobj); - - err = zap_lookup(dp->dp_meta_objset, - dsl_dir_phys(dd)->dd_child_dir_zapobj, - buf, sizeof (ddobj), 1, &ddobj); - if (err != 0) { - if (err == ENOENT) - err = 0; - break; - } - - err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); - if (err != 0) - break; - dsl_dir_rele(dd, tag); - dd = child_dd; - next = nextnext; - } - - if (err != 0) { - dsl_dir_rele(dd, tag); - return (err); - } - - /* - * It's an error if there's more than one component left, or - * tailp==NULL and there's any component left. - */ - if (next != NULL && - (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { - /* bad path name */ - dsl_dir_rele(dd, tag); - dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); - err = SET_ERROR(ENOENT); - } - if (tailp != NULL) - *tailp = next; - *ddp = dd; - return (err); -} - -/* - * If the counts are already initialized for this filesystem and its - * descendants then do nothing, otherwise initialize the counts. - * - * The counts on this filesystem, and those below, may be uninitialized due to - * either the use of a pre-existing pool which did not support the - * filesystem/snapshot limit feature, or one in which the feature had not yet - * been enabled. - * - * Recursively descend the filesystem tree and update the filesystem/snapshot - * counts on each filesystem below, then update the cumulative count on the - * current filesystem. If the filesystem already has a count set on it, - * then we know that its counts, and the counts on the filesystems below it, - * are already correct, so we don't have to update this filesystem. - */ -static void -dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) -{ - uint64_t my_fs_cnt = 0; - uint64_t my_ss_cnt = 0; - dsl_pool_t *dp = dd->dd_pool; - objset_t *os = dp->dp_meta_objset; - zap_cursor_t *zc; - zap_attribute_t *za; - dsl_dataset_t *ds; - - ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); - ASSERT(dsl_pool_config_held(dp)); - ASSERT(dmu_tx_is_syncing(tx)); - - dsl_dir_zapify(dd, tx); - - /* - * If the filesystem count has already been initialized then we - * don't need to recurse down any further. - */ - if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) - return; - - zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); - za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - - /* Iterate my child dirs */ - for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj); - zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { - dsl_dir_t *chld_dd; - uint64_t count; - - VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, - &chld_dd)); - - /* - * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and - * temporary datasets. - */ - if (chld_dd->dd_myname[0] == '$' || - chld_dd->dd_myname[0] == '%') { - dsl_dir_rele(chld_dd, FTAG); - continue; - } - - my_fs_cnt++; /* count this child */ - - dsl_dir_init_fs_ss_count(chld_dd, tx); - - VERIFY0(zap_lookup(os, chld_dd->dd_object, - DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); - my_fs_cnt += count; - VERIFY0(zap_lookup(os, chld_dd->dd_object, - DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); - my_ss_cnt += count; - - dsl_dir_rele(chld_dd, FTAG); - } - zap_cursor_fini(zc); - /* Count my snapshots (we counted children's snapshots above) */ - VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, - dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds)); - - for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj); - zap_cursor_retrieve(zc, za) == 0; - zap_cursor_advance(zc)) { - /* Don't count temporary snapshots */ - if (za->za_name[0] != '%') - my_ss_cnt++; - } - zap_cursor_fini(zc); - - dsl_dataset_rele(ds, FTAG); - - kmem_free(zc, sizeof (zap_cursor_t)); - kmem_free(za, sizeof (zap_attribute_t)); - - /* we're in a sync task, update counts */ - dmu_buf_will_dirty(dd->dd_dbuf, tx); - VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, - sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); - VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, - sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); -} - -static int -dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) -{ - char *ddname = (char *)arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - dsl_dir_t *dd; - int error; - - error = dsl_dataset_hold(dp, ddname, FTAG, &ds); - if (error != 0) - return (error); - - if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(ENOTSUP)); - } - - dd = ds->ds_dir; - if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && - dsl_dir_is_zapified(dd) && - zap_contains(dp->dp_meta_objset, dd->dd_object, - DD_FIELD_FILESYSTEM_COUNT) == 0) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EALREADY)); - } - - dsl_dataset_rele(ds, FTAG); - return (0); -} - -static void -dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) -{ - char *ddname = (char *)arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - spa_t *spa; - - VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); - - spa = dsl_dataset_get_spa(ds); - - if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { - /* - * Since the feature was not active and we're now setting a - * limit, increment the feature-active counter so that the - * feature becomes active for the first time. - * - * We are already in a sync task so we can update the MOS. - */ - spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); - } - - /* - * Since we are now setting a non-UINT64_MAX limit on the filesystem, - * we need to ensure the counts are correct. Descend down the tree from - * this point and update all of the counts to be accurate. - */ - dsl_dir_init_fs_ss_count(ds->ds_dir, tx); - - dsl_dataset_rele(ds, FTAG); -} - -/* - * Make sure the feature is enabled and activate it if necessary. - * Since we're setting a limit, ensure the on-disk counts are valid. - * This is only called by the ioctl path when setting a limit value. - * - * We do not need to validate the new limit, since users who can change the - * limit are also allowed to exceed the limit. - */ -int -dsl_dir_activate_fs_ss_limit(const char *ddname) -{ - int error; - - error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, - dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0, - ZFS_SPACE_CHECK_RESERVED); - - if (error == EALREADY) - error = 0; - - return (error); -} - -/* - * Used to determine if the filesystem_limit or snapshot_limit should be - * enforced. We allow the limit to be exceeded if the user has permission to - * write the property value. We pass in the creds that we got in the open - * context since we will always be the GZ root in syncing context. We also have - * to handle the case where we are allowed to change the limit on the current - * dataset, but there may be another limit in the tree above. - * - * We can never modify these two properties within a non-global zone. In - * addition, the other checks are modeled on zfs_secpolicy_write_perms. We - * can't use that function since we are already holding the dp_config_rwlock. - * In addition, we already have the dd and dealing with snapshots is simplified - * in this code. - */ - -typedef enum { - ENFORCE_ALWAYS, - ENFORCE_NEVER, - ENFORCE_ABOVE -} enforce_res_t; - -static enforce_res_t -dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) -{ - enforce_res_t enforce = ENFORCE_ALWAYS; - uint64_t obj; - dsl_dataset_t *ds; - uint64_t zoned; - - ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || - prop == ZFS_PROP_SNAPSHOT_LIMIT); - -#ifdef _KERNEL -#ifdef __FreeBSD__ - if (jailed(cr)) -#else - if (crgetzoneid(cr) != GLOBAL_ZONEID) -#endif - return (ENFORCE_ALWAYS); - - if (secpolicy_zfs(cr) == 0) - return (ENFORCE_NEVER); -#endif - - if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) - return (ENFORCE_ALWAYS); - - ASSERT(dsl_pool_config_held(dd->dd_pool)); - - if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) - return (ENFORCE_ALWAYS); - - if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) { - /* Only root can access zoned fs's from the GZ */ - enforce = ENFORCE_ALWAYS; - } else { - if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) - enforce = ENFORCE_ABOVE; - } - - dsl_dataset_rele(ds, FTAG); - return (enforce); -} - -static void -dsl_dir_update_last_remap_txg_sync(void *varg, dmu_tx_t *tx) -{ - ddulrt_arg_t *arg = varg; - uint64_t last_remap_txg; - dsl_dir_t *dd = arg->ddulrta_dd; - objset_t *mos = dd->dd_pool->dp_meta_objset; - - dsl_dir_zapify(dd, tx); - if (zap_lookup(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, - sizeof (last_remap_txg), 1, &last_remap_txg) != 0 || - last_remap_txg < arg->ddlrta_txg) { - VERIFY0(zap_update(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, - sizeof (arg->ddlrta_txg), 1, &arg->ddlrta_txg, tx)); - } -} - -int -dsl_dir_update_last_remap_txg(dsl_dir_t *dd, uint64_t txg) -{ - ddulrt_arg_t arg; - arg.ddulrta_dd = dd; - arg.ddlrta_txg = txg; - - return (dsl_sync_task(spa_name(dd->dd_pool->dp_spa), - NULL, dsl_dir_update_last_remap_txg_sync, &arg, - 1, ZFS_SPACE_CHECK_RESERVED)); -} - -/* - * Check if adding additional child filesystem(s) would exceed any filesystem - * limits or adding additional snapshot(s) would exceed any snapshot limits. - * The prop argument indicates which limit to check. - * - * Note that all filesystem limits up to the root (or the highest - * initialized) filesystem or the given ancestor must be satisfied. - */ -int -dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, - dsl_dir_t *ancestor, cred_t *cr) -{ - objset_t *os = dd->dd_pool->dp_meta_objset; - uint64_t limit, count; - char *count_prop; - enforce_res_t enforce; - int err = 0; - - ASSERT(dsl_pool_config_held(dd->dd_pool)); - ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || - prop == ZFS_PROP_SNAPSHOT_LIMIT); - - /* - * If we're allowed to change the limit, don't enforce the limit - * e.g. this can happen if a snapshot is taken by an administrative - * user in the global zone (i.e. a recursive snapshot by root). - * However, we must handle the case of delegated permissions where we - * are allowed to change the limit on the current dataset, but there - * is another limit in the tree above. - */ - enforce = dsl_enforce_ds_ss_limits(dd, prop, cr); - if (enforce == ENFORCE_NEVER) - return (0); - - /* - * e.g. if renaming a dataset with no snapshots, count adjustment - * is 0. - */ - if (delta == 0) - return (0); - - if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { - /* - * We don't enforce the limit for temporary snapshots. This is - * indicated by a NULL cred_t argument. - */ - if (cr == NULL) - return (0); - - count_prop = DD_FIELD_SNAPSHOT_COUNT; - } else { - count_prop = DD_FIELD_FILESYSTEM_COUNT; - } - - /* - * If an ancestor has been provided, stop checking the limit once we - * hit that dir. We need this during rename so that we don't overcount - * the check once we recurse up to the common ancestor. - */ - if (ancestor == dd) - return (0); - - /* - * If we hit an uninitialized node while recursing up the tree, we can - * stop since we know there is no limit here (or above). The counts are - * not valid on this node and we know we won't touch this node's counts. - */ - if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object, - count_prop, sizeof (count), 1, &count) == ENOENT) - return (0); - - err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, - B_FALSE); - if (err != 0) - return (err); - - /* Is there a limit which we've hit? */ - if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) - return (SET_ERROR(EDQUOT)); - - if (dd->dd_parent != NULL) - err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, - ancestor, cr); - - return (err); -} - -/* - * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all - * parents. When a new filesystem/snapshot is created, increment the count on - * all parents, and when a filesystem/snapshot is destroyed, decrement the - * count. - */ -void -dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, - dmu_tx_t *tx) -{ - int err; - objset_t *os = dd->dd_pool->dp_meta_objset; - uint64_t count; - - ASSERT(dsl_pool_config_held(dd->dd_pool)); - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || - strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); - - /* - * When we receive an incremental stream into a filesystem that already - * exists, a temporary clone is created. We don't count this temporary - * clone, whose name begins with a '%'. We also ignore hidden ($FREE, - * $MOS & $ORIGIN) objsets. - */ - if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') && - strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0) - return; - - /* - * e.g. if renaming a dataset with no snapshots, count adjustment is 0 - */ - if (delta == 0) - return; - - /* - * If we hit an uninitialized node while recursing up the tree, we can - * stop since we know the counts are not valid on this node and we - * know we shouldn't touch this node's counts. An uninitialized count - * on the node indicates that either the feature has not yet been - * activated or there are no limits on this part of the tree. - */ - if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, - prop, sizeof (count), 1, &count)) == ENOENT) - return; - VERIFY0(err); - - count += delta; - /* Use a signed verify to make sure we're not neg. */ - VERIFY3S(count, >=, 0); - - VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, - tx)); - - /* Roll up this additional count into our ancestors */ - if (dd->dd_parent != NULL) - dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); -} - -uint64_t -dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, - dmu_tx_t *tx) -{ - objset_t *mos = dp->dp_meta_objset; - uint64_t ddobj; - dsl_dir_phys_t *ddphys; - dmu_buf_t *dbuf; - - ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, - DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); - if (pds) { - VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, - name, sizeof (uint64_t), 1, &ddobj, tx)); - } else { - /* it's the root dir */ - VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); - } - VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); - ddphys = dbuf->db_data; - - ddphys->dd_creation_time = gethrestime_sec(); - if (pds) { - ddphys->dd_parent_obj = pds->dd_object; - - /* update the filesystem counts */ - dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); - } - ddphys->dd_props_zapobj = zap_create(mos, - DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); - ddphys->dd_child_dir_zapobj = zap_create(mos, - DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); - if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) - ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; - dmu_buf_rele(dbuf, FTAG); - - return (ddobj); -} - -boolean_t -dsl_dir_is_clone(dsl_dir_t *dd) -{ - return (dsl_dir_phys(dd)->dd_origin_obj && - (dd->dd_pool->dp_origin_snap == NULL || - dsl_dir_phys(dd)->dd_origin_obj != - dd->dd_pool->dp_origin_snap->ds_object)); -} - - -uint64_t -dsl_dir_get_used(dsl_dir_t *dd) -{ - return (dsl_dir_phys(dd)->dd_used_bytes); -} - -uint64_t -dsl_dir_get_compressed(dsl_dir_t *dd) -{ - return (dsl_dir_phys(dd)->dd_compressed_bytes); -} - -uint64_t -dsl_dir_get_quota(dsl_dir_t *dd) -{ - return (dsl_dir_phys(dd)->dd_quota); -} - -uint64_t -dsl_dir_get_reservation(dsl_dir_t *dd) -{ - return (dsl_dir_phys(dd)->dd_reserved); -} - -uint64_t -dsl_dir_get_compressratio(dsl_dir_t *dd) -{ - /* a fixed point number, 100x the ratio */ - return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 : - (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 / - dsl_dir_phys(dd)->dd_compressed_bytes)); -} - -uint64_t -dsl_dir_get_logicalused(dsl_dir_t *dd) -{ - return (dsl_dir_phys(dd)->dd_uncompressed_bytes); -} - -uint64_t -dsl_dir_get_usedsnap(dsl_dir_t *dd) -{ - return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]); -} - -uint64_t -dsl_dir_get_usedds(dsl_dir_t *dd) -{ - return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]); -} - -uint64_t -dsl_dir_get_usedrefreserv(dsl_dir_t *dd) -{ - return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]); -} - -uint64_t -dsl_dir_get_usedchild(dsl_dir_t *dd) -{ - return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] + - dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]); -} - -void -dsl_dir_get_origin(dsl_dir_t *dd, char *buf) -{ - dsl_dataset_t *ds; - VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, - dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); - - dsl_dataset_name(ds, buf); - - dsl_dataset_rele(ds, FTAG); -} - -int -dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count) -{ - if (dsl_dir_is_zapified(dd)) { - objset_t *os = dd->dd_pool->dp_meta_objset; - return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, - sizeof (*count), 1, count)); - } else { - return (ENOENT); - } -} - -int -dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count) -{ - if (dsl_dir_is_zapified(dd)) { - objset_t *os = dd->dd_pool->dp_meta_objset; - return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, - sizeof (*count), 1, count)); - } else { - return (ENOENT); - } -} - -int -dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count) -{ - if (dsl_dir_is_zapified(dd)) { - objset_t *os = dd->dd_pool->dp_meta_objset; - return (zap_lookup(os, dd->dd_object, DD_FIELD_LAST_REMAP_TXG, - sizeof (*count), 1, count)); - } else { - return (ENOENT); - } -} - -void -dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) -{ - mutex_enter(&dd->dd_lock); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, - dsl_dir_get_quota(dd)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, - dsl_dir_get_reservation(dd)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, - dsl_dir_get_logicalused(dd)); - if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, - dsl_dir_get_usedsnap(dd)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, - dsl_dir_get_usedds(dd)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, - dsl_dir_get_usedrefreserv(dd)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, - dsl_dir_get_usedchild(dd)); - } - mutex_exit(&dd->dd_lock); - - uint64_t count; - if (dsl_dir_get_filesystem_count(dd, &count) == 0) { - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT, - count); - } - if (dsl_dir_get_snapshot_count(dd, &count) == 0) { - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT, - count); - } - if (dsl_dir_get_remaptxg(dd, &count) == 0) { - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REMAPTXG, - count); - } - - if (dsl_dir_is_clone(dd)) { - char buf[ZFS_MAX_DATASET_NAME_LEN]; - dsl_dir_get_origin(dd, buf); - dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); - } - -} - -void -dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) -{ - dsl_pool_t *dp = dd->dd_pool; - - ASSERT(dsl_dir_phys(dd)); - - if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { - /* up the hold count until we can be written out */ - dmu_buf_add_ref(dd->dd_dbuf, dd); - } -} - -static int64_t -parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) -{ - uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved); - uint64_t new_accounted = - MAX(used + delta, dsl_dir_phys(dd)->dd_reserved); - return (new_accounted - old_accounted); -} - -void -dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) -{ - ASSERT(dmu_tx_is_syncing(tx)); - - mutex_enter(&dd->dd_lock); - ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]); - dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, - dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); - dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; - mutex_exit(&dd->dd_lock); - - /* release the hold from dsl_dir_dirty */ - dmu_buf_rele(dd->dd_dbuf, dd); -} - -static uint64_t -dsl_dir_space_towrite(dsl_dir_t *dd) -{ - uint64_t space = 0; - - ASSERT(MUTEX_HELD(&dd->dd_lock)); - - for (int i = 0; i < TXG_SIZE; i++) { - space += dd->dd_space_towrite[i & TXG_MASK]; - ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0); - } - return (space); -} - -/* - * How much space would dd have available if ancestor had delta applied - * to it? If ondiskonly is set, we're only interested in what's - * on-disk, not estimated pending changes. - */ -uint64_t -dsl_dir_space_available(dsl_dir_t *dd, - dsl_dir_t *ancestor, int64_t delta, int ondiskonly) -{ - uint64_t parentspace, myspace, quota, used; - - /* - * If there are no restrictions otherwise, assume we have - * unlimited space available. - */ - quota = UINT64_MAX; - parentspace = UINT64_MAX; - - if (dd->dd_parent != NULL) { - parentspace = dsl_dir_space_available(dd->dd_parent, - ancestor, delta, ondiskonly); - } - - mutex_enter(&dd->dd_lock); - if (dsl_dir_phys(dd)->dd_quota != 0) - quota = dsl_dir_phys(dd)->dd_quota; - used = dsl_dir_phys(dd)->dd_used_bytes; - if (!ondiskonly) - used += dsl_dir_space_towrite(dd); - - if (dd->dd_parent == NULL) { - uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, - ZFS_SPACE_CHECK_NORMAL); - quota = MIN(quota, poolsize); - } - - if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) { - /* - * We have some space reserved, in addition to what our - * parent gave us. - */ - parentspace += dsl_dir_phys(dd)->dd_reserved - used; - } - - if (dd == ancestor) { - ASSERT(delta <= 0); - ASSERT(used >= -delta); - used += delta; - if (parentspace != UINT64_MAX) - parentspace -= delta; - } - - if (used > quota) { - /* over quota */ - myspace = 0; - } else { - /* - * the lesser of the space provided by our parent and - * the space left in our quota - */ - myspace = MIN(parentspace, quota - used); - } - - mutex_exit(&dd->dd_lock); - - return (myspace); -} - -struct tempreserve { - list_node_t tr_node; - dsl_dir_t *tr_ds; - uint64_t tr_size; -}; - -static int -dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, - boolean_t ignorequota, list_t *tr_list, - dmu_tx_t *tx, boolean_t first) -{ - uint64_t txg = tx->tx_txg; - uint64_t quota; - struct tempreserve *tr; - int retval = EDQUOT; - uint64_t ref_rsrv = 0; - - ASSERT3U(txg, !=, 0); - ASSERT3S(asize, >, 0); - - mutex_enter(&dd->dd_lock); - - /* - * Check against the dsl_dir's quota. We don't add in the delta - * when checking for over-quota because they get one free hit. - */ - uint64_t est_inflight = dsl_dir_space_towrite(dd); - for (int i = 0; i < TXG_SIZE; i++) - est_inflight += dd->dd_tempreserved[i]; - uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; - - /* - * On the first iteration, fetch the dataset's used-on-disk and - * refreservation values. Also, if checkrefquota is set, test if - * allocating this space would exceed the dataset's refquota. - */ - if (first && tx->tx_objset) { - int error; - dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; - - error = dsl_dataset_check_quota(ds, !netfree, - asize, est_inflight, &used_on_disk, &ref_rsrv); - if (error != 0) { - mutex_exit(&dd->dd_lock); - return (error); - } - } - - /* - * If this transaction will result in a net free of space, - * we want to let it through. - */ - if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0) - quota = UINT64_MAX; - else - quota = dsl_dir_phys(dd)->dd_quota; - - /* - * Adjust the quota against the actual pool size at the root - * minus any outstanding deferred frees. - * To ensure that it's possible to remove files from a full - * pool without inducing transient overcommits, we throttle - * netfree transactions against a quota that is slightly larger, - * but still within the pool's allocation slop. In cases where - * we're very close to full, this will allow a steady trickle of - * removes to get through. - */ - uint64_t deferred = 0; - if (dd->dd_parent == NULL) { - uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool, - (netfree) ? - ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL); - - if (avail < quota) { - quota = avail; - retval = ENOSPC; - } - } - - /* - * If they are requesting more space, and our current estimate - * is over quota, they get to try again unless the actual - * on-disk is over quota and there are no pending changes (which - * may free up space for us). - */ - if (used_on_disk + est_inflight >= quota) { - if (est_inflight > 0 || used_on_disk < quota || - (retval == ENOSPC && used_on_disk < quota + deferred)) - retval = ERESTART; - dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " - "quota=%lluK tr=%lluK err=%d\n", - used_on_disk>>10, est_inflight>>10, - quota>>10, asize>>10, retval); - mutex_exit(&dd->dd_lock); - return (SET_ERROR(retval)); - } - - /* We need to up our estimated delta before dropping dd_lock */ - dd->dd_tempreserved[txg & TXG_MASK] += asize; - - uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, - asize - ref_rsrv); - mutex_exit(&dd->dd_lock); - - tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); - tr->tr_ds = dd; - tr->tr_size = asize; - list_insert_tail(tr_list, tr); - - /* see if it's OK with our parent */ - if (dd->dd_parent != NULL && parent_rsrv != 0) { - boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); - - return (dsl_dir_tempreserve_impl(dd->dd_parent, - parent_rsrv, netfree, ismos, tr_list, tx, B_FALSE)); - } else { - return (0); - } -} - -/* - * Reserve space in this dsl_dir, to be used in this tx's txg. - * After the space has been dirtied (and dsl_dir_willuse_space() - * has been called), the reservation should be canceled, using - * dsl_dir_tempreserve_clear(). - */ -int -dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, - boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx) -{ - int err; - list_t *tr_list; - - if (asize == 0) { - *tr_cookiep = NULL; - return (0); - } - - tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); - list_create(tr_list, sizeof (struct tempreserve), - offsetof(struct tempreserve, tr_node)); - ASSERT3S(asize, >, 0); - - err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg); - if (err == 0) { - struct tempreserve *tr; - - tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); - tr->tr_size = lsize; - list_insert_tail(tr_list, tr); - } else { - if (err == EAGAIN) { - /* - * If arc_memory_throttle() detected that pageout - * is running and we are low on memory, we delay new - * non-pageout transactions to give pageout an - * advantage. - * - * It is unfortunate to be delaying while the caller's - * locks are held. - */ - txg_delay(dd->dd_pool, tx->tx_txg, - MSEC2NSEC(10), MSEC2NSEC(10)); - err = SET_ERROR(ERESTART); - } - } - - if (err == 0) { - err = dsl_dir_tempreserve_impl(dd, asize, netfree, - B_FALSE, tr_list, tx, B_TRUE); - } - - if (err != 0) - dsl_dir_tempreserve_clear(tr_list, tx); - else - *tr_cookiep = tr_list; - - return (err); -} - -/* - * Clear a temporary reservation that we previously made with - * dsl_dir_tempreserve_space(). - */ -void -dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) -{ - int txgidx = tx->tx_txg & TXG_MASK; - list_t *tr_list = tr_cookie; - struct tempreserve *tr; - - ASSERT3U(tx->tx_txg, !=, 0); - - if (tr_cookie == NULL) - return; - - while ((tr = list_head(tr_list)) != NULL) { - if (tr->tr_ds) { - mutex_enter(&tr->tr_ds->dd_lock); - ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, - tr->tr_size); - tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; - mutex_exit(&tr->tr_ds->dd_lock); - } else { - arc_tempreserve_clear(tr->tr_size); - } - list_remove(tr_list, tr); - kmem_free(tr, sizeof (struct tempreserve)); - } - - kmem_free(tr_list, sizeof (list_t)); -} - -/* - * This should be called from open context when we think we're going to write - * or free space, for example when dirtying data. Be conservative; it's okay - * to write less space or free more, but we don't want to write more or free - * less than the amount specified. - */ -void -dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) -{ - int64_t parent_space; - uint64_t est_used; - - mutex_enter(&dd->dd_lock); - if (space > 0) - dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; - - est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes; - parent_space = parent_delta(dd, est_used, space); - mutex_exit(&dd->dd_lock); - - /* Make sure that we clean up dd_space_to* */ - dsl_dir_dirty(dd, tx); - - /* XXX this is potentially expensive and unnecessary... */ - if (parent_space && dd->dd_parent) - dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); -} - -/* call from syncing context when we actually write/free space for this dd */ -void -dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, - int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) -{ - int64_t accounted_delta; - - /* - * dsl_dataset_set_refreservation_sync_impl() calls this with - * dd_lock held, so that it can atomically update - * ds->ds_reserved and the dsl_dir accounting, so that - * dsl_dataset_check_quota() can see dataset and dir accounting - * consistently. - */ - boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); - - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(type < DD_USED_NUM); - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - - if (needlock) - mutex_enter(&dd->dd_lock); - accounted_delta = - parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used); - ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used); - ASSERT(compressed >= 0 || - dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed); - ASSERT(uncompressed >= 0 || - dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed); - dsl_dir_phys(dd)->dd_used_bytes += used; - dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed; - dsl_dir_phys(dd)->dd_compressed_bytes += compressed; - - if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { - ASSERT(used > 0 || - dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used); - dsl_dir_phys(dd)->dd_used_breakdown[type] += used; -#ifdef DEBUG - dd_used_t t; - uint64_t u = 0; - for (t = 0; t < DD_USED_NUM; t++) - u += dsl_dir_phys(dd)->dd_used_breakdown[t]; - ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes); -#endif - } - if (needlock) - mutex_exit(&dd->dd_lock); - - if (dd->dd_parent != NULL) { - dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, - accounted_delta, compressed, uncompressed, tx); - dsl_dir_transfer_space(dd->dd_parent, - used - accounted_delta, - DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL); - } -} - -void -dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, - dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) -{ - ASSERT(tx == NULL || dmu_tx_is_syncing(tx)); - ASSERT(oldtype < DD_USED_NUM); - ASSERT(newtype < DD_USED_NUM); - - if (delta == 0 || - !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN)) - return; - - if (tx != NULL) - dmu_buf_will_dirty(dd->dd_dbuf, tx); - mutex_enter(&dd->dd_lock); - ASSERT(delta > 0 ? - dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta : - dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta); - ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta)); - dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta; - dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta; - mutex_exit(&dd->dd_lock); -} - -typedef struct dsl_dir_set_qr_arg { - const char *ddsqra_name; - zprop_source_t ddsqra_source; - uint64_t ddsqra_value; -} dsl_dir_set_qr_arg_t; - -static int -dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) -{ - dsl_dir_set_qr_arg_t *ddsqra = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - int error; - uint64_t towrite, newval; - - error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); - if (error != 0) - return (error); - - error = dsl_prop_predict(ds->ds_dir, "quota", - ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - - if (newval == 0) { - dsl_dataset_rele(ds, FTAG); - return (0); - } - - mutex_enter(&ds->ds_dir->dd_lock); - /* - * If we are doing the preliminary check in open context, and - * there are pending changes, then don't fail it, since the - * pending changes could under-estimate the amount of space to be - * freed up. - */ - towrite = dsl_dir_space_towrite(ds->ds_dir); - if ((dmu_tx_is_syncing(tx) || towrite == 0) && - (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved || - newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) { - error = SET_ERROR(ENOSPC); - } - mutex_exit(&ds->ds_dir->dd_lock); - dsl_dataset_rele(ds, FTAG); - return (error); -} - -static void -dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) -{ - dsl_dir_set_qr_arg_t *ddsqra = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - uint64_t newval; - - VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - - if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { - dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), - ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, - &ddsqra->ddsqra_value, tx); - - VERIFY0(dsl_prop_get_int_ds(ds, - zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); - } else { - newval = ddsqra->ddsqra_value; - spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", - zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); - } - - dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); - mutex_enter(&ds->ds_dir->dd_lock); - dsl_dir_phys(ds->ds_dir)->dd_quota = newval; - mutex_exit(&ds->ds_dir->dd_lock); - dsl_dataset_rele(ds, FTAG); -} - -int -dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) -{ - dsl_dir_set_qr_arg_t ddsqra; - - ddsqra.ddsqra_name = ddname; - ddsqra.ddsqra_source = source; - ddsqra.ddsqra_value = quota; - - return (dsl_sync_task(ddname, dsl_dir_set_quota_check, - dsl_dir_set_quota_sync, &ddsqra, 0, - ZFS_SPACE_CHECK_EXTRA_RESERVED)); -} - -int -dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) -{ - dsl_dir_set_qr_arg_t *ddsqra = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - dsl_dir_t *dd; - uint64_t newval, used, avail; - int error; - - error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); - if (error != 0) - return (error); - dd = ds->ds_dir; - - /* - * If we are doing the preliminary check in open context, the - * space estimates may be inaccurate. - */ - if (!dmu_tx_is_syncing(tx)) { - dsl_dataset_rele(ds, FTAG); - return (0); - } - - error = dsl_prop_predict(ds->ds_dir, - zfs_prop_to_name(ZFS_PROP_RESERVATION), - ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - - mutex_enter(&dd->dd_lock); - used = dsl_dir_phys(dd)->dd_used_bytes; - mutex_exit(&dd->dd_lock); - - if (dd->dd_parent) { - avail = dsl_dir_space_available(dd->dd_parent, - NULL, 0, FALSE); - } else { - avail = dsl_pool_adjustedsize(dd->dd_pool, - ZFS_SPACE_CHECK_NORMAL) - used; - } - - if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { - uint64_t delta = MAX(used, newval) - - MAX(used, dsl_dir_phys(dd)->dd_reserved); - - if (delta > avail || - (dsl_dir_phys(dd)->dd_quota > 0 && - newval > dsl_dir_phys(dd)->dd_quota)) - error = SET_ERROR(ENOSPC); - } - - dsl_dataset_rele(ds, FTAG); - return (error); -} - -void -dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) -{ - uint64_t used; - int64_t delta; - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - - mutex_enter(&dd->dd_lock); - used = dsl_dir_phys(dd)->dd_used_bytes; - delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved); - dsl_dir_phys(dd)->dd_reserved = value; - - if (dd->dd_parent != NULL) { - /* Roll up this additional usage into our ancestors */ - dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, - delta, 0, 0, tx); - } - mutex_exit(&dd->dd_lock); -} - -static void -dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) -{ - dsl_dir_set_qr_arg_t *ddsqra = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - uint64_t newval; - - VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - - if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { - dsl_prop_set_sync_impl(ds, - zfs_prop_to_name(ZFS_PROP_RESERVATION), - ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, - &ddsqra->ddsqra_value, tx); - - VERIFY0(dsl_prop_get_int_ds(ds, - zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); - } else { - newval = ddsqra->ddsqra_value; - spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", - zfs_prop_to_name(ZFS_PROP_RESERVATION), - (longlong_t)newval); - } - - dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); - dsl_dataset_rele(ds, FTAG); -} - -int -dsl_dir_set_reservation(const char *ddname, zprop_source_t source, - uint64_t reservation) -{ - dsl_dir_set_qr_arg_t ddsqra; - - ddsqra.ddsqra_name = ddname; - ddsqra.ddsqra_source = source; - ddsqra.ddsqra_value = reservation; - - return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, - dsl_dir_set_reservation_sync, &ddsqra, 0, - ZFS_SPACE_CHECK_EXTRA_RESERVED)); -} - -static dsl_dir_t * -closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) -{ - for (; ds1; ds1 = ds1->dd_parent) { - dsl_dir_t *dd; - for (dd = ds2; dd; dd = dd->dd_parent) { - if (ds1 == dd) - return (dd); - } - } - return (NULL); -} - -/* - * If delta is applied to dd, how much of that delta would be applied to - * ancestor? Syncing context only. - */ -static int64_t -would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) -{ - if (dd == ancestor) - return (delta); - - mutex_enter(&dd->dd_lock); - delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta); - mutex_exit(&dd->dd_lock); - return (would_change(dd->dd_parent, delta, ancestor)); -} - -typedef struct dsl_dir_rename_arg { - const char *ddra_oldname; - const char *ddra_newname; - cred_t *ddra_cred; -} dsl_dir_rename_arg_t; - -typedef struct dsl_valid_rename_arg { - int char_delta; - int nest_delta; -} dsl_valid_rename_arg_t; - -/* ARGSUSED */ -static int -dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) -{ - dsl_valid_rename_arg_t *dvra = arg; - char namebuf[ZFS_MAX_DATASET_NAME_LEN]; - - dsl_dataset_name(ds, namebuf); - - ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN), - <, ZFS_MAX_DATASET_NAME_LEN); - int namelen = strlen(namebuf) + dvra->char_delta; - int depth = get_dataset_depth(namebuf) + dvra->nest_delta; - - if (namelen >= ZFS_MAX_DATASET_NAME_LEN) - return (SET_ERROR(ENAMETOOLONG)); - if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting) - return (SET_ERROR(ENAMETOOLONG)); - return (0); -} - -static int -dsl_dir_rename_check(void *arg, dmu_tx_t *tx) -{ - dsl_dir_rename_arg_t *ddra = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dir_t *dd, *newparent; - dsl_valid_rename_arg_t dvra; - dsl_dataset_t *parentds; - objset_t *parentos; - const char *mynewname; - int error; - - /* target dir should exist */ - error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); - if (error != 0) - return (error); - - /* new parent should exist */ - error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, - &newparent, &mynewname); - if (error != 0) { - dsl_dir_rele(dd, FTAG); - return (error); - } - - /* can't rename to different pool */ - if (dd->dd_pool != newparent->dd_pool) { - dsl_dir_rele(newparent, FTAG); - dsl_dir_rele(dd, FTAG); - return (SET_ERROR(EXDEV)); - } - - /* new name should not already exist */ - if (mynewname == NULL) { - dsl_dir_rele(newparent, FTAG); - dsl_dir_rele(dd, FTAG); - return (SET_ERROR(EEXIST)); - } - - /* can't rename below anything but filesystems (eg. no ZVOLs) */ - error = dsl_dataset_hold_obj(newparent->dd_pool, - dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds); - if (error != 0) { - dsl_dir_rele(newparent, FTAG); - dsl_dir_rele(dd, FTAG); - return (error); - } - error = dmu_objset_from_ds(parentds, &parentos); - if (error != 0) { - dsl_dataset_rele(parentds, FTAG); - dsl_dir_rele(newparent, FTAG); - dsl_dir_rele(dd, FTAG); - return (error); - } - if (dmu_objset_type(parentos) != DMU_OST_ZFS) { - dsl_dataset_rele(parentds, FTAG); - dsl_dir_rele(newparent, FTAG); - dsl_dir_rele(dd, FTAG); - return (error); - } - dsl_dataset_rele(parentds, FTAG); - - ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN), - <, ZFS_MAX_DATASET_NAME_LEN); - ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN), - <, ZFS_MAX_DATASET_NAME_LEN); - dvra.char_delta = strlen(ddra->ddra_newname) - - strlen(ddra->ddra_oldname); - dvra.nest_delta = get_dataset_depth(ddra->ddra_newname) - - get_dataset_depth(ddra->ddra_oldname); - - /* if the name length is growing, validate child name lengths */ - if (dvra.char_delta > 0 || dvra.nest_delta > 0) { - error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, - &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); - if (error != 0) { - dsl_dir_rele(newparent, FTAG); - dsl_dir_rele(dd, FTAG); - return (error); - } - } - - if (dmu_tx_is_syncing(tx)) { - if (spa_feature_is_active(dp->dp_spa, - SPA_FEATURE_FS_SS_LIMIT)) { - /* - * Although this is the check function and we don't - * normally make on-disk changes in check functions, - * we need to do that here. - * - * Ensure this portion of the tree's counts have been - * initialized in case the new parent has limits set. - */ - dsl_dir_init_fs_ss_count(dd, tx); - } - } - - if (newparent != dd->dd_parent) { - /* is there enough space? */ - uint64_t myspace = - MAX(dsl_dir_phys(dd)->dd_used_bytes, - dsl_dir_phys(dd)->dd_reserved); - objset_t *os = dd->dd_pool->dp_meta_objset; - uint64_t fs_cnt = 0; - uint64_t ss_cnt = 0; - - if (dsl_dir_is_zapified(dd)) { - int err; - - err = zap_lookup(os, dd->dd_object, - DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, - &fs_cnt); - if (err != ENOENT && err != 0) { - dsl_dir_rele(newparent, FTAG); - dsl_dir_rele(dd, FTAG); - return (err); - } - - /* - * have to add 1 for the filesystem itself that we're - * moving - */ - fs_cnt++; - - err = zap_lookup(os, dd->dd_object, - DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, - &ss_cnt); - if (err != ENOENT && err != 0) { - dsl_dir_rele(newparent, FTAG); - dsl_dir_rele(dd, FTAG); - return (err); - } - } - - /* no rename into our descendant */ - if (closest_common_ancestor(dd, newparent) == dd) { - dsl_dir_rele(newparent, FTAG); - dsl_dir_rele(dd, FTAG); - return (SET_ERROR(EINVAL)); - } - - error = dsl_dir_transfer_possible(dd->dd_parent, - newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred); - if (error != 0) { - dsl_dir_rele(newparent, FTAG); - dsl_dir_rele(dd, FTAG); - return (error); - } - } - - dsl_dir_rele(newparent, FTAG); - dsl_dir_rele(dd, FTAG); - return (0); -} - -static void -dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) -{ - dsl_dir_rename_arg_t *ddra = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dir_t *dd, *newparent; - const char *mynewname; - int error; - objset_t *mos = dp->dp_meta_objset; - - VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); - VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, - &mynewname)); - - /* Log this before we change the name. */ - spa_history_log_internal_dd(dd, "rename", tx, - "-> %s", ddra->ddra_newname); - - if (newparent != dd->dd_parent) { - objset_t *os = dd->dd_pool->dp_meta_objset; - uint64_t fs_cnt = 0; - uint64_t ss_cnt = 0; - - /* - * We already made sure the dd counts were initialized in the - * check function. - */ - if (spa_feature_is_active(dp->dp_spa, - SPA_FEATURE_FS_SS_LIMIT)) { - VERIFY0(zap_lookup(os, dd->dd_object, - DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, - &fs_cnt)); - /* add 1 for the filesystem itself that we're moving */ - fs_cnt++; - - VERIFY0(zap_lookup(os, dd->dd_object, - DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, - &ss_cnt)); - } - - dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, - DD_FIELD_FILESYSTEM_COUNT, tx); - dsl_fs_ss_count_adjust(newparent, fs_cnt, - DD_FIELD_FILESYSTEM_COUNT, tx); - - dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, - DD_FIELD_SNAPSHOT_COUNT, tx); - dsl_fs_ss_count_adjust(newparent, ss_cnt, - DD_FIELD_SNAPSHOT_COUNT, tx); - - dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, - -dsl_dir_phys(dd)->dd_used_bytes, - -dsl_dir_phys(dd)->dd_compressed_bytes, - -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); - dsl_dir_diduse_space(newparent, DD_USED_CHILD, - dsl_dir_phys(dd)->dd_used_bytes, - dsl_dir_phys(dd)->dd_compressed_bytes, - dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); - - if (dsl_dir_phys(dd)->dd_reserved > - dsl_dir_phys(dd)->dd_used_bytes) { - uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved - - dsl_dir_phys(dd)->dd_used_bytes; - - dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, - -unused_rsrv, 0, 0, tx); - dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, - unused_rsrv, 0, 0, tx); - } - } - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - - /* remove from old parent zapobj */ - error = zap_remove(mos, - dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, - dd->dd_myname, tx); - ASSERT0(error); - - (void) strcpy(dd->dd_myname, mynewname); - dsl_dir_rele(dd->dd_parent, dd); - dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object; - VERIFY0(dsl_dir_hold_obj(dp, - newparent->dd_object, NULL, dd, &dd->dd_parent)); - - /* add to new parent zapobj */ - VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, - dd->dd_myname, 8, 1, &dd->dd_object, tx)); - -#ifdef __FreeBSD__ -#ifdef _KERNEL - zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname); - zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname, ddra->ddra_newname); -#endif -#endif - - dsl_prop_notify_all(dd); - - dsl_dir_rele(newparent, FTAG); - dsl_dir_rele(dd, FTAG); -} - -int -dsl_dir_rename(const char *oldname, const char *newname) -{ - dsl_dir_rename_arg_t ddra; - - ddra.ddra_oldname = oldname; - ddra.ddra_newname = newname; - ddra.ddra_cred = CRED(); - - return (dsl_sync_task(oldname, - dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, - 3, ZFS_SPACE_CHECK_RESERVED)); -} - -int -dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, - uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr) -{ - dsl_dir_t *ancestor; - int64_t adelta; - uint64_t avail; - int err; - - ancestor = closest_common_ancestor(sdd, tdd); - adelta = would_change(sdd, -space, ancestor); - avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); - if (avail < space) - return (SET_ERROR(ENOSPC)); - - err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, - ancestor, cr); - if (err != 0) - return (err); - err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, - ancestor, cr); - if (err != 0) - return (err); - - return (0); -} - -timestruc_t -dsl_dir_snap_cmtime(dsl_dir_t *dd) -{ - timestruc_t t; - - mutex_enter(&dd->dd_lock); - t = dd->dd_snap_cmtime; - mutex_exit(&dd->dd_lock); - - return (t); -} - -void -dsl_dir_snap_cmtime_update(dsl_dir_t *dd) -{ - timestruc_t t; - - gethrestime(&t); - mutex_enter(&dd->dd_lock); - dd->dd_snap_cmtime = t; - mutex_exit(&dd->dd_lock); -} - -void -dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) -{ - objset_t *mos = dd->dd_pool->dp_meta_objset; - dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); -} - -boolean_t -dsl_dir_is_zapified(dsl_dir_t *dd) -{ - dmu_object_info_t doi; - - dmu_object_info_from_db(dd->dd_dbuf, &doi); - return (doi.doi_type == DMU_OTN_ZAP_METADATA); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c deleted file mode 100644 index ee0ba4793aad..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c +++ /dev/null @@ -1,1372 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. - * Copyright (c) 2013 Steven Hartland. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2016 Nexenta Systems, Inc. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(__FreeBSD__) && defined(_KERNEL) -#include -#include -#endif - -/* - * ZFS Write Throttle - * ------------------ - * - * ZFS must limit the rate of incoming writes to the rate at which it is able - * to sync data modifications to the backend storage. Throttling by too much - * creates an artificial limit; throttling by too little can only be sustained - * for short periods and would lead to highly lumpy performance. On a per-pool - * basis, ZFS tracks the amount of modified (dirty) data. As operations change - * data, the amount of dirty data increases; as ZFS syncs out data, the amount - * of dirty data decreases. When the amount of dirty data exceeds a - * predetermined threshold further modifications are blocked until the amount - * of dirty data decreases (as data is synced out). - * - * The limit on dirty data is tunable, and should be adjusted according to - * both the IO capacity and available memory of the system. The larger the - * window, the more ZFS is able to aggregate and amortize metadata (and data) - * changes. However, memory is a limited resource, and allowing for more dirty - * data comes at the cost of keeping other useful data in memory (for example - * ZFS data cached by the ARC). - * - * Implementation - * - * As buffers are modified dsl_pool_willuse_space() increments both the per- - * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of - * dirty space used; dsl_pool_dirty_space() decrements those values as data - * is synced out from dsl_pool_sync(). While only the poolwide value is - * relevant, the per-txg value is useful for debugging. The tunable - * zfs_dirty_data_max determines the dirty space limit. Once that value is - * exceeded, new writes are halted until space frees up. - * - * The zfs_dirty_data_sync tunable dictates the threshold at which we - * ensure that there is a txg syncing (see the comment in txg.c for a full - * description of transaction group stages). - * - * The IO scheduler uses both the dirty space limit and current amount of - * dirty data as inputs. Those values affect the number of concurrent IOs ZFS - * issues. See the comment in vdev_queue.c for details of the IO scheduler. - * - * The delay is also calculated based on the amount of dirty data. See the - * comment above dmu_tx_delay() for details. - */ - -/* - * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, - * capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system. - */ -uint64_t zfs_dirty_data_max; -uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024; -int zfs_dirty_data_max_percent = 10; - -/* - * If there's at least this much dirty data (as a percentage of - * zfs_dirty_data_max), push out a txg. This should be less than - * zfs_vdev_async_write_active_min_dirty_percent. - */ -uint64_t zfs_dirty_data_sync_pct = 20; - -/* - * Once there is this amount of dirty data, the dmu_tx_delay() will kick in - * and delay each transaction. - * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. - */ -int zfs_delay_min_dirty_percent = 60; - -/* - * This controls how quickly the delay approaches infinity. - * Larger values cause it to delay more for a given amount of dirty data. - * Therefore larger values will cause there to be less dirty data for a - * given throughput. - * - * For the smoothest delay, this value should be about 1 billion divided - * by the maximum number of operations per second. This will smoothly - * handle between 10x and 1/10th this number. - * - * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the - * multiply in dmu_tx_delay(). - */ -uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; - -/* - * This determines the number of threads used by the dp_sync_taskq. - */ -int zfs_sync_taskq_batch_pct = 75; - -/* - * These tunables determine the behavior of how zil_itxg_clean() is - * called via zil_clean() in the context of spa_sync(). When an itxg - * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching. - * If the dispatch fails, the call to zil_itxg_clean() will occur - * synchronously in the context of spa_sync(), which can negatively - * impact the performance of spa_sync() (e.g. in the case of the itxg - * list having a large number of itxs that needs to be cleaned). - * - * Thus, these tunables can be used to manipulate the behavior of the - * taskq used by zil_clean(); they determine the number of taskq entries - * that are pre-populated when the taskq is first created (via the - * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of - * taskq entries that are cached after an on-demand allocation (via the - * "zfs_zil_clean_taskq_maxalloc"). - * - * The idea being, we want to try reasonably hard to ensure there will - * already be a taskq entry pre-allocated by the time that it is needed - * by zil_clean(). This way, we can avoid the possibility of an - * on-demand allocation of a new taskq entry from failing, which would - * result in zil_itxg_clean() being called synchronously from zil_clean() - * (which can adversely affect performance of spa_sync()). - * - * Additionally, the number of threads used by the taskq can be - * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable. - */ -int zfs_zil_clean_taskq_nthr_pct = 100; -int zfs_zil_clean_taskq_minalloc = 1024; -int zfs_zil_clean_taskq_maxalloc = 1024 * 1024; - -#if defined(__FreeBSD__) && defined(_KERNEL) - -extern int zfs_vdev_async_write_active_max_dirty_percent; - -SYSCTL_DECL(_vfs_zfs); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN, - &zfs_dirty_data_max, 0, - "The maximum amount of dirty data in bytes after which new writes are " - "halted until space becomes available"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN, - &zfs_dirty_data_max_max, 0, - "The absolute cap on dirty_data_max when auto calculating"); - -static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS); -SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent, - CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), - sysctl_zfs_dirty_data_max_percent, "I", - "The percent of physical memory used to auto calculate dirty_data_max"); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync_pct, CTLFLAG_RWTUN, - &zfs_dirty_data_sync_pct, 0, - "Force a txg if the percent of dirty buffer bytes exceed this value"); - -static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS); -/* No zfs_delay_min_dirty_percent tunable due to limit requirements */ -SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent, - CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int), - sysctl_zfs_delay_min_dirty_percent, "I", - "The limit of outstanding dirty data before transactions are delayed"); - -static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS); -/* No zfs_delay_scale tunable due to limit requirements */ -SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale, - CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), - sysctl_zfs_delay_scale, "QU", - "Controls how quickly the delay approaches infinity"); - -static int -sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS) -{ - int val, err; - - val = zfs_dirty_data_max_percent; - err = sysctl_handle_int(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (val < 0 || val > 100) - return (EINVAL); - - zfs_dirty_data_max_percent = val; - - return (0); -} - -static int -sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS) -{ - int val, err; - - val = zfs_delay_min_dirty_percent; - err = sysctl_handle_int(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (val < zfs_vdev_async_write_active_max_dirty_percent) - return (EINVAL); - - zfs_delay_min_dirty_percent = val; - - return (0); -} - -static int -sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS) -{ - uint64_t val; - int err; - - val = zfs_delay_scale; - err = sysctl_handle_64(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (val > UINT64_MAX / zfs_dirty_data_max) - return (EINVAL); - - zfs_delay_scale = val; - - return (0); -} -#endif - -int -dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) -{ - uint64_t obj; - int err; - - err = zap_lookup(dp->dp_meta_objset, - dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj, - name, sizeof (obj), 1, &obj); - if (err) - return (err); - - return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); -} - -static dsl_pool_t * -dsl_pool_open_impl(spa_t *spa, uint64_t txg) -{ - dsl_pool_t *dp; - blkptr_t *bp = spa_get_rootblkptr(spa); - - dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); - dp->dp_spa = spa; - dp->dp_meta_rootbp = *bp; - rrw_init(&dp->dp_config_rwlock, B_TRUE); - txg_init(dp, txg); - mmp_init(spa); - - txg_list_create(&dp->dp_dirty_datasets, spa, - offsetof(dsl_dataset_t, ds_dirty_link)); - txg_list_create(&dp->dp_dirty_zilogs, spa, - offsetof(zilog_t, zl_dirty_link)); - txg_list_create(&dp->dp_dirty_dirs, spa, - offsetof(dsl_dir_t, dd_dirty_link)); - txg_list_create(&dp->dp_sync_tasks, spa, - offsetof(dsl_sync_task_t, dst_node)); - txg_list_create(&dp->dp_early_sync_tasks, spa, - offsetof(dsl_sync_task_t, dst_node)); - - dp->dp_sync_taskq = taskq_create("dp_sync_taskq", - zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, - TASKQ_THREADS_CPU_PCT); - - dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", - zfs_zil_clean_taskq_nthr_pct, minclsyspri, - zfs_zil_clean_taskq_minalloc, - zfs_zil_clean_taskq_maxalloc, - TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); - - mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); - - dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, - 1, 4, 0); - - return (dp); -} - -int -dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) -{ - int err; - dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); - - err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, - &dp->dp_meta_objset); - if (err != 0) - dsl_pool_close(dp); - else - *dpp = dp; - - return (err); -} - -int -dsl_pool_open(dsl_pool_t *dp) -{ - int err; - dsl_dir_t *dd; - dsl_dataset_t *ds; - uint64_t obj; - - rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, - &dp->dp_root_dir_obj); - if (err) - goto out; - - err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, - NULL, dp, &dp->dp_root_dir); - if (err) - goto out; - - err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); - if (err) - goto out; - - if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { - err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); - if (err) - goto out; - err = dsl_dataset_hold_obj(dp, - dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds); - if (err == 0) { - err = dsl_dataset_hold_obj(dp, - dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, - &dp->dp_origin_snap); - dsl_dataset_rele(ds, FTAG); - } - dsl_dir_rele(dd, dp); - if (err) - goto out; - } - - if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { - err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, - &dp->dp_free_dir); - if (err) - goto out; - - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); - if (err) - goto out; - VERIFY0(bpobj_open(&dp->dp_free_bpobj, - dp->dp_meta_objset, obj)); - } - - if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj); - if (err == 0) { - VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, - dp->dp_meta_objset, obj)); - } else if (err == ENOENT) { - /* - * We might not have created the remap bpobj yet. - */ - err = 0; - } else { - goto out; - } - } - - /* - * Note: errors ignored, because the these special dirs, used for - * space accounting, are only created on demand. - */ - (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, - &dp->dp_leak_dir); - - if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, - &dp->dp_bptree_obj); - if (err != 0) - goto out; - } - - if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) { - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, - &dp->dp_empty_bpobj); - if (err != 0) - goto out; - } - - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, - &dp->dp_tmp_userrefs_obj); - if (err == ENOENT) - err = 0; - if (err) - goto out; - - err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); - -out: - rrw_exit(&dp->dp_config_rwlock, FTAG); - return (err); -} - -void -dsl_pool_close(dsl_pool_t *dp) -{ - /* - * Drop our references from dsl_pool_open(). - * - * Since we held the origin_snap from "syncing" context (which - * includes pool-opening context), it actually only got a "ref" - * and not a hold, so just drop that here. - */ - if (dp->dp_origin_snap != NULL) - dsl_dataset_rele(dp->dp_origin_snap, dp); - if (dp->dp_mos_dir != NULL) - dsl_dir_rele(dp->dp_mos_dir, dp); - if (dp->dp_free_dir != NULL) - dsl_dir_rele(dp->dp_free_dir, dp); - if (dp->dp_leak_dir != NULL) - dsl_dir_rele(dp->dp_leak_dir, dp); - if (dp->dp_root_dir != NULL) - dsl_dir_rele(dp->dp_root_dir, dp); - - bpobj_close(&dp->dp_free_bpobj); - bpobj_close(&dp->dp_obsolete_bpobj); - - /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ - if (dp->dp_meta_objset != NULL) - dmu_objset_evict(dp->dp_meta_objset); - - txg_list_destroy(&dp->dp_dirty_datasets); - txg_list_destroy(&dp->dp_dirty_zilogs); - txg_list_destroy(&dp->dp_sync_tasks); - txg_list_destroy(&dp->dp_early_sync_tasks); - txg_list_destroy(&dp->dp_dirty_dirs); - - taskq_destroy(dp->dp_zil_clean_taskq); - taskq_destroy(dp->dp_sync_taskq); - - /* - * We can't set retry to TRUE since we're explicitly specifying - * a spa to flush. This is good enough; any missed buffers for - * this spa won't cause trouble, and they'll eventually fall - * out of the ARC just like any other unused buffer. - */ - arc_flush(dp->dp_spa, FALSE); - - mmp_fini(dp->dp_spa); - txg_fini(dp); - dsl_scan_fini(dp); - dmu_buf_user_evict_wait(); - - rrw_destroy(&dp->dp_config_rwlock); - mutex_destroy(&dp->dp_lock); - taskq_destroy(dp->dp_vnrele_taskq); - if (dp->dp_blkstats != NULL) { - mutex_destroy(&dp->dp_blkstats->zab_lock); - kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); - } - kmem_free(dp, sizeof (dsl_pool_t)); -} - -void -dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) -{ - uint64_t obj; - /* - * Currently, we only create the obsolete_bpobj where there are - * indirect vdevs with referenced mappings. - */ - ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL)); - /* create and open the obsolete_bpobj */ - obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); - VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj)); - VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); - spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); -} - -void -dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) -{ - spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); - VERIFY0(zap_remove(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_OBSOLETE_BPOBJ, tx)); - bpobj_free(dp->dp_meta_objset, - dp->dp_obsolete_bpobj.bpo_object, tx); - bpobj_close(&dp->dp_obsolete_bpobj); -} - -dsl_pool_t * -dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) -{ - int err; - dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); - dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); - dsl_dataset_t *ds; - uint64_t obj; - - rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); - - /* create and open the MOS (meta-objset) */ - dp->dp_meta_objset = dmu_objset_create_impl(spa, - NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); - - /* create the pool directory */ - err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); - ASSERT0(err); - - /* Initialize scan structures */ - VERIFY0(dsl_scan_init(dp, txg)); - - /* create and open the root dir */ - dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); - VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, - NULL, dp, &dp->dp_root_dir)); - - /* create and open the meta-objset dir */ - (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); - VERIFY0(dsl_pool_open_special_dir(dp, - MOS_DIR_NAME, &dp->dp_mos_dir)); - - if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { - /* create and open the free dir */ - (void) dsl_dir_create_sync(dp, dp->dp_root_dir, - FREE_DIR_NAME, tx); - VERIFY0(dsl_pool_open_special_dir(dp, - FREE_DIR_NAME, &dp->dp_free_dir)); - - /* create and open the free_bplist */ - obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); - VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); - VERIFY0(bpobj_open(&dp->dp_free_bpobj, - dp->dp_meta_objset, obj)); - } - - if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) - dsl_pool_create_origin(dp, tx); - - /* create the root dataset */ - obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); - - /* create the root objset */ - VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); -#ifdef _KERNEL - { - objset_t *os; - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - os = dmu_objset_create_impl(dp->dp_spa, ds, - dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); - rrw_exit(&ds->ds_bp_rwlock, FTAG); - zfs_create_fs(os, kcred, zplprops, tx); - } -#endif - dsl_dataset_rele(ds, FTAG); - - dmu_tx_commit(tx); - - rrw_exit(&dp->dp_config_rwlock, FTAG); - - return (dp); -} - -/* - * Account for the meta-objset space in its placeholder dsl_dir. - */ -void -dsl_pool_mos_diduse_space(dsl_pool_t *dp, - int64_t used, int64_t comp, int64_t uncomp) -{ - ASSERT3U(comp, ==, uncomp); /* it's all metadata */ - mutex_enter(&dp->dp_lock); - dp->dp_mos_used_delta += used; - dp->dp_mos_compressed_delta += comp; - dp->dp_mos_uncompressed_delta += uncomp; - mutex_exit(&dp->dp_lock); -} - -static void -dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) -{ - zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - dmu_objset_sync(dp->dp_meta_objset, zio, tx); - VERIFY0(zio_wait(zio)); - dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); - spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); -} - -static void -dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) -{ - ASSERT(MUTEX_HELD(&dp->dp_lock)); - - if (delta < 0) - ASSERT3U(-delta, <=, dp->dp_dirty_total); - - dp->dp_dirty_total += delta; - - /* - * Note: we signal even when increasing dp_dirty_total. - * This ensures forward progress -- each thread wakes the next waiter. - */ - if (dp->dp_dirty_total < zfs_dirty_data_max) - cv_signal(&dp->dp_spaceavail_cv); -} - -static boolean_t -dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) -{ - spa_t *spa = dp->dp_spa; - vdev_t *rvd = spa->spa_root_vdev; - - for (uint64_t c = 0; c < rvd->vdev_children; c++) { - vdev_t *vd = rvd->vdev_child[c]; - txg_list_t *tl = &vd->vdev_ms_list; - metaslab_t *ms; - - for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms; - ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) { - VERIFY(range_tree_is_empty(ms->ms_freeing)); - VERIFY(range_tree_is_empty(ms->ms_checkpointing)); - } - } - - return (B_TRUE); -} - -void -dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) -{ - zio_t *zio; - dmu_tx_t *tx; - dsl_dir_t *dd; - dsl_dataset_t *ds; - objset_t *mos = dp->dp_meta_objset; - list_t synced_datasets; - - list_create(&synced_datasets, sizeof (dsl_dataset_t), - offsetof(dsl_dataset_t, ds_synced_link)); - - tx = dmu_tx_create_assigned(dp, txg); - - /* - * Run all early sync tasks before writing out any dirty blocks. - * For more info on early sync tasks see block comment in - * dsl_early_sync_task(). - */ - if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) { - dsl_sync_task_t *dst; - - ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); - while ((dst = - txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) { - ASSERT(dsl_early_sync_task_verify(dp, txg)); - dsl_sync_task_sync(dst, tx); - } - ASSERT(dsl_early_sync_task_verify(dp, txg)); - } - - /* - * Write out all dirty blocks of dirty datasets. - */ - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { - /* - * We must not sync any non-MOS datasets twice, because - * we may have taken a snapshot of them. However, we - * may sync newly-created datasets on pass 2. - */ - ASSERT(!list_link_active(&ds->ds_synced_link)); - list_insert_tail(&synced_datasets, ds); - dsl_dataset_sync(ds, zio, tx); - } - VERIFY0(zio_wait(zio)); - - /* - * We have written all of the accounted dirty data, so our - * dp_space_towrite should now be zero. However, some seldom-used - * code paths do not adhere to this (e.g. dbuf_undirty(), also - * rounding error in dbuf_write_physdone). - * Shore up the accounting of any dirtied space now. - */ - dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); - - /* - * Update the long range free counter after - * we're done syncing user data - */ - mutex_enter(&dp->dp_lock); - ASSERT(spa_sync_pass(dp->dp_spa) == 1 || - dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0); - dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0; - mutex_exit(&dp->dp_lock); - - /* - * After the data blocks have been written (ensured by the zio_wait() - * above), update the user/group space accounting. This happens - * in tasks dispatched to dp_sync_taskq, so wait for them before - * continuing. - */ - for (ds = list_head(&synced_datasets); ds != NULL; - ds = list_next(&synced_datasets, ds)) { - dmu_objset_do_userquota_updates(ds->ds_objset, tx); - } - taskq_wait(dp->dp_sync_taskq); - - /* - * Sync the datasets again to push out the changes due to - * userspace updates. This must be done before we process the - * sync tasks, so that any snapshots will have the correct - * user accounting information (and we won't get confused - * about which blocks are part of the snapshot). - */ - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { - ASSERT(list_link_active(&ds->ds_synced_link)); - dmu_buf_rele(ds->ds_dbuf, ds); - dsl_dataset_sync(ds, zio, tx); - } - VERIFY0(zio_wait(zio)); - - /* - * Now that the datasets have been completely synced, we can - * clean up our in-memory structures accumulated while syncing: - * - * - move dead blocks from the pending deadlist to the on-disk deadlist - * - release hold from dsl_dataset_dirty() - */ - while ((ds = list_remove_head(&synced_datasets)) != NULL) { - dsl_dataset_sync_done(ds, tx); - } - while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { - dsl_dir_sync(dd, tx); - } - - /* - * The MOS's space is accounted for in the pool/$MOS - * (dp_mos_dir). We can't modify the mos while we're syncing - * it, so we remember the deltas and apply them here. - */ - if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || - dp->dp_mos_uncompressed_delta != 0) { - dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, - dp->dp_mos_used_delta, - dp->dp_mos_compressed_delta, - dp->dp_mos_uncompressed_delta, tx); - dp->dp_mos_used_delta = 0; - dp->dp_mos_compressed_delta = 0; - dp->dp_mos_uncompressed_delta = 0; - } - - if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) { - dsl_pool_sync_mos(dp, tx); - } - - /* - * If we modify a dataset in the same txg that we want to destroy it, - * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. - * dsl_dir_destroy_check() will fail if there are unexpected holds. - * Therefore, we want to sync the MOS (thus syncing the dd_dbuf - * and clearing the hold on it) before we process the sync_tasks. - * The MOS data dirtied by the sync_tasks will be synced on the next - * pass. - */ - if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { - dsl_sync_task_t *dst; - /* - * No more sync tasks should have been added while we - * were syncing. - */ - ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); - while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) - dsl_sync_task_sync(dst, tx); - } - - dmu_tx_commit(tx); - - DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); -} - -void -dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) -{ - zilog_t *zilog; - - while (zilog = txg_list_head(&dp->dp_dirty_zilogs, txg)) { - dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); - /* - * We don't remove the zilog from the dp_dirty_zilogs - * list until after we've cleaned it. This ensures that - * callers of zilog_is_dirty() receive an accurate - * answer when they are racing with the spa sync thread. - */ - zil_clean(zilog, txg); - (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg); - ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); - dmu_buf_rele(ds->ds_dbuf, zilog); - } - ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); -} - -/* - * TRUE if the current thread is the tx_sync_thread or if we - * are being called from SPA context during pool initialization. - */ -int -dsl_pool_sync_context(dsl_pool_t *dp) -{ - return (curthread == dp->dp_tx.tx_sync_thread || - spa_is_initializing(dp->dp_spa) || - taskq_member(dp->dp_sync_taskq, curthread)); -} - -/* - * This function returns the amount of allocatable space in the pool - * minus whatever space is currently reserved by ZFS for specific - * purposes. Specifically: - * - * 1] Any reserved SLOP space - * 2] Any space used by the checkpoint - * 3] Any space used for deferred frees - * - * The latter 2 are especially important because they are needed to - * rectify the SPA's and DMU's different understanding of how much space - * is used. Now the DMU is aware of that extra space tracked by the SPA - * without having to maintain a separate special dir (e.g similar to - * $MOS, $FREEING, and $LEAKED). - * - * Note: By deferred frees here, we mean the frees that were deferred - * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the - * segments placed in ms_defer trees during metaslab_sync_done(). - */ -uint64_t -dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy) -{ - spa_t *spa = dp->dp_spa; - uint64_t space, resv, adjustedsize; - uint64_t spa_deferred_frees = - spa->spa_deferred_bpobj.bpo_phys->bpo_bytes; - - space = spa_get_dspace(spa) - - spa_get_checkpoint_space(spa) - spa_deferred_frees; - resv = spa_get_slop_space(spa); - - switch (slop_policy) { - case ZFS_SPACE_CHECK_NORMAL: - break; - case ZFS_SPACE_CHECK_RESERVED: - resv >>= 1; - break; - case ZFS_SPACE_CHECK_EXTRA_RESERVED: - resv >>= 2; - break; - case ZFS_SPACE_CHECK_NONE: - resv = 0; - break; - default: - panic("invalid slop policy value: %d", slop_policy); - break; - } - adjustedsize = (space >= resv) ? (space - resv) : 0; - - return (adjustedsize); -} - -uint64_t -dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy) -{ - uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy); - uint64_t deferred = - metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); - uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0; - return (quota); -} - -boolean_t -dsl_pool_need_dirty_delay(dsl_pool_t *dp) -{ - uint64_t delay_min_bytes = - zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - uint64_t dirty_min_bytes = - zfs_dirty_data_max * zfs_dirty_data_sync_pct / 100; - boolean_t rv; - - mutex_enter(&dp->dp_lock); - if (dp->dp_dirty_total > dirty_min_bytes) - txg_kick(dp); - rv = (dp->dp_dirty_total > delay_min_bytes); - mutex_exit(&dp->dp_lock); - return (rv); -} - -void -dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) -{ - if (space > 0) { - mutex_enter(&dp->dp_lock); - dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; - dsl_pool_dirty_delta(dp, space); - mutex_exit(&dp->dp_lock); - } -} - -void -dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) -{ - ASSERT3S(space, >=, 0); - if (space == 0) - return; - mutex_enter(&dp->dp_lock); - if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { - /* XXX writing something we didn't dirty? */ - space = dp->dp_dirty_pertxg[txg & TXG_MASK]; - } - ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); - dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; - ASSERT3U(dp->dp_dirty_total, >=, space); - dsl_pool_dirty_delta(dp, -space); - mutex_exit(&dp->dp_lock); -} - -/* ARGSUSED */ -static int -upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) -{ - dmu_tx_t *tx = arg; - dsl_dataset_t *ds, *prev = NULL; - int err; - - err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); - if (err) - return (err); - - while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { - err = dsl_dataset_hold_obj(dp, - dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); - if (err) { - dsl_dataset_rele(ds, FTAG); - return (err); - } - - if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) - break; - dsl_dataset_rele(ds, FTAG); - ds = prev; - prev = NULL; - } - - if (prev == NULL) { - prev = dp->dp_origin_snap; - - /* - * The $ORIGIN can't have any data, or the accounting - * will be wrong. - */ - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth); - rrw_exit(&ds->ds_bp_rwlock, FTAG); - - /* The origin doesn't get attached to itself */ - if (ds->ds_object == prev->ds_object) { - dsl_dataset_rele(ds, FTAG); - return (0); - } - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object; - dsl_dataset_phys(ds)->ds_prev_snap_txg = - dsl_dataset_phys(prev)->ds_creation_txg; - - dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); - dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object; - - dmu_buf_will_dirty(prev->ds_dbuf, tx); - dsl_dataset_phys(prev)->ds_num_children++; - - if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) { - ASSERT(ds->ds_prev == NULL); - VERIFY0(dsl_dataset_hold_obj(dp, - dsl_dataset_phys(ds)->ds_prev_snap_obj, - ds, &ds->ds_prev)); - } - } - - ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object); - ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object); - - if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) { - dmu_buf_will_dirty(prev->ds_dbuf, tx); - dsl_dataset_phys(prev)->ds_next_clones_obj = - zap_create(dp->dp_meta_objset, - DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); - } - VERIFY0(zap_add_int(dp->dp_meta_objset, - dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx)); - - dsl_dataset_rele(ds, FTAG); - if (prev != dp->dp_origin_snap) - dsl_dataset_rele(prev, FTAG); - return (0); -} - -void -dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) -{ - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(dp->dp_origin_snap != NULL); - - VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, - tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); -} - -/* ARGSUSED */ -static int -upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) -{ - dmu_tx_t *tx = arg; - objset_t *mos = dp->dp_meta_objset; - - if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) { - dsl_dataset_t *origin; - - VERIFY0(dsl_dataset_hold_obj(dp, - dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin)); - - if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { - dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); - dsl_dir_phys(origin->ds_dir)->dd_clones = - zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, - 0, tx); - } - - VERIFY0(zap_add_int(dp->dp_meta_objset, - dsl_dir_phys(origin->ds_dir)->dd_clones, - ds->ds_object, tx)); - - dsl_dataset_rele(origin, FTAG); - } - return (0); -} - -void -dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) -{ - ASSERT(dmu_tx_is_syncing(tx)); - uint64_t obj; - - (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); - VERIFY0(dsl_pool_open_special_dir(dp, - FREE_DIR_NAME, &dp->dp_free_dir)); - - /* - * We can't use bpobj_alloc(), because spa_version() still - * returns the old version, and we need a new-version bpobj with - * subobj support. So call dmu_object_alloc() directly. - */ - obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, - SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); - VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); - VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); - - VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); -} - -void -dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) -{ - uint64_t dsobj; - dsl_dataset_t *ds; - - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(dp->dp_origin_snap == NULL); - ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); - - /* create the origin dir, ds, & snap-ds */ - dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, - NULL, 0, kcred, tx); - VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); - VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, - dp, &dp->dp_origin_snap)); - dsl_dataset_rele(ds, FTAG); -} - -taskq_t * -dsl_pool_vnrele_taskq(dsl_pool_t *dp) -{ - return (dp->dp_vnrele_taskq); -} - -/* - * Walk through the pool-wide zap object of temporary snapshot user holds - * and release them. - */ -void -dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) -{ - zap_attribute_t za; - zap_cursor_t zc; - objset_t *mos = dp->dp_meta_objset; - uint64_t zapobj = dp->dp_tmp_userrefs_obj; - nvlist_t *holds; - - if (zapobj == 0) - return; - ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); - - holds = fnvlist_alloc(); - - for (zap_cursor_init(&zc, mos, zapobj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - char *htag; - nvlist_t *tags; - - htag = strchr(za.za_name, '-'); - *htag = '\0'; - ++htag; - if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { - tags = fnvlist_alloc(); - fnvlist_add_boolean(tags, htag); - fnvlist_add_nvlist(holds, za.za_name, tags); - fnvlist_free(tags); - } else { - fnvlist_add_boolean(tags, htag); - } - } - dsl_dataset_user_release_tmp(dp, holds); - fnvlist_free(holds); - zap_cursor_fini(&zc); -} - -/* - * Create the pool-wide zap object for storing temporary snapshot holds. - */ -void -dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) -{ - objset_t *mos = dp->dp_meta_objset; - - ASSERT(dp->dp_tmp_userrefs_obj == 0); - ASSERT(dmu_tx_is_syncing(tx)); - - dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); -} - -static int -dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, - const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) -{ - objset_t *mos = dp->dp_meta_objset; - uint64_t zapobj = dp->dp_tmp_userrefs_obj; - char *name; - int error; - - ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); - ASSERT(dmu_tx_is_syncing(tx)); - - /* - * If the pool was created prior to SPA_VERSION_USERREFS, the - * zap object for temporary holds might not exist yet. - */ - if (zapobj == 0) { - if (holding) { - dsl_pool_user_hold_create_obj(dp, tx); - zapobj = dp->dp_tmp_userrefs_obj; - } else { - return (SET_ERROR(ENOENT)); - } - } - - name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); - if (holding) - error = zap_add(mos, zapobj, name, 8, 1, &now, tx); - else - error = zap_remove(mos, zapobj, name, tx); - strfree(name); - - return (error); -} - -/* - * Add a temporary hold for the given dataset object and tag. - */ -int -dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, - uint64_t now, dmu_tx_t *tx) -{ - return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); -} - -/* - * Release a temporary hold for the given dataset object and tag. - */ -int -dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, - dmu_tx_t *tx) -{ - return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, tx, B_FALSE)); -} - -/* - * DSL Pool Configuration Lock - * - * The dp_config_rwlock protects against changes to DSL state (e.g. dataset - * creation / destruction / rename / property setting). It must be held for - * read to hold a dataset or dsl_dir. I.e. you must call - * dsl_pool_config_enter() or dsl_pool_hold() before calling - * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock - * must be held continuously until all datasets and dsl_dirs are released. - * - * The only exception to this rule is that if a "long hold" is placed on - * a dataset, then the dp_config_rwlock may be dropped while the dataset - * is still held. The long hold will prevent the dataset from being - * destroyed -- the destroy will fail with EBUSY. A long hold can be - * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset - * (by calling dsl_{dataset,objset}_{try}own{_obj}). - * - * Legitimate long-holders (including owners) should be long-running, cancelable - * tasks that should cause "zfs destroy" to fail. This includes DMU - * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), - * "zfs send", and "zfs diff". There are several other long-holders whose - * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). - * - * The usual formula for long-holding would be: - * dsl_pool_hold() - * dsl_dataset_hold() - * ... perform checks ... - * dsl_dataset_long_hold() - * dsl_pool_rele() - * ... perform long-running task ... - * dsl_dataset_long_rele() - * dsl_dataset_rele() - * - * Note that when the long hold is released, the dataset is still held but - * the pool is not held. The dataset may change arbitrarily during this time - * (e.g. it could be destroyed). Therefore you shouldn't do anything to the - * dataset except release it. - * - * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only - * or modifying operations. - * - * Modifying operations should generally use dsl_sync_task(). The synctask - * infrastructure enforces proper locking strategy with respect to the - * dp_config_rwlock. See the comment above dsl_sync_task() for details. - * - * Read-only operations will manually hold the pool, then the dataset, obtain - * information from the dataset, then release the pool and dataset. - * dmu_objset_{hold,rele}() are convenience routines that also do the pool - * hold/rele. - */ - -int -dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) -{ - spa_t *spa; - int error; - - error = spa_open(name, &spa, tag); - if (error == 0) { - *dp = spa_get_dsl(spa); - dsl_pool_config_enter(*dp, tag); - } - return (error); -} - -void -dsl_pool_rele(dsl_pool_t *dp, void *tag) -{ - dsl_pool_config_exit(dp, tag); - spa_close(dp->dp_spa, tag); -} - -void -dsl_pool_config_enter(dsl_pool_t *dp, void *tag) -{ - /* - * We use a "reentrant" reader-writer lock, but not reentrantly. - * - * The rrwlock can (with the track_all flag) track all reading threads, - * which is very useful for debugging which code path failed to release - * the lock, and for verifying that the *current* thread does hold - * the lock. - * - * (Unlike a rwlock, which knows that N threads hold it for - * read, but not *which* threads, so rw_held(RW_READER) returns TRUE - * if any thread holds it for read, even if this thread doesn't). - */ - ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); - rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); -} - -void -dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag) -{ - ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); - rrw_enter_read_prio(&dp->dp_config_rwlock, tag); -} - -void -dsl_pool_config_exit(dsl_pool_t *dp, void *tag) -{ - rrw_exit(&dp->dp_config_rwlock, tag); -} - -boolean_t -dsl_pool_config_held(dsl_pool_t *dp) -{ - return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); -} - -boolean_t -dsl_pool_config_held_writer(dsl_pool_t *dp) -{ - return (RRW_WRITE_HELD(&dp->dp_config_rwlock)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c deleted file mode 100644 index 50aef5b618f9..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c +++ /dev/null @@ -1,1211 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. - * Copyright 2015, Joyent, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zfs_prop.h" - -#define ZPROP_INHERIT_SUFFIX "$inherit" -#define ZPROP_RECVD_SUFFIX "$recvd" - -static int -dodefault(zfs_prop_t prop, int intsz, int numints, void *buf) -{ - /* - * The setonce properties are read-only, BUT they still - * have a default value that can be used as the initial - * value. - */ - if (prop == ZPROP_INVAL || - (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop))) - return (SET_ERROR(ENOENT)); - - if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { - if (zfs_prop_default_string(prop) == NULL) - return (SET_ERROR(ENOENT)); - if (intsz != 1) - return (SET_ERROR(EOVERFLOW)); - (void) strncpy(buf, zfs_prop_default_string(prop), - numints); - } else { - if (intsz != 8 || numints < 1) - return (SET_ERROR(EOVERFLOW)); - - *(uint64_t *)buf = zfs_prop_default_numeric(prop); - } - - return (0); -} - -int -dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, - int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot) -{ - int err = ENOENT; - dsl_dir_t *target = dd; - objset_t *mos = dd->dd_pool->dp_meta_objset; - zfs_prop_t prop; - boolean_t inheritable; - boolean_t inheriting = B_FALSE; - char *inheritstr; - char *recvdstr; - - ASSERT(dsl_pool_config_held(dd->dd_pool)); - - if (setpoint) - setpoint[0] = '\0'; - - prop = zfs_name_to_prop(propname); - inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); - inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); - recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); - - /* - * Note: dd may become NULL, therefore we shouldn't dereference it - * after this loop. - */ - for (; dd != NULL; dd = dd->dd_parent) { - if (dd != target || snapshot) { - if (!inheritable) - break; - inheriting = B_TRUE; - } - - /* Check for a local value. */ - err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, - propname, intsz, numints, buf); - if (err != ENOENT) { - if (setpoint != NULL && err == 0) - dsl_dir_name(dd, setpoint); - break; - } - - /* - * Skip the check for a received value if there is an explicit - * inheritance entry. - */ - err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj, - inheritstr); - if (err != 0 && err != ENOENT) - break; - - if (err == ENOENT) { - /* Check for a received value. */ - err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, - recvdstr, intsz, numints, buf); - if (err != ENOENT) { - if (setpoint != NULL && err == 0) { - if (inheriting) { - dsl_dir_name(dd, setpoint); - } else { - (void) strcpy(setpoint, - ZPROP_SOURCE_VAL_RECVD); - } - } - break; - } - } - - /* - * If we found an explicit inheritance entry, err is zero even - * though we haven't yet found the value, so reinitializing err - * at the end of the loop (instead of at the beginning) ensures - * that err has a valid post-loop value. - */ - err = SET_ERROR(ENOENT); - } - - if (err == ENOENT) - err = dodefault(prop, intsz, numints, buf); - - strfree(inheritstr); - strfree(recvdstr); - - return (err); -} - -int -dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, - int intsz, int numints, void *buf, char *setpoint) -{ - zfs_prop_t prop = zfs_name_to_prop(propname); - boolean_t inheritable; - uint64_t zapobj; - - ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); - inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); - zapobj = dsl_dataset_phys(ds)->ds_props_obj; - - if (zapobj != 0) { - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - int err; - - ASSERT(ds->ds_is_snapshot); - - /* Check for a local value. */ - err = zap_lookup(mos, zapobj, propname, intsz, numints, buf); - if (err != ENOENT) { - if (setpoint != NULL && err == 0) - dsl_dataset_name(ds, setpoint); - return (err); - } - - /* - * Skip the check for a received value if there is an explicit - * inheritance entry. - */ - if (inheritable) { - char *inheritstr = kmem_asprintf("%s%s", propname, - ZPROP_INHERIT_SUFFIX); - err = zap_contains(mos, zapobj, inheritstr); - strfree(inheritstr); - if (err != 0 && err != ENOENT) - return (err); - } - - if (err == ENOENT) { - /* Check for a received value. */ - char *recvdstr = kmem_asprintf("%s%s", propname, - ZPROP_RECVD_SUFFIX); - err = zap_lookup(mos, zapobj, recvdstr, - intsz, numints, buf); - strfree(recvdstr); - if (err != ENOENT) { - if (setpoint != NULL && err == 0) - (void) strcpy(setpoint, - ZPROP_SOURCE_VAL_RECVD); - return (err); - } - } - } - - return (dsl_prop_get_dd(ds->ds_dir, propname, - intsz, numints, buf, setpoint, ds->ds_is_snapshot)); -} - -static dsl_prop_record_t * -dsl_prop_record_find(dsl_dir_t *dd, const char *propname) -{ - dsl_prop_record_t *pr = NULL; - - ASSERT(MUTEX_HELD(&dd->dd_lock)); - - for (pr = list_head(&dd->dd_props); - pr != NULL; pr = list_next(&dd->dd_props, pr)) { - if (strcmp(pr->pr_propname, propname) == 0) - break; - } - - return (pr); -} - -static dsl_prop_record_t * -dsl_prop_record_create(dsl_dir_t *dd, const char *propname) -{ - dsl_prop_record_t *pr; - - ASSERT(MUTEX_HELD(&dd->dd_lock)); - - pr = kmem_alloc(sizeof (dsl_prop_record_t), KM_SLEEP); - pr->pr_propname = spa_strdup(propname); - list_create(&pr->pr_cbs, sizeof (dsl_prop_cb_record_t), - offsetof(dsl_prop_cb_record_t, cbr_pr_node)); - list_insert_head(&dd->dd_props, pr); - - return (pr); -} - -void -dsl_prop_init(dsl_dir_t *dd) -{ - list_create(&dd->dd_props, sizeof (dsl_prop_record_t), - offsetof(dsl_prop_record_t, pr_node)); -} - -void -dsl_prop_fini(dsl_dir_t *dd) -{ - dsl_prop_record_t *pr; - - while ((pr = list_remove_head(&dd->dd_props)) != NULL) { - list_destroy(&pr->pr_cbs); - strfree((char *)pr->pr_propname); - kmem_free(pr, sizeof (dsl_prop_record_t)); - } - list_destroy(&dd->dd_props); -} - -/* - * Register interest in the named property. We'll call the callback - * once to notify it of the current property value, and again each time - * the property changes, until this callback is unregistered. - * - * Return 0 on success, errno if the prop is not an integer value. - */ -int -dsl_prop_register(dsl_dataset_t *ds, const char *propname, - dsl_prop_changed_cb_t *callback, void *cbarg) -{ - dsl_dir_t *dd = ds->ds_dir; - dsl_pool_t *dp = dd->dd_pool; - uint64_t value; - dsl_prop_record_t *pr; - dsl_prop_cb_record_t *cbr; - int err; - - ASSERT(dsl_pool_config_held(dp)); - - err = dsl_prop_get_int_ds(ds, propname, &value); - if (err != 0) - return (err); - - cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP); - cbr->cbr_ds = ds; - cbr->cbr_func = callback; - cbr->cbr_arg = cbarg; - - mutex_enter(&dd->dd_lock); - pr = dsl_prop_record_find(dd, propname); - if (pr == NULL) - pr = dsl_prop_record_create(dd, propname); - cbr->cbr_pr = pr; - list_insert_head(&pr->pr_cbs, cbr); - list_insert_head(&ds->ds_prop_cbs, cbr); - mutex_exit(&dd->dd_lock); - - cbr->cbr_func(cbr->cbr_arg, value); - return (0); -} - -int -dsl_prop_get(const char *dsname, const char *propname, - int intsz, int numints, void *buf, char *setpoint) -{ - objset_t *os; - int error; - - error = dmu_objset_hold(dsname, FTAG, &os); - if (error != 0) - return (error); - - error = dsl_prop_get_ds(dmu_objset_ds(os), propname, - intsz, numints, buf, setpoint); - - dmu_objset_rele(os, FTAG); - return (error); -} - -/* - * Get the current property value. It may have changed by the time this - * function returns, so it is NOT safe to follow up with - * dsl_prop_register() and assume that the value has not changed in - * between. - * - * Return 0 on success, ENOENT if ddname is invalid. - */ -int -dsl_prop_get_integer(const char *ddname, const char *propname, - uint64_t *valuep, char *setpoint) -{ - return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint)); -} - -int -dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname, - uint64_t *valuep) -{ - return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL)); -} - -/* - * Predict the effective value of the given special property if it were set with - * the given value and source. This is not a general purpose function. It exists - * only to handle the special requirements of the quota and reservation - * properties. The fact that these properties are non-inheritable greatly - * simplifies the prediction logic. - * - * Returns 0 on success, a positive error code on failure, or -1 if called with - * a property not handled by this function. - */ -int -dsl_prop_predict(dsl_dir_t *dd, const char *propname, - zprop_source_t source, uint64_t value, uint64_t *newvalp) -{ - zfs_prop_t prop = zfs_name_to_prop(propname); - objset_t *mos; - uint64_t zapobj; - uint64_t version; - char *recvdstr; - int err = 0; - - switch (prop) { - case ZFS_PROP_QUOTA: - case ZFS_PROP_RESERVATION: - case ZFS_PROP_REFQUOTA: - case ZFS_PROP_REFRESERVATION: - break; - default: - return (-1); - } - - mos = dd->dd_pool->dp_meta_objset; - zapobj = dsl_dir_phys(dd)->dd_props_zapobj; - recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); - - version = spa_version(dd->dd_pool->dp_spa); - if (version < SPA_VERSION_RECVD_PROPS) { - if (source & ZPROP_SRC_NONE) - source = ZPROP_SRC_NONE; - else if (source & ZPROP_SRC_RECEIVED) - source = ZPROP_SRC_LOCAL; - } - - switch (source) { - case ZPROP_SRC_NONE: - /* Revert to the received value, if any. */ - err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp); - if (err == ENOENT) - *newvalp = 0; - break; - case ZPROP_SRC_LOCAL: - *newvalp = value; - break; - case ZPROP_SRC_RECEIVED: - /* - * If there's no local setting, then the new received value will - * be the effective value. - */ - err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); - if (err == ENOENT) - *newvalp = value; - break; - case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): - /* - * We're clearing the received value, so the local setting (if - * it exists) remains the effective value. - */ - err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); - if (err == ENOENT) - *newvalp = 0; - break; - default: - panic("unexpected property source: %d", source); - } - - strfree(recvdstr); - - if (err == ENOENT) - return (0); - - return (err); -} - -/* - * Unregister all callbacks that are registered with the - * given callback argument. - */ -void -dsl_prop_unregister_all(dsl_dataset_t *ds, void *cbarg) -{ - dsl_prop_cb_record_t *cbr, *next_cbr; - - dsl_dir_t *dd = ds->ds_dir; - - mutex_enter(&dd->dd_lock); - next_cbr = list_head(&ds->ds_prop_cbs); - while (next_cbr != NULL) { - cbr = next_cbr; - next_cbr = list_next(&ds->ds_prop_cbs, cbr); - if (cbr->cbr_arg == cbarg) { - list_remove(&ds->ds_prop_cbs, cbr); - list_remove(&cbr->cbr_pr->pr_cbs, cbr); - kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); - } - } - mutex_exit(&dd->dd_lock); -} - -boolean_t -dsl_prop_hascb(dsl_dataset_t *ds) -{ - return (!list_is_empty(&ds->ds_prop_cbs)); -} - -/* ARGSUSED */ -static int -dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) -{ - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_record_t *pr; - dsl_prop_cb_record_t *cbr; - - mutex_enter(&dd->dd_lock); - for (pr = list_head(&dd->dd_props); - pr; pr = list_next(&dd->dd_props, pr)) { - for (cbr = list_head(&pr->pr_cbs); cbr; - cbr = list_next(&pr->pr_cbs, cbr)) { - uint64_t value; - - /* - * Callback entries do not have holds on their - * datasets so that datasets with registered - * callbacks are still eligible for eviction. - * Unlike operations to update properties on a - * single dataset, we are performing a recursive - * descent of related head datasets. The caller - * of this function only has a dataset hold on - * the passed in head dataset, not the snapshots - * associated with this dataset. Without a hold, - * the dataset pointer within callback records - * for snapshots can be invalidated by eviction - * at any time. - * - * Use dsl_dataset_try_add_ref() to verify - * that the dataset for a snapshot has not - * begun eviction processing and to prevent - * eviction from occurring for the duration of - * the callback. If the hold attempt fails, - * this object is already being evicted and the - * callback can be safely ignored. - */ - if (ds != cbr->cbr_ds && - !dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG)) - continue; - - if (dsl_prop_get_ds(cbr->cbr_ds, - cbr->cbr_pr->pr_propname, sizeof (value), 1, - &value, NULL) == 0) - cbr->cbr_func(cbr->cbr_arg, value); - - if (ds != cbr->cbr_ds) - dsl_dataset_rele(cbr->cbr_ds, FTAG); - } - } - mutex_exit(&dd->dd_lock); - - return (0); -} - -/* - * Update all property values for ddobj & its descendants. This is used - * when renaming the dir. - */ -void -dsl_prop_notify_all(dsl_dir_t *dd) -{ - dsl_pool_t *dp = dd->dd_pool; - ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); - (void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb, - NULL, DS_FIND_CHILDREN); -} - -static void -dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, - const char *propname, uint64_t value, int first) -{ - dsl_dir_t *dd; - dsl_prop_record_t *pr; - dsl_prop_cb_record_t *cbr; - objset_t *mos = dp->dp_meta_objset; - zap_cursor_t zc; - zap_attribute_t *za; - int err; - - ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); - err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); - if (err) - return; - - if (!first) { - /* - * If the prop is set here, then this change is not - * being inherited here or below; stop the recursion. - */ - err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj, - propname); - if (err == 0) { - dsl_dir_rele(dd, FTAG); - return; - } - ASSERT3U(err, ==, ENOENT); - } - - mutex_enter(&dd->dd_lock); - pr = dsl_prop_record_find(dd, propname); - if (pr != NULL) { - for (cbr = list_head(&pr->pr_cbs); cbr; - cbr = list_next(&pr->pr_cbs, cbr)) { - uint64_t propobj; - - /* - * cbr->cbr_ds may be invalidated due to eviction, - * requiring the use of dsl_dataset_try_add_ref(). - * See comment block in dsl_prop_notify_all_cb() - * for details. - */ - if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG)) - continue; - - propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj; - - /* - * If the property is not set on this ds, then it is - * inherited here; call the callback. - */ - if (propobj == 0 || - zap_contains(mos, propobj, propname) != 0) - cbr->cbr_func(cbr->cbr_arg, value); - - dsl_dataset_rele(cbr->cbr_ds, FTAG); - } - } - mutex_exit(&dd->dd_lock); - - za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - for (zap_cursor_init(&zc, mos, - dsl_dir_phys(dd)->dd_child_dir_zapobj); - zap_cursor_retrieve(&zc, za) == 0; - zap_cursor_advance(&zc)) { - dsl_prop_changed_notify(dp, za->za_first_integer, - propname, value, FALSE); - } - kmem_free(za, sizeof (zap_attribute_t)); - zap_cursor_fini(&zc); - dsl_dir_rele(dd, FTAG); -} - -void -dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, - zprop_source_t source, int intsz, int numints, const void *value, - dmu_tx_t *tx) -{ - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t zapobj, intval, dummy; - int isint; - char valbuf[32]; - const char *valstr = NULL; - char *inheritstr; - char *recvdstr; - char *tbuf = NULL; - int err; - uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa); - - isint = (dodefault(zfs_name_to_prop(propname), 8, 1, &intval) == 0); - - if (ds->ds_is_snapshot) { - ASSERT(version >= SPA_VERSION_SNAP_PROPS); - if (dsl_dataset_phys(ds)->ds_props_obj == 0) { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_props_obj = - zap_create(mos, - DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); - } - zapobj = dsl_dataset_phys(ds)->ds_props_obj; - } else { - zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj; - } - - if (version < SPA_VERSION_RECVD_PROPS) { - if (source & ZPROP_SRC_NONE) - source = ZPROP_SRC_NONE; - else if (source & ZPROP_SRC_RECEIVED) - source = ZPROP_SRC_LOCAL; - } - - inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); - recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); - - switch (source) { - case ZPROP_SRC_NONE: - /* - * revert to received value, if any (inherit -S) - * - remove propname - * - remove propname$inherit - */ - err = zap_remove(mos, zapobj, propname, tx); - ASSERT(err == 0 || err == ENOENT); - err = zap_remove(mos, zapobj, inheritstr, tx); - ASSERT(err == 0 || err == ENOENT); - break; - case ZPROP_SRC_LOCAL: - /* - * remove propname$inherit - * set propname -> value - */ - err = zap_remove(mos, zapobj, inheritstr, tx); - ASSERT(err == 0 || err == ENOENT); - VERIFY0(zap_update(mos, zapobj, propname, - intsz, numints, value, tx)); - break; - case ZPROP_SRC_INHERITED: - /* - * explicitly inherit - * - remove propname - * - set propname$inherit - */ - err = zap_remove(mos, zapobj, propname, tx); - ASSERT(err == 0 || err == ENOENT); - if (version >= SPA_VERSION_RECVD_PROPS && - dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) { - dummy = 0; - VERIFY0(zap_update(mos, zapobj, inheritstr, - 8, 1, &dummy, tx)); - } - break; - case ZPROP_SRC_RECEIVED: - /* - * set propname$recvd -> value - */ - err = zap_update(mos, zapobj, recvdstr, - intsz, numints, value, tx); - ASSERT(err == 0); - break; - case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED): - /* - * clear local and received settings - * - remove propname - * - remove propname$inherit - * - remove propname$recvd - */ - err = zap_remove(mos, zapobj, propname, tx); - ASSERT(err == 0 || err == ENOENT); - err = zap_remove(mos, zapobj, inheritstr, tx); - ASSERT(err == 0 || err == ENOENT); - /* FALLTHRU */ - case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): - /* - * remove propname$recvd - */ - err = zap_remove(mos, zapobj, recvdstr, tx); - ASSERT(err == 0 || err == ENOENT); - break; - default: - cmn_err(CE_PANIC, "unexpected property source: %d", source); - } - - strfree(inheritstr); - strfree(recvdstr); - - if (isint) { - VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval)); - - if (ds->ds_is_snapshot) { - dsl_prop_cb_record_t *cbr; - /* - * It's a snapshot; nothing can inherit this - * property, so just look for callbacks on this - * ds here. - */ - mutex_enter(&ds->ds_dir->dd_lock); - for (cbr = list_head(&ds->ds_prop_cbs); cbr; - cbr = list_next(&ds->ds_prop_cbs, cbr)) { - if (strcmp(cbr->cbr_pr->pr_propname, - propname) == 0) - cbr->cbr_func(cbr->cbr_arg, intval); - } - mutex_exit(&ds->ds_dir->dd_lock); - } else { - dsl_prop_changed_notify(ds->ds_dir->dd_pool, - ds->ds_dir->dd_object, propname, intval, TRUE); - } - - (void) snprintf(valbuf, sizeof (valbuf), - "%lld", (longlong_t)intval); - valstr = valbuf; - } else { - if (source == ZPROP_SRC_LOCAL) { - valstr = value; - } else { - tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); - if (dsl_prop_get_ds(ds, propname, 1, - ZAP_MAXVALUELEN, tbuf, NULL) == 0) - valstr = tbuf; - } - } - - spa_history_log_internal_ds(ds, (source == ZPROP_SRC_NONE || - source == ZPROP_SRC_INHERITED) ? "inherit" : "set", tx, - "%s=%s", propname, (valstr == NULL ? "" : valstr)); - - if (tbuf != NULL) - kmem_free(tbuf, ZAP_MAXVALUELEN); -} - -int -dsl_prop_set_int(const char *dsname, const char *propname, - zprop_source_t source, uint64_t value) -{ - nvlist_t *nvl = fnvlist_alloc(); - int error; - - fnvlist_add_uint64(nvl, propname, value); - error = dsl_props_set(dsname, source, nvl); - fnvlist_free(nvl); - return (error); -} - -int -dsl_prop_set_string(const char *dsname, const char *propname, - zprop_source_t source, const char *value) -{ - nvlist_t *nvl = fnvlist_alloc(); - int error; - - fnvlist_add_string(nvl, propname, value); - error = dsl_props_set(dsname, source, nvl); - fnvlist_free(nvl); - return (error); -} - -int -dsl_prop_inherit(const char *dsname, const char *propname, - zprop_source_t source) -{ - nvlist_t *nvl = fnvlist_alloc(); - int error; - - fnvlist_add_boolean(nvl, propname); - error = dsl_props_set(dsname, source, nvl); - fnvlist_free(nvl); - return (error); -} - -typedef struct dsl_props_set_arg { - const char *dpsa_dsname; - zprop_source_t dpsa_source; - nvlist_t *dpsa_props; -} dsl_props_set_arg_t; - -static int -dsl_props_set_check(void *arg, dmu_tx_t *tx) -{ - dsl_props_set_arg_t *dpsa = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - uint64_t version; - nvpair_t *elem = NULL; - int err; - - err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds); - if (err != 0) - return (err); - - version = spa_version(ds->ds_dir->dd_pool->dp_spa); - while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) { - if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(ENAMETOOLONG)); - } - if (nvpair_type(elem) == DATA_TYPE_STRING) { - char *valstr = fnvpair_value_string(elem); - if (strlen(valstr) >= (version < - SPA_VERSION_STMF_PROP ? - ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { - dsl_dataset_rele(ds, FTAG); - return (E2BIG); - } - } - } - - if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(ENOTSUP)); - } - dsl_dataset_rele(ds, FTAG); - return (0); -} - -void -dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source, - nvlist_t *props, dmu_tx_t *tx) -{ - nvpair_t *elem = NULL; - - while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { - nvpair_t *pair = elem; - const char *name = nvpair_name(pair); - - if (nvpair_type(pair) == DATA_TYPE_NVLIST) { - /* - * This usually happens when we reuse the nvlist_t data - * returned by the counterpart dsl_prop_get_all_impl(). - * For instance we do this to restore the original - * received properties when an error occurs in the - * zfs_ioc_recv() codepath. - */ - nvlist_t *attrs = fnvpair_value_nvlist(pair); - pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); - } - - if (nvpair_type(pair) == DATA_TYPE_STRING) { - const char *value = fnvpair_value_string(pair); - dsl_prop_set_sync_impl(ds, name, - source, 1, strlen(value) + 1, value, tx); - } else if (nvpair_type(pair) == DATA_TYPE_UINT64) { - uint64_t intval = fnvpair_value_uint64(pair); - dsl_prop_set_sync_impl(ds, name, - source, sizeof (intval), 1, &intval, tx); - } else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) { - dsl_prop_set_sync_impl(ds, name, - source, 0, 0, NULL, tx); - } else { - panic("invalid nvpair type"); - } - } -} - -static void -dsl_props_set_sync(void *arg, dmu_tx_t *tx) -{ - dsl_props_set_arg_t *dpsa = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - - VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds)); - dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx); - dsl_dataset_rele(ds, FTAG); -} - -/* - * All-or-nothing; if any prop can't be set, nothing will be modified. - */ -int -dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) -{ - dsl_props_set_arg_t dpsa; - int nblks = 0; - - dpsa.dpsa_dsname = dsname; - dpsa.dpsa_source = source; - dpsa.dpsa_props = props; - - /* - * If the source includes NONE, then we will only be removing entries - * from the ZAP object. In that case don't check for ENOSPC. - */ - if ((source & ZPROP_SRC_NONE) == 0) - nblks = 2 * fnvlist_num_pairs(props); - - return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync, - &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED)); -} - -typedef enum dsl_prop_getflags { - DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */ - DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */ - DSL_PROP_GET_LOCAL = 0x4, /* local properties */ - DSL_PROP_GET_RECEIVED = 0x8 /* received properties */ -} dsl_prop_getflags_t; - -static int -dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, - const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv) -{ - zap_cursor_t zc; - zap_attribute_t za; - int err = 0; - - for (zap_cursor_init(&zc, mos, propobj); - (err = zap_cursor_retrieve(&zc, &za)) == 0; - zap_cursor_advance(&zc)) { - nvlist_t *propval; - zfs_prop_t prop; - char buf[ZAP_MAXNAMELEN]; - char *valstr; - const char *suffix; - const char *propname; - const char *source; - - suffix = strchr(za.za_name, '$'); - - if (suffix == NULL) { - /* - * Skip local properties if we only want received - * properties. - */ - if (flags & DSL_PROP_GET_RECEIVED) - continue; - - propname = za.za_name; - source = setpoint; - } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) { - /* Skip explicitly inherited entries. */ - continue; - } else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) { - if (flags & DSL_PROP_GET_LOCAL) - continue; - - (void) strncpy(buf, za.za_name, (suffix - za.za_name)); - buf[suffix - za.za_name] = '\0'; - propname = buf; - - if (!(flags & DSL_PROP_GET_RECEIVED)) { - /* Skip if locally overridden. */ - err = zap_contains(mos, propobj, propname); - if (err == 0) - continue; - if (err != ENOENT) - break; - - /* Skip if explicitly inherited. */ - valstr = kmem_asprintf("%s%s", propname, - ZPROP_INHERIT_SUFFIX); - err = zap_contains(mos, propobj, valstr); - strfree(valstr); - if (err == 0) - continue; - if (err != ENOENT) - break; - } - - source = ((flags & DSL_PROP_GET_INHERITING) ? - setpoint : ZPROP_SOURCE_VAL_RECVD); - } else { - /* - * For backward compatibility, skip suffixes we don't - * recognize. - */ - continue; - } - - prop = zfs_name_to_prop(propname); - - /* Skip non-inheritable properties. */ - if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL && - !zfs_prop_inheritable(prop)) - continue; - - /* Skip properties not valid for this type. */ - if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL && - !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT)) - continue; - - /* Skip properties already defined. */ - if (nvlist_exists(nv, propname)) - continue; - - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if (za.za_integer_length == 1) { - /* - * String property - */ - char *tmp = kmem_alloc(za.za_num_integers, - KM_SLEEP); - err = zap_lookup(mos, propobj, - za.za_name, 1, za.za_num_integers, tmp); - if (err != 0) { - kmem_free(tmp, za.za_num_integers); - break; - } - VERIFY(nvlist_add_string(propval, ZPROP_VALUE, - tmp) == 0); - kmem_free(tmp, za.za_num_integers); - } else { - /* - * Integer property - */ - ASSERT(za.za_integer_length == 8); - (void) nvlist_add_uint64(propval, ZPROP_VALUE, - za.za_first_integer); - } - - VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0); - VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); - nvlist_free(propval); - } - zap_cursor_fini(&zc); - if (err == ENOENT) - err = 0; - return (err); -} - -/* - * Iterate over all properties for this dataset and return them in an nvlist. - */ -static int -dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, - dsl_prop_getflags_t flags) -{ - dsl_dir_t *dd = ds->ds_dir; - dsl_pool_t *dp = dd->dd_pool; - objset_t *mos = dp->dp_meta_objset; - int err = 0; - char setpoint[ZFS_MAX_DATASET_NAME_LEN]; - - VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - if (ds->ds_is_snapshot) - flags |= DSL_PROP_GET_SNAPSHOT; - - ASSERT(dsl_pool_config_held(dp)); - - if (dsl_dataset_phys(ds)->ds_props_obj != 0) { - ASSERT(flags & DSL_PROP_GET_SNAPSHOT); - dsl_dataset_name(ds, setpoint); - err = dsl_prop_get_all_impl(mos, - dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp); - if (err) - goto out; - } - - for (; dd != NULL; dd = dd->dd_parent) { - if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) { - if (flags & (DSL_PROP_GET_LOCAL | - DSL_PROP_GET_RECEIVED)) - break; - flags |= DSL_PROP_GET_INHERITING; - } - dsl_dir_name(dd, setpoint); - err = dsl_prop_get_all_impl(mos, - dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp); - if (err) - break; - } -out: - return (err); -} - -boolean_t -dsl_prop_get_hasrecvd(const char *dsname) -{ - uint64_t dummy; - - return (0 == - dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL)); -} - -static int -dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source) -{ - uint64_t version; - spa_t *spa; - int error = 0; - - VERIFY0(spa_open(dsname, &spa, FTAG)); - version = spa_version(spa); - spa_close(spa, FTAG); - - if (version >= SPA_VERSION_RECVD_PROPS) - error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0); - return (error); -} - -/* - * Call after successfully receiving properties to ensure that only the first - * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties. - */ -int -dsl_prop_set_hasrecvd(const char *dsname) -{ - int error = 0; - if (!dsl_prop_get_hasrecvd(dsname)) - error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL); - return (error); -} - -void -dsl_prop_unset_hasrecvd(const char *dsname) -{ - VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE)); -} - -int -dsl_prop_get_all(objset_t *os, nvlist_t **nvp) -{ - return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0)); -} - -int -dsl_prop_get_received(const char *dsname, nvlist_t **nvp) -{ - objset_t *os; - int error; - - /* - * Received properties are not distinguishable from local properties - * until the dataset has received properties on or after - * SPA_VERSION_RECVD_PROPS. - */ - dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ? - DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL); - - error = dmu_objset_hold(dsname, FTAG, &os); - if (error != 0) - return (error); - error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags); - dmu_objset_rele(os, FTAG); - return (error); -} - -void -dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value) -{ - nvlist_t *propval; - const char *propname = zfs_prop_to_name(prop); - uint64_t default_value; - - if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { - VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); - return; - } - - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); - /* Indicate the default source if we can. */ - if (dodefault(prop, 8, 1, &default_value) == 0 && - value == default_value) { - VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0); - } - VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); - nvlist_free(propval); -} - -void -dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value) -{ - nvlist_t *propval; - const char *propname = zfs_prop_to_name(prop); - - if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { - VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); - return; - } - - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); - VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); - nvlist_free(propval); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c deleted file mode 100644 index f87a0539e9bb..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c +++ /dev/null @@ -1,4001 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright 2016 Gary Mills - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. - * Copyright 2017 Joyent, Inc. - * Copyright (c) 2017 Datto Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef _KERNEL -#include -#endif - -/* - * Grand theory statement on scan queue sorting - * - * Scanning is implemented by recursively traversing all indirection levels - * in an object and reading all blocks referenced from said objects. This - * results in us approximately traversing the object from lowest logical - * offset to the highest. For best performance, we would want the logical - * blocks to be physically contiguous. However, this is frequently not the - * case with pools given the allocation patterns of copy-on-write filesystems. - * So instead, we put the I/Os into a reordering queue and issue them in a - * way that will most benefit physical disks (LBA-order). - * - * Queue management: - * - * Ideally, we would want to scan all metadata and queue up all block I/O - * prior to starting to issue it, because that allows us to do an optimal - * sorting job. This can however consume large amounts of memory. Therefore - * we continuously monitor the size of the queues and constrain them to 5% - * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this - * limit, we clear out a few of the largest extents at the head of the queues - * to make room for more scanning. Hopefully, these extents will be fairly - * large and contiguous, allowing us to approach sequential I/O throughput - * even without a fully sorted tree. - * - * Metadata scanning takes place in dsl_scan_visit(), which is called from - * dsl_scan_sync() every spa_sync(). If we have either fully scanned all - * metadata on the pool, or we need to make room in memory because our - * queues are too large, dsl_scan_visit() is postponed and - * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies - * that metadata scanning and queued I/O issuing are mutually exclusive. This - * allows us to provide maximum sequential I/O throughput for the majority of - * I/O's issued since sequential I/O performance is significantly negatively - * impacted if it is interleaved with random I/O. - * - * Implementation Notes - * - * One side effect of the queued scanning algorithm is that the scanning code - * needs to be notified whenever a block is freed. This is needed to allow - * the scanning code to remove these I/Os from the issuing queue. Additionally, - * we do not attempt to queue gang blocks to be issued sequentially since this - * is very hard to do and would have an extremely limitted performance benefit. - * Instead, we simply issue gang I/Os as soon as we find them using the legacy - * algorithm. - * - * Backwards compatibility - * - * This new algorithm is backwards compatible with the legacy on-disk data - * structures (and therefore does not require a new feature flag). - * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan - * will stop scanning metadata (in logical order) and wait for all outstanding - * sorted I/O to complete. Once this is done, we write out a checkpoint - * bookmark, indicating that we have scanned everything logically before it. - * If the pool is imported on a machine without the new sorting algorithm, - * the scan simply resumes from the last checkpoint using the legacy algorithm. - */ - -typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, - const zbookmark_phys_t *); - -static scan_cb_t dsl_scan_scrub_cb; - -static int scan_ds_queue_compare(const void *a, const void *b); -static int scan_prefetch_queue_compare(const void *a, const void *b); -static void scan_ds_queue_clear(dsl_scan_t *scn); -static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, - uint64_t *txg); -static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); -static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); -static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); -static uint64_t dsl_scan_count_leaves(vdev_t *vd); - -extern int zfs_vdev_async_write_active_min_dirty_percent; - -/* - * By default zfs will check to ensure it is not over the hard memory - * limit before each txg. If finer-grained control of this is needed - * this value can be set to 1 to enable checking before scanning each - * block. - */ -int zfs_scan_strict_mem_lim = B_FALSE; - -unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver -- 2 is a good number */ -unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub -- 4 is a good number */ -unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */ - -/* - * Maximum number of parallelly executed bytes per leaf vdev. We attempt - * to strike a balance here between keeping the vdev queues full of I/Os - * at all times and not overflowing the queues to cause long latency, - * which would cause long txg sync times. No matter what, we will not - * overload the drives with I/O, since that is protected by - * zfs_vdev_scrub_max_active. - */ -unsigned long zfs_scan_vdev_limit = 4 << 20; - -int zfs_scan_issue_strategy = 0; -int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */ -uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ - -unsigned int zfs_scan_checkpoint_intval = 7200; /* seconds */ -#define ZFS_SCAN_CHECKPOINT_INTVAL SEC_TO_TICK(zfs_scan_checkpoint_intval) - -/* - * fill_weight is non-tunable at runtime, so we copy it at module init from - * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would - * break queue sorting. - */ -uint64_t zfs_scan_fill_weight = 3; -static uint64_t fill_weight; - -/* See dsl_scan_should_clear() for details on the memory limit tunables */ -uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */ -uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */ -int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */ -int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */ - -unsigned int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ -unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ -unsigned int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */ -unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ -boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ -boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ - -SYSCTL_DECL(_vfs_zfs); -SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN, - &zfs_resilver_delay, 0, "Number of ticks to delay resilver"); -SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN, - &zfs_scrub_delay, 0, "Number of ticks to delay scrub"); -SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN, - &zfs_scan_idle, 0, "Idle scan window in clock ticks"); -SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN, - &zfs_scrub_min_time_ms, 0, "Min millisecs to scrub per txg"); -SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN, - &zfs_free_min_time_ms, 0, "Min millisecs to free per txg"); -SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN, - &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN, - &zfs_no_scrub_io, 0, "Disable scrub I/O"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN, - &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching"); -SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_legacy, CTLFLAG_RWTUN, - &zfs_scan_legacy, 0, "Scrub using legacy non-sequential method"); -SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_checkpoint_interval, CTLFLAG_RWTUN, - &zfs_scan_checkpoint_intval, 0, "Scan progress on-disk checkpointing interval"); - -enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; -/* max number of blocks to free in a single TXG */ -uint64_t zfs_async_block_max_blocks = UINT64_MAX; -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN, - &zfs_async_block_max_blocks, 0, "Maximum number of blocks to free in one TXG"); - -/* - * We wait a few txgs after importing a pool to begin scanning so that - * the import / mounting code isn't held up by scrub / resilver IO. - * Unfortunately, it is a bit difficult to determine exactly how long - * this will take since userspace will trigger fs mounts asynchronously - * and the kernel will create zvol minors asynchronously. As a result, - * the value provided here is a bit arbitrary, but represents a - * reasonable estimate of how many txgs it will take to finish fully - * importing a pool - */ -#define SCAN_IMPORT_WAIT_TXGS 5 - - -#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ - ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ - (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) - -extern int zfs_txg_timeout; - -/* - * Enable/disable the processing of the free_bpobj object. - */ -boolean_t zfs_free_bpobj_enabled = B_TRUE; - -SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN, - &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing"); - -/* the order has to match pool_scan_type */ -static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { - NULL, - dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ - dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ -}; - -/* In core node for the scn->scn_queue. Represents a dataset to be scanned */ -typedef struct { - uint64_t sds_dsobj; - uint64_t sds_txg; - avl_node_t sds_node; -} scan_ds_t; - -/* - * This controls what conditions are placed on dsl_scan_sync_state(): - * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0 - * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0. - * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise - * write out the scn_phys_cached version. - * See dsl_scan_sync_state for details. - */ -typedef enum { - SYNC_OPTIONAL, - SYNC_MANDATORY, - SYNC_CACHED -} state_sync_type_t; - -/* - * This struct represents the minimum information needed to reconstruct a - * zio for sequential scanning. This is useful because many of these will - * accumulate in the sequential IO queues before being issued, so saving - * memory matters here. - */ -typedef struct scan_io { - /* fields from blkptr_t */ - uint64_t sio_offset; - uint64_t sio_blk_prop; - uint64_t sio_phys_birth; - uint64_t sio_birth; - zio_cksum_t sio_cksum; - uint32_t sio_asize; - - /* fields from zio_t */ - int sio_flags; - zbookmark_phys_t sio_zb; - - /* members for queue sorting */ - union { - avl_node_t sio_addr_node; /* link into issueing queue */ - list_node_t sio_list_node; /* link for issuing to disk */ - } sio_nodes; -} scan_io_t; - -struct dsl_scan_io_queue { - dsl_scan_t *q_scn; /* associated dsl_scan_t */ - vdev_t *q_vd; /* top-level vdev that this queue represents */ - - /* trees used for sorting I/Os and extents of I/Os */ - range_tree_t *q_exts_by_addr; - avl_tree_t q_exts_by_size; - avl_tree_t q_sios_by_addr; - - /* members for zio rate limiting */ - uint64_t q_maxinflight_bytes; - uint64_t q_inflight_bytes; - kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */ - - /* per txg statistics */ - uint64_t q_total_seg_size_this_txg; - uint64_t q_segs_this_txg; - uint64_t q_total_zio_size_this_txg; - uint64_t q_zios_this_txg; -}; - -/* private data for dsl_scan_prefetch_cb() */ -typedef struct scan_prefetch_ctx { - zfs_refcount_t spc_refcnt; /* refcount for memory management */ - dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */ - boolean_t spc_root; /* is this prefetch for an objset? */ - uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */ - uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */ -} scan_prefetch_ctx_t; - -/* private data for dsl_scan_prefetch() */ -typedef struct scan_prefetch_issue_ctx { - avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */ - scan_prefetch_ctx_t *spic_spc; /* spc for the callback */ - blkptr_t spic_bp; /* bp to prefetch */ - zbookmark_phys_t spic_zb; /* bookmark to prefetch */ -} scan_prefetch_issue_ctx_t; - -static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, - const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue); -static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, - scan_io_t *sio); - -static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); -static void scan_io_queues_destroy(dsl_scan_t *scn); - -static kmem_cache_t *sio_cache; - -void -scan_init(void) -{ - /* - * This is used in ext_size_compare() to weight segments - * based on how sparse they are. This cannot be changed - * mid-scan and the tree comparison functions don't currently - * have a mechansim for passing additional context to the - * compare functions. Thus we store this value globally and - * we only allow it to be set at module intiailization time - */ - fill_weight = zfs_scan_fill_weight; - - sio_cache = kmem_cache_create("sio_cache", - sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -} - -void -scan_fini(void) -{ - kmem_cache_destroy(sio_cache); -} - -static inline boolean_t -dsl_scan_is_running(const dsl_scan_t *scn) -{ - return (scn->scn_phys.scn_state == DSS_SCANNING); -} - -boolean_t -dsl_scan_resilvering(dsl_pool_t *dp) -{ - return (dsl_scan_is_running(dp->dp_scan) && - dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); -} - -static inline void -sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id) -{ - bzero(bp, sizeof (*bp)); - DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize); - DVA_SET_VDEV(&bp->blk_dva[0], vdev_id); - DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset); - bp->blk_prop = sio->sio_blk_prop; - bp->blk_phys_birth = sio->sio_phys_birth; - bp->blk_birth = sio->sio_birth; - bp->blk_fill = 1; /* we always only work with data pointers */ - bp->blk_cksum = sio->sio_cksum; -} - -static inline void -bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) -{ - /* we discard the vdev id, since we can deduce it from the queue */ - sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]); - sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]); - sio->sio_blk_prop = bp->blk_prop; - sio->sio_phys_birth = bp->blk_phys_birth; - sio->sio_birth = bp->blk_birth; - sio->sio_cksum = bp->blk_cksum; -} - -void -dsl_scan_global_init(void) -{ - /* - * This is used in ext_size_compare() to weight segments - * based on how sparse they are. This cannot be changed - * mid-scan and the tree comparison functions don't currently - * have a mechansim for passing additional context to the - * compare functions. Thus we store this value globally and - * we only allow it to be set at module intiailization time - */ - fill_weight = zfs_scan_fill_weight; -} - -int -dsl_scan_init(dsl_pool_t *dp, uint64_t txg) -{ - int err; - dsl_scan_t *scn; - spa_t *spa = dp->dp_spa; - uint64_t f; - - scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); - scn->scn_dp = dp; - - /* - * It's possible that we're resuming a scan after a reboot so - * make sure that the scan_async_destroying flag is initialized - * appropriately. - */ - ASSERT(!scn->scn_async_destroying); - scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, - SPA_FEATURE_ASYNC_DESTROY); - - /* - * Calculate the max number of in-flight bytes for pool-wide - * scanning operations (minimum 1MB). Limits for the issuing - * phase are done per top-level vdev and are handled separately. - */ - scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * - dsl_scan_count_leaves(spa->spa_root_vdev), 1ULL << 20); - - avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), - offsetof(scan_ds_t, sds_node)); - avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, - sizeof (scan_prefetch_issue_ctx_t), - offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); - - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - "scrub_func", sizeof (uint64_t), 1, &f); - if (err == 0) { - /* - * There was an old-style scrub in progress. Restart a - * new-style scrub from the beginning. - */ - scn->scn_restart_txg = txg; - zfs_dbgmsg("old-style scrub was in progress; " - "restarting new-style scrub in txg %llu", - (longlong_t)scn->scn_restart_txg); - - /* - * Load the queue obj from the old location so that it - * can be freed by dsl_scan_done(). - */ - (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - "scrub_queue", sizeof (uint64_t), 1, - &scn->scn_phys.scn_queue_obj); - } else { - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, - &scn->scn_phys); - if (err == ENOENT) - return (0); - else if (err) - return (err); - - /* - * We might be restarting after a reboot, so jump the issued - * counter to how far we've scanned. We know we're consistent - * up to here. - */ - scn->scn_issued_before_pass = scn->scn_phys.scn_examined; - - if (dsl_scan_is_running(scn) && - spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { - /* - * A new-type scrub was in progress on an old - * pool, and the pool was accessed by old - * software. Restart from the beginning, since - * the old software may have changed the pool in - * the meantime. - */ - scn->scn_restart_txg = txg; - zfs_dbgmsg("new-style scrub was modified " - "by old software; restarting in txg %llu", - (longlong_t)scn->scn_restart_txg); - } - } - - bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); - - /* reload the queue into the in-core state */ - if (scn->scn_phys.scn_queue_obj != 0) { - zap_cursor_t zc; - zap_attribute_t za; - - for (zap_cursor_init(&zc, dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj); - zap_cursor_retrieve(&zc, &za) == 0; - (void) zap_cursor_advance(&zc)) { - scan_ds_queue_insert(scn, - zfs_strtonum(za.za_name, NULL), - za.za_first_integer); - } - zap_cursor_fini(&zc); - } - - spa_scan_stat_init(spa); - return (0); -} - -void -dsl_scan_fini(dsl_pool_t *dp) -{ - if (dp->dp_scan != NULL) { - dsl_scan_t *scn = dp->dp_scan; - - if (scn->scn_taskq != NULL) - taskq_destroy(scn->scn_taskq); - scan_ds_queue_clear(scn); - avl_destroy(&scn->scn_queue); - avl_destroy(&scn->scn_prefetch_queue); - - kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); - dp->dp_scan = NULL; - } -} - -static boolean_t -dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) -{ - return (scn->scn_restart_txg != 0 && - scn->scn_restart_txg <= tx->tx_txg); -} - -boolean_t -dsl_scan_scrubbing(const dsl_pool_t *dp) -{ - dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys; - - return (scn_phys->scn_state == DSS_SCANNING && - scn_phys->scn_func == POOL_SCAN_SCRUB); -} - -boolean_t -dsl_scan_is_paused_scrub(const dsl_scan_t *scn) -{ - return (dsl_scan_scrubbing(scn->scn_dp) && - scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); -} - -/* - * Writes out a persistent dsl_scan_phys_t record to the pool directory. - * Because we can be running in the block sorting algorithm, we do not always - * want to write out the record, only when it is "safe" to do so. This safety - * condition is achieved by making sure that the sorting queues are empty - * (scn_bytes_pending == 0). When this condition is not true, the sync'd state - * is inconsistent with how much actual scanning progress has been made. The - * kind of sync to be performed is specified by the sync_type argument. If the - * sync is optional, we only sync if the queues are empty. If the sync is - * mandatory, we do a hard ASSERT to make sure that the queues are empty. The - * third possible state is a "cached" sync. This is done in response to: - * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been - * destroyed, so we wouldn't be able to restart scanning from it. - * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been - * superseded by a newer snapshot. - * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been - * swapped with its clone. - * In all cases, a cached sync simply rewrites the last record we've written, - * just slightly modified. For the modifications that are performed to the - * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed, - * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped. - */ -static void -dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) -{ - int i; - spa_t *spa = scn->scn_dp->dp_spa; - - ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0); - if (scn->scn_bytes_pending == 0) { - for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { - vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; - dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue; - - if (q == NULL) - continue; - - mutex_enter(&vd->vdev_scan_io_queue_lock); - ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); - ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL); - ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); - mutex_exit(&vd->vdev_scan_io_queue_lock); - } - - if (scn->scn_phys.scn_queue_obj != 0) - scan_ds_queue_sync(scn, tx); - VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, - &scn->scn_phys, tx)); - bcopy(&scn->scn_phys, &scn->scn_phys_cached, - sizeof (scn->scn_phys)); - - if (scn->scn_checkpointing) - zfs_dbgmsg("finish scan checkpoint"); - - scn->scn_checkpointing = B_FALSE; - scn->scn_last_checkpoint = ddi_get_lbolt(); - } else if (sync_type == SYNC_CACHED) { - VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, - &scn->scn_phys_cached, tx)); - } -} - -/* ARGSUSED */ -static int -dsl_scan_setup_check(void *arg, dmu_tx_t *tx) -{ - dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - - if (dsl_scan_is_running(scn)) - return (SET_ERROR(EBUSY)); - - return (0); -} - -static void -dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) -{ - dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - pool_scan_func_t *funcp = arg; - dmu_object_type_t ot = 0; - dsl_pool_t *dp = scn->scn_dp; - spa_t *spa = dp->dp_spa; - - ASSERT(!dsl_scan_is_running(scn)); - ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); - bzero(&scn->scn_phys, sizeof (scn->scn_phys)); - scn->scn_phys.scn_func = *funcp; - scn->scn_phys.scn_state = DSS_SCANNING; - scn->scn_phys.scn_min_txg = 0; - scn->scn_phys.scn_max_txg = tx->tx_txg; - scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ - scn->scn_phys.scn_start_time = gethrestime_sec(); - scn->scn_phys.scn_errors = 0; - scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; - scn->scn_issued_before_pass = 0; - scn->scn_restart_txg = 0; - scn->scn_done_txg = 0; - scn->scn_last_checkpoint = 0; - scn->scn_checkpointing = B_FALSE; - spa_scan_stat_init(spa); - - if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { - scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; - - /* rewrite all disk labels */ - vdev_config_dirty(spa->spa_root_vdev); - - if (vdev_resilver_needed(spa->spa_root_vdev, - &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { - spa_event_notify(spa, NULL, NULL, - ESC_ZFS_RESILVER_START); - } else { - spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START); - } - - spa->spa_scrub_started = B_TRUE; - /* - * If this is an incremental scrub, limit the DDT scrub phase - * to just the auto-ditto class (for correctness); the rest - * of the scrub should go faster using top-down pruning. - */ - if (scn->scn_phys.scn_min_txg > TXG_INITIAL) - scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; - - } - - /* back to the generic stuff */ - - if (dp->dp_blkstats == NULL) { - dp->dp_blkstats = - kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); - mutex_init(&dp->dp_blkstats->zab_lock, NULL, - MUTEX_DEFAULT, NULL); - } - bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type)); - - if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) - ot = DMU_OT_ZAP_OTHER; - - scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, - ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); - - bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); - - dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); - - spa_history_log_internal(spa, "scan setup", tx, - "func=%u mintxg=%llu maxtxg=%llu", - *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); -} - -/* - * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. - * Can also be called to resume a paused scrub. - */ -int -dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) -{ - spa_t *spa = dp->dp_spa; - dsl_scan_t *scn = dp->dp_scan; - - /* - * Purge all vdev caches and probe all devices. We do this here - * rather than in sync context because this requires a writer lock - * on the spa_config lock, which we can't do from sync context. The - * spa_scrub_reopen flag indicates that vdev_open() should not - * attempt to start another scrub. - */ - spa_vdev_state_enter(spa, SCL_NONE); - spa->spa_scrub_reopen = B_TRUE; - vdev_reopen(spa->spa_root_vdev); - spa->spa_scrub_reopen = B_FALSE; - (void) spa_vdev_state_exit(spa, NULL, 0); - - if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { - /* got scrub start cmd, resume paused scrub */ - int err = dsl_scrub_set_pause_resume(scn->scn_dp, - POOL_SCRUB_NORMAL); - if (err == 0) { - spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); - return (ECANCELED); - } - return (SET_ERROR(err)); - } - - return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, - dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); -} - -/* ARGSUSED */ -static void -dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) -{ - static const char *old_names[] = { - "scrub_bookmark", - "scrub_ddt_bookmark", - "scrub_ddt_class_max", - "scrub_queue", - "scrub_min_txg", - "scrub_max_txg", - "scrub_func", - "scrub_errors", - NULL - }; - - dsl_pool_t *dp = scn->scn_dp; - spa_t *spa = dp->dp_spa; - int i; - - /* Remove any remnants of an old-style scrub. */ - for (i = 0; old_names[i]; i++) { - (void) zap_remove(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); - } - - if (scn->scn_phys.scn_queue_obj != 0) { - VERIFY0(dmu_object_free(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, tx)); - scn->scn_phys.scn_queue_obj = 0; - } - scan_ds_queue_clear(scn); - - scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; - - /* - * If we were "restarted" from a stopped state, don't bother - * with anything else. - */ - if (!dsl_scan_is_running(scn)) { - ASSERT(!scn->scn_is_sorted); - return; - } - - if (scn->scn_is_sorted) { - scan_io_queues_destroy(scn); - scn->scn_is_sorted = B_FALSE; - - if (scn->scn_taskq != NULL) { - taskq_destroy(scn->scn_taskq); - scn->scn_taskq = NULL; - } - } - - scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED; - - if (dsl_scan_restarting(scn, tx)) - spa_history_log_internal(spa, "scan aborted, restarting", tx, - "errors=%llu", spa_get_errlog_size(spa)); - else if (!complete) - spa_history_log_internal(spa, "scan cancelled", tx, - "errors=%llu", spa_get_errlog_size(spa)); - else - spa_history_log_internal(spa, "scan done", tx, - "errors=%llu", spa_get_errlog_size(spa)); - - if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { - spa->spa_scrub_started = B_FALSE; - spa->spa_scrub_active = B_FALSE; - - /* - * If the scrub/resilver completed, update all DTLs to - * reflect this. Whether it succeeded or not, vacate - * all temporary scrub DTLs. - * - * As the scrub does not currently support traversing - * data that have been freed but are part of a checkpoint, - * we don't mark the scrub as done in the DTLs as faults - * may still exist in those vdevs. - */ - if (complete && - !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { - vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, - scn->scn_phys.scn_max_txg, B_TRUE); - - spa_event_notify(spa, NULL, NULL, - scn->scn_phys.scn_min_txg ? - ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); - } else { - vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, - 0, B_TRUE); - } - spa_errlog_rotate(spa); - - /* - * We may have finished replacing a device. - * Let the async thread assess this and handle the detach. - */ - spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); - } - - scn->scn_phys.scn_end_time = gethrestime_sec(); - - ASSERT(!dsl_scan_is_running(scn)); -} - -/* ARGSUSED */ -static int -dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) -{ - dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - - if (!dsl_scan_is_running(scn)) - return (SET_ERROR(ENOENT)); - return (0); -} - -/* ARGSUSED */ -static void -dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) -{ - dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - - dsl_scan_done(scn, B_FALSE, tx); - dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); - spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT); -} - -int -dsl_scan_cancel(dsl_pool_t *dp) -{ - return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, - dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); -} - -static int -dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx) -{ - pool_scrub_cmd_t *cmd = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_scan_t *scn = dp->dp_scan; - - if (*cmd == POOL_SCRUB_PAUSE) { - /* can't pause a scrub when there is no in-progress scrub */ - if (!dsl_scan_scrubbing(dp)) - return (SET_ERROR(ENOENT)); - - /* can't pause a paused scrub */ - if (dsl_scan_is_paused_scrub(scn)) - return (SET_ERROR(EBUSY)); - } else if (*cmd != POOL_SCRUB_NORMAL) { - return (SET_ERROR(ENOTSUP)); - } - - return (0); -} - -static void -dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) -{ - pool_scrub_cmd_t *cmd = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - spa_t *spa = dp->dp_spa; - dsl_scan_t *scn = dp->dp_scan; - - if (*cmd == POOL_SCRUB_PAUSE) { - /* can't pause a scrub when there is no in-progress scrub */ - spa->spa_scan_pass_scrub_pause = gethrestime_sec(); - scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED; - scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED; - dsl_scan_sync_state(scn, tx, SYNC_CACHED); - spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED); - } else { - ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); - if (dsl_scan_is_paused_scrub(scn)) { - /* - * We need to keep track of how much time we spend - * paused per pass so that we can adjust the scrub rate - * shown in the output of 'zpool status' - */ - spa->spa_scan_pass_scrub_spent_paused += - gethrestime_sec() - spa->spa_scan_pass_scrub_pause; - spa->spa_scan_pass_scrub_pause = 0; - scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; - scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED; - dsl_scan_sync_state(scn, tx, SYNC_CACHED); - } - } -} - -/* - * Set scrub pause/resume state if it makes sense to do so - */ -int -dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) -{ - return (dsl_sync_task(spa_name(dp->dp_spa), - dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3, - ZFS_SPACE_CHECK_RESERVED)); -} - - -/* start a new scan, or restart an existing one. */ -void -dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) -{ - if (txg == 0) { - dmu_tx_t *tx; - tx = dmu_tx_create_dd(dp->dp_mos_dir); - VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); - - txg = dmu_tx_get_txg(tx); - dp->dp_scan->scn_restart_txg = txg; - dmu_tx_commit(tx); - } else { - dp->dp_scan->scn_restart_txg = txg; - } - zfs_dbgmsg("restarting resilver txg=%llu", txg); -} - -void -dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) -{ - zio_free(dp->dp_spa, txg, bp); -} - -void -dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) -{ - ASSERT(dsl_pool_sync_context(dp)); - zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), - pio->io_flags)); -} - -static int -scan_ds_queue_compare(const void *a, const void *b) -{ - const scan_ds_t *sds_a = a, *sds_b = b; - - if (sds_a->sds_dsobj < sds_b->sds_dsobj) - return (-1); - if (sds_a->sds_dsobj == sds_b->sds_dsobj) - return (0); - return (1); -} - -static void -scan_ds_queue_clear(dsl_scan_t *scn) -{ - void *cookie = NULL; - scan_ds_t *sds; - while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) { - kmem_free(sds, sizeof (*sds)); - } -} - -static boolean_t -scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg) -{ - scan_ds_t srch, *sds; - - srch.sds_dsobj = dsobj; - sds = avl_find(&scn->scn_queue, &srch, NULL); - if (sds != NULL && txg != NULL) - *txg = sds->sds_txg; - return (sds != NULL); -} - -static void -scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg) -{ - scan_ds_t *sds; - avl_index_t where; - - sds = kmem_zalloc(sizeof (*sds), KM_SLEEP); - sds->sds_dsobj = dsobj; - sds->sds_txg = txg; - - VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL); - avl_insert(&scn->scn_queue, sds, where); -} - -static void -scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj) -{ - scan_ds_t srch, *sds; - - srch.sds_dsobj = dsobj; - - sds = avl_find(&scn->scn_queue, &srch, NULL); - VERIFY(sds != NULL); - avl_remove(&scn->scn_queue, sds); - kmem_free(sds, sizeof (*sds)); -} - -static void -scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx) -{ - dsl_pool_t *dp = scn->scn_dp; - spa_t *spa = dp->dp_spa; - dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ? - DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER; - - ASSERT0(scn->scn_bytes_pending); - ASSERT(scn->scn_phys.scn_queue_obj != 0); - - VERIFY0(dmu_object_free(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, tx)); - scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot, - DMU_OT_NONE, 0, tx); - for (scan_ds_t *sds = avl_first(&scn->scn_queue); - sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) { - VERIFY0(zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, sds->sds_dsobj, - sds->sds_txg, tx)); - } -} - -/* - * Computes the memory limit state that we're currently in. A sorted scan - * needs quite a bit of memory to hold the sorting queue, so we need to - * reasonably constrain the size so it doesn't impact overall system - * performance. We compute two limits: - * 1) Hard memory limit: if the amount of memory used by the sorting - * queues on a pool gets above this value, we stop the metadata - * scanning portion and start issuing the queued up and sorted - * I/Os to reduce memory usage. - * This limit is calculated as a fraction of physmem (by default 5%). - * We constrain the lower bound of the hard limit to an absolute - * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain - * the upper bound to 5% of the total pool size - no chance we'll - * ever need that much memory, but just to keep the value in check. - * 2) Soft memory limit: once we hit the hard memory limit, we start - * issuing I/O to reduce queue memory usage, but we don't want to - * completely empty out the queues, since we might be able to find I/Os - * that will fill in the gaps of our non-sequential IOs at some point - * in the future. So we stop the issuing of I/Os once the amount of - * memory used drops below the soft limit (at which point we stop issuing - * I/O and start scanning metadata again). - * - * This limit is calculated by subtracting a fraction of the hard - * limit from the hard limit. By default this fraction is 5%, so - * the soft limit is 95% of the hard limit. We cap the size of the - * difference between the hard and soft limits at an absolute - * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is - * sufficient to not cause too frequent switching between the - * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's - * worth of queues is about 1.2 GiB of on-pool data, so scanning - * that should take at least a decent fraction of a second). - */ -static boolean_t -dsl_scan_should_clear(dsl_scan_t *scn) -{ - spa_t *spa = scn->scn_dp->dp_spa; - vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; - uint64_t alloc, mlim_hard, mlim_soft, mused; - - alloc = metaslab_class_get_alloc(spa_normal_class(spa)); - alloc += metaslab_class_get_alloc(spa_special_class(spa)); - alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); - - mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE, - zfs_scan_mem_lim_min); - mlim_hard = MIN(mlim_hard, alloc / 20); - mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact, - zfs_scan_mem_lim_soft_max); - mused = 0; - for (uint64_t i = 0; i < rvd->vdev_children; i++) { - vdev_t *tvd = rvd->vdev_child[i]; - dsl_scan_io_queue_t *queue; - - mutex_enter(&tvd->vdev_scan_io_queue_lock); - queue = tvd->vdev_scan_io_queue; - if (queue != NULL) { - /* #extents in exts_by_size = # in exts_by_addr */ - mused += avl_numnodes(&queue->q_exts_by_size) * - sizeof (range_seg_t) + - avl_numnodes(&queue->q_sios_by_addr) * - sizeof (scan_io_t); - } - mutex_exit(&tvd->vdev_scan_io_queue_lock); - } - - dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused); - - if (mused == 0) - ASSERT0(scn->scn_bytes_pending); - - /* - * If we are above our hard limit, we need to clear out memory. - * If we are below our soft limit, we need to accumulate sequential IOs. - * Otherwise, we should keep doing whatever we are currently doing. - */ - if (mused >= mlim_hard) - return (B_TRUE); - else if (mused < mlim_soft) - return (B_FALSE); - else - return (scn->scn_clearing); -} - -static boolean_t -dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) -{ - /* we never skip user/group accounting objects */ - if (zb && (int64_t)zb->zb_object < 0) - return (B_FALSE); - - if (scn->scn_suspending) - return (B_TRUE); /* we're already suspending */ - - if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) - return (B_FALSE); /* we're resuming */ - - /* We only know how to resume from level-0 blocks. */ - if (zb && zb->zb_level != 0) - return (B_FALSE); - - /* - * We suspend if: - * - we have scanned for at least the minimum time (default 1 sec - * for scrub, 3 sec for resilver), and either we have sufficient - * dirty data that we are starting to write more quickly - * (default 30%), or someone is explicitly waiting for this txg - * to complete. - * or - * - the spa is shutting down because this pool is being exported - * or the machine is rebooting. - * or - * - the scan queue has reached its memory use limit - */ - uint64_t elapsed_nanosecs = gethrtime(); - uint64_t curr_time_ns = gethrtime(); - uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; - uint64_t sync_time_ns = curr_time_ns - - scn->scn_dp->dp_spa->spa_sync_starttime; - - int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; - int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? - zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; - - if ((NSEC2MSEC(scan_time_ns) > mintime && - (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || - txg_sync_waiting(scn->scn_dp) || - NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || - spa_shutting_down(scn->scn_dp->dp_spa) || - (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) { - if (zb) { - dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n", - (longlong_t)zb->zb_objset, - (longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, - (longlong_t)zb->zb_blkid); - scn->scn_phys.scn_bookmark = *zb; - } else { - dsl_scan_phys_t *scnp = &scn->scn_phys; - - dprintf("suspending at at DDT bookmark " - "%llx/%llx/%llx/%llx\n", - (longlong_t)scnp->scn_ddt_bookmark.ddb_class, - (longlong_t)scnp->scn_ddt_bookmark.ddb_type, - (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, - (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); - } - scn->scn_suspending = B_TRUE; - return (B_TRUE); - } - return (B_FALSE); -} - -typedef struct zil_scan_arg { - dsl_pool_t *zsa_dp; - zil_header_t *zsa_zh; -} zil_scan_arg_t; - -/* ARGSUSED */ -static int -dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) -{ - zil_scan_arg_t *zsa = arg; - dsl_pool_t *dp = zsa->zsa_dp; - dsl_scan_t *scn = dp->dp_scan; - zil_header_t *zh = zsa->zsa_zh; - zbookmark_phys_t zb; - - if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) - return (0); - - /* - * One block ("stubby") can be allocated a long time ago; we - * want to visit that one because it has been allocated - * (on-disk) even if it hasn't been claimed (even though for - * scrub there's nothing to do to it). - */ - if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa)) - return (0); - - SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], - ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - - VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); - return (0); -} - -/* ARGSUSED */ -static int -dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) -{ - if (lrc->lrc_txtype == TX_WRITE) { - zil_scan_arg_t *zsa = arg; - dsl_pool_t *dp = zsa->zsa_dp; - dsl_scan_t *scn = dp->dp_scan; - zil_header_t *zh = zsa->zsa_zh; - lr_write_t *lr = (lr_write_t *)lrc; - blkptr_t *bp = &lr->lr_blkptr; - zbookmark_phys_t zb; - - if (BP_IS_HOLE(bp) || - bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) - return (0); - - /* - * birth can be < claim_txg if this record's txg is - * already txg sync'ed (but this log block contains - * other records that are not synced) - */ - if (claim_txg == 0 || bp->blk_birth < claim_txg) - return (0); - - SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], - lr->lr_foid, ZB_ZIL_LEVEL, - lr->lr_offset / BP_GET_LSIZE(bp)); - - VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); - } - return (0); -} - -static void -dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) -{ - uint64_t claim_txg = zh->zh_claim_txg; - zil_scan_arg_t zsa = { dp, zh }; - zilog_t *zilog; - - ASSERT(spa_writeable(dp->dp_spa)); - - /* - * We only want to visit blocks that have been claimed - * but not yet replayed. - */ - if (claim_txg == 0) - return; - - zilog = zil_alloc(dp->dp_meta_objset, zh); - - (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, - claim_txg); - - zil_free(zilog); -} - -/* - * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea - * here is to sort the AVL tree by the order each block will be needed. - */ -static int -scan_prefetch_queue_compare(const void *a, const void *b) -{ - const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b; - const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc; - const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc; - - return (zbookmark_compare(spc_a->spc_datablkszsec, - spc_a->spc_indblkshift, spc_b->spc_datablkszsec, - spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb)); -} - -static void -scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag) -{ - if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) { - zfs_refcount_destroy(&spc->spc_refcnt); - kmem_free(spc, sizeof (scan_prefetch_ctx_t)); - } -} - -static scan_prefetch_ctx_t * -scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag) -{ - scan_prefetch_ctx_t *spc; - - spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP); - zfs_refcount_create(&spc->spc_refcnt); - zfs_refcount_add(&spc->spc_refcnt, tag); - spc->spc_scn = scn; - if (dnp != NULL) { - spc->spc_datablkszsec = dnp->dn_datablkszsec; - spc->spc_indblkshift = dnp->dn_indblkshift; - spc->spc_root = B_FALSE; - } else { - spc->spc_datablkszsec = 0; - spc->spc_indblkshift = 0; - spc->spc_root = B_TRUE; - } - - return (spc); -} - -static void -scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag) -{ - zfs_refcount_add(&spc->spc_refcnt, tag); -} - -static boolean_t -dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc, - const zbookmark_phys_t *zb) -{ - zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark; - dnode_phys_t tmp_dnp; - dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp; - - if (zb->zb_objset != last_zb->zb_objset) - return (B_TRUE); - if ((int64_t)zb->zb_object < 0) - return (B_FALSE); - - tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec; - tmp_dnp.dn_indblkshift = spc->spc_indblkshift; - - if (zbookmark_subtree_completed(dnp, zb, last_zb)) - return (B_TRUE); - - return (B_FALSE); -} - -static void -dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) -{ - avl_index_t idx; - dsl_scan_t *scn = spc->spc_scn; - spa_t *spa = scn->scn_dp->dp_spa; - scan_prefetch_issue_ctx_t *spic; - - if (zfs_no_scrub_prefetch) - return; - - if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg || - (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && - BP_GET_TYPE(bp) != DMU_OT_OBJSET)) - return; - - if (dsl_scan_check_prefetch_resume(spc, zb)) - return; - - scan_prefetch_ctx_add_ref(spc, scn); - spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP); - spic->spic_spc = spc; - spic->spic_bp = *bp; - spic->spic_zb = *zb; - - /* - * Add the IO to the queue of blocks to prefetch. This allows us to - * prioritize blocks that we will need first for the main traversal - * thread. - */ - mutex_enter(&spa->spa_scrub_lock); - if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) { - /* this block is already queued for prefetch */ - kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); - scan_prefetch_ctx_rele(spc, scn); - mutex_exit(&spa->spa_scrub_lock); - return; - } - - avl_insert(&scn->scn_prefetch_queue, spic, idx); - cv_broadcast(&spa->spa_scrub_io_cv); - mutex_exit(&spa->spa_scrub_lock); -} - -static void -dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp, - uint64_t objset, uint64_t object) -{ - int i; - zbookmark_phys_t zb; - scan_prefetch_ctx_t *spc; - - if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) - return; - - SET_BOOKMARK(&zb, objset, object, 0, 0); - - spc = scan_prefetch_ctx_create(scn, dnp, FTAG); - - for (i = 0; i < dnp->dn_nblkptr; i++) { - zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]); - zb.zb_blkid = i; - dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb); - } - - if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { - zb.zb_level = 0; - zb.zb_blkid = DMU_SPILL_BLKID; - dsl_scan_prefetch(spc, &dnp->dn_spill, &zb); - } - - scan_prefetch_ctx_rele(spc, FTAG); -} - -void -dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, - arc_buf_t *buf, void *private) -{ - scan_prefetch_ctx_t *spc = private; - dsl_scan_t *scn = spc->spc_scn; - spa_t *spa = scn->scn_dp->dp_spa; - - /* broadcast that the IO has completed for rate limitting purposes */ - mutex_enter(&spa->spa_scrub_lock); - ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); - spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); - cv_broadcast(&spa->spa_scrub_io_cv); - mutex_exit(&spa->spa_scrub_lock); - - /* if there was an error or we are done prefetching, just cleanup */ - if (buf == NULL || scn->scn_suspending) - goto out; - - if (BP_GET_LEVEL(bp) > 0) { - int i; - blkptr_t *cbp; - int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; - zbookmark_phys_t czb; - - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { - SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, zb->zb_blkid * epb + i); - dsl_scan_prefetch(spc, cbp, &czb); - } - } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - dnode_phys_t *cdnp = buf->b_data; - int i; - int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - - for (i = 0, cdnp = buf->b_data; i < epb; - i += cdnp->dn_extra_slots + 1, - cdnp += cdnp->dn_extra_slots + 1) { - dsl_scan_prefetch_dnode(scn, cdnp, - zb->zb_objset, zb->zb_blkid * epb + i); - } - } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - objset_phys_t *osp = buf->b_data; - - dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode, - zb->zb_objset, DMU_META_DNODE_OBJECT); - - if (OBJSET_BUF_HAS_USERUSED(buf)) { - dsl_scan_prefetch_dnode(scn, - &osp->os_groupused_dnode, zb->zb_objset, - DMU_GROUPUSED_OBJECT); - dsl_scan_prefetch_dnode(scn, - &osp->os_userused_dnode, zb->zb_objset, - DMU_USERUSED_OBJECT); - } - } - -out: - if (buf != NULL) - arc_buf_destroy(buf, private); - scan_prefetch_ctx_rele(spc, scn); -} - -/* ARGSUSED */ -static void -dsl_scan_prefetch_thread(void *arg) -{ - dsl_scan_t *scn = arg; - spa_t *spa = scn->scn_dp->dp_spa; - vdev_t *rvd = spa->spa_root_vdev; - scan_prefetch_issue_ctx_t *spic; - - /* loop until we are told to stop */ - while (!scn->scn_prefetch_stop) { - arc_flags_t flags = ARC_FLAG_NOWAIT | - ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH; - int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; - - mutex_enter(&spa->spa_scrub_lock); - - /* - * Wait until we have an IO to issue and are not above our - * maximum in flight limit. - */ - while (!scn->scn_prefetch_stop && - (avl_numnodes(&scn->scn_prefetch_queue) == 0 || - spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) { - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - } - - /* recheck if we should stop since we waited for the cv */ - if (scn->scn_prefetch_stop) { - mutex_exit(&spa->spa_scrub_lock); - break; - } - - /* remove the prefetch IO from the tree */ - spic = avl_first(&scn->scn_prefetch_queue); - spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp); - avl_remove(&scn->scn_prefetch_queue, spic); - - mutex_exit(&spa->spa_scrub_lock); - - /* issue the prefetch asynchronously */ - (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, - &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc, - ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb); - - kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); - } - - ASSERT(scn->scn_prefetch_stop); - - /* free any prefetches we didn't get to complete */ - mutex_enter(&spa->spa_scrub_lock); - while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) { - avl_remove(&scn->scn_prefetch_queue, spic); - scan_prefetch_ctx_rele(spic->spic_spc, scn); - kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); - } - ASSERT0(avl_numnodes(&scn->scn_prefetch_queue)); - mutex_exit(&spa->spa_scrub_lock); -} - -static boolean_t -dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, - const zbookmark_phys_t *zb) -{ - /* - * We never skip over user/group accounting objects (obj<0) - */ - if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && - (int64_t)zb->zb_object >= 0) { - /* - * If we already visited this bp & everything below (in - * a prior txg sync), don't bother doing it again. - */ - if (zbookmark_subtree_completed(dnp, zb, - &scn->scn_phys.scn_bookmark)) - return (B_TRUE); - - /* - * If we found the block we're trying to resume from, or - * we went past it to a different object, zero it out to - * indicate that it's OK to start checking for suspending - * again. - */ - if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || - zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { - dprintf("resuming at %llx/%llx/%llx/%llx\n", - (longlong_t)zb->zb_objset, - (longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, - (longlong_t)zb->zb_blkid); - bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); - } - } - return (B_FALSE); -} - -static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, - dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, - dmu_objset_type_t ostype, dmu_tx_t *tx); -static void dsl_scan_visitdnode( - dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, - dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); - -/* - * Return nonzero on i/o error. - * Return new buf to write out in *bufp. - */ -static int -dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, - dnode_phys_t *dnp, const blkptr_t *bp, - const zbookmark_phys_t *zb, dmu_tx_t *tx) -{ - dsl_pool_t *dp = scn->scn_dp; - int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; - int err; - - if (BP_GET_LEVEL(bp) > 0) { - arc_flags_t flags = ARC_FLAG_WAIT; - int i; - blkptr_t *cbp; - int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; - arc_buf_t *buf; - - err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); - if (err) { - scn->scn_phys.scn_errors++; - return (err); - } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { - zbookmark_phys_t czb; - - SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); - dsl_scan_visitbp(cbp, &czb, dnp, - ds, scn, ostype, tx); - } - arc_buf_destroy(buf, &buf); - } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - arc_flags_t flags = ARC_FLAG_WAIT; - dnode_phys_t *cdnp; - int i; - int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - arc_buf_t *buf; - - err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); - if (err) { - scn->scn_phys.scn_errors++; - return (err); - } - for (i = 0, cdnp = buf->b_data; i < epb; - i += cdnp->dn_extra_slots + 1, - cdnp += cdnp->dn_extra_slots + 1) { - dsl_scan_visitdnode(scn, ds, ostype, - cdnp, zb->zb_blkid * epb + i, tx); - } - - arc_buf_destroy(buf, &buf); - } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - arc_flags_t flags = ARC_FLAG_WAIT; - objset_phys_t *osp; - arc_buf_t *buf; - - err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); - if (err) { - scn->scn_phys.scn_errors++; - return (err); - } - - osp = buf->b_data; - - dsl_scan_visitdnode(scn, ds, osp->os_type, - &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); - - if (OBJSET_BUF_HAS_USERUSED(buf)) { - /* - * We also always visit user/group accounting - * objects, and never skip them, even if we are - * suspending. This is necessary so that the space - * deltas from this txg get integrated. - */ - dsl_scan_visitdnode(scn, ds, osp->os_type, - &osp->os_groupused_dnode, - DMU_GROUPUSED_OBJECT, tx); - dsl_scan_visitdnode(scn, ds, osp->os_type, - &osp->os_userused_dnode, - DMU_USERUSED_OBJECT, tx); - } - arc_buf_destroy(buf, &buf); - } - - return (0); -} - -static void -dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, - dmu_objset_type_t ostype, dnode_phys_t *dnp, - uint64_t object, dmu_tx_t *tx) -{ - int j; - - for (j = 0; j < dnp->dn_nblkptr; j++) { - zbookmark_phys_t czb; - - SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, - dnp->dn_nlevels - 1, j); - dsl_scan_visitbp(&dnp->dn_blkptr[j], - &czb, dnp, ds, scn, ostype, tx); - } - - if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { - zbookmark_phys_t czb; - SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, - 0, DMU_SPILL_BLKID); - dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp), - &czb, dnp, ds, scn, ostype, tx); - } -} - -/* - * The arguments are in this order because mdb can only print the - * first 5; we want them to be useful. - */ -static void -dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, - dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, - dmu_objset_type_t ostype, dmu_tx_t *tx) -{ - dsl_pool_t *dp = scn->scn_dp; - blkptr_t *bp_toread = NULL; - - if (dsl_scan_check_suspend(scn, zb)) - return; - - if (dsl_scan_check_resume(scn, dnp, zb)) - return; - - scn->scn_visited_this_txg++; - - dprintf_bp(bp, - "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", - ds, ds ? ds->ds_object : 0, - zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, - bp); - - if (BP_IS_HOLE(bp)) { - scn->scn_holes_this_txg++; - return; - } - - if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { - scn->scn_lt_min_this_txg++; - return; - } - - bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); - *bp_toread = *bp; - - if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0) - goto out; - - /* - * If dsl_scan_ddt() has already visited this block, it will have - * already done any translations or scrubbing, so don't call the - * callback again. - */ - if (ddt_class_contains(dp->dp_spa, - scn->scn_phys.scn_ddt_class_max, bp)) { - scn->scn_ddt_contained_this_txg++; - goto out; - } - - /* - * If this block is from the future (after cur_max_txg), then we - * are doing this on behalf of a deleted snapshot, and we will - * revisit the future block on the next pass of this dataset. - * Don't scan it now unless we need to because something - * under it was modified. - */ - if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { - scn->scn_gt_max_this_txg++; - goto out; - } - - scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); -out: - kmem_free(bp_toread, sizeof (blkptr_t)); -} - -static void -dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, - dmu_tx_t *tx) -{ - zbookmark_phys_t zb; - scan_prefetch_ctx_t *spc; - - SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, - ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - - if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) { - SET_BOOKMARK(&scn->scn_prefetch_bookmark, - zb.zb_objset, 0, 0, 0); - } else { - scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark; - } - - scn->scn_objsets_visited_this_txg++; - - spc = scan_prefetch_ctx_create(scn, NULL, FTAG); - dsl_scan_prefetch(spc, bp, &zb); - scan_prefetch_ctx_rele(spc, FTAG); - - dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx); - - dprintf_ds(ds, "finished scan%s", ""); -} - -static void -ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys) -{ - if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) { - if (ds->ds_is_snapshot) { - /* - * Note: - * - scn_cur_{min,max}_txg stays the same. - * - Setting the flag is not really necessary if - * scn_cur_max_txg == scn_max_txg, because there - * is nothing after this snapshot that we care - * about. However, we set it anyway and then - * ignore it when we retraverse it in - * dsl_scan_visitds(). - */ - scn_phys->scn_bookmark.zb_objset = - dsl_dataset_phys(ds)->ds_next_snap_obj; - zfs_dbgmsg("destroying ds %llu; currently traversing; " - "reset zb_objset to %llu", - (u_longlong_t)ds->ds_object, - (u_longlong_t)dsl_dataset_phys(ds)-> - ds_next_snap_obj); - scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN; - } else { - SET_BOOKMARK(&scn_phys->scn_bookmark, - ZB_DESTROYED_OBJSET, 0, 0, 0); - zfs_dbgmsg("destroying ds %llu; currently traversing; " - "reset bookmark to -1,0,0,0", - (u_longlong_t)ds->ds_object); - } - } -} - -/* - * Invoked when a dataset is destroyed. We need to make sure that: - * - * 1) If it is the dataset that was currently being scanned, we write - * a new dsl_scan_phys_t and marking the objset reference in it - * as destroyed. - * 2) Remove it from the work queue, if it was present. - * - * If the dataset was actually a snapshot, instead of marking the dataset - * as destroyed, we instead substitute the next snapshot in line. - */ -void -dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - dsl_scan_t *scn = dp->dp_scan; - uint64_t mintxg; - - if (!dsl_scan_is_running(scn)) - return; - - ds_destroyed_scn_phys(ds, &scn->scn_phys); - ds_destroyed_scn_phys(ds, &scn->scn_phys_cached); - - if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { - scan_ds_queue_remove(scn, ds->ds_object); - if (ds->ds_is_snapshot) - scan_ds_queue_insert(scn, - dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg); - } - - if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, - ds->ds_object, &mintxg) == 0) { - ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); - if (ds->ds_is_snapshot) { - /* - * We keep the same mintxg; it could be > - * ds_creation_txg if the previous snapshot was - * deleted too. - */ - VERIFY(zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, - dsl_dataset_phys(ds)->ds_next_snap_obj, - mintxg, tx) == 0); - zfs_dbgmsg("destroying ds %llu; in queue; " - "replacing with %llu", - (u_longlong_t)ds->ds_object, - (u_longlong_t)dsl_dataset_phys(ds)-> - ds_next_snap_obj); - } else { - zfs_dbgmsg("destroying ds %llu; in queue; removing", - (u_longlong_t)ds->ds_object); - } - } - - /* - * dsl_scan_sync() should be called after this, and should sync - * out our changed state, but just to be safe, do it here. - */ - dsl_scan_sync_state(scn, tx, SYNC_CACHED); -} - -static void -ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark) -{ - if (scn_bookmark->zb_objset == ds->ds_object) { - scn_bookmark->zb_objset = - dsl_dataset_phys(ds)->ds_prev_snap_obj; - zfs_dbgmsg("snapshotting ds %llu; currently traversing; " - "reset zb_objset to %llu", - (u_longlong_t)ds->ds_object, - (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); - } -} - -/* - * Called when a dataset is snapshotted. If we were currently traversing - * this snapshot, we reset our bookmark to point at the newly created - * snapshot. We also modify our work queue to remove the old snapshot and - * replace with the new one. - */ -void -dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - dsl_scan_t *scn = dp->dp_scan; - uint64_t mintxg; - - if (!dsl_scan_is_running(scn)) - return; - - ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); - - ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark); - ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark); - - if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { - scan_ds_queue_remove(scn, ds->ds_object); - scan_ds_queue_insert(scn, - dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg); - } - - if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, - ds->ds_object, &mintxg) == 0) { - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); - VERIFY(zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, - dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); - zfs_dbgmsg("snapshotting ds %llu; in queue; " - "replacing with %llu", - (u_longlong_t)ds->ds_object, - (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); - } - - dsl_scan_sync_state(scn, tx, SYNC_CACHED); -} - -static void -ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2, - zbookmark_phys_t *scn_bookmark) -{ - if (scn_bookmark->zb_objset == ds1->ds_object) { - scn_bookmark->zb_objset = ds2->ds_object; - zfs_dbgmsg("clone_swap ds %llu; currently traversing; " - "reset zb_objset to %llu", - (u_longlong_t)ds1->ds_object, - (u_longlong_t)ds2->ds_object); - } else if (scn_bookmark->zb_objset == ds2->ds_object) { - scn_bookmark->zb_objset = ds1->ds_object; - zfs_dbgmsg("clone_swap ds %llu; currently traversing; " - "reset zb_objset to %llu", - (u_longlong_t)ds2->ds_object, - (u_longlong_t)ds1->ds_object); - } -} - -/* - * Called when an origin dataset and its clone are swapped. If we were - * currently traversing the dataset, we need to switch to traversing the - * newly promoted clone. - */ -void -dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds1->ds_dir->dd_pool; - dsl_scan_t *scn = dp->dp_scan; - uint64_t mintxg1, mintxg2; - boolean_t ds1_queued, ds2_queued; - - if (!dsl_scan_is_running(scn)) - return; - - ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark); - ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark); - - /* - * Handle the in-memory scan queue. - */ - ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1); - ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2); - - /* Sanity checking. */ - if (ds1_queued) { - ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); - ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); - } - if (ds2_queued) { - ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); - ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); - } - - if (ds1_queued && ds2_queued) { - /* - * If both are queued, we don't need to do anything. - * The swapping code below would not handle this case correctly, - * since we can't insert ds2 if it is already there. That's - * because scan_ds_queue_insert() prohibits a duplicate insert - * and panics. - */ - } else if (ds1_queued) { - scan_ds_queue_remove(scn, ds1->ds_object); - scan_ds_queue_insert(scn, ds2->ds_object, mintxg1); - } else if (ds2_queued) { - scan_ds_queue_remove(scn, ds2->ds_object); - scan_ds_queue_insert(scn, ds1->ds_object, mintxg2); - } - - /* - * Handle the on-disk scan queue. - * The on-disk state is an out-of-date version of the in-memory state, - * so the in-memory and on-disk values for ds1_queued and ds2_queued may - * be different. Therefore we need to apply the swap logic to the - * on-disk state independently of the in-memory state. - */ - ds1_queued = zap_lookup_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0; - ds2_queued = zap_lookup_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0; - - /* Sanity checking. */ - if (ds1_queued) { - ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); - ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); - } - if (ds2_queued) { - ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); - ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); - } - - if (ds1_queued && ds2_queued) { - /* - * If both are queued, we don't need to do anything. - * Alternatively, we could check for EEXIST from - * zap_add_int_key() and back out to the original state, but - * that would be more work than checking for this case upfront. - */ - } else if (ds1_queued) { - VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); - VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx)); - zfs_dbgmsg("clone_swap ds %llu; in queue; " - "replacing with %llu", - (u_longlong_t)ds1->ds_object, - (u_longlong_t)ds2->ds_object); - } else if (ds2_queued) { - VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); - VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx)); - zfs_dbgmsg("clone_swap ds %llu; in queue; " - "replacing with %llu", - (u_longlong_t)ds2->ds_object, - (u_longlong_t)ds1->ds_object); - } - - dsl_scan_sync_state(scn, tx, SYNC_CACHED); -} - -/* ARGSUSED */ -static int -enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) -{ - uint64_t originobj = *(uint64_t *)arg; - dsl_dataset_t *ds; - int err; - dsl_scan_t *scn = dp->dp_scan; - - if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj) - return (0); - - err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); - if (err) - return (err); - - while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) { - dsl_dataset_t *prev; - err = dsl_dataset_hold_obj(dp, - dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); - - dsl_dataset_rele(ds, FTAG); - if (err) - return (err); - ds = prev; - } - scan_ds_queue_insert(scn, ds->ds_object, - dsl_dataset_phys(ds)->ds_prev_snap_txg); - dsl_dataset_rele(ds, FTAG); - return (0); -} - -static void -dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) -{ - dsl_pool_t *dp = scn->scn_dp; - dsl_dataset_t *ds; - - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - - if (scn->scn_phys.scn_cur_min_txg >= - scn->scn_phys.scn_max_txg) { - /* - * This can happen if this snapshot was created after the - * scan started, and we already completed a previous snapshot - * that was created after the scan started. This snapshot - * only references blocks with: - * - * birth < our ds_creation_txg - * cur_min_txg is no less than ds_creation_txg. - * We have already visited these blocks. - * or - * birth > scn_max_txg - * The scan requested not to visit these blocks. - * - * Subsequent snapshots (and clones) can reference our - * blocks, or blocks with even higher birth times. - * Therefore we do not need to visit them either, - * so we do not add them to the work queue. - * - * Note that checking for cur_min_txg >= cur_max_txg - * is not sufficient, because in that case we may need to - * visit subsequent snapshots. This happens when min_txg > 0, - * which raises cur_min_txg. In this case we will visit - * this dataset but skip all of its blocks, because the - * rootbp's birth time is < cur_min_txg. Then we will - * add the next snapshots/clones to the work queue. - */ - char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); - dsl_dataset_name(ds, dsname); - zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " - "cur_min_txg (%llu) >= max_txg (%llu)", - (longlong_t)dsobj, dsname, - (longlong_t)scn->scn_phys.scn_cur_min_txg, - (longlong_t)scn->scn_phys.scn_max_txg); - kmem_free(dsname, MAXNAMELEN); - - goto out; - } - - /* - * Only the ZIL in the head (non-snapshot) is valid. Even though - * snapshots can have ZIL block pointers (which may be the same - * BP as in the head), they must be ignored. In addition, $ORIGIN - * doesn't have a objset (i.e. its ds_bp is a hole) so we don't - * need to look for a ZIL in it either. So we traverse the ZIL here, - * rather than in scan_recurse(), because the regular snapshot - * block-sharing rules don't apply to it. - */ - if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) && - (dp->dp_origin_snap == NULL || - ds->ds_dir != dp->dp_origin_snap->ds_dir)) { - objset_t *os; - if (dmu_objset_from_ds(ds, &os) != 0) { - goto out; - } - dsl_scan_zil(dp, &os->os_zil_header); - } - - /* - * Iterate over the bps in this ds. - */ - dmu_buf_will_dirty(ds->ds_dbuf, tx); - rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); - rrw_exit(&ds->ds_bp_rwlock, FTAG); - - char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); - dsl_dataset_name(ds, dsname); - zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " - "suspending=%u", - (longlong_t)dsobj, dsname, - (longlong_t)scn->scn_phys.scn_cur_min_txg, - (longlong_t)scn->scn_phys.scn_cur_max_txg, - (int)scn->scn_suspending); - kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); - - if (scn->scn_suspending) - goto out; - - /* - * We've finished this pass over this dataset. - */ - - /* - * If we did not completely visit this dataset, do another pass. - */ - if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { - zfs_dbgmsg("incomplete pass; visiting again"); - scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; - scan_ds_queue_insert(scn, ds->ds_object, - scn->scn_phys.scn_cur_max_txg); - goto out; - } - - /* - * Add descendent datasets to work queue. - */ - if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { - scan_ds_queue_insert(scn, - dsl_dataset_phys(ds)->ds_next_snap_obj, - dsl_dataset_phys(ds)->ds_creation_txg); - } - if (dsl_dataset_phys(ds)->ds_num_children > 1) { - boolean_t usenext = B_FALSE; - if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { - uint64_t count; - /* - * A bug in a previous version of the code could - * cause upgrade_clones_cb() to not set - * ds_next_snap_obj when it should, leading to a - * missing entry. Therefore we can only use the - * next_clones_obj when its count is correct. - */ - int err = zap_count(dp->dp_meta_objset, - dsl_dataset_phys(ds)->ds_next_clones_obj, &count); - if (err == 0 && - count == dsl_dataset_phys(ds)->ds_num_children - 1) - usenext = B_TRUE; - } - - if (usenext) { - zap_cursor_t zc; - zap_attribute_t za; - for (zap_cursor_init(&zc, dp->dp_meta_objset, - dsl_dataset_phys(ds)->ds_next_clones_obj); - zap_cursor_retrieve(&zc, &za) == 0; - (void) zap_cursor_advance(&zc)) { - scan_ds_queue_insert(scn, - zfs_strtonum(za.za_name, NULL), - dsl_dataset_phys(ds)->ds_creation_txg); - } - zap_cursor_fini(&zc); - } else { - VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - enqueue_clones_cb, &ds->ds_object, - DS_FIND_CHILDREN)); - } - } - -out: - dsl_dataset_rele(ds, FTAG); -} - -/* ARGSUSED */ -static int -enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) -{ - dsl_dataset_t *ds; - int err; - dsl_scan_t *scn = dp->dp_scan; - - err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); - if (err) - return (err); - - while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { - dsl_dataset_t *prev; - err = dsl_dataset_hold_obj(dp, - dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); - if (err) { - dsl_dataset_rele(ds, FTAG); - return (err); - } - - /* - * If this is a clone, we don't need to worry about it for now. - */ - if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { - dsl_dataset_rele(ds, FTAG); - dsl_dataset_rele(prev, FTAG); - return (0); - } - dsl_dataset_rele(ds, FTAG); - ds = prev; - } - - scan_ds_queue_insert(scn, ds->ds_object, - dsl_dataset_phys(ds)->ds_prev_snap_txg); - dsl_dataset_rele(ds, FTAG); - return (0); -} - -/* ARGSUSED */ -void -dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, - ddt_entry_t *dde, dmu_tx_t *tx) -{ - const ddt_key_t *ddk = &dde->dde_key; - ddt_phys_t *ddp = dde->dde_phys; - blkptr_t bp; - zbookmark_phys_t zb = { 0 }; - int p; - - if (!dsl_scan_is_running(scn)) - return; - - /* - * This function is special because it is the only thing - * that can add scan_io_t's to the vdev scan queues from - * outside dsl_scan_sync(). For the most part this is ok - * as long as it is called from within syncing context. - * However, dsl_scan_sync() expects that no new sio's will - * be added between when all the work for a scan is done - * and the next txg when the scan is actually marked as - * completed. This check ensures we do not issue new sio's - * during this period. - */ - if (scn->scn_done_txg != 0) - return; - - for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0 || - ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) - continue; - ddt_bp_create(checksum, ddk, ddp, &bp); - - scn->scn_visited_this_txg++; - scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); - } -} - -/* - * Scrub/dedup interaction. - * - * If there are N references to a deduped block, we don't want to scrub it - * N times -- ideally, we should scrub it exactly once. - * - * We leverage the fact that the dde's replication class (enum ddt_class) - * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest - * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. - * - * To prevent excess scrubbing, the scrub begins by walking the DDT - * to find all blocks with refcnt > 1, and scrubs each of these once. - * Since there are two replication classes which contain blocks with - * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. - * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. - * - * There would be nothing more to say if a block's refcnt couldn't change - * during a scrub, but of course it can so we must account for changes - * in a block's replication class. - * - * Here's an example of what can occur: - * - * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 - * when visited during the top-down scrub phase, it will be scrubbed twice. - * This negates our scrub optimization, but is otherwise harmless. - * - * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 - * on each visit during the top-down scrub phase, it will never be scrubbed. - * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's - * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to - * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 - * while a scrub is in progress, it scrubs the block right then. - */ -static void -dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) -{ - ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; - ddt_entry_t dde = { 0 }; - int error; - uint64_t n = 0; - - while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { - ddt_t *ddt; - - if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) - break; - dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", - (longlong_t)ddb->ddb_class, - (longlong_t)ddb->ddb_type, - (longlong_t)ddb->ddb_checksum, - (longlong_t)ddb->ddb_cursor); - - /* There should be no pending changes to the dedup table */ - ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; - ASSERT(avl_first(&ddt->ddt_tree) == NULL); - - dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); - n++; - - if (dsl_scan_check_suspend(scn, NULL)) - break; - } - - zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; " - "suspending=%u", (longlong_t)n, - (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); - - ASSERT(error == 0 || error == ENOENT); - ASSERT(error != ENOENT || - ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); -} - -static uint64_t -dsl_scan_ds_maxtxg(dsl_dataset_t *ds) -{ - uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; - if (ds->ds_is_snapshot) - return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); - return (smt); -} - -static void -dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) -{ - scan_ds_t *sds; - dsl_pool_t *dp = scn->scn_dp; - - if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= - scn->scn_phys.scn_ddt_class_max) { - scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; - scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; - dsl_scan_ddt(scn, tx); - if (scn->scn_suspending) - return; - } - - if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { - /* First do the MOS & ORIGIN */ - - scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; - scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; - dsl_scan_visit_rootbp(scn, NULL, - &dp->dp_meta_rootbp, tx); - spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); - if (scn->scn_suspending) - return; - - if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { - VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - enqueue_cb, NULL, DS_FIND_CHILDREN)); - } else { - dsl_scan_visitds(scn, - dp->dp_origin_snap->ds_object, tx); - } - ASSERT(!scn->scn_suspending); - } else if (scn->scn_phys.scn_bookmark.zb_objset != - ZB_DESTROYED_OBJSET) { - uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset; - /* - * If we were suspended, continue from here. Note if the - * ds we were suspended on was deleted, the zb_objset may - * be -1, so we will skip this and find a new objset - * below. - */ - dsl_scan_visitds(scn, dsobj, tx); - if (scn->scn_suspending) - return; - } - - /* - * In case we suspended right at the end of the ds, zero the - * bookmark so we don't think that we're still trying to resume. - */ - bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); - - /* - * Keep pulling things out of the dataset avl queue. Updates to the - * persistent zap-object-as-queue happen only at checkpoints. - */ - while ((sds = avl_first(&scn->scn_queue)) != NULL) { - dsl_dataset_t *ds; - uint64_t dsobj = sds->sds_dsobj; - uint64_t txg = sds->sds_txg; - - /* dequeue and free the ds from the queue */ - scan_ds_queue_remove(scn, dsobj); - sds = NULL; /* must not be touched after removal */ - - /* Set up min / max txg */ - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - if (txg != 0) { - scn->scn_phys.scn_cur_min_txg = - MAX(scn->scn_phys.scn_min_txg, txg); - } else { - scn->scn_phys.scn_cur_min_txg = - MAX(scn->scn_phys.scn_min_txg, - dsl_dataset_phys(ds)->ds_prev_snap_txg); - } - scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); - dsl_dataset_rele(ds, FTAG); - - dsl_scan_visitds(scn, dsobj, tx); - if (scn->scn_suspending) - return; - } - /* No more objsets to fetch, we're done */ - scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET; - ASSERT0(scn->scn_suspending); -} - -static uint64_t -dsl_scan_count_leaves(vdev_t *vd) -{ - uint64_t i, leaves = 0; - - /* we only count leaves that belong to the main pool and are readable */ - if (vd->vdev_islog || vd->vdev_isspare || - vd->vdev_isl2cache || !vdev_readable(vd)) - return (0); - - if (vd->vdev_ops->vdev_op_leaf) - return (1); - - for (i = 0; i < vd->vdev_children; i++) { - leaves += dsl_scan_count_leaves(vd->vdev_child[i]); - } - - return (leaves); -} - - -static void -scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp) -{ - int i; - uint64_t cur_size = 0; - - for (i = 0; i < BP_GET_NDVAS(bp); i++) { - cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]); - } - - q->q_total_zio_size_this_txg += cur_size; - q->q_zios_this_txg++; -} - -static void -scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start, - uint64_t end) -{ - q->q_total_seg_size_this_txg += end - start; - q->q_segs_this_txg++; -} - -static boolean_t -scan_io_queue_check_suspend(dsl_scan_t *scn) -{ - /* See comment in dsl_scan_check_suspend() */ - uint64_t curr_time_ns = gethrtime(); - uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; - uint64_t sync_time_ns = curr_time_ns - - scn->scn_dp->dp_spa->spa_sync_starttime; - int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; - int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? - zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; - - return ((NSEC2MSEC(scan_time_ns) > mintime && - (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || - txg_sync_waiting(scn->scn_dp) || - NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || - spa_shutting_down(scn->scn_dp->dp_spa)); -} - -/* - * Given a list of scan_io_t's in io_list, this issues the io's out to - * disk. This consumes the io_list and frees the scan_io_t's. This is - * called when emptying queues, either when we're up against the memory - * limit or when we have finished scanning. Returns B_TRUE if we stopped - * processing the list before we finished. Any zios that were not issued - * will remain in the io_list. - */ -static boolean_t -scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) -{ - dsl_scan_t *scn = queue->q_scn; - scan_io_t *sio; - int64_t bytes_issued = 0; - boolean_t suspended = B_FALSE; - - while ((sio = list_head(io_list)) != NULL) { - blkptr_t bp; - - if (scan_io_queue_check_suspend(scn)) { - suspended = B_TRUE; - break; - } - - sio2bp(sio, &bp, queue->q_vd->vdev_id); - bytes_issued += sio->sio_asize; - scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, - &sio->sio_zb, queue); - (void) list_remove_head(io_list); - scan_io_queues_update_zio_stats(queue, &bp); - kmem_free(sio, sizeof (*sio)); - } - - atomic_add_64(&scn->scn_bytes_pending, -bytes_issued); - - return (suspended); -} - -/* - * Given a range_seg_t (extent) and a list, this function passes over a - * scan queue and gathers up the appropriate ios which fit into that - * scan seg (starting from lowest LBA). At the end, we remove the segment - * from the q_exts_by_addr range tree. - */ -static boolean_t -scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) -{ - scan_io_t srch_sio, *sio, *next_sio; - avl_index_t idx; - uint_t num_sios = 0; - int64_t bytes_issued = 0; - - ASSERT(rs != NULL); - ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); - - srch_sio.sio_offset = rs->rs_start; - - /* - * The exact start of the extent might not contain any matching zios, - * so if that's the case, examine the next one in the tree. - */ - sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx); - if (sio == NULL) - sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); - - while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) { - ASSERT3U(sio->sio_offset, >=, rs->rs_start); - ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end); - - next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); - avl_remove(&queue->q_sios_by_addr, sio); - - bytes_issued += sio->sio_asize; - num_sios++; - list_insert_tail(list, sio); - sio = next_sio; - } - - /* - * We limit the number of sios we process at once to 32 to avoid - * biting off more than we can chew. If we didn't take everything - * in the segment we update it to reflect the work we were able to - * complete. Otherwise, we remove it from the range tree entirely. - */ - if (sio != NULL && sio->sio_offset < rs->rs_end) { - range_tree_adjust_fill(queue->q_exts_by_addr, rs, - -bytes_issued); - range_tree_resize_segment(queue->q_exts_by_addr, rs, - sio->sio_offset, rs->rs_end - sio->sio_offset); - - return (B_TRUE); - } else { - range_tree_remove(queue->q_exts_by_addr, rs->rs_start, - rs->rs_end - rs->rs_start); - return (B_FALSE); - } -} - - -/* - * This is called from the queue emptying thread and selects the next - * extent from which we are to issue io's. The behavior of this function - * depends on the state of the scan, the current memory consumption and - * whether or not we are performing a scan shutdown. - * 1) We select extents in an elevator algorithm (LBA-order) if the scan - * needs to perform a checkpoint - * 2) We select the largest available extent if we are up against the - * memory limit. - * 3) Otherwise we don't select any extents. - */ -static const range_seg_t * -scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) -{ - dsl_scan_t *scn = queue->q_scn; - - ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); - ASSERT(scn->scn_is_sorted); - - /* handle tunable overrides */ - if (scn->scn_checkpointing || scn->scn_clearing) { - if (zfs_scan_issue_strategy == 1) { - return (range_tree_first(queue->q_exts_by_addr)); - } else if (zfs_scan_issue_strategy == 2) { - return (avl_first(&queue->q_exts_by_size)); - } - } - - /* - * During normal clearing, we want to issue our largest segments - * first, keeping IO as sequential as possible, and leaving the - * smaller extents for later with the hope that they might eventually - * grow to larger sequential segments. However, when the scan is - * checkpointing, no new extents will be added to the sorting queue, - * so the way we are sorted now is as good as it will ever get. - * In this case, we instead switch to issuing extents in LBA order. - */ - if (scn->scn_checkpointing) { - return (range_tree_first(queue->q_exts_by_addr)); - } else if (scn->scn_clearing) { - return (avl_first(&queue->q_exts_by_size)); - } else { - return (NULL); - } -} - -static void -scan_io_queues_run_one(void *arg) -{ - dsl_scan_io_queue_t *queue = arg; - kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; - boolean_t suspended = B_FALSE; - range_seg_t *rs = NULL; - scan_io_t *sio = NULL; - list_t sio_list; - uint64_t bytes_per_leaf = zfs_scan_vdev_limit; - uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd); - - ASSERT(queue->q_scn->scn_is_sorted); - - list_create(&sio_list, sizeof (scan_io_t), - offsetof(scan_io_t, sio_nodes.sio_list_node)); - mutex_enter(q_lock); - - /* calculate maximum in-flight bytes for this txg (min 1MB) */ - queue->q_maxinflight_bytes = - MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); - - /* reset per-queue scan statistics for this txg */ - queue->q_total_seg_size_this_txg = 0; - queue->q_segs_this_txg = 0; - queue->q_total_zio_size_this_txg = 0; - queue->q_zios_this_txg = 0; - - /* loop until we have run out of time or sios */ - while ((rs = (range_seg_t*)scan_io_queue_fetch_ext(queue)) != NULL) { - uint64_t seg_start = 0, seg_end = 0; - boolean_t more_left = B_TRUE; - - ASSERT(list_is_empty(&sio_list)); - - /* loop while we still have sios left to process in this rs */ - while (more_left) { - scan_io_t *first_sio, *last_sio; - - /* - * We have selected which extent needs to be - * processed next. Gather up the corresponding sios. - */ - more_left = scan_io_queue_gather(queue, rs, &sio_list); - ASSERT(!list_is_empty(&sio_list)); - first_sio = list_head(&sio_list); - last_sio = list_tail(&sio_list); - - seg_end = last_sio->sio_offset + last_sio->sio_asize; - if (seg_start == 0) - seg_start = first_sio->sio_offset; - - /* - * Issuing sios can take a long time so drop the - * queue lock. The sio queue won't be updated by - * other threads since we're in syncing context so - * we can be sure that our trees will remain exactly - * as we left them. - */ - mutex_exit(q_lock); - suspended = scan_io_queue_issue(queue, &sio_list); - mutex_enter(q_lock); - - if (suspended) - break; - } - /* update statistics for debugging purposes */ - scan_io_queues_update_seg_stats(queue, seg_start, seg_end); - - if (suspended) - break; - } - - - /* If we were suspended in the middle of processing, - * requeue any unfinished sios and exit. - */ - while ((sio = list_head(&sio_list)) != NULL) { - list_remove(&sio_list, sio); - scan_io_queue_insert_impl(queue, sio); - } - - mutex_exit(q_lock); - list_destroy(&sio_list); -} - -/* - * Performs an emptying run on all scan queues in the pool. This just - * punches out one thread per top-level vdev, each of which processes - * only that vdev's scan queue. We can parallelize the I/O here because - * we know that each queue's io's only affect its own top-level vdev. - * - * This function waits for the queue runs to complete, and must be - * called from dsl_scan_sync (or in general, syncing context). - */ -static void -scan_io_queues_run(dsl_scan_t *scn) -{ - spa_t *spa = scn->scn_dp->dp_spa; - - ASSERT(scn->scn_is_sorted); - ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); - - if (scn->scn_bytes_pending == 0) - return; - - if (scn->scn_taskq == NULL) { - char *tq_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN + 16, - KM_SLEEP); - int nthreads = spa->spa_root_vdev->vdev_children; - - /* - * We need to make this taskq *always* execute as many - * threads in parallel as we have top-level vdevs and no - * less, otherwise strange serialization of the calls to - * scan_io_queues_run_one can occur during spa_sync runs - * and that significantly impacts performance. - */ - (void) snprintf(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16, - "dsl_scan_tq_%s", spa->spa_name); - scn->scn_taskq = taskq_create(tq_name, nthreads, minclsyspri, - nthreads, nthreads, TASKQ_PREPOPULATE); - kmem_free(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16); - } - - for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { - vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; - - mutex_enter(&vd->vdev_scan_io_queue_lock); - if (vd->vdev_scan_io_queue != NULL) { - VERIFY(taskq_dispatch(scn->scn_taskq, - scan_io_queues_run_one, vd->vdev_scan_io_queue, - TQ_SLEEP) != TASKQID_INVALID); - } - mutex_exit(&vd->vdev_scan_io_queue_lock); - } - - /* - * Wait for the queues to finish issuing thir IOs for this run - * before we return. There may still be IOs in flight at this - * point. - */ - taskq_wait(scn->scn_taskq); -} - -static boolean_t -dsl_scan_async_block_should_pause(dsl_scan_t *scn) -{ - uint64_t elapsed_nanosecs; - - if (zfs_recover) - return (B_FALSE); - - if (scn->scn_visited_this_txg >= zfs_async_block_max_blocks) - return (B_TRUE); - - elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; - return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || - (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms && - txg_sync_waiting(scn->scn_dp)) || - spa_shutting_down(scn->scn_dp->dp_spa)); -} - -static int -dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -{ - dsl_scan_t *scn = arg; - - if (!scn->scn_is_bptree || - (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { - if (dsl_scan_async_block_should_pause(scn)) - return (SET_ERROR(ERESTART)); - } - - zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, - dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); - dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, - -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), - -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); - scn->scn_visited_this_txg++; - return (0); -} - -static void -dsl_scan_update_stats(dsl_scan_t *scn) -{ - spa_t *spa = scn->scn_dp->dp_spa; - uint64_t i; - uint64_t seg_size_total = 0, zio_size_total = 0; - uint64_t seg_count_total = 0, zio_count_total = 0; - - for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { - vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; - dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue; - - if (queue == NULL) - continue; - - seg_size_total += queue->q_total_seg_size_this_txg; - zio_size_total += queue->q_total_zio_size_this_txg; - seg_count_total += queue->q_segs_this_txg; - zio_count_total += queue->q_zios_this_txg; - } - - if (seg_count_total == 0 || zio_count_total == 0) { - scn->scn_avg_seg_size_this_txg = 0; - scn->scn_avg_zio_size_this_txg = 0; - scn->scn_segs_this_txg = 0; - scn->scn_zios_this_txg = 0; - return; - } - - scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total; - scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total; - scn->scn_segs_this_txg = seg_count_total; - scn->scn_zios_this_txg = zio_count_total; -} - -static int -dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -{ - dsl_scan_t *scn = arg; - const dva_t *dva = &bp->blk_dva[0]; - - if (dsl_scan_async_block_should_pause(scn)) - return (SET_ERROR(ERESTART)); - - spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa, - DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), - DVA_GET_ASIZE(dva), tx); - scn->scn_visited_this_txg++; - return (0); -} - -boolean_t -dsl_scan_active(dsl_scan_t *scn) -{ - spa_t *spa = scn->scn_dp->dp_spa; - uint64_t used = 0, comp, uncomp; - - if (spa->spa_load_state != SPA_LOAD_NONE) - return (B_FALSE); - if (spa_shutting_down(spa)) - return (B_FALSE); - if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) || - (scn->scn_async_destroying && !scn->scn_async_stalled)) - return (B_TRUE); - - if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { - (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, - &used, &comp, &uncomp); - } - return (used != 0); -} - -static boolean_t -dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, - uint64_t phys_birth) -{ - vdev_t *vd; - - vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); - - if (vd->vdev_ops == &vdev_indirect_ops) { - /* - * The indirect vdev can point to multiple - * vdevs. For simplicity, always create - * the resilver zio_t. zio_vdev_io_start() - * will bypass the child resilver i/o's if - * they are on vdevs that don't have DTL's. - */ - return (B_TRUE); - } - - if (DVA_GET_GANG(dva)) { - /* - * Gang members may be spread across multiple - * vdevs, so the best estimate we have is the - * scrub range, which has already been checked. - * XXX -- it would be better to change our - * allocation policy to ensure that all - * gang members reside on the same vdev. - */ - return (B_TRUE); - } - - /* - * Check if the txg falls within the range which must be - * resilvered. DVAs outside this range can always be skipped. - */ - if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) - return (B_FALSE); - - /* - * Check if the top-level vdev must resilver this offset. - * When the offset does not intersect with a dirty leaf DTL - * then it may be possible to skip the resilver IO. The psize - * is provided instead of asize to simplify the check for RAIDZ. - */ - if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) - return (B_FALSE); - - return (B_TRUE); -} - -static int -dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) -{ - int err = 0; - dsl_scan_t *scn = dp->dp_scan; - spa_t *spa = dp->dp_spa; - - if (spa_suspend_async_destroy(spa)) - return (0); - - if (zfs_free_bpobj_enabled && - spa_version(spa) >= SPA_VERSION_DEADLISTS) { - scn->scn_is_bptree = B_FALSE; - scn->scn_async_block_min_time_ms = zfs_free_min_time_ms; - scn->scn_zio_root = zio_root(spa, NULL, - NULL, ZIO_FLAG_MUSTSUCCEED); - err = bpobj_iterate(&dp->dp_free_bpobj, - dsl_scan_free_block_cb, scn, tx); - VERIFY0(zio_wait(scn->scn_zio_root)); - scn->scn_zio_root = NULL; - - if (err != 0 && err != ERESTART) - zfs_panic_recover("error %u from bpobj_iterate()", err); - } - - if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { - ASSERT(scn->scn_async_destroying); - scn->scn_is_bptree = B_TRUE; - scn->scn_zio_root = zio_root(spa, NULL, - NULL, ZIO_FLAG_MUSTSUCCEED); - err = bptree_iterate(dp->dp_meta_objset, - dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); - VERIFY0(zio_wait(scn->scn_zio_root)); - scn->scn_zio_root = NULL; - - if (err == EIO || err == ECKSUM) { - err = 0; - } else if (err != 0 && err != ERESTART) { - zfs_panic_recover("error %u from " - "traverse_dataset_destroyed()", err); - } - - if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { - /* finished; deactivate async destroy feature */ - spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); - ASSERT(!spa_feature_is_active(spa, - SPA_FEATURE_ASYNC_DESTROY)); - VERIFY0(zap_remove(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_BPTREE_OBJ, tx)); - VERIFY0(bptree_free(dp->dp_meta_objset, - dp->dp_bptree_obj, tx)); - dp->dp_bptree_obj = 0; - scn->scn_async_destroying = B_FALSE; - scn->scn_async_stalled = B_FALSE; - } else { - /* - * If we didn't make progress, mark the async - * destroy as stalled, so that we will not initiate - * a spa_sync() on its behalf. Note that we only - * check this if we are not finished, because if the - * bptree had no blocks for us to visit, we can - * finish without "making progress". - */ - scn->scn_async_stalled = - (scn->scn_visited_this_txg == 0); - } - } - if (scn->scn_visited_this_txg) { - zfs_dbgmsg("freed %llu blocks in %llums from " - "free_bpobj/bptree txg %llu; err=%d", - (longlong_t)scn->scn_visited_this_txg, - (longlong_t) - NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), - (longlong_t)tx->tx_txg, err); - scn->scn_visited_this_txg = 0; - - /* - * Write out changes to the DDT that may be required as a - * result of the blocks freed. This ensures that the DDT - * is clean when a scrub/resilver runs. - */ - ddt_sync(spa, tx->tx_txg); - } - if (err != 0) - return (err); - if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && - zfs_free_leak_on_eio && - (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || - dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 || - dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { - /* - * We have finished background destroying, but there is still - * some space left in the dp_free_dir. Transfer this leaked - * space to the dp_leak_dir. - */ - if (dp->dp_leak_dir == NULL) { - rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); - (void) dsl_dir_create_sync(dp, dp->dp_root_dir, - LEAK_DIR_NAME, tx); - VERIFY0(dsl_pool_open_special_dir(dp, - LEAK_DIR_NAME, &dp->dp_leak_dir)); - rrw_exit(&dp->dp_config_rwlock, FTAG); - } - dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, - dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, - dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, - dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); - dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, - -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, - -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, - -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); - } - - if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) { - /* finished; verify that space accounting went to zero */ - ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); - ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); - ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); - } - - EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj), - 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_OBSOLETE_BPOBJ)); - if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) { - ASSERT(spa_feature_is_active(dp->dp_spa, - SPA_FEATURE_OBSOLETE_COUNTS)); - - scn->scn_is_bptree = B_FALSE; - scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms; - err = bpobj_iterate(&dp->dp_obsolete_bpobj, - dsl_scan_obsolete_block_cb, scn, tx); - if (err != 0 && err != ERESTART) - zfs_panic_recover("error %u from bpobj_iterate()", err); - - if (bpobj_is_empty(&dp->dp_obsolete_bpobj)) - dsl_pool_destroy_obsolete_bpobj(dp, tx); - } - - return (0); -} - -/* - * This is the primary entry point for scans that is called from syncing - * context. Scans must happen entirely during syncing context so that we - * cna guarantee that blocks we are currently scanning will not change out - * from under us. While a scan is active, this funciton controls how quickly - * transaction groups proceed, instead of the normal handling provided by - * txg_sync_thread(). - */ -void -dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) -{ - dsl_scan_t *scn = dp->dp_scan; - spa_t *spa = dp->dp_spa; - int err = 0; - state_sync_type_t sync_type = SYNC_OPTIONAL; - - /* - * Check for scn_restart_txg before checking spa_load_state, so - * that we can restart an old-style scan while the pool is being - * imported (see dsl_scan_init). - */ - if (dsl_scan_restarting(scn, tx)) { - pool_scan_func_t func = POOL_SCAN_SCRUB; - dsl_scan_done(scn, B_FALSE, tx); - if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) - func = POOL_SCAN_RESILVER; - zfs_dbgmsg("restarting scan func=%u txg=%llu", - func, (longlong_t)tx->tx_txg); - dsl_scan_setup_sync(&func, tx); - } - - /* - * Only process scans in sync pass 1. - */ - if (spa_sync_pass(dp->dp_spa) > 1) - return; - - /* - * If the spa is shutting down, then stop scanning. This will - * ensure that the scan does not dirty any new data during the - * shutdown phase. - */ - if (spa_shutting_down(spa)) - return; - - /* - * If the scan is inactive due to a stalled async destroy, try again. - */ - if (!scn->scn_async_stalled && !dsl_scan_active(scn)) - return; - - /* reset scan statistics */ - scn->scn_visited_this_txg = 0; - scn->scn_holes_this_txg = 0; - scn->scn_lt_min_this_txg = 0; - scn->scn_gt_max_this_txg = 0; - scn->scn_ddt_contained_this_txg = 0; - scn->scn_objsets_visited_this_txg = 0; - scn->scn_avg_seg_size_this_txg = 0; - scn->scn_segs_this_txg = 0; - scn->scn_avg_zio_size_this_txg = 0; - scn->scn_zios_this_txg = 0; - scn->scn_suspending = B_FALSE; - scn->scn_sync_start_time = gethrtime(); - spa->spa_scrub_active = B_TRUE; - - /* - * First process the async destroys. If we pause, don't do - * any scrubbing or resilvering. This ensures that there are no - * async destroys while we are scanning, so the scan code doesn't - * have to worry about traversing it. It is also faster to free the - * blocks than to scrub them. - */ - err = dsl_process_async_destroys(dp, tx); - if (err != 0) - return; - - if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn)) - return; - - /* - * Wait a few txgs after importing to begin scanning so that - * we can get the pool imported quickly. - */ - if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS) - return; - - /* - * It is possible to switch from unsorted to sorted at any time, - * but afterwards the scan will remain sorted unless reloaded from - * a checkpoint after a reboot. - */ - if (!zfs_scan_legacy) { - scn->scn_is_sorted = B_TRUE; - if (scn->scn_last_checkpoint == 0) - scn->scn_last_checkpoint = ddi_get_lbolt(); - } - - /* - * For sorted scans, determine what kind of work we will be doing - * this txg based on our memory limitations and whether or not we - * need to perform a checkpoint. - */ - if (scn->scn_is_sorted) { - /* - * If we are over our checkpoint interval, set scn_clearing - * so that we can begin checkpointing immediately. The - * checkpoint allows us to save a consisent bookmark - * representing how much data we have scrubbed so far. - * Otherwise, use the memory limit to determine if we should - * scan for metadata or start issue scrub IOs. We accumulate - * metadata until we hit our hard memory limit at which point - * we issue scrub IOs until we are at our soft memory limit. - */ - if (scn->scn_checkpointing || - ddi_get_lbolt() - scn->scn_last_checkpoint > - SEC_TO_TICK(zfs_scan_checkpoint_intval)) { - if (!scn->scn_checkpointing) - zfs_dbgmsg("begin scan checkpoint"); - - scn->scn_checkpointing = B_TRUE; - scn->scn_clearing = B_TRUE; - } else { - boolean_t should_clear = dsl_scan_should_clear(scn); - if (should_clear && !scn->scn_clearing) { - zfs_dbgmsg("begin scan clearing"); - scn->scn_clearing = B_TRUE; - } else if (!should_clear && scn->scn_clearing) { - zfs_dbgmsg("finish scan clearing"); - scn->scn_clearing = B_FALSE; - } - } - } else { - ASSERT0(scn->scn_checkpointing); - ASSERT0(scn->scn_clearing); - } - - if (!scn->scn_clearing && scn->scn_done_txg == 0) { - /* Need to scan metadata for more blocks to scrub */ - dsl_scan_phys_t *scnp = &scn->scn_phys; - taskqid_t prefetch_tqid; - uint64_t bytes_per_leaf = zfs_scan_vdev_limit; - uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev); - - /* - * Recalculate the max number of in-flight bytes for pool-wide - * scanning operations (minimum 1MB). Limits for the issuing - * phase are done per top-level vdev and are handled separately. - */ - scn->scn_maxinflight_bytes = - MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); - - if (scnp->scn_ddt_bookmark.ddb_class <= - scnp->scn_ddt_class_max) { - ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark)); - zfs_dbgmsg("doing scan sync txg %llu; " - "ddt bm=%llu/%llu/%llu/%llx", - (longlong_t)tx->tx_txg, - (longlong_t)scnp->scn_ddt_bookmark.ddb_class, - (longlong_t)scnp->scn_ddt_bookmark.ddb_type, - (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, - (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); - } else { - zfs_dbgmsg("doing scan sync txg %llu; " - "bm=%llu/%llu/%llu/%llu", - (longlong_t)tx->tx_txg, - (longlong_t)scnp->scn_bookmark.zb_objset, - (longlong_t)scnp->scn_bookmark.zb_object, - (longlong_t)scnp->scn_bookmark.zb_level, - (longlong_t)scnp->scn_bookmark.zb_blkid); - } - - scn->scn_zio_root = zio_root(dp->dp_spa, NULL, - NULL, ZIO_FLAG_CANFAIL); - - scn->scn_prefetch_stop = B_FALSE; - prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq, - dsl_scan_prefetch_thread, scn, TQ_SLEEP); - ASSERT(prefetch_tqid != TASKQID_INVALID); - - dsl_pool_config_enter(dp, FTAG); - dsl_scan_visit(scn, tx); - dsl_pool_config_exit(dp, FTAG); - - mutex_enter(&dp->dp_spa->spa_scrub_lock); - scn->scn_prefetch_stop = B_TRUE; - cv_broadcast(&spa->spa_scrub_io_cv); - mutex_exit(&dp->dp_spa->spa_scrub_lock); - - taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid); - (void) zio_wait(scn->scn_zio_root); - scn->scn_zio_root = NULL; - - zfs_dbgmsg("scan visited %llu blocks in %llums " - "(%llu os's, %llu holes, %llu < mintxg, " - "%llu in ddt, %llu > maxtxg)", - (longlong_t)scn->scn_visited_this_txg, - (longlong_t)NSEC2MSEC(gethrtime() - - scn->scn_sync_start_time), - (longlong_t)scn->scn_objsets_visited_this_txg, - (longlong_t)scn->scn_holes_this_txg, - (longlong_t)scn->scn_lt_min_this_txg, - (longlong_t)scn->scn_ddt_contained_this_txg, - (longlong_t)scn->scn_gt_max_this_txg); - - if (!scn->scn_suspending) { - ASSERT0(avl_numnodes(&scn->scn_queue)); - scn->scn_done_txg = tx->tx_txg + 1; - if (scn->scn_is_sorted) { - scn->scn_checkpointing = B_TRUE; - scn->scn_clearing = B_TRUE; - } - zfs_dbgmsg("scan complete txg %llu", - (longlong_t)tx->tx_txg); - } - } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) { - ASSERT(scn->scn_clearing); - - /* need to issue scrubbing IOs from per-vdev queues */ - scn->scn_zio_root = zio_root(dp->dp_spa, NULL, - NULL, ZIO_FLAG_CANFAIL); - scan_io_queues_run(scn); - (void) zio_wait(scn->scn_zio_root); - scn->scn_zio_root = NULL; - - /* calculate and dprintf the current memory usage */ - (void) dsl_scan_should_clear(scn); - dsl_scan_update_stats(scn); - - zfs_dbgmsg("scrubbed %llu blocks (%llu segs) in %llums " - "(avg_block_size = %llu, avg_seg_size = %llu)", - (longlong_t)scn->scn_zios_this_txg, - (longlong_t)scn->scn_segs_this_txg, - (longlong_t)NSEC2MSEC(gethrtime() - - scn->scn_sync_start_time), - (longlong_t)scn->scn_avg_zio_size_this_txg, - (longlong_t)scn->scn_avg_seg_size_this_txg); - } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) { - /* Finished with everything. Mark the scrub as complete */ - zfs_dbgmsg("scan issuing complete txg %llu", - (longlong_t)tx->tx_txg); - ASSERT3U(scn->scn_done_txg, !=, 0); - ASSERT0(spa->spa_scrub_inflight); - ASSERT0(scn->scn_bytes_pending); - dsl_scan_done(scn, B_TRUE, tx); - sync_type = SYNC_MANDATORY; - } - - dsl_scan_sync_state(scn, tx, sync_type); -} - -static void -count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) -{ - int i; - - /* update the spa's stats on how many bytes we have issued */ - for (i = 0; i < BP_GET_NDVAS(bp); i++) { - atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued, - DVA_GET_ASIZE(&bp->blk_dva[i])); - } - - /* - * If we resume after a reboot, zab will be NULL; don't record - * incomplete stats in that case. - */ - if (zab == NULL) - return; - - mutex_enter(&zab->zab_lock); - - for (i = 0; i < 4; i++) { - int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; - int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; - if (t & DMU_OT_NEWTYPE) - t = DMU_OT_OTHER; - zfs_blkstat_t *zb = &zab->zab_type[l][t]; - int equal; - - zb->zb_count++; - zb->zb_asize += BP_GET_ASIZE(bp); - zb->zb_lsize += BP_GET_LSIZE(bp); - zb->zb_psize += BP_GET_PSIZE(bp); - zb->zb_gangs += BP_COUNT_GANG(bp); - - switch (BP_GET_NDVAS(bp)) { - case 2: - if (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[1])) - zb->zb_ditto_2_of_2_samevdev++; - break; - case 3: - equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[1])) + - (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[2])) + - (DVA_GET_VDEV(&bp->blk_dva[1]) == - DVA_GET_VDEV(&bp->blk_dva[2])); - if (equal == 1) - zb->zb_ditto_2_of_3_samevdev++; - else if (equal == 3) - zb->zb_ditto_3_of_3_samevdev++; - break; - } - } - - mutex_exit(&zab->zab_lock); -} - -static void -scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) -{ - avl_index_t idx; - int64_t asize = sio->sio_asize; - dsl_scan_t *scn = queue->q_scn; - - ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); - - if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { - /* block is already scheduled for reading */ - atomic_add_64(&scn->scn_bytes_pending, -asize); - kmem_free(sio, sizeof (*sio)); - return; - } - avl_insert(&queue->q_sios_by_addr, sio, idx); - range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize); -} - -/* - * Given all the info we got from our metadata scanning process, we - * construct a scan_io_t and insert it into the scan sorting queue. The - * I/O must already be suitable for us to process. This is controlled - * by dsl_scan_enqueue(). - */ -static void -scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, - int zio_flags, const zbookmark_phys_t *zb) -{ - dsl_scan_t *scn = queue->q_scn; - scan_io_t *sio = kmem_zalloc(sizeof (*sio), KM_SLEEP); - - ASSERT0(BP_IS_GANG(bp)); - ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); - - bp2sio(bp, sio, dva_i); - sio->sio_flags = zio_flags; - sio->sio_zb = *zb; - - /* - * Increment the bytes pending counter now so that we can't - * get an integer underflow in case the worker processes the - * zio before we get to incrementing this counter. - */ - atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize); - - scan_io_queue_insert_impl(queue, sio); -} - -/* - * Given a set of I/O parameters as discovered by the metadata traversal - * process, attempts to place the I/O into the sorted queues (if allowed), - * or immediately executes the I/O. - */ -static void -dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, - const zbookmark_phys_t *zb) -{ - spa_t *spa = dp->dp_spa; - - ASSERT(!BP_IS_EMBEDDED(bp)); - - /* - * Gang blocks are hard to issue sequentially, so we just issue them - * here immediately instead of queuing them. - */ - if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) { - scan_exec_io(dp, bp, zio_flags, zb, NULL); - return; - } - for (int i = 0; i < BP_GET_NDVAS(bp); i++) { - dva_t dva; - vdev_t *vdev; - - dva = bp->blk_dva[i]; - vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva)); - ASSERT(vdev != NULL); - - mutex_enter(&vdev->vdev_scan_io_queue_lock); - if (vdev->vdev_scan_io_queue == NULL) - vdev->vdev_scan_io_queue = scan_io_queue_create(vdev); - ASSERT(dp->dp_scan != NULL); - scan_io_queue_insert(vdev->vdev_scan_io_queue, bp, - i, zio_flags, zb); - mutex_exit(&vdev->vdev_scan_io_queue_lock); - } -} - -static int -dsl_scan_scrub_cb(dsl_pool_t *dp, - const blkptr_t *bp, const zbookmark_phys_t *zb) -{ - dsl_scan_t *scn = dp->dp_scan; - spa_t *spa = dp->dp_spa; - uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); - size_t psize = BP_GET_PSIZE(bp); - boolean_t needs_io; - int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; - int d; - - if (phys_birth <= scn->scn_phys.scn_min_txg || - phys_birth >= scn->scn_phys.scn_max_txg) { - count_block(scn, dp->dp_blkstats, bp); - return (0); - } - - /* Embedded BP's have phys_birth==0, so we reject them above. */ - ASSERT(!BP_IS_EMBEDDED(bp)); - - ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); - if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { - zio_flags |= ZIO_FLAG_SCRUB; - needs_io = B_TRUE; - } else { - ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); - zio_flags |= ZIO_FLAG_RESILVER; - needs_io = B_FALSE; - } - - /* If it's an intent log block, failure is expected. */ - if (zb->zb_level == ZB_ZIL_LEVEL) - zio_flags |= ZIO_FLAG_SPECULATIVE; - - for (d = 0; d < BP_GET_NDVAS(bp); d++) { - const dva_t *dva = &bp->blk_dva[d]; - - /* - * Keep track of how much data we've examined so that - * zpool(1M) status can make useful progress reports. - */ - scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); - spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); - - /* if it's a resilver, this may not be in the target range */ - if (!needs_io) - needs_io = dsl_scan_need_resilver(spa, dva, psize, - phys_birth); - } - - if (needs_io && !zfs_no_scrub_io) { - dsl_scan_enqueue(dp, bp, zio_flags, zb); - } else { - count_block(scn, dp->dp_blkstats, bp); - } - - /* do not relocate this block */ - return (0); -} - -static void -dsl_scan_scrub_done(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - blkptr_t *bp = zio->io_bp; - dsl_scan_io_queue_t *queue = zio->io_private; - - abd_free(zio->io_abd); - - if (queue == NULL) { - mutex_enter(&spa->spa_scrub_lock); - ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); - spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); - cv_broadcast(&spa->spa_scrub_io_cv); - mutex_exit(&spa->spa_scrub_lock); - } else { - mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock); - ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp)); - queue->q_inflight_bytes -= BP_GET_PSIZE(bp); - cv_broadcast(&queue->q_zio_cv); - mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock); - } - - if (zio->io_error && (zio->io_error != ECKSUM || - !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { - atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors); - } -} - -/* - * Given a scanning zio's information, executes the zio. The zio need - * not necessarily be only sortable, this function simply executes the - * zio, no matter what it is. The optional queue argument allows the - * caller to specify that they want per top level vdev IO rate limiting - * instead of the legacy global limiting. - */ -static void -scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, - const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue) -{ - spa_t *spa = dp->dp_spa; - dsl_scan_t *scn = dp->dp_scan; - size_t size = BP_GET_PSIZE(bp); - abd_t *data = abd_alloc_for_io(size, B_FALSE); - unsigned int scan_delay = 0; - - ASSERT3U(scn->scn_maxinflight_bytes, >, 0); - - if (queue == NULL) { - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes) - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_scrub_inflight += BP_GET_PSIZE(bp); - mutex_exit(&spa->spa_scrub_lock); - } else { - kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; - - mutex_enter(q_lock); - while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes) - cv_wait(&queue->q_zio_cv, q_lock); - queue->q_inflight_bytes += BP_GET_PSIZE(bp); - mutex_exit(q_lock); - } - - if (zio_flags & ZIO_FLAG_RESILVER) - scan_delay = zfs_resilver_delay; - else { - ASSERT(zio_flags & ZIO_FLAG_SCRUB); - scan_delay = zfs_scrub_delay; - } - - if (scan_delay && (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)) - delay(MAX((int)scan_delay, 0)); - - count_block(dp->dp_scan, dp->dp_blkstats, bp); - zio_nowait(zio_read(dp->dp_scan->scn_zio_root, spa, bp, data, size, - dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb)); -} - -/* - * This is the primary extent sorting algorithm. We balance two parameters: - * 1) how many bytes of I/O are in an extent - * 2) how well the extent is filled with I/O (as a fraction of its total size) - * Since we allow extents to have gaps between their constituent I/Os, it's - * possible to have a fairly large extent that contains the same amount of - * I/O bytes than a much smaller extent, which just packs the I/O more tightly. - * The algorithm sorts based on a score calculated from the extent's size, - * the relative fill volume (in %) and a "fill weight" parameter that controls - * the split between whether we prefer larger extents or more well populated - * extents: - * - * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT) - * - * Example: - * 1) assume extsz = 64 MiB - * 2) assume fill = 32 MiB (extent is half full) - * 3) assume fill_weight = 3 - * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100 - * SCORE = 32M + (50 * 3 * 32M) / 100 - * SCORE = 32M + (4800M / 100) - * SCORE = 32M + 48M - * ^ ^ - * | +--- final total relative fill-based score - * +--------- final total fill-based score - * SCORE = 80M - * - * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards - * extents that are more completely filled (in a 3:2 ratio) vs just larger. - * Note that as an optimization, we replace multiplication and division by - * 100 with bitshifting by 7 (which effecitvely multiplies and divides by 128). - */ -static int -ext_size_compare(const void *x, const void *y) -{ - const range_seg_t *rsa = x, *rsb = y; - uint64_t sa = rsa->rs_end - rsa->rs_start, - sb = rsb->rs_end - rsb->rs_start; - uint64_t score_a, score_b; - - score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) * - fill_weight * rsa->rs_fill) >> 7); - score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) * - fill_weight * rsb->rs_fill) >> 7); - - if (score_a > score_b) - return (-1); - if (score_a == score_b) { - if (rsa->rs_start < rsb->rs_start) - return (-1); - if (rsa->rs_start == rsb->rs_start) - return (0); - return (1); - } - return (1); -} - -/* - * Comparator for the q_sios_by_addr tree. Sorting is simply performed - * based on LBA-order (from lowest to highest). - */ -static int -io_addr_compare(const void *x, const void *y) -{ - const scan_io_t *a = x, *b = y; - - if (a->sio_offset < b->sio_offset) - return (-1); - if (a->sio_offset == b->sio_offset) - return (0); - return (1); -} - -/* IO queues are created on demand when they are needed. */ -static dsl_scan_io_queue_t * -scan_io_queue_create(vdev_t *vd) -{ - dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; - dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP); - - q->q_scn = scn; - q->q_vd = vd; - cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); - q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, - &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap); - avl_create(&q->q_sios_by_addr, io_addr_compare, - sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); - - return (q); -} - -/* - * Destroys a scan queue and all segments and scan_io_t's contained in it. - * No further execution of I/O occurs, anything pending in the queue is - * simply freed without being executed. - */ -void -dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) -{ - dsl_scan_t *scn = queue->q_scn; - scan_io_t *sio; - void *cookie = NULL; - int64_t bytes_dequeued = 0; - - ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); - - while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != - NULL) { - ASSERT(range_tree_contains(queue->q_exts_by_addr, - sio->sio_offset, sio->sio_asize)); - bytes_dequeued += sio->sio_asize; - kmem_free(sio, sizeof (*sio)); - } - - atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued); - range_tree_vacate(queue->q_exts_by_addr, NULL, queue); - range_tree_destroy(queue->q_exts_by_addr); - avl_destroy(&queue->q_sios_by_addr); - cv_destroy(&queue->q_zio_cv); - - kmem_free(queue, sizeof (*queue)); -} - -/* - * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is - * called on behalf of vdev_top_transfer when creating or destroying - * a mirror vdev due to zpool attach/detach. - */ -void -dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd) -{ - mutex_enter(&svd->vdev_scan_io_queue_lock); - mutex_enter(&tvd->vdev_scan_io_queue_lock); - - VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL); - tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue; - svd->vdev_scan_io_queue = NULL; - if (tvd->vdev_scan_io_queue != NULL) - tvd->vdev_scan_io_queue->q_vd = tvd; - - mutex_exit(&tvd->vdev_scan_io_queue_lock); - mutex_exit(&svd->vdev_scan_io_queue_lock); -} - -static void -scan_io_queues_destroy(dsl_scan_t *scn) -{ - vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; - - for (uint64_t i = 0; i < rvd->vdev_children; i++) { - vdev_t *tvd = rvd->vdev_child[i]; - - mutex_enter(&tvd->vdev_scan_io_queue_lock); - if (tvd->vdev_scan_io_queue != NULL) - dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue); - tvd->vdev_scan_io_queue = NULL; - mutex_exit(&tvd->vdev_scan_io_queue_lock); - } -} - -static void -dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) -{ - dsl_pool_t *dp = spa->spa_dsl_pool; - dsl_scan_t *scn = dp->dp_scan; - vdev_t *vdev; - kmutex_t *q_lock; - dsl_scan_io_queue_t *queue; - scan_io_t srch, *sio; - avl_index_t idx; - uint64_t start, size; - - vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i])); - ASSERT(vdev != NULL); - q_lock = &vdev->vdev_scan_io_queue_lock; - queue = vdev->vdev_scan_io_queue; - - mutex_enter(q_lock); - if (queue == NULL) { - mutex_exit(q_lock); - return; - } - - bp2sio(bp, &srch, dva_i); - start = srch.sio_offset; - size = srch.sio_asize; - - /* - * We can find the zio in two states: - * 1) Cold, just sitting in the queue of zio's to be issued at - * some point in the future. In this case, all we do is - * remove the zio from the q_sios_by_addr tree, decrement - * its data volume from the containing range_seg_t and - * resort the q_exts_by_size tree to reflect that the - * range_seg_t has lost some of its 'fill'. We don't shorten - * the range_seg_t - this is usually rare enough not to be - * worth the extra hassle of trying keep track of precise - * extent boundaries. - * 2) Hot, where the zio is currently in-flight in - * dsl_scan_issue_ios. In this case, we can't simply - * reach in and stop the in-flight zio's, so we instead - * block the caller. Eventually, dsl_scan_issue_ios will - * be done with issuing the zio's it gathered and will - * signal us. - */ - sio = avl_find(&queue->q_sios_by_addr, &srch, &idx); - if (sio != NULL) { - int64_t asize = sio->sio_asize; - blkptr_t tmpbp; - - /* Got it while it was cold in the queue */ - ASSERT3U(start, ==, sio->sio_offset); - ASSERT3U(size, ==, asize); - avl_remove(&queue->q_sios_by_addr, sio); - - ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); - range_tree_remove_fill(queue->q_exts_by_addr, start, size); - - /* - * We only update scn_bytes_pending in the cold path, - * otherwise it will already have been accounted for as - * part of the zio's execution. - */ - atomic_add_64(&scn->scn_bytes_pending, -asize); - - /* count the block as though we issued it */ - sio2bp(sio, &tmpbp, dva_i); - count_block(scn, dp->dp_blkstats, &tmpbp); - - kmem_free(sio, sizeof (*sio)); - } - mutex_exit(q_lock); -} - -/* - * Callback invoked when a zio_free() zio is executing. This needs to be - * intercepted to prevent the zio from deallocating a particular portion - * of disk space and it then getting reallocated and written to, while we - * still have it queued up for processing. - */ -void -dsl_scan_freed(spa_t *spa, const blkptr_t *bp) -{ - dsl_pool_t *dp = spa->spa_dsl_pool; - dsl_scan_t *scn = dp->dp_scan; - - ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT(scn != NULL); - if (!dsl_scan_is_running(scn)) - return; - - for (int i = 0; i < BP_GET_NDVAS(bp); i++) - dsl_scan_freed_dva(spa, bp, i); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c deleted file mode 100644 index a78b4cb030cf..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c +++ /dev/null @@ -1,256 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include - -#define DST_AVG_BLKSHIFT 14 - -/* ARGSUSED */ -static int -dsl_null_checkfunc(void *arg, dmu_tx_t *tx) -{ - return (0); -} - -static int -dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc, - dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check, boolean_t early) -{ - spa_t *spa; - dmu_tx_t *tx; - int err; - dsl_sync_task_t dst = { 0 }; - dsl_pool_t *dp; - - err = spa_open(pool, &spa, FTAG); - if (err != 0) - return (err); - dp = spa_get_dsl(spa); - -top: - tx = dmu_tx_create_dd(dp->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - - dst.dst_pool = dp; - dst.dst_txg = dmu_tx_get_txg(tx); - dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT; - dst.dst_space_check = space_check; - dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc; - dst.dst_syncfunc = syncfunc; - dst.dst_arg = arg; - dst.dst_error = 0; - dst.dst_nowaiter = B_FALSE; - - dsl_pool_config_enter(dp, FTAG); - err = dst.dst_checkfunc(arg, tx); - dsl_pool_config_exit(dp, FTAG); - - if (err != 0) { - dmu_tx_commit(tx); - spa_close(spa, FTAG); - return (err); - } - - txg_list_t *task_list = (early) ? - &dp->dp_early_sync_tasks : &dp->dp_sync_tasks; - VERIFY(txg_list_add_tail(task_list, &dst, dst.dst_txg)); - - dmu_tx_commit(tx); - - if (sigfunc != NULL && txg_wait_synced_sig(dp, dst.dst_txg)) { - /* current contract is to call func once */ - sigfunc(arg, tx); - sigfunc = NULL; /* in case of an EAGAIN retry */ - } - txg_wait_synced(dp, dst.dst_txg); - - if (dst.dst_error == EAGAIN) { - txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE); - goto top; - } - - spa_close(spa, FTAG); - return (dst.dst_error); -} - -/* - * Called from open context to perform a callback in syncing context. Waits - * for the operation to complete. - * - * The checkfunc will be called from open context as a preliminary check - * which can quickly fail. If it succeeds, it will be called again from - * syncing context. The checkfunc should generally be designed to work - * properly in either context, but if necessary it can check - * dmu_tx_is_syncing(tx). - * - * The synctask infrastructure enforces proper locking strategy with respect - * to the dp_config_rwlock -- the lock will always be held when the callbacks - * are called. It will be held for read during the open-context (preliminary) - * call to the checkfunc, and then held for write from syncing context during - * the calls to the check and sync funcs. - * - * A dataset or pool name can be passed as the first argument. Typically, - * the check func will hold, check the return value of the hold, and then - * release the dataset. The sync func will VERIFYO(hold()) the dataset. - * This is safe because no changes can be made between the check and sync funcs, - * and the sync func will only be called if the check func successfully opened - * the dataset. - */ -int -dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, - dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check) -{ - return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg, - blocks_modified, space_check, B_FALSE)); -} - -/* - * An early synctask works exactly as a standard synctask with one important - * difference on the way it is handled during syncing context. Standard - * synctasks run after we've written out all the dirty blocks of dirty - * datasets. Early synctasks are executed before writing out any dirty data, - * and thus before standard synctasks. - * - * For that reason, early synctasks can affect the process of writing dirty - * changes to disk for the txg that they run and should be used with caution. - * In addition, early synctasks should not dirty any metaslabs as this would - * invalidate the precodition/invariant for subsequent early synctasks. - * [see dsl_pool_sync() and dsl_early_sync_task_verify()] - */ -int -dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, - dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check) -{ - return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg, - blocks_modified, space_check, B_TRUE)); -} - -/* - * A standard synctask that can be interrupted from a signal. The sigfunc - * is called once if a signal occurred while waiting for the task to sync. - */ -int -dsl_sync_task_sig(const char *pool, dsl_checkfunc_t *checkfunc, - dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check) -{ - return (dsl_sync_task_common(pool, checkfunc, syncfunc, sigfunc, arg, - blocks_modified, space_check, B_FALSE)); -} - -static void -dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx, - boolean_t early) -{ - dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP); - - dst->dst_pool = dp; - dst->dst_txg = dmu_tx_get_txg(tx); - dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT; - dst->dst_space_check = space_check; - dst->dst_checkfunc = dsl_null_checkfunc; - dst->dst_syncfunc = syncfunc; - dst->dst_arg = arg; - dst->dst_error = 0; - dst->dst_nowaiter = B_TRUE; - - txg_list_t *task_list = (early) ? - &dp->dp_early_sync_tasks : &dp->dp_sync_tasks; - VERIFY(txg_list_add_tail(task_list, dst, dst->dst_txg)); -} - -void -dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) -{ - dsl_sync_task_nowait_common(dp, syncfunc, arg, - blocks_modified, space_check, tx, B_FALSE); -} - -void -dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) -{ - dsl_sync_task_nowait_common(dp, syncfunc, arg, - blocks_modified, space_check, tx, B_TRUE); -} - -/* - * Called in syncing context to execute the synctask. - */ -void -dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx) -{ - dsl_pool_t *dp = dst->dst_pool; - - ASSERT0(dst->dst_error); - - /* - * Check for sufficient space. - * - * When the sync task was created, the caller specified the - * type of space checking required. See the comment in - * zfs_space_check_t for details on the semantics of each - * type of space checking. - * - * We just check against what's on-disk; we don't want any - * in-flight accounting to get in our way, because open context - * may have already used up various in-core limits - * (arc_tempreserve, dsl_pool_tempreserve). - */ - if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) { - uint64_t quota = dsl_pool_unreserved_space(dp, - dst->dst_space_check); - uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; - - /* MOS space is triple-dittoed, so we multiply by 3. */ - if (used + dst->dst_space * 3 > quota) { - dst->dst_error = SET_ERROR(ENOSPC); - if (dst->dst_nowaiter) - kmem_free(dst, sizeof (*dst)); - return; - } - } - - /* - * Check for errors by calling checkfunc. - */ - rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); - dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx); - if (dst->dst_error == 0) - dst->dst_syncfunc(dst->dst_arg, tx); - rrw_exit(&dp->dp_config_rwlock, FTAG); - if (dst->dst_nowaiter) - kmem_free(dst, sizeof (*dst)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c deleted file mode 100644 index d0274dc4ce39..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c +++ /dev/null @@ -1,667 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -typedef struct dsl_dataset_user_hold_arg { - nvlist_t *dduha_holds; - nvlist_t *dduha_chkholds; - nvlist_t *dduha_errlist; - minor_t dduha_minor; -} dsl_dataset_user_hold_arg_t; - -/* - * If you add new checks here, you may need to add additional checks to the - * "temporary" case in snapshot_check() in dmu_objset.c. - */ -int -dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag, - boolean_t temphold, dmu_tx_t *tx) -{ - dsl_pool_t *dp = dmu_tx_pool(tx); - objset_t *mos = dp->dp_meta_objset; - int error = 0; - - ASSERT(dsl_pool_config_held(dp)); - - if (strlen(htag) > MAXNAMELEN) - return (SET_ERROR(E2BIG)); - /* Tempholds have a more restricted length */ - if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) - return (SET_ERROR(E2BIG)); - - /* tags must be unique (if ds already exists) */ - if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { - uint64_t value; - - error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, - htag, 8, 1, &value); - if (error == 0) - error = SET_ERROR(EEXIST); - else if (error == ENOENT) - error = 0; - } - - return (error); -} - -static int -dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_user_hold_arg_t *dduha = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - - if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) - return (SET_ERROR(ENOTSUP)); - - if (!dmu_tx_is_syncing(tx)) - return (0); - - for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); - pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { - dsl_dataset_t *ds; - int error = 0; - char *htag, *name; - - /* must be a snapshot */ - name = nvpair_name(pair); - if (strchr(name, '@') == NULL) - error = SET_ERROR(EINVAL); - - if (error == 0) - error = nvpair_value_string(pair, &htag); - - if (error == 0) - error = dsl_dataset_hold(dp, name, FTAG, &ds); - - if (error == 0) { - error = dsl_dataset_user_hold_check_one(ds, htag, - dduha->dduha_minor != 0, tx); - dsl_dataset_rele(ds, FTAG); - } - - if (error == 0) { - fnvlist_add_string(dduha->dduha_chkholds, name, htag); - } else { - /* - * We register ENOENT errors so they can be correctly - * reported if needed, such as when all holds fail. - */ - fnvlist_add_int32(dduha->dduha_errlist, name, error); - if (error != ENOENT) - return (error); - } - } - - return (0); -} - - -static void -dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds, - const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - uint64_t zapobj; - - ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); - - if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) { - /* - * This is the first user hold for this dataset. Create - * the userrefs zap object. - */ - dmu_buf_will_dirty(ds->ds_dbuf, tx); - zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj = - zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); - } else { - zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj; - } - ds->ds_userrefs++; - - VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx)); - - if (minor != 0) { - char name[MAXNAMELEN]; - nvlist_t *tags; - - VERIFY0(dsl_pool_user_hold(dp, ds->ds_object, - htag, now, tx)); - (void) snprintf(name, sizeof (name), "%llx", - (u_longlong_t)ds->ds_object); - - if (nvlist_lookup_nvlist(tmpholds, name, &tags) != 0) { - tags = fnvlist_alloc(); - fnvlist_add_boolean(tags, htag); - fnvlist_add_nvlist(tmpholds, name, tags); - fnvlist_free(tags); - } else { - fnvlist_add_boolean(tags, htag); - } - } - - spa_history_log_internal_ds(ds, "hold", tx, - "tag=%s temp=%d refs=%llu", - htag, minor != 0, ds->ds_userrefs); -} - -typedef struct zfs_hold_cleanup_arg { - char zhca_spaname[ZFS_MAX_DATASET_NAME_LEN]; - uint64_t zhca_spa_load_guid; - nvlist_t *zhca_holds; -} zfs_hold_cleanup_arg_t; - -static void -dsl_dataset_user_release_onexit(void *arg) -{ - zfs_hold_cleanup_arg_t *ca = arg; - spa_t *spa; - int error; - - error = spa_open(ca->zhca_spaname, &spa, FTAG); - if (error != 0) { - zfs_dbgmsg("couldn't release holds on pool=%s " - "because pool is no longer loaded", - ca->zhca_spaname); - return; - } - if (spa_load_guid(spa) != ca->zhca_spa_load_guid) { - zfs_dbgmsg("couldn't release holds on pool=%s " - "because pool is no longer loaded (guid doesn't match)", - ca->zhca_spaname); - spa_close(spa, FTAG); - return; - } - - (void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds); - fnvlist_free(ca->zhca_holds); - kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); - spa_close(spa, FTAG); -} - -static void -dsl_onexit_hold_cleanup(spa_t *spa, nvlist_t *holds, minor_t minor) -{ - zfs_hold_cleanup_arg_t *ca; - - if (minor == 0 || nvlist_empty(holds)) { - fnvlist_free(holds); - return; - } - - ASSERT(spa != NULL); - ca = kmem_alloc(sizeof (*ca), KM_SLEEP); - - (void) strlcpy(ca->zhca_spaname, spa_name(spa), - sizeof (ca->zhca_spaname)); - ca->zhca_spa_load_guid = spa_load_guid(spa); - ca->zhca_holds = holds; - VERIFY0(zfs_onexit_add_cb(minor, - dsl_dataset_user_release_onexit, ca, NULL)); -} - -void -dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag, - minor_t minor, uint64_t now, dmu_tx_t *tx) -{ - nvlist_t *tmpholds; - - if (minor != 0) - tmpholds = fnvlist_alloc(); - else - tmpholds = NULL; - dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, htag, minor, now, tx); - dsl_onexit_hold_cleanup(dsl_dataset_get_spa(ds), tmpholds, minor); -} - -static void -dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_user_hold_arg_t *dduha = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - nvlist_t *tmpholds; - uint64_t now = gethrestime_sec(); - - if (dduha->dduha_minor != 0) - tmpholds = fnvlist_alloc(); - else - tmpholds = NULL; - for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_chkholds, NULL); - pair != NULL; - pair = nvlist_next_nvpair(dduha->dduha_chkholds, pair)) { - dsl_dataset_t *ds; - - VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); - dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, - fnvpair_value_string(pair), dduha->dduha_minor, now, tx); - dsl_dataset_rele(ds, FTAG); - } - dsl_onexit_hold_cleanup(dp->dp_spa, tmpholds, dduha->dduha_minor); -} - -/* - * The full semantics of this function are described in the comment above - * lzc_hold(). - * - * To summarize: - * holds is nvl of snapname -> holdname - * errlist will be filled in with snapname -> error - * - * The snaphosts must all be in the same pool. - * - * Holds for snapshots that don't exist will be skipped. - * - * If none of the snapshots for requested holds exist then ENOENT will be - * returned. - * - * If cleanup_minor is not 0, the holds will be temporary, which will be cleaned - * up when the process exits. - * - * On success all the holds, for snapshots that existed, will be created and 0 - * will be returned. - * - * On failure no holds will be created, the errlist will be filled in, - * and an errno will returned. - * - * In all cases the errlist will contain entries for holds where the snapshot - * didn't exist. - */ -int -dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist) -{ - dsl_dataset_user_hold_arg_t dduha; - nvpair_t *pair; - int ret; - - pair = nvlist_next_nvpair(holds, NULL); - if (pair == NULL) - return (0); - - dduha.dduha_holds = holds; - dduha.dduha_chkholds = fnvlist_alloc(); - dduha.dduha_errlist = errlist; - dduha.dduha_minor = cleanup_minor; - - ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check, - dsl_dataset_user_hold_sync, &dduha, - fnvlist_num_pairs(holds), ZFS_SPACE_CHECK_RESERVED); - fnvlist_free(dduha.dduha_chkholds); - - return (ret); -} - -typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag, - dsl_dataset_t **dsp); - -typedef struct dsl_dataset_user_release_arg { - dsl_holdfunc_t *ddura_holdfunc; - nvlist_t *ddura_holds; - nvlist_t *ddura_todelete; - nvlist_t *ddura_errlist; - nvlist_t *ddura_chkholds; -} dsl_dataset_user_release_arg_t; - -/* Place a dataset hold on the snapshot identified by passed dsobj string */ -static int -dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag, - dsl_dataset_t **dsp) -{ - return (dsl_dataset_hold_obj(dp, zfs_strtonum(dsobj, NULL), tag, dsp)); -} - -static int -dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura, - dsl_dataset_t *ds, nvlist_t *holds, const char *snapname) -{ - uint64_t zapobj; - nvlist_t *holds_found; - objset_t *mos; - int numholds; - - if (!ds->ds_is_snapshot) - return (SET_ERROR(EINVAL)); - - if (nvlist_empty(holds)) - return (0); - - numholds = 0; - mos = ds->ds_dir->dd_pool->dp_meta_objset; - zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj; - holds_found = fnvlist_alloc(); - - for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL; - pair = nvlist_next_nvpair(holds, pair)) { - uint64_t tmp; - int error; - const char *holdname = nvpair_name(pair); - - if (zapobj != 0) - error = zap_lookup(mos, zapobj, holdname, 8, 1, &tmp); - else - error = SET_ERROR(ENOENT); - - /* - * Non-existent holds are put on the errlist, but don't - * cause an overall failure. - */ - if (error == ENOENT) { - if (ddura->ddura_errlist != NULL) { - char *errtag = kmem_asprintf("%s#%s", - snapname, holdname); - fnvlist_add_int32(ddura->ddura_errlist, errtag, - ENOENT); - strfree(errtag); - } - continue; - } - - if (error != 0) { - fnvlist_free(holds_found); - return (error); - } - - fnvlist_add_boolean(holds_found, holdname); - numholds++; - } - - if (DS_IS_DEFER_DESTROY(ds) && - dsl_dataset_phys(ds)->ds_num_children == 1 && - ds->ds_userrefs == numholds) { - /* we need to destroy the snapshot as well */ - if (dsl_dataset_long_held(ds)) { - fnvlist_free(holds_found); - return (SET_ERROR(EBUSY)); - } - fnvlist_add_boolean(ddura->ddura_todelete, snapname); - } - - if (numholds != 0) { - fnvlist_add_nvlist(ddura->ddura_chkholds, snapname, - holds_found); - } - fnvlist_free(holds_found); - - return (0); -} - -static int -dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_user_release_arg_t *ddura; - dsl_holdfunc_t *holdfunc; - dsl_pool_t *dp; - - if (!dmu_tx_is_syncing(tx)) - return (0); - - dp = dmu_tx_pool(tx); - - ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); - - ddura = arg; - holdfunc = ddura->ddura_holdfunc; - - for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); - pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { - int error; - dsl_dataset_t *ds; - nvlist_t *holds; - const char *snapname = nvpair_name(pair); - - error = nvpair_value_nvlist(pair, &holds); - if (error != 0) - error = (SET_ERROR(EINVAL)); - else - error = holdfunc(dp, snapname, FTAG, &ds); - if (error == 0) { - error = dsl_dataset_user_release_check_one(ddura, ds, - holds, snapname); - dsl_dataset_rele(ds, FTAG); - } - if (error != 0) { - if (ddura->ddura_errlist != NULL) { - fnvlist_add_int32(ddura->ddura_errlist, - snapname, error); - } - /* - * Non-existent snapshots are put on the errlist, - * but don't cause an overall failure. - */ - if (error != ENOENT) - return (error); - } - } - - return (0); -} - -static void -dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds, - dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - - for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL; - pair = nvlist_next_nvpair(holds, pair)) { - int error; - const char *holdname = nvpair_name(pair); - - /* Remove temporary hold if one exists. */ - error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx); - VERIFY(error == 0 || error == ENOENT); - - VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, - holdname, tx)); - ds->ds_userrefs--; - - spa_history_log_internal_ds(ds, "release", tx, - "tag=%s refs=%lld", holdname, (longlong_t)ds->ds_userrefs); - } -} - -static void -dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_user_release_arg_t *ddura = arg; - dsl_holdfunc_t *holdfunc = ddura->ddura_holdfunc; - dsl_pool_t *dp = dmu_tx_pool(tx); - - ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); - - for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_chkholds, NULL); - pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_chkholds, - pair)) { - dsl_dataset_t *ds; - const char *name = nvpair_name(pair); - - VERIFY0(holdfunc(dp, name, FTAG, &ds)); - - dsl_dataset_user_release_sync_one(ds, - fnvpair_value_nvlist(pair), tx); - if (nvlist_exists(ddura->ddura_todelete, name)) { - ASSERT(ds->ds_userrefs == 0 && - dsl_dataset_phys(ds)->ds_num_children == 1 && - DS_IS_DEFER_DESTROY(ds)); - dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx); - } - dsl_dataset_rele(ds, FTAG); - } -} - -/* - * The full semantics of this function are described in the comment above - * lzc_release(). - * - * To summarize: - * Releases holds specified in the nvl holds. - * - * holds is nvl of snapname -> { holdname, ... } - * errlist will be filled in with snapname -> error - * - * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots, - * otherwise they should be the names of shapshots. - * - * As a release may cause snapshots to be destroyed this trys to ensure they - * aren't mounted. - * - * The release of non-existent holds are skipped. - * - * At least one hold must have been released for the this function to succeed - * and return 0. - */ -static int -dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist, - dsl_pool_t *tmpdp) -{ - dsl_dataset_user_release_arg_t ddura; - nvpair_t *pair; - char *pool; - int error; - - pair = nvlist_next_nvpair(holds, NULL); - if (pair == NULL) - return (0); - - /* - * The release may cause snapshots to be destroyed; make sure they - * are not mounted. - */ - if (tmpdp != NULL) { - /* Temporary holds are specified by dsobj string. */ - ddura.ddura_holdfunc = dsl_dataset_hold_obj_string; - pool = spa_name(tmpdp->dp_spa); -#ifdef _KERNEL - for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; - pair = nvlist_next_nvpair(holds, pair)) { - dsl_dataset_t *ds; - - dsl_pool_config_enter(tmpdp, FTAG); - error = dsl_dataset_hold_obj_string(tmpdp, - nvpair_name(pair), FTAG, &ds); - if (error == 0) { - char name[ZFS_MAX_DATASET_NAME_LEN]; - dsl_dataset_name(ds, name); - dsl_pool_config_exit(tmpdp, FTAG); - dsl_dataset_rele(ds, FTAG); - (void) zfs_unmount_snap(name); - } else { - dsl_pool_config_exit(tmpdp, FTAG); - } - } -#endif - } else { - /* Non-temporary holds are specified by name. */ - ddura.ddura_holdfunc = dsl_dataset_hold; - pool = nvpair_name(pair); -#ifdef _KERNEL - for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; - pair = nvlist_next_nvpair(holds, pair)) { - (void) zfs_unmount_snap(nvpair_name(pair)); - } -#endif - } - - ddura.ddura_holds = holds; - ddura.ddura_errlist = errlist; - ddura.ddura_todelete = fnvlist_alloc(); - ddura.ddura_chkholds = fnvlist_alloc(); - - error = dsl_sync_task(pool, dsl_dataset_user_release_check, - dsl_dataset_user_release_sync, &ddura, 0, - ZFS_SPACE_CHECK_EXTRA_RESERVED); - fnvlist_free(ddura.ddura_todelete); - fnvlist_free(ddura.ddura_chkholds); - - return (error); -} - -/* - * holds is nvl of snapname -> { holdname, ... } - * errlist will be filled in with snapname -> error - */ -int -dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist) -{ - return (dsl_dataset_user_release_impl(holds, errlist, NULL)); -} - -/* - * holds is nvl of snapdsobj -> { holdname, ... } - */ -void -dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds) -{ - ASSERT(dp != NULL); - (void) dsl_dataset_user_release_impl(holds, NULL, dp); -} - -int -dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl) -{ - dsl_pool_t *dp; - dsl_dataset_t *ds; - int err; - - err = dsl_pool_hold(dsname, FTAG, &dp); - if (err != 0) - return (err); - err = dsl_dataset_hold(dp, dsname, FTAG, &ds); - if (err != 0) { - dsl_pool_rele(dp, FTAG); - return (err); - } - - if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { - zap_attribute_t *za; - zap_cursor_t zc; - - za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, - dsl_dataset_phys(ds)->ds_userrefs_obj); - zap_cursor_retrieve(&zc, za) == 0; - zap_cursor_advance(&zc)) { - fnvlist_add_uint64(nvl, za->za_name, - za->za_first_integer); - } - zap_cursor_fini(&zc); - kmem_free(za, sizeof (zap_attribute_t)); - } - dsl_dataset_rele(ds, FTAG); - dsl_pool_rele(dp, FTAG); - return (0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c deleted file mode 100644 index 9a3430d94668..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c +++ /dev/null @@ -1,114 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2013 Saso Kiselkov. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2016 by Delphix. All rights reserved. - */ -#include -#include -#include -#include - -#define EDONR_MODE 512 -#define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE - -static int -edonr_incremental(void *buf, size_t size, void *arg) -{ - EdonRState *ctx = arg; - EdonRUpdate(ctx, buf, size * 8); - return (0); -} - -/* - * Native zio_checksum interface for the Edon-R hash function. - */ -/*ARGSUSED*/ -void -abd_checksum_edonr_native(abd_t *abd, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - uint8_t digest[EDONR_MODE / 8]; - EdonRState ctx; - - ASSERT(ctx_template != NULL); - bcopy(ctx_template, &ctx, sizeof (ctx)); - (void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx); - EdonRFinal(&ctx, digest); - bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word)); -} - -/* - * Byteswapped zio_checksum interface for the Edon-R hash function. - */ -void -abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - zio_cksum_t tmp; - - abd_checksum_edonr_native(abd, size, ctx_template, &tmp); - zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]); - zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]); - zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]); - zcp->zc_word[3] = BSWAP_64(zcp->zc_word[3]); -} - -void * -abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) -{ - EdonRState *ctx; - uint8_t salt_block[EDONR_BLOCK_SIZE]; - - /* - * Edon-R needs all but the last hash invocation to be on full-size - * blocks, but the salt is too small. Rather than simply padding it - * with zeros, we expand the salt into a new salt block of proper - * size by double-hashing it (the new salt block will be composed of - * H(salt) || H(H(salt))). - */ - CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8)); - EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8, - salt_block); - EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block + - EDONR_MODE / 8); - - /* - * Feed the new salt block into the hash function - this will serve - * as our MAC key. - */ - ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); - EdonRInit(ctx, EDONR_MODE); - EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8); - return (ctx); -} - -void -abd_checksum_edonr_tmpl_free(void *ctx_template) -{ - EdonRState *ctx = ctx_template; - - bzero(ctx, sizeof (*ctx)); - kmem_free(ctx, sizeof (*ctx)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c deleted file mode 100644 index b257d4af753c..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include -#include -#include - -#ifdef _KERNEL -#include -#else -#include -#endif - -size_t -gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) -{ - size_t dstlen = d_len; - - ASSERT(d_len <= s_len); - - if (z_compress_level(d_start, &dstlen, s_start, s_len, n) != Z_OK) { - if (d_len != s_len) - return (s_len); - - bcopy(s_start, d_start, s_len); - return (s_len); - } - - return (dstlen); -} - -/*ARGSUSED*/ -int -gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) -{ - size_t dstlen = d_len; - - ASSERT(d_len >= s_len); - - if (z_uncompress(d_start, &dstlen, s_start, s_len) != Z_OK) - return (-1); - - return (0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs deleted file mode 100644 index 0e22de7a4a18..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs +++ /dev/null @@ -1,80 +0,0 @@ -# -# CDDL HEADER START -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# -# CDDL HEADER END -# - -# -# Copyright (c) 2017 by Delphix. All rights reserved. -# - -Introduction ------------- - -This README describes the Lua interpreter source code that lives in the ZFS -source tree to enable execution of ZFS channel programs, including its -maintenance policy, the modifications that have been made to it, and how it -should (and should not) be used. - -For a description of the Lua language and features exposed by ZFS channel -programs, please refer to the zfs-program(1m) man page instead. - - -Maintenance policy ------------------- - -The Lua runtime is considered stable software. Channel programs don't need much -complicated logic, so updates to the Lua runtime from upstream are viewed as -nice-to-have, but not required for channel programs to be well-supported. As -such, the Lua runtime in ZFS should be updated on an as-needed basis for -security vulnerabilities, but not much else. - - -Modifications to Lua --------------------- - -The version of the Lua runtime we're using in ZFS has been modified in a variety -of ways to make it more useful for the specific purpose of running channel -programs. These changes include: - -1. "Normal" Lua uses floating point for all numbers it stores, but those aren't - useful inside ZFS / the kernel. We have changed the runtime to use int64_t - throughout for all numbers. -2. Some of the Lua standard libraries do file I/O or spawn processes, but - neither of these make sense from inside channel programs. We have removed - those libraries rather than reimplementing them using kernel APIs. -3. The "normal" Lua runtime handles errors by failing fatally, but since this - version of Lua runs inside the kernel we must handle these failures and - return meaningful error codes to userland. We have customized the Lua - failure paths so that they aren't fatal. -4. Running poorly-vetted code inside the kernel is always a risk; even if the - ability to do so is restricted to the root user, it's still possible to write - an incorrect program that results in an infinite loop or massive memory use. - We've added new protections into the Lua interpreter to limit the runtime - (measured in number of Lua instructions run) and memory overhead of running - a channel program. -5. The Lua bytecode is not designed to be secure / safe, so it would be easy to - pass invalid bytecode which can panic the kernel. By comparison, the parser - is hardened and fails gracefully on invalid input. Therefore, we only accept - Lua source code at the ioctl level and then interpret it inside the kernel. - -Each of these modifications have been tested in the zfs-test suite. If / when -new modifications are made, new tests should be added to the suite located in -zfs-tests/tests/functional/channel_program/lua_core. - - -How to use this Lua interpreter -------------------------------- - -From the above, it should be clear that this is not a general-purpose Lua -interpreter. Additional work would be required to extricate this custom version -of Lua from ZFS and make it usable by other areas of the kernel. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c deleted file mode 100644 index 34820a2d8b44..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c +++ /dev/null @@ -1,1283 +0,0 @@ -/* -** $Id: lapi.c,v 2.171.1.1 2013/04/12 18:48:47 roberto Exp $ -** Lua API -** See Copyright Notice in lua.h -*/ - - -#include - -#define lapi_c -#define LUA_CORE - -#include "lua.h" - -#include "lapi.h" -#include "ldebug.h" -#include "ldo.h" -#include "lfunc.h" -#include "lgc.h" -#include "lmem.h" -#include "lobject.h" -#include "lstate.h" -#include "lstring.h" -#include "ltable.h" -#include "ltm.h" -#include "lundump.h" -#include "lvm.h" - - - -const char lua_ident[] = - "$LuaVersion: " LUA_COPYRIGHT " $" - "$LuaAuthors: " LUA_AUTHORS " $"; - - -/* value at a non-valid index */ -#define NONVALIDVALUE cast(TValue *, luaO_nilobject) - -/* corresponding test */ -#define isvalid(o) ((o) != luaO_nilobject) - -/* test for pseudo index */ -#define ispseudo(i) ((i) <= LUA_REGISTRYINDEX) - -/* test for valid but not pseudo index */ -#define isstackindex(i, o) (isvalid(o) && !ispseudo(i)) - -#define api_checkvalidindex(L, o) api_check(L, isvalid(o), "invalid index") - -#define api_checkstackindex(L, i, o) \ - api_check(L, isstackindex(i, o), "index not in the stack") - - -static TValue *index2addr (lua_State *L, int idx) { - CallInfo *ci = L->ci; - if (idx > 0) { - TValue *o = ci->func + idx; - api_check(L, idx <= ci->top - (ci->func + 1), "unacceptable index"); - if (o >= L->top) return NONVALIDVALUE; - else return o; - } - else if (!ispseudo(idx)) { /* negative index */ - api_check(L, idx != 0 && -idx <= L->top - (ci->func + 1), "invalid index"); - return L->top + idx; - } - else if (idx == LUA_REGISTRYINDEX) - return &G(L)->l_registry; - else { /* upvalues */ - idx = LUA_REGISTRYINDEX - idx; - api_check(L, idx <= MAXUPVAL + 1, "upvalue index too large"); - if (ttislcf(ci->func)) /* light C function? */ - return NONVALIDVALUE; /* it has no upvalues */ - else { - CClosure *func = clCvalue(ci->func); - return (idx <= func->nupvalues) ? &func->upvalue[idx-1] : NONVALIDVALUE; - } - } -} - - -/* -** to be called by 'lua_checkstack' in protected mode, to grow stack -** capturing memory errors -*/ -static void growstack (lua_State *L, void *ud) { - int size = *(int *)ud; - luaD_growstack(L, size); -} - - -LUA_API int lua_checkstack (lua_State *L, int size) { - int res; - CallInfo *ci = L->ci; - lua_lock(L); - if (L->stack_last - L->top > size) /* stack large enough? */ - res = 1; /* yes; check is OK */ - else { /* no; need to grow stack */ - int inuse = cast_int(L->top - L->stack) + EXTRA_STACK; - if (inuse > LUAI_MAXSTACK - size) /* can grow without overflow? */ - res = 0; /* no */ - else /* try to grow stack */ - res = (luaD_rawrunprotected(L, &growstack, &size) == LUA_OK); - } - if (res && ci->top < L->top + size) - ci->top = L->top + size; /* adjust frame top */ - lua_unlock(L); - return res; -} - - -LUA_API void lua_xmove (lua_State *from, lua_State *to, int n) { - int i; - if (from == to) return; - lua_lock(to); - api_checknelems(from, n); - api_check(from, G(from) == G(to), "moving among independent states"); - api_check(from, to->ci->top - to->top >= n, "not enough elements to move"); - from->top -= n; - for (i = 0; i < n; i++) { - setobj2s(to, to->top++, from->top + i); - } - lua_unlock(to); -} - - -LUA_API lua_CFunction lua_atpanic (lua_State *L, lua_CFunction panicf) { - lua_CFunction old; - lua_lock(L); - old = G(L)->panic; - G(L)->panic = panicf; - lua_unlock(L); - return old; -} - - -LUA_API const lua_Number *lua_version (lua_State *L) { - static const lua_Number version = LUA_VERSION_NUM; - if (L == NULL) return &version; - else return G(L)->version; -} - - - -/* -** basic stack manipulation -*/ - - -/* -** convert an acceptable stack index into an absolute index -*/ -LUA_API int lua_absindex (lua_State *L, int idx) { - return (idx > 0 || ispseudo(idx)) - ? idx - : cast_int(L->top - L->ci->func + idx); -} - - -LUA_API int lua_gettop (lua_State *L) { - return cast_int(L->top - (L->ci->func + 1)); -} - - -LUA_API void lua_settop (lua_State *L, int idx) { - StkId func = L->ci->func; - lua_lock(L); - if (idx >= 0) { - api_check(L, idx <= L->stack_last - (func + 1), "new top too large"); - while (L->top < (func + 1) + idx) - setnilvalue(L->top++); - L->top = (func + 1) + idx; - } - else { - api_check(L, -(idx+1) <= (L->top - (func + 1)), "invalid new top"); - L->top += idx+1; /* `subtract' index (index is negative) */ - } - lua_unlock(L); -} - - -LUA_API void lua_remove (lua_State *L, int idx) { - StkId p; - lua_lock(L); - p = index2addr(L, idx); - api_checkstackindex(L, idx, p); - while (++p < L->top) setobjs2s(L, p-1, p); - L->top--; - lua_unlock(L); -} - - -LUA_API void lua_insert (lua_State *L, int idx) { - StkId p; - StkId q; - lua_lock(L); - p = index2addr(L, idx); - api_checkstackindex(L, idx, p); - for (q = L->top; q > p; q--) /* use L->top as a temporary */ - setobjs2s(L, q, q - 1); - setobjs2s(L, p, L->top); - lua_unlock(L); -} - - -static void moveto (lua_State *L, TValue *fr, int idx) { - TValue *to = index2addr(L, idx); - api_checkvalidindex(L, to); - setobj(L, to, fr); - if (idx < LUA_REGISTRYINDEX) /* function upvalue? */ - luaC_barrier(L, clCvalue(L->ci->func), fr); - /* LUA_REGISTRYINDEX does not need gc barrier - (collector revisits it before finishing collection) */ -} - - -LUA_API void lua_replace (lua_State *L, int idx) { - lua_lock(L); - api_checknelems(L, 1); - moveto(L, L->top - 1, idx); - L->top--; - lua_unlock(L); -} - - -LUA_API void lua_copy (lua_State *L, int fromidx, int toidx) { - TValue *fr; - lua_lock(L); - fr = index2addr(L, fromidx); - moveto(L, fr, toidx); - lua_unlock(L); -} - - -LUA_API void lua_pushvalue (lua_State *L, int idx) { - lua_lock(L); - setobj2s(L, L->top, index2addr(L, idx)); - api_incr_top(L); - lua_unlock(L); -} - - - -/* -** access functions (stack -> C) -*/ - - -LUA_API int lua_type (lua_State *L, int idx) { - StkId o = index2addr(L, idx); - return (isvalid(o) ? ttypenv(o) : LUA_TNONE); -} - - -LUA_API const char *lua_typename (lua_State *L, int t) { - UNUSED(L); - return ttypename(t); -} - - -LUA_API int lua_iscfunction (lua_State *L, int idx) { - StkId o = index2addr(L, idx); - return (ttislcf(o) || (ttisCclosure(o))); -} - - -LUA_API int lua_isnumber (lua_State *L, int idx) { - TValue n; - const TValue *o = index2addr(L, idx); - return tonumber(o, &n); -} - - -LUA_API int lua_isstring (lua_State *L, int idx) { - int t = lua_type(L, idx); - return (t == LUA_TSTRING || t == LUA_TNUMBER); -} - - -LUA_API int lua_isuserdata (lua_State *L, int idx) { - const TValue *o = index2addr(L, idx); - return (ttisuserdata(o) || ttislightuserdata(o)); -} - - -LUA_API int lua_rawequal (lua_State *L, int index1, int index2) { - StkId o1 = index2addr(L, index1); - StkId o2 = index2addr(L, index2); - return (isvalid(o1) && isvalid(o2)) ? luaV_rawequalobj(o1, o2) : 0; -} - - -LUA_API void lua_arith (lua_State *L, int op) { - StkId o1; /* 1st operand */ - StkId o2; /* 2nd operand */ - lua_lock(L); - if (op != LUA_OPUNM) /* all other operations expect two operands */ - api_checknelems(L, 2); - else { /* for unary minus, add fake 2nd operand */ - api_checknelems(L, 1); - setobjs2s(L, L->top, L->top - 1); - L->top++; - } - o1 = L->top - 2; - o2 = L->top - 1; - if (ttisnumber(o1) && ttisnumber(o2)) { - setnvalue(o1, luaO_arith(op, nvalue(o1), nvalue(o2))); - } - else - luaV_arith(L, o1, o1, o2, cast(TMS, op - LUA_OPADD + TM_ADD)); - L->top--; - lua_unlock(L); -} - - -LUA_API int lua_compare (lua_State *L, int index1, int index2, int op) { - StkId o1, o2; - int i = 0; - lua_lock(L); /* may call tag method */ - o1 = index2addr(L, index1); - o2 = index2addr(L, index2); - if (isvalid(o1) && isvalid(o2)) { - switch (op) { - case LUA_OPEQ: i = equalobj(L, o1, o2); break; - case LUA_OPLT: i = luaV_lessthan(L, o1, o2); break; - case LUA_OPLE: i = luaV_lessequal(L, o1, o2); break; - default: api_check(L, 0, "invalid option"); - } - } - lua_unlock(L); - return i; -} - - -LUA_API lua_Number lua_tonumberx (lua_State *L, int idx, int *isnum) { - TValue n; - const TValue *o = index2addr(L, idx); - if (tonumber(o, &n)) { - if (isnum) *isnum = 1; - return nvalue(o); - } - else { - if (isnum) *isnum = 0; - return 0; - } -} - - -LUA_API lua_Integer lua_tointegerx (lua_State *L, int idx, int *isnum) { - TValue n; - const TValue *o = index2addr(L, idx); - if (tonumber(o, &n)) { - lua_Integer res; - lua_Number num = nvalue(o); - lua_number2integer(res, num); - if (isnum) *isnum = 1; - return res; - } - else { - if (isnum) *isnum = 0; - return 0; - } -} - - -LUA_API lua_Unsigned lua_tounsignedx (lua_State *L, int idx, int *isnum) { - TValue n; - const TValue *o = index2addr(L, idx); - if (tonumber(o, &n)) { - lua_Unsigned res; - lua_Number num = nvalue(o); - lua_number2unsigned(res, num); - if (isnum) *isnum = 1; - return res; - } - else { - if (isnum) *isnum = 0; - return 0; - } -} - - -LUA_API int lua_toboolean (lua_State *L, int idx) { - const TValue *o = index2addr(L, idx); - return !l_isfalse(o); -} - - -LUA_API const char *lua_tolstring (lua_State *L, int idx, size_t *len) { - StkId o = index2addr(L, idx); - if (!ttisstring(o)) { - lua_lock(L); /* `luaV_tostring' may create a new string */ - if (!luaV_tostring(L, o)) { /* conversion failed? */ - if (len != NULL) *len = 0; - lua_unlock(L); - return NULL; - } - luaC_checkGC(L); - o = index2addr(L, idx); /* previous call may reallocate the stack */ - lua_unlock(L); - } - if (len != NULL) *len = tsvalue(o)->len; - return svalue(o); -} - - -LUA_API size_t lua_rawlen (lua_State *L, int idx) { - StkId o = index2addr(L, idx); - switch (ttypenv(o)) { - case LUA_TSTRING: return tsvalue(o)->len; - case LUA_TUSERDATA: return uvalue(o)->len; - case LUA_TTABLE: return luaH_getn(hvalue(o)); - default: return 0; - } -} - - -LUA_API lua_CFunction lua_tocfunction (lua_State *L, int idx) { - StkId o = index2addr(L, idx); - if (ttislcf(o)) return fvalue(o); - else if (ttisCclosure(o)) - return clCvalue(o)->f; - else return NULL; /* not a C function */ -} - - -LUA_API void *lua_touserdata (lua_State *L, int idx) { - StkId o = index2addr(L, idx); - switch (ttypenv(o)) { - case LUA_TUSERDATA: return (rawuvalue(o) + 1); - case LUA_TLIGHTUSERDATA: return pvalue(o); - default: return NULL; - } -} - - -LUA_API lua_State *lua_tothread (lua_State *L, int idx) { - StkId o = index2addr(L, idx); - return (!ttisthread(o)) ? NULL : thvalue(o); -} - - -LUA_API const void *lua_topointer (lua_State *L, int idx) { - StkId o = index2addr(L, idx); - switch (ttype(o)) { - case LUA_TTABLE: return hvalue(o); - case LUA_TLCL: return clLvalue(o); - case LUA_TCCL: return clCvalue(o); - case LUA_TLCF: return cast(void *, cast(size_t, fvalue(o))); - case LUA_TTHREAD: return thvalue(o); - case LUA_TUSERDATA: - case LUA_TLIGHTUSERDATA: - return lua_touserdata(L, idx); - default: return NULL; - } -} - - - -/* -** push functions (C -> stack) -*/ - - -LUA_API void lua_pushnil (lua_State *L) { - lua_lock(L); - setnilvalue(L->top); - api_incr_top(L); - lua_unlock(L); -} - - -LUA_API void lua_pushnumber (lua_State *L, lua_Number n) { - lua_lock(L); - setnvalue(L->top, n); - luai_checknum(L, L->top, - luaG_runerror(L, "C API - attempt to push a signaling NaN")); - api_incr_top(L); - lua_unlock(L); -} - - -LUA_API void lua_pushinteger (lua_State *L, lua_Integer n) { - lua_lock(L); - setnvalue(L->top, cast_num(n)); - api_incr_top(L); - lua_unlock(L); -} - - -LUA_API void lua_pushunsigned (lua_State *L, lua_Unsigned u) { - lua_Number n; - lua_lock(L); - n = lua_unsigned2number(u); - setnvalue(L->top, n); - api_incr_top(L); - lua_unlock(L); -} - - -LUA_API const char *lua_pushlstring (lua_State *L, const char *s, size_t len) { - TString *ts; - lua_lock(L); - luaC_checkGC(L); - ts = luaS_newlstr(L, s, len); - setsvalue2s(L, L->top, ts); - api_incr_top(L); - lua_unlock(L); - return getstr(ts); -} - - -LUA_API const char *lua_pushstring (lua_State *L, const char *s) { - if (s == NULL) { - lua_pushnil(L); - return NULL; - } - else { - TString *ts; - lua_lock(L); - luaC_checkGC(L); - ts = luaS_new(L, s); - setsvalue2s(L, L->top, ts); - api_incr_top(L); - lua_unlock(L); - return getstr(ts); - } -} - - -LUA_API const char *lua_pushvfstring (lua_State *L, const char *fmt, - va_list argp) { - const char *ret; - lua_lock(L); - luaC_checkGC(L); - ret = luaO_pushvfstring(L, fmt, argp); - lua_unlock(L); - return ret; -} - - -LUA_API const char *lua_pushfstring (lua_State *L, const char *fmt, ...) { - const char *ret; - va_list argp; - lua_lock(L); - luaC_checkGC(L); - va_start(argp, fmt); - ret = luaO_pushvfstring(L, fmt, argp); - va_end(argp); - lua_unlock(L); - return ret; -} - - -LUA_API void lua_pushcclosure (lua_State *L, lua_CFunction fn, int n) { - lua_lock(L); - if (n == 0) { - setfvalue(L->top, fn); - } - else { - Closure *cl; - api_checknelems(L, n); - api_check(L, n <= MAXUPVAL, "upvalue index too large"); - luaC_checkGC(L); - cl = luaF_newCclosure(L, n); - cl->c.f = fn; - L->top -= n; - while (n--) - setobj2n(L, &cl->c.upvalue[n], L->top + n); - setclCvalue(L, L->top, cl); - } - api_incr_top(L); - lua_unlock(L); -} - - -LUA_API void lua_pushboolean (lua_State *L, int b) { - lua_lock(L); - setbvalue(L->top, (b != 0)); /* ensure that true is 1 */ - api_incr_top(L); - lua_unlock(L); -} - - -LUA_API void lua_pushlightuserdata (lua_State *L, void *p) { - lua_lock(L); - setpvalue(L->top, p); - api_incr_top(L); - lua_unlock(L); -} - - -LUA_API int lua_pushthread (lua_State *L) { - lua_lock(L); - setthvalue(L, L->top, L); - api_incr_top(L); - lua_unlock(L); - return (G(L)->mainthread == L); -} - - - -/* -** get functions (Lua -> stack) -*/ - - -LUA_API void lua_getglobal (lua_State *L, const char *var) { - Table *reg = hvalue(&G(L)->l_registry); - const TValue *gt; /* global table */ - lua_lock(L); - gt = luaH_getint(reg, LUA_RIDX_GLOBALS); - setsvalue2s(L, L->top++, luaS_new(L, var)); - luaV_gettable(L, gt, L->top - 1, L->top - 1); - lua_unlock(L); -} - - -LUA_API void lua_gettable (lua_State *L, int idx) { - StkId t; - lua_lock(L); - t = index2addr(L, idx); - luaV_gettable(L, t, L->top - 1, L->top - 1); - lua_unlock(L); -} - - -LUA_API void lua_getfield (lua_State *L, int idx, const char *k) { - StkId t; - lua_lock(L); - t = index2addr(L, idx); - setsvalue2s(L, L->top, luaS_new(L, k)); - api_incr_top(L); - luaV_gettable(L, t, L->top - 1, L->top - 1); - lua_unlock(L); -} - - -LUA_API void lua_rawget (lua_State *L, int idx) { - StkId t; - lua_lock(L); - t = index2addr(L, idx); - api_check(L, ttistable(t), "table expected"); - setobj2s(L, L->top - 1, luaH_get(hvalue(t), L->top - 1)); - lua_unlock(L); -} - - -LUA_API void lua_rawgeti (lua_State *L, int idx, int n) { - StkId t; - lua_lock(L); - t = index2addr(L, idx); - api_check(L, ttistable(t), "table expected"); - setobj2s(L, L->top, luaH_getint(hvalue(t), n)); - api_incr_top(L); - lua_unlock(L); -} - - -LUA_API void lua_rawgetp (lua_State *L, int idx, const void *p) { - StkId t; - TValue k; - lua_lock(L); - t = index2addr(L, idx); - api_check(L, ttistable(t), "table expected"); - setpvalue(&k, cast(void *, p)); - setobj2s(L, L->top, luaH_get(hvalue(t), &k)); - api_incr_top(L); - lua_unlock(L); -} - - -LUA_API void lua_createtable (lua_State *L, int narray, int nrec) { - Table *t; - lua_lock(L); - luaC_checkGC(L); - t = luaH_new(L); - sethvalue(L, L->top, t); - api_incr_top(L); - if (narray > 0 || nrec > 0) - luaH_resize(L, t, narray, nrec); - lua_unlock(L); -} - - -LUA_API int lua_getmetatable (lua_State *L, int objindex) { - const TValue *obj; - Table *mt = NULL; - int res; - lua_lock(L); - obj = index2addr(L, objindex); - switch (ttypenv(obj)) { - case LUA_TTABLE: - mt = hvalue(obj)->metatable; - break; - case LUA_TUSERDATA: - mt = uvalue(obj)->metatable; - break; - default: - mt = G(L)->mt[ttypenv(obj)]; - break; - } - if (mt == NULL) - res = 0; - else { - sethvalue(L, L->top, mt); - api_incr_top(L); - res = 1; - } - lua_unlock(L); - return res; -} - - -LUA_API void lua_getuservalue (lua_State *L, int idx) { - StkId o; - lua_lock(L); - o = index2addr(L, idx); - api_check(L, ttisuserdata(o), "userdata expected"); - if (uvalue(o)->env) { - sethvalue(L, L->top, uvalue(o)->env); - } else - setnilvalue(L->top); - api_incr_top(L); - lua_unlock(L); -} - - -/* -** set functions (stack -> Lua) -*/ - - -LUA_API void lua_setglobal (lua_State *L, const char *var) { - Table *reg = hvalue(&G(L)->l_registry); - const TValue *gt; /* global table */ - lua_lock(L); - api_checknelems(L, 1); - gt = luaH_getint(reg, LUA_RIDX_GLOBALS); - setsvalue2s(L, L->top++, luaS_new(L, var)); - luaV_settable(L, gt, L->top - 1, L->top - 2); - L->top -= 2; /* pop value and key */ - lua_unlock(L); -} - - -LUA_API void lua_settable (lua_State *L, int idx) { - StkId t; - lua_lock(L); - api_checknelems(L, 2); - t = index2addr(L, idx); - luaV_settable(L, t, L->top - 2, L->top - 1); - L->top -= 2; /* pop index and value */ - lua_unlock(L); -} - - -LUA_API void lua_setfield (lua_State *L, int idx, const char *k) { - StkId t; - lua_lock(L); - api_checknelems(L, 1); - t = index2addr(L, idx); - setsvalue2s(L, L->top++, luaS_new(L, k)); - luaV_settable(L, t, L->top - 1, L->top - 2); - L->top -= 2; /* pop value and key */ - lua_unlock(L); -} - - -LUA_API void lua_rawset (lua_State *L, int idx) { - StkId t; - lua_lock(L); - api_checknelems(L, 2); - t = index2addr(L, idx); - api_check(L, ttistable(t), "table expected"); - setobj2t(L, luaH_set(L, hvalue(t), L->top-2), L->top-1); - invalidateTMcache(hvalue(t)); - luaC_barrierback(L, gcvalue(t), L->top-1); - L->top -= 2; - lua_unlock(L); -} - - -LUA_API void lua_rawseti (lua_State *L, int idx, int n) { - StkId t; - lua_lock(L); - api_checknelems(L, 1); - t = index2addr(L, idx); - api_check(L, ttistable(t), "table expected"); - luaH_setint(L, hvalue(t), n, L->top - 1); - luaC_barrierback(L, gcvalue(t), L->top-1); - L->top--; - lua_unlock(L); -} - - -LUA_API void lua_rawsetp (lua_State *L, int idx, const void *p) { - StkId t; - TValue k; - lua_lock(L); - api_checknelems(L, 1); - t = index2addr(L, idx); - api_check(L, ttistable(t), "table expected"); - setpvalue(&k, cast(void *, p)); - setobj2t(L, luaH_set(L, hvalue(t), &k), L->top - 1); - luaC_barrierback(L, gcvalue(t), L->top - 1); - L->top--; - lua_unlock(L); -} - - -LUA_API int lua_setmetatable (lua_State *L, int objindex) { - TValue *obj; - Table *mt; - lua_lock(L); - api_checknelems(L, 1); - obj = index2addr(L, objindex); - if (ttisnil(L->top - 1)) - mt = NULL; - else { - api_check(L, ttistable(L->top - 1), "table expected"); - mt = hvalue(L->top - 1); - } - switch (ttypenv(obj)) { - case LUA_TTABLE: { - hvalue(obj)->metatable = mt; - if (mt) { - luaC_objbarrierback(L, gcvalue(obj), mt); - luaC_checkfinalizer(L, gcvalue(obj), mt); - } - break; - } - case LUA_TUSERDATA: { - uvalue(obj)->metatable = mt; - if (mt) { - luaC_objbarrier(L, rawuvalue(obj), mt); - luaC_checkfinalizer(L, gcvalue(obj), mt); - } - break; - } - default: { - G(L)->mt[ttypenv(obj)] = mt; - break; - } - } - L->top--; - lua_unlock(L); - return 1; -} - - -LUA_API void lua_setuservalue (lua_State *L, int idx) { - StkId o; - lua_lock(L); - api_checknelems(L, 1); - o = index2addr(L, idx); - api_check(L, ttisuserdata(o), "userdata expected"); - if (ttisnil(L->top - 1)) - uvalue(o)->env = NULL; - else { - api_check(L, ttistable(L->top - 1), "table expected"); - uvalue(o)->env = hvalue(L->top - 1); - luaC_objbarrier(L, gcvalue(o), hvalue(L->top - 1)); - } - L->top--; - lua_unlock(L); -} - - -/* -** `load' and `call' functions (run Lua code) -*/ - - -#define checkresults(L,na,nr) \ - api_check(L, (nr) == LUA_MULTRET || (L->ci->top - L->top >= (nr) - (na)), \ - "results from function overflow current stack size") - - -LUA_API int lua_getctx (lua_State *L, int *ctx) { - if (L->ci->callstatus & CIST_YIELDED) { - if (ctx) *ctx = L->ci->u.c.ctx; - return L->ci->u.c.status; - } - else return LUA_OK; -} - - -LUA_API void lua_callk (lua_State *L, int nargs, int nresults, int ctx, - lua_CFunction k) { - StkId func; - lua_lock(L); - api_check(L, k == NULL || !isLua(L->ci), - "cannot use continuations inside hooks"); - api_checknelems(L, nargs+1); - api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread"); - checkresults(L, nargs, nresults); - func = L->top - (nargs+1); - if (k != NULL && L->nny == 0) { /* need to prepare continuation? */ - L->ci->u.c.k = k; /* save continuation */ - L->ci->u.c.ctx = ctx; /* save context */ - luaD_call(L, func, nresults, 1); /* do the call */ - } - else /* no continuation or no yieldable */ - luaD_call(L, func, nresults, 0); /* just do the call */ - adjustresults(L, nresults); - lua_unlock(L); -} - - - -/* -** Execute a protected call. -*/ -struct CallS { /* data to `f_call' */ - StkId func; - int nresults; -}; - - -static void f_call (lua_State *L, void *ud) { - struct CallS *c = cast(struct CallS *, ud); - luaD_call(L, c->func, c->nresults, 0); -} - - - -LUA_API int lua_pcallk (lua_State *L, int nargs, int nresults, int errfunc, - int ctx, lua_CFunction k) { - struct CallS c; - int status; - ptrdiff_t func; - lua_lock(L); - api_check(L, k == NULL || !isLua(L->ci), - "cannot use continuations inside hooks"); - api_checknelems(L, nargs+1); - api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread"); - checkresults(L, nargs, nresults); - if (errfunc == 0) - func = 0; - else { - StkId o = index2addr(L, errfunc); - api_checkstackindex(L, errfunc, o); - func = savestack(L, o); - } - c.func = L->top - (nargs+1); /* function to be called */ - if (k == NULL || L->nny > 0) { /* no continuation or no yieldable? */ - c.nresults = nresults; /* do a 'conventional' protected call */ - status = luaD_pcall(L, f_call, &c, savestack(L, c.func), func); - } - else { /* prepare continuation (call is already protected by 'resume') */ - CallInfo *ci = L->ci; - ci->u.c.k = k; /* save continuation */ - ci->u.c.ctx = ctx; /* save context */ - /* save information for error recovery */ - ci->extra = savestack(L, c.func); - ci->u.c.old_allowhook = L->allowhook; - ci->u.c.old_errfunc = L->errfunc; - L->errfunc = func; - /* mark that function may do error recovery */ - ci->callstatus |= CIST_YPCALL; - luaD_call(L, c.func, nresults, 1); /* do the call */ - ci->callstatus &= ~CIST_YPCALL; - L->errfunc = ci->u.c.old_errfunc; - status = LUA_OK; /* if it is here, there were no errors */ - } - adjustresults(L, nresults); - lua_unlock(L); - return status; -} - - -LUA_API int lua_load (lua_State *L, lua_Reader reader, void *data, - const char *chunkname, const char *mode) { - ZIO z; - int status; - lua_lock(L); - if (!chunkname) chunkname = "?"; - luaZ_init(L, &z, reader, data); - status = luaD_protectedparser(L, &z, chunkname, mode); - if (status == LUA_OK) { /* no errors? */ - LClosure *f = clLvalue(L->top - 1); /* get newly created function */ - if (f->nupvalues == 1) { /* does it have one upvalue? */ - /* get global table from registry */ - Table *reg = hvalue(&G(L)->l_registry); - const TValue *gt = luaH_getint(reg, LUA_RIDX_GLOBALS); - /* set global table as 1st upvalue of 'f' (may be LUA_ENV) */ - setobj(L, f->upvals[0]->v, gt); - luaC_barrier(L, f->upvals[0], gt); - } - } - lua_unlock(L); - return status; -} - - -LUA_API int lua_dump (lua_State *L, lua_Writer writer, void *data) { - int status; - TValue *o; - lua_lock(L); - api_checknelems(L, 1); - o = L->top - 1; - if (isLfunction(o)) - status = luaU_dump(L, getproto(o), writer, data, 0); - else - status = 1; - lua_unlock(L); - return status; -} - - -LUA_API int lua_status (lua_State *L) { - return L->status; -} - - -/* -** Garbage-collection function -*/ - -LUA_API int lua_gc (lua_State *L, int what, int data) { - int res = 0; - global_State *g; - lua_lock(L); - g = G(L); - switch (what) { - case LUA_GCSTOP: { - g->gcrunning = 0; - break; - } - case LUA_GCRESTART: { - luaE_setdebt(g, 0); - g->gcrunning = 1; - break; - } - case LUA_GCCOLLECT: { - luaC_fullgc(L, 0); - break; - } - case LUA_GCCOUNT: { - /* GC values are expressed in Kbytes: #bytes/2^10 */ - res = cast_int(gettotalbytes(g) >> 10); - break; - } - case LUA_GCCOUNTB: { - res = cast_int(gettotalbytes(g) & 0x3ff); - break; - } - case LUA_GCSTEP: { - if (g->gckind == KGC_GEN) { /* generational mode? */ - res = (g->GCestimate == 0); /* true if it will do major collection */ - luaC_forcestep(L); /* do a single step */ - } - else { - lu_mem debt = cast(lu_mem, data) * 1024 - GCSTEPSIZE; - if (g->gcrunning) - debt += g->GCdebt; /* include current debt */ - luaE_setdebt(g, debt); - luaC_forcestep(L); - if (g->gcstate == GCSpause) /* end of cycle? */ - res = 1; /* signal it */ - } - break; - } - case LUA_GCSETPAUSE: { - res = g->gcpause; - g->gcpause = data; - break; - } - case LUA_GCSETMAJORINC: { - res = g->gcmajorinc; - g->gcmajorinc = data; - break; - } - case LUA_GCSETSTEPMUL: { - res = g->gcstepmul; - g->gcstepmul = data; - break; - } - case LUA_GCISRUNNING: { - res = g->gcrunning; - break; - } - case LUA_GCGEN: { /* change collector to generational mode */ - luaC_changemode(L, KGC_GEN); - break; - } - case LUA_GCINC: { /* change collector to incremental mode */ - luaC_changemode(L, KGC_NORMAL); - break; - } - default: res = -1; /* invalid option */ - } - lua_unlock(L); - return res; -} - - - -/* -** miscellaneous functions -*/ - - -LUA_API int lua_error (lua_State *L) { - lua_lock(L); - api_checknelems(L, 1); - luaG_errormsg(L); - /* code unreachable; will unlock when control actually leaves the kernel */ - return 0; /* to avoid warnings */ -} - - -LUA_API int lua_next (lua_State *L, int idx) { - StkId t; - int more; - lua_lock(L); - t = index2addr(L, idx); - api_check(L, ttistable(t), "table expected"); - more = luaH_next(L, hvalue(t), L->top - 1); - if (more) { - api_incr_top(L); - } - else /* no more elements */ - L->top -= 1; /* remove key */ - lua_unlock(L); - return more; -} - - -LUA_API void lua_concat (lua_State *L, int n) { - lua_lock(L); - api_checknelems(L, n); - if (n >= 2) { - luaC_checkGC(L); - luaV_concat(L, n); - } - else if (n == 0) { /* push empty string */ - setsvalue2s(L, L->top, luaS_newlstr(L, "", 0)); - api_incr_top(L); - } - /* else n == 1; nothing to do */ - lua_unlock(L); -} - - -LUA_API void lua_len (lua_State *L, int idx) { - StkId t; - lua_lock(L); - t = index2addr(L, idx); - luaV_objlen(L, L->top, t); - api_incr_top(L); - lua_unlock(L); -} - - -LUA_API lua_Alloc lua_getallocf (lua_State *L, void **ud) { - lua_Alloc f; - lua_lock(L); - if (ud) *ud = G(L)->ud; - f = G(L)->frealloc; - lua_unlock(L); - return f; -} - - -LUA_API void lua_setallocf (lua_State *L, lua_Alloc f, void *ud) { - lua_lock(L); - G(L)->ud = ud; - G(L)->frealloc = f; - lua_unlock(L); -} - - -LUA_API void *lua_newuserdata (lua_State *L, size_t size) { - Udata *u; - lua_lock(L); - luaC_checkGC(L); - u = luaS_newudata(L, size, NULL); - setuvalue(L, L->top, u); - api_incr_top(L); - lua_unlock(L); - return u + 1; -} - - - -static const char *aux_upvalue (StkId fi, int n, TValue **val, - GCObject **owner) { - switch (ttype(fi)) { - case LUA_TCCL: { /* C closure */ - CClosure *f = clCvalue(fi); - if (!(1 <= n && n <= f->nupvalues)) return NULL; - *val = &f->upvalue[n-1]; - if (owner) *owner = obj2gco(f); - return ""; - } - case LUA_TLCL: { /* Lua closure */ - LClosure *f = clLvalue(fi); - TString *name; - Proto *p = f->p; - if (!(1 <= n && n <= p->sizeupvalues)) return NULL; - *val = f->upvals[n-1]->v; - if (owner) *owner = obj2gco(f->upvals[n - 1]); - name = p->upvalues[n-1].name; - return (name == NULL) ? "" : getstr(name); - } - default: return NULL; /* not a closure */ - } -} - - -LUA_API const char *lua_getupvalue (lua_State *L, int funcindex, int n) { - const char *name; - TValue *val = NULL; /* to avoid warnings */ - lua_lock(L); - name = aux_upvalue(index2addr(L, funcindex), n, &val, NULL); - if (name) { - setobj2s(L, L->top, val); - api_incr_top(L); - } - lua_unlock(L); - return name; -} - - -LUA_API const char *lua_setupvalue (lua_State *L, int funcindex, int n) { - const char *name; - TValue *val = NULL; /* to avoid warnings */ - GCObject *owner = NULL; /* to avoid warnings */ - StkId fi; - lua_lock(L); - fi = index2addr(L, funcindex); - api_checknelems(L, 1); - name = aux_upvalue(fi, n, &val, &owner); - if (name) { - L->top--; - setobj(L, val, L->top); - luaC_barrier(L, owner, L->top); - } - lua_unlock(L); - return name; -} - - -static UpVal **getupvalref (lua_State *L, int fidx, int n, LClosure **pf) { - LClosure *f; - StkId fi = index2addr(L, fidx); - api_check(L, ttisLclosure(fi), "Lua function expected"); - f = clLvalue(fi); - api_check(L, (1 <= n && n <= f->p->sizeupvalues), "invalid upvalue index"); - if (pf) *pf = f; - return &f->upvals[n - 1]; /* get its upvalue pointer */ -} - - -LUA_API void *lua_upvalueid (lua_State *L, int fidx, int n) { - StkId fi = index2addr(L, fidx); - switch (ttype(fi)) { - case LUA_TLCL: { /* lua closure */ - return *getupvalref(L, fidx, n, NULL); - } - case LUA_TCCL: { /* C closure */ - CClosure *f = clCvalue(fi); - api_check(L, 1 <= n && n <= f->nupvalues, "invalid upvalue index"); - return &f->upvalue[n - 1]; - } - default: { - api_check(L, 0, "closure expected"); - return NULL; - } - } -} - - -LUA_API void lua_upvaluejoin (lua_State *L, int fidx1, int n1, - int fidx2, int n2) { - LClosure *f1; - UpVal **up1 = getupvalref(L, fidx1, n1, &f1); - UpVal **up2 = getupvalref(L, fidx2, n2, NULL); - *up1 = *up2; - luaC_objbarrier(L, f1, *up2); -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h deleted file mode 100644 index c7d34ad84866..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h +++ /dev/null @@ -1,24 +0,0 @@ -/* -** $Id: lapi.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $ -** Auxiliary functions from Lua API -** See Copyright Notice in lua.h -*/ - -#ifndef lapi_h -#define lapi_h - - -#include "llimits.h" -#include "lstate.h" - -#define api_incr_top(L) {L->top++; api_check(L, L->top <= L->ci->top, \ - "stack overflow");} - -#define adjustresults(L,nres) \ - { if ((nres) == LUA_MULTRET && L->ci->top < L->top) L->ci->top = L->top; } - -#define api_checknelems(L,n) api_check(L, (n) < (L->top - L->ci->func), \ - "not enough elements in the stack") - - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c deleted file mode 100644 index 4bd13788b459..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c +++ /dev/null @@ -1,791 +0,0 @@ -/* -** $Id: lauxlib.c,v 1.248.1.1 2013/04/12 18:48:47 roberto Exp $ -** Auxiliary functions for building Lua libraries -** See Copyright Notice in lua.h -*/ - - -#include - -/* This file uses only the official API of Lua. -** Any function declared here could be written as an application function. -*/ - -#define lauxlib_c -#define LUA_LIB - -#include "lua.h" - -#include "lauxlib.h" - - -/* -** {====================================================== -** Traceback -** ======================================================= -*/ - - -#define LEVELS1 12 /* size of the first part of the stack */ -#define LEVELS2 10 /* size of the second part of the stack */ - - - -/* -** search for 'objidx' in table at index -1. -** return 1 + string at top if find a good name. -*/ -static int findfield (lua_State *L, int objidx, int level) { - if (level == 0 || !lua_istable(L, -1)) - return 0; /* not found */ - lua_pushnil(L); /* start 'next' loop */ - while (lua_next(L, -2)) { /* for each pair in table */ - if (lua_type(L, -2) == LUA_TSTRING) { /* ignore non-string keys */ - if (lua_rawequal(L, objidx, -1)) { /* found object? */ - lua_pop(L, 1); /* remove value (but keep name) */ - return 1; - } - else if (findfield(L, objidx, level - 1)) { /* try recursively */ - lua_remove(L, -2); /* remove table (but keep name) */ - lua_pushliteral(L, "."); - lua_insert(L, -2); /* place '.' between the two names */ - lua_concat(L, 3); - return 1; - } - } - lua_pop(L, 1); /* remove value */ - } - return 0; /* not found */ -} - - -static int pushglobalfuncname (lua_State *L, lua_Debug *ar) { - int top = lua_gettop(L); - lua_getinfo(L, "f", ar); /* push function */ - lua_pushglobaltable(L); - if (findfield(L, top + 1, 2)) { - lua_copy(L, -1, top + 1); /* move name to proper place */ - lua_pop(L, 2); /* remove pushed values */ - return 1; - } - else { - lua_settop(L, top); /* remove function and global table */ - return 0; - } -} - - -static void pushfuncname (lua_State *L, lua_Debug *ar) { - if (*ar->namewhat != '\0') /* is there a name? */ - lua_pushfstring(L, "function " LUA_QS, ar->name); - else if (*ar->what == 'm') /* main? */ - lua_pushliteral(L, "main chunk"); - else if (*ar->what == 'C') { - if (pushglobalfuncname(L, ar)) { - lua_pushfstring(L, "function " LUA_QS, lua_tostring(L, -1)); - lua_remove(L, -2); /* remove name */ - } - else - lua_pushliteral(L, "?"); - } - else - lua_pushfstring(L, "function <%s:%d>", ar->short_src, ar->linedefined); -} - - -static int countlevels (lua_State *L) { - lua_Debug ar; - int li = 1, le = 1; - /* find an upper bound */ - while (lua_getstack(L, le, &ar)) { li = le; le *= 2; } - /* do a binary search */ - while (li < le) { - int m = (li + le)/2; - if (lua_getstack(L, m, &ar)) li = m + 1; - else le = m; - } - return le - 1; -} - - -LUALIB_API void luaL_traceback (lua_State *L, lua_State *L1, - const char *msg, int level) { - lua_Debug ar; - int top = lua_gettop(L); - int numlevels = countlevels(L1); - int mark = (numlevels > LEVELS1 + LEVELS2) ? LEVELS1 : 0; - if (msg) lua_pushfstring(L, "%s\n", msg); - lua_pushliteral(L, "stack traceback:"); - while (lua_getstack(L1, level++, &ar)) { - if (level == mark) { /* too many levels? */ - lua_pushliteral(L, "\n\t..."); /* add a '...' */ - level = numlevels - LEVELS2; /* and skip to last ones */ - } - else { - lua_getinfo(L1, "Slnt", &ar); - lua_pushfstring(L, "\n\t%s:", ar.short_src); - if (ar.currentline > 0) - lua_pushfstring(L, "%d:", ar.currentline); - lua_pushliteral(L, " in "); - pushfuncname(L, &ar); - if (ar.istailcall) - lua_pushliteral(L, "\n\t(...tail calls...)"); - lua_concat(L, lua_gettop(L) - top); - } - } - lua_concat(L, lua_gettop(L) - top); -} - -/* }====================================================== */ - - -/* -** {====================================================== -** Error-report functions -** ======================================================= -*/ - -LUALIB_API int luaL_argerror (lua_State *L, int narg, const char *extramsg) { - lua_Debug ar; - if (!lua_getstack(L, 0, &ar)) /* no stack frame? */ - return luaL_error(L, "bad argument #%d (%s)", narg, extramsg); - lua_getinfo(L, "n", &ar); - if (strcmp(ar.namewhat, "method") == 0) { - narg--; /* do not count `self' */ - if (narg == 0) /* error is in the self argument itself? */ - return luaL_error(L, "calling " LUA_QS " on bad self (%s)", - ar.name, extramsg); - } - if (ar.name == NULL) - ar.name = (pushglobalfuncname(L, &ar)) ? lua_tostring(L, -1) : "?"; - return luaL_error(L, "bad argument #%d to " LUA_QS " (%s)", - narg, ar.name, extramsg); -} - - -static int typeerror (lua_State *L, int narg, const char *tname) { - const char *msg = lua_pushfstring(L, "%s expected, got %s", - tname, luaL_typename(L, narg)); - return luaL_argerror(L, narg, msg); -} - - -static void tag_error (lua_State *L, int narg, int tag) { - typeerror(L, narg, lua_typename(L, tag)); -} - - -LUALIB_API void luaL_where (lua_State *L, int level) { - lua_Debug ar; - if (lua_getstack(L, level, &ar)) { /* check function at level */ - lua_getinfo(L, "Sl", &ar); /* get info about it */ - if (ar.currentline > 0) { /* is there info? */ - lua_pushfstring(L, "%s:%d: ", ar.short_src, ar.currentline); - return; - } - } - lua_pushliteral(L, ""); /* else, no information available... */ -} - - -LUALIB_API int luaL_error (lua_State *L, const char *fmt, ...) { - va_list argp; - va_start(argp, fmt); - luaL_where(L, 1); - lua_pushvfstring(L, fmt, argp); - va_end(argp); - lua_concat(L, 2); - return lua_error(L); -} - - -#if !defined(inspectstat) /* { */ - -#if defined(LUA_USE_POSIX) - -#include - -/* -** use appropriate macros to interpret 'pclose' return status -*/ -#define inspectstat(stat,what) \ - if (WIFEXITED(stat)) { stat = WEXITSTATUS(stat); } \ - else if (WIFSIGNALED(stat)) { stat = WTERMSIG(stat); what = "signal"; } - -#else - -#define inspectstat(stat,what) /* no op */ - -#endif - -#endif /* } */ - - -/* }====================================================== */ - - -/* -** {====================================================== -** Userdata's metatable manipulation -** ======================================================= -*/ - -LUALIB_API int luaL_newmetatable (lua_State *L, const char *tname) { - luaL_getmetatable(L, tname); /* try to get metatable */ - if (!lua_isnil(L, -1)) /* name already in use? */ - return 0; /* leave previous value on top, but return 0 */ - lua_pop(L, 1); - lua_newtable(L); /* create metatable */ - lua_pushvalue(L, -1); - lua_setfield(L, LUA_REGISTRYINDEX, tname); /* registry.name = metatable */ - return 1; -} - - -LUALIB_API void luaL_setmetatable (lua_State *L, const char *tname) { - luaL_getmetatable(L, tname); - lua_setmetatable(L, -2); -} - - -LUALIB_API void *luaL_testudata (lua_State *L, int ud, const char *tname) { - void *p = lua_touserdata(L, ud); - if (p != NULL) { /* value is a userdata? */ - if (lua_getmetatable(L, ud)) { /* does it have a metatable? */ - luaL_getmetatable(L, tname); /* get correct metatable */ - if (!lua_rawequal(L, -1, -2)) /* not the same? */ - p = NULL; /* value is a userdata with wrong metatable */ - lua_pop(L, 2); /* remove both metatables */ - return p; - } - } - return NULL; /* value is not a userdata with a metatable */ -} - - -LUALIB_API void *luaL_checkudata (lua_State *L, int ud, const char *tname) { - void *p = luaL_testudata(L, ud, tname); - if (p == NULL) typeerror(L, ud, tname); - return p; -} - -/* }====================================================== */ - - -/* -** {====================================================== -** Argument check functions -** ======================================================= -*/ - -LUALIB_API int luaL_checkoption (lua_State *L, int narg, const char *def, - const char *const lst[]) { - const char *name = (def) ? luaL_optstring(L, narg, def) : - luaL_checkstring(L, narg); - int i; - for (i=0; lst[i]; i++) - if (strcmp(lst[i], name) == 0) - return i; - return luaL_argerror(L, narg, - lua_pushfstring(L, "invalid option " LUA_QS, name)); -} - - -LUALIB_API void luaL_checkstack (lua_State *L, int space, const char *msg) { - /* keep some extra space to run error routines, if needed */ - const int extra = LUA_MINSTACK; - if (!lua_checkstack(L, space + extra)) { - if (msg) - luaL_error(L, "stack overflow (%s)", msg); - else - luaL_error(L, "stack overflow"); - } -} - - -LUALIB_API void luaL_checktype (lua_State *L, int narg, int t) { - if (lua_type(L, narg) != t) - tag_error(L, narg, t); -} - - -LUALIB_API void luaL_checkany (lua_State *L, int narg) { - if (lua_type(L, narg) == LUA_TNONE) - luaL_argerror(L, narg, "value expected"); -} - - -LUALIB_API const char *luaL_checklstring (lua_State *L, int narg, size_t *len) { - const char *s = lua_tolstring(L, narg, len); - if (!s) tag_error(L, narg, LUA_TSTRING); - return s; -} - - -LUALIB_API const char *luaL_optlstring (lua_State *L, int narg, - const char *def, size_t *len) { - if (lua_isnoneornil(L, narg)) { - if (len) - *len = (def ? strlen(def) : 0); - return def; - } - else return luaL_checklstring(L, narg, len); -} - - -LUALIB_API lua_Number luaL_checknumber (lua_State *L, int narg) { - int isnum; - lua_Number d = lua_tonumberx(L, narg, &isnum); - if (!isnum) - tag_error(L, narg, LUA_TNUMBER); - return d; -} - - -LUALIB_API lua_Number luaL_optnumber (lua_State *L, int narg, lua_Number def) { - return luaL_opt(L, luaL_checknumber, narg, def); -} - - -LUALIB_API lua_Integer luaL_checkinteger (lua_State *L, int narg) { - int isnum; - lua_Integer d = lua_tointegerx(L, narg, &isnum); - if (!isnum) - tag_error(L, narg, LUA_TNUMBER); - return d; -} - - -LUALIB_API lua_Unsigned luaL_checkunsigned (lua_State *L, int narg) { - int isnum; - lua_Unsigned d = lua_tounsignedx(L, narg, &isnum); - if (!isnum) - tag_error(L, narg, LUA_TNUMBER); - return d; -} - - -LUALIB_API lua_Integer luaL_optinteger (lua_State *L, int narg, - lua_Integer def) { - return luaL_opt(L, luaL_checkinteger, narg, def); -} - - -LUALIB_API lua_Unsigned luaL_optunsigned (lua_State *L, int narg, - lua_Unsigned def) { - return luaL_opt(L, luaL_checkunsigned, narg, def); -} - -/* }====================================================== */ - - -/* -** {====================================================== -** Generic Buffer manipulation -** ======================================================= -*/ - -/* -** check whether buffer is using a userdata on the stack as a temporary -** buffer -*/ -#define buffonstack(B) ((B)->b != (B)->initb) - - -/* -** returns a pointer to a free area with at least 'sz' bytes -*/ -LUALIB_API char *luaL_prepbuffsize (luaL_Buffer *B, size_t sz) { - lua_State *L = B->L; - if (B->size - B->n < sz) { /* not enough space? */ - char *newbuff; - size_t newsize = B->size * 2; /* double buffer size */ - if (newsize - B->n < sz) /* not big enough? */ - newsize = B->n + sz; - if (newsize < B->n || newsize - B->n < sz) - luaL_error(L, "buffer too large"); - /* create larger buffer */ - newbuff = (char *)lua_newuserdata(L, newsize * sizeof(char)); - /* move content to new buffer */ - memcpy(newbuff, B->b, B->n * sizeof(char)); - if (buffonstack(B)) - lua_remove(L, -2); /* remove old buffer */ - B->b = newbuff; - B->size = newsize; - } - return &B->b[B->n]; -} - - -LUALIB_API void luaL_addlstring (luaL_Buffer *B, const char *s, size_t l) { - char *b = luaL_prepbuffsize(B, l); - memcpy(b, s, l * sizeof(char)); - luaL_addsize(B, l); -} - - -LUALIB_API void luaL_addstring (luaL_Buffer *B, const char *s) { - luaL_addlstring(B, s, strlen(s)); -} - - -LUALIB_API void luaL_pushresult (luaL_Buffer *B) { - lua_State *L = B->L; - lua_pushlstring(L, B->b, B->n); - if (buffonstack(B)) - lua_remove(L, -2); /* remove old buffer */ -} - - -LUALIB_API void luaL_pushresultsize (luaL_Buffer *B, size_t sz) { - luaL_addsize(B, sz); - luaL_pushresult(B); -} - - -LUALIB_API void luaL_addvalue (luaL_Buffer *B) { - lua_State *L = B->L; - size_t l; - const char *s = lua_tolstring(L, -1, &l); - if (buffonstack(B)) - lua_insert(L, -2); /* put value below buffer */ - luaL_addlstring(B, s, l); - lua_remove(L, (buffonstack(B)) ? -2 : -1); /* remove value */ -} - - -LUALIB_API void luaL_buffinit (lua_State *L, luaL_Buffer *B) { - B->L = L; - B->b = B->initb; - B->n = 0; - B->size = LUAL_BUFFERSIZE; -} - - -LUALIB_API char *luaL_buffinitsize (lua_State *L, luaL_Buffer *B, size_t sz) { - luaL_buffinit(L, B); - return luaL_prepbuffsize(B, sz); -} - -/* }====================================================== */ - - -/* -** {====================================================== -** Reference system -** ======================================================= -*/ - -/* index of free-list header */ -#define freelist 0 - - -LUALIB_API int luaL_ref (lua_State *L, int t) { - int ref; - if (lua_isnil(L, -1)) { - lua_pop(L, 1); /* remove from stack */ - return LUA_REFNIL; /* `nil' has a unique fixed reference */ - } - t = lua_absindex(L, t); - lua_rawgeti(L, t, freelist); /* get first free element */ - ref = (int)lua_tointeger(L, -1); /* ref = t[freelist] */ - lua_pop(L, 1); /* remove it from stack */ - if (ref != 0) { /* any free element? */ - lua_rawgeti(L, t, ref); /* remove it from list */ - lua_rawseti(L, t, freelist); /* (t[freelist] = t[ref]) */ - } - else /* no free elements */ - ref = (int)lua_rawlen(L, t) + 1; /* get a new reference */ - lua_rawseti(L, t, ref); - return ref; -} - - -LUALIB_API void luaL_unref (lua_State *L, int t, int ref) { - if (ref >= 0) { - t = lua_absindex(L, t); - lua_rawgeti(L, t, freelist); - lua_rawseti(L, t, ref); /* t[ref] = t[freelist] */ - lua_pushinteger(L, ref); - lua_rawseti(L, t, freelist); /* t[freelist] = ref */ - } -} - -/* }====================================================== */ - - -/* -** {====================================================== -** Load functions -** ======================================================= -*/ - -typedef struct LoadS { - const char *s; - size_t size; -} LoadS; - - -static const char *getS (lua_State *L, void *ud, size_t *size) { - LoadS *ls = (LoadS *)ud; - (void)L; /* not used */ - if (ls->size == 0) return NULL; - *size = ls->size; - ls->size = 0; - return ls->s; -} - - -LUALIB_API int luaL_loadbufferx (lua_State *L, const char *buff, size_t size, - const char *name, const char *mode) { - LoadS ls; - ls.s = buff; - ls.size = size; - return lua_load(L, getS, &ls, name, mode); -} - - -LUALIB_API int luaL_loadstring (lua_State *L, const char *s) { - return luaL_loadbuffer(L, s, strlen(s), s); -} - -/* }====================================================== */ - - - -LUALIB_API int luaL_getmetafield (lua_State *L, int obj, const char *event) { - if (!lua_getmetatable(L, obj)) /* no metatable? */ - return 0; - lua_pushstring(L, event); - lua_rawget(L, -2); - if (lua_isnil(L, -1)) { - lua_pop(L, 2); /* remove metatable and metafield */ - return 0; - } - else { - lua_remove(L, -2); /* remove only metatable */ - return 1; - } -} - - -LUALIB_API int luaL_callmeta (lua_State *L, int obj, const char *event) { - obj = lua_absindex(L, obj); - if (!luaL_getmetafield(L, obj, event)) /* no metafield? */ - return 0; - lua_pushvalue(L, obj); - lua_call(L, 1, 1); - return 1; -} - - -LUALIB_API int luaL_len (lua_State *L, int idx) { - int l; - int isnum; - lua_len(L, idx); - l = (int)lua_tointegerx(L, -1, &isnum); - if (!isnum) - luaL_error(L, "object length is not a number"); - lua_pop(L, 1); /* remove object */ - return l; -} - - -LUALIB_API const char *luaL_tolstring (lua_State *L, int idx, size_t *len) { - if (!luaL_callmeta(L, idx, "__tostring")) { /* no metafield? */ - switch (lua_type(L, idx)) { - case LUA_TNUMBER: - case LUA_TSTRING: - lua_pushvalue(L, idx); - break; - case LUA_TBOOLEAN: - lua_pushstring(L, (lua_toboolean(L, idx) ? "true" : "false")); - break; - case LUA_TNIL: - lua_pushliteral(L, "nil"); - break; - default: - lua_pushfstring(L, "%s: %p", luaL_typename(L, idx), - lua_topointer(L, idx)); - break; - } - } - return lua_tolstring(L, -1, len); -} - - -/* -** {====================================================== -** Compatibility with 5.1 module functions -** ======================================================= -*/ -#if defined(LUA_COMPAT_MODULE) - -static const char *luaL_findtable (lua_State *L, int idx, - const char *fname, int szhint) { - const char *e; - if (idx) lua_pushvalue(L, idx); - do { - e = strchr(fname, '.'); - if (e == NULL) e = fname + strlen(fname); - lua_pushlstring(L, fname, e - fname); - lua_rawget(L, -2); - if (lua_isnil(L, -1)) { /* no such field? */ - lua_pop(L, 1); /* remove this nil */ - lua_createtable(L, 0, (*e == '.' ? 1 : szhint)); /* new table for field */ - lua_pushlstring(L, fname, e - fname); - lua_pushvalue(L, -2); - lua_settable(L, -4); /* set new table into field */ - } - else if (!lua_istable(L, -1)) { /* field has a non-table value? */ - lua_pop(L, 2); /* remove table and value */ - return fname; /* return problematic part of the name */ - } - lua_remove(L, -2); /* remove previous table */ - fname = e + 1; - } while (*e == '.'); - return NULL; -} - - -/* -** Count number of elements in a luaL_Reg list. -*/ -static int libsize (const luaL_Reg *l) { - int size = 0; - for (; l && l->name; l++) size++; - return size; -} - - -/* -** Find or create a module table with a given name. The function -** first looks at the _LOADED table and, if that fails, try a -** global variable with that name. In any case, leaves on the stack -** the module table. -*/ -LUALIB_API void luaL_pushmodule (lua_State *L, const char *modname, - int sizehint) { - luaL_findtable(L, LUA_REGISTRYINDEX, "_LOADED", 1); /* get _LOADED table */ - lua_getfield(L, -1, modname); /* get _LOADED[modname] */ - if (!lua_istable(L, -1)) { /* not found? */ - lua_pop(L, 1); /* remove previous result */ - /* try global variable (and create one if it does not exist) */ - lua_pushglobaltable(L); - if (luaL_findtable(L, 0, modname, sizehint) != NULL) - luaL_error(L, "name conflict for module " LUA_QS, modname); - lua_pushvalue(L, -1); - lua_setfield(L, -3, modname); /* _LOADED[modname] = new table */ - } - lua_remove(L, -2); /* remove _LOADED table */ -} - - -LUALIB_API void luaL_openlib (lua_State *L, const char *libname, - const luaL_Reg *l, int nup) { - luaL_checkversion(L); - if (libname) { - luaL_pushmodule(L, libname, libsize(l)); /* get/create library table */ - lua_insert(L, -(nup + 1)); /* move library table to below upvalues */ - } - if (l) - luaL_setfuncs(L, l, nup); - else - lua_pop(L, nup); /* remove upvalues */ -} - -#endif -/* }====================================================== */ - -/* -** set functions from list 'l' into table at top - 'nup'; each -** function gets the 'nup' elements at the top as upvalues. -** Returns with only the table at the stack. -*/ -LUALIB_API void luaL_setfuncs (lua_State *L, const luaL_Reg *l, int nup) { - luaL_checkversion(L); - luaL_checkstack(L, nup, "too many upvalues"); - for (; l->name != NULL; l++) { /* fill the table with given functions */ - int i; - for (i = 0; i < nup; i++) /* copy upvalues to the top */ - lua_pushvalue(L, -nup); - lua_pushcclosure(L, l->func, nup); /* closure with those upvalues */ - lua_setfield(L, -(nup + 2), l->name); - } - lua_pop(L, nup); /* remove upvalues */ -} - - -/* -** ensure that stack[idx][fname] has a table and push that table -** into the stack -*/ -LUALIB_API int luaL_getsubtable (lua_State *L, int idx, const char *fname) { - lua_getfield(L, idx, fname); - if (lua_istable(L, -1)) return 1; /* table already there */ - else { - lua_pop(L, 1); /* remove previous result */ - idx = lua_absindex(L, idx); - lua_newtable(L); - lua_pushvalue(L, -1); /* copy to be left at top */ - lua_setfield(L, idx, fname); /* assign new table to field */ - return 0; /* false, because did not find table there */ - } -} - - -/* -** stripped-down 'require'. Calls 'openf' to open a module, -** registers the result in 'package.loaded' table and, if 'glb' -** is true, also registers the result in the global table. -** Leaves resulting module on the top. -*/ -LUALIB_API void luaL_requiref (lua_State *L, const char *modname, - lua_CFunction openf, int glb) { - lua_pushcfunction(L, openf); - lua_pushstring(L, modname); /* argument to open function */ - lua_call(L, 1, 1); /* open module */ - luaL_getsubtable(L, LUA_REGISTRYINDEX, "_LOADED"); - lua_pushvalue(L, -2); /* make copy of module (call result) */ - lua_setfield(L, -2, modname); /* _LOADED[modname] = module */ - lua_pop(L, 1); /* remove _LOADED table */ - if (glb) { - lua_pushvalue(L, -1); /* copy of 'mod' */ - lua_setglobal(L, modname); /* _G[modname] = module */ - } -} - - -LUALIB_API const char *luaL_gsub (lua_State *L, const char *s, const char *p, - const char *r) { - const char *wild; - size_t l = strlen(p); - luaL_Buffer b; - luaL_buffinit(L, &b); - while ((wild = strstr(s, p)) != NULL) { - luaL_addlstring(&b, s, wild - s); /* push prefix */ - luaL_addstring(&b, r); /* push replacement in place of pattern */ - s = wild + l; /* continue after `p' */ - } - luaL_addstring(&b, s); /* push last suffix */ - luaL_pushresult(&b); - return lua_tostring(L, -1); -} - - -LUALIB_API void luaL_checkversion_ (lua_State *L, lua_Number ver) { - const lua_Number *v = lua_version(L); - if (v != lua_version(NULL)) - luaL_error(L, "multiple Lua VMs detected"); - else if (*v != ver) - luaL_error(L, "version mismatch: app. needs %f, Lua core provides %f", - ver, *v); - /* check conversions number -> integer types */ - lua_pushnumber(L, -(lua_Number)0x1234); - if (lua_tointeger(L, -1) != -0x1234 || - lua_tounsigned(L, -1) != (lua_Unsigned)-0x1234) - luaL_error(L, "bad conversion number->int;" - " must recompile Lua with proper settings"); - lua_pop(L, 1); -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h deleted file mode 100644 index f6fdac14f50b..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h +++ /dev/null @@ -1,176 +0,0 @@ -/* -** $Id: lauxlib.h,v 1.120.1.1 2013/04/12 18:48:47 roberto Exp $ -** Auxiliary functions for building Lua libraries -** See Copyright Notice in lua.h -*/ - - -#ifndef lauxlib_h -#define lauxlib_h - - -#include - -#include "lua.h" - - - -/* extra error code for `luaL_load' */ -#define LUA_ERRFILE (LUA_ERRERR+1) - - -typedef struct luaL_Reg { - const char *name; - lua_CFunction func; -} luaL_Reg; - - -LUALIB_API void (luaL_checkversion_) (lua_State *L, lua_Number ver); -#define luaL_checkversion(L) luaL_checkversion_(L, LUA_VERSION_NUM) - -LUALIB_API int (luaL_getmetafield) (lua_State *L, int obj, const char *e); -LUALIB_API int (luaL_callmeta) (lua_State *L, int obj, const char *e); -LUALIB_API const char *(luaL_tolstring) (lua_State *L, int idx, size_t *len); -LUALIB_API int (luaL_argerror) (lua_State *L, int numarg, const char *extramsg); -LUALIB_API const char *(luaL_checklstring) (lua_State *L, int numArg, - size_t *l); -LUALIB_API const char *(luaL_optlstring) (lua_State *L, int numArg, - const char *def, size_t *l); -LUALIB_API lua_Number (luaL_checknumber) (lua_State *L, int numArg); -LUALIB_API lua_Number (luaL_optnumber) (lua_State *L, int nArg, lua_Number def); - -LUALIB_API lua_Integer (luaL_checkinteger) (lua_State *L, int numArg); -LUALIB_API lua_Integer (luaL_optinteger) (lua_State *L, int nArg, - lua_Integer def); -LUALIB_API lua_Unsigned (luaL_checkunsigned) (lua_State *L, int numArg); -LUALIB_API lua_Unsigned (luaL_optunsigned) (lua_State *L, int numArg, - lua_Unsigned def); - -LUALIB_API void (luaL_checkstack) (lua_State *L, int sz, const char *msg); -LUALIB_API void (luaL_checktype) (lua_State *L, int narg, int t); -LUALIB_API void (luaL_checkany) (lua_State *L, int narg); - -LUALIB_API int (luaL_newmetatable) (lua_State *L, const char *tname); -LUALIB_API void (luaL_setmetatable) (lua_State *L, const char *tname); -LUALIB_API void *(luaL_testudata) (lua_State *L, int ud, const char *tname); -LUALIB_API void *(luaL_checkudata) (lua_State *L, int ud, const char *tname); - -LUALIB_API void (luaL_where) (lua_State *L, int lvl); -LUALIB_API int (luaL_error) (lua_State *L, const char *fmt, ...); - -LUALIB_API int (luaL_checkoption) (lua_State *L, int narg, const char *def, - const char *const lst[]); - -/* pre-defined references */ -#define LUA_NOREF (-2) -#define LUA_REFNIL (-1) - -LUALIB_API int (luaL_ref) (lua_State *L, int t); -LUALIB_API void (luaL_unref) (lua_State *L, int t, int ref); - -LUALIB_API int (luaL_loadbufferx) (lua_State *L, const char *buff, size_t sz, - const char *name, const char *mode); -LUALIB_API int (luaL_loadstring) (lua_State *L, const char *s); - -LUALIB_API int (luaL_len) (lua_State *L, int idx); - -LUALIB_API const char *(luaL_gsub) (lua_State *L, const char *s, const char *p, - const char *r); - -LUALIB_API void (luaL_setfuncs) (lua_State *L, const luaL_Reg *l, int nup); - -LUALIB_API int (luaL_getsubtable) (lua_State *L, int idx, const char *fname); - -LUALIB_API void (luaL_traceback) (lua_State *L, lua_State *L1, - const char *msg, int level); - -LUALIB_API void (luaL_requiref) (lua_State *L, const char *modname, - lua_CFunction openf, int glb); - -/* -** =============================================================== -** some useful macros -** =============================================================== -*/ - - -#define luaL_newlibtable(L,l) \ - lua_createtable(L, 0, sizeof(l)/sizeof((l)[0]) - 1) - -#define luaL_newlib(L,l) (luaL_newlibtable(L,l), luaL_setfuncs(L,l,0)) - -#define luaL_argcheck(L, cond,numarg,extramsg) \ - ((void)((cond) || luaL_argerror(L, (numarg), (extramsg)))) -#define luaL_checkstring(L,n) (luaL_checklstring(L, (n), NULL)) -#define luaL_optstring(L,n,d) (luaL_optlstring(L, (n), (d), NULL)) -#define luaL_checkint(L,n) ((int)luaL_checkinteger(L, (n))) -#define luaL_optint(L,n,d) ((int)luaL_optinteger(L, (n), (d))) -#define luaL_checklong(L,n) ((long)luaL_checkinteger(L, (n))) -#define luaL_optlong(L,n,d) ((long)luaL_optinteger(L, (n), (d))) - -#define luaL_typename(L,i) lua_typename(L, lua_type(L,(i))) - -#define luaL_dofile(L, fn) \ - (luaL_loadfile(L, fn) || lua_pcall(L, 0, LUA_MULTRET, 0)) - -#define luaL_dostring(L, s) \ - (luaL_loadstring(L, s) || lua_pcall(L, 0, LUA_MULTRET, 0)) - -#define luaL_getmetatable(L,n) (lua_getfield(L, LUA_REGISTRYINDEX, (n))) - -#define luaL_opt(L,f,n,d) (lua_isnoneornil(L,(n)) ? (d) : f(L,(n))) - -#define luaL_loadbuffer(L,s,sz,n) luaL_loadbufferx(L,s,sz,n,NULL) - - -/* -** {====================================================== -** Generic Buffer manipulation -** ======================================================= -*/ - -typedef struct luaL_Buffer { - char *b; /* buffer address */ - size_t size; /* buffer size */ - size_t n; /* number of characters in buffer */ - lua_State *L; - char initb[LUAL_BUFFERSIZE]; /* initial buffer */ -} luaL_Buffer; - - -#define luaL_addchar(B,c) \ - ((void)((B)->n < (B)->size || luaL_prepbuffsize((B), 1)), \ - ((B)->b[(B)->n++] = (c))) - -#define luaL_addsize(B,s) ((B)->n += (s)) - -LUALIB_API void (luaL_buffinit) (lua_State *L, luaL_Buffer *B); -LUALIB_API char *(luaL_prepbuffsize) (luaL_Buffer *B, size_t sz); -LUALIB_API void (luaL_addlstring) (luaL_Buffer *B, const char *s, size_t l); -LUALIB_API void (luaL_addstring) (luaL_Buffer *B, const char *s); -LUALIB_API void (luaL_addvalue) (luaL_Buffer *B); -LUALIB_API void (luaL_pushresult) (luaL_Buffer *B); -LUALIB_API void (luaL_pushresultsize) (luaL_Buffer *B, size_t sz); -LUALIB_API char *(luaL_buffinitsize) (lua_State *L, luaL_Buffer *B, size_t sz); - -#define luaL_prepbuffer(B) luaL_prepbuffsize(B, LUAL_BUFFERSIZE) - -/* }====================================================== */ - - -/* compatibility with old module system */ -#if defined(LUA_COMPAT_MODULE) - -LUALIB_API void (luaL_pushmodule) (lua_State *L, const char *modname, - int sizehint); -LUALIB_API void (luaL_openlib) (lua_State *L, const char *libname, - const luaL_Reg *l, int nup); - -#define luaL_register(L,n,l) (luaL_openlib(L,(n),(l),0)) - -#endif - - -#endif - - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c deleted file mode 100644 index b580cee1f955..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c +++ /dev/null @@ -1,296 +0,0 @@ -/* -** $Id: lbaselib.c,v 1.276.1.1 2013/04/12 18:48:47 roberto Exp $ -** Basic library -** See Copyright Notice in lua.h -*/ - -/* The following built-in lua functions have been removed and are not available - * for use in ZFS channel programs: - * - * dofile - * loadfile - * load - * pcall - * print - * xpcall - */ - -#include -#include -#ifdef illumos -#define toupper(C) (((C) >= 'a' && (C) <= 'z')? (C) - 'a' + 'A': (C)) -#else -#define isalnum(C) (isalpha(C) || isdigit(C)) -#endif - -#define lbaselib_c -#define LUA_LIB - -#include "lua.h" - -#include "lauxlib.h" -#include "lualib.h" - -#define SPACECHARS " \f\n\r\t\v" - -static int luaB_tonumber (lua_State *L) { - if (lua_isnoneornil(L, 2)) { /* standard conversion */ - int isnum; - lua_Number n = lua_tonumberx(L, 1, &isnum); - if (isnum) { - lua_pushnumber(L, n); - return 1; - } /* else not a number; must be something */ - luaL_checkany(L, 1); - } - else { - size_t l; - const char *s = luaL_checklstring(L, 1, &l); - const char *e = s + l; /* end point for 's' */ - int base = luaL_checkint(L, 2); - int neg = 0; - luaL_argcheck(L, 2 <= base && base <= 36, 2, "base out of range"); - s += strspn(s, SPACECHARS); /* skip initial spaces */ - if (*s == '-') { s++; neg = 1; } /* handle signal */ - else if (*s == '+') s++; - if (isalnum((unsigned char)*s)) { - lua_Number n = 0; - do { - int digit = (isdigit((unsigned char)*s)) ? *s - '0' - : toupper((unsigned char)*s) - 'A' + 10; - if (digit >= base) break; /* invalid numeral; force a fail */ - n = n * (lua_Number)base + (lua_Number)digit; - s++; - } while (isalnum((unsigned char)*s)); - s += strspn(s, SPACECHARS); /* skip trailing spaces */ - if (s == e) { /* no invalid trailing characters? */ - lua_pushnumber(L, (neg) ? -n : n); - return 1; - } /* else not a number */ - } /* else not a number */ - } - lua_pushnil(L); /* not a number */ - return 1; -} - - -static int luaB_error (lua_State *L) { - int level = luaL_optint(L, 2, 1); - lua_settop(L, 1); - if (lua_isstring(L, 1) && level > 0) { /* add extra information? */ - luaL_where(L, level); - lua_pushvalue(L, 1); - lua_concat(L, 2); - } - return lua_error(L); -} - - -static int luaB_getmetatable (lua_State *L) { - luaL_checkany(L, 1); - if (!lua_getmetatable(L, 1)) { - lua_pushnil(L); - return 1; /* no metatable */ - } - luaL_getmetafield(L, 1, "__metatable"); - return 1; /* returns either __metatable field (if present) or metatable */ -} - - -static int luaB_setmetatable (lua_State *L) { - int t = lua_type(L, 2); - luaL_checktype(L, 1, LUA_TTABLE); - luaL_argcheck(L, t == LUA_TNIL || t == LUA_TTABLE, 2, - "nil or table expected"); - if (luaL_getmetafield(L, 1, "__metatable")) - return luaL_error(L, "cannot change a protected metatable"); - lua_settop(L, 2); - lua_setmetatable(L, 1); - return 1; -} - - -static int luaB_rawequal (lua_State *L) { - luaL_checkany(L, 1); - luaL_checkany(L, 2); - lua_pushboolean(L, lua_rawequal(L, 1, 2)); - return 1; -} - - -static int luaB_rawlen (lua_State *L) { - int t = lua_type(L, 1); - luaL_argcheck(L, t == LUA_TTABLE || t == LUA_TSTRING, 1, - "table or string expected"); - lua_pushinteger(L, lua_rawlen(L, 1)); - return 1; -} - - -static int luaB_rawget (lua_State *L) { - luaL_checktype(L, 1, LUA_TTABLE); - luaL_checkany(L, 2); - lua_settop(L, 2); - lua_rawget(L, 1); - return 1; -} - -static int luaB_rawset (lua_State *L) { - luaL_checktype(L, 1, LUA_TTABLE); - luaL_checkany(L, 2); - luaL_checkany(L, 3); - lua_settop(L, 3); - lua_rawset(L, 1); - return 1; -} - - -static int luaB_collectgarbage (lua_State *L) { - static const char *const opts[] = {"stop", "restart", "collect", - "count", "step", "setpause", "setstepmul", - "setmajorinc", "isrunning", "generational", "incremental", NULL}; - static const int optsnum[] = {LUA_GCSTOP, LUA_GCRESTART, LUA_GCCOLLECT, - LUA_GCCOUNT, LUA_GCSTEP, LUA_GCSETPAUSE, LUA_GCSETSTEPMUL, - LUA_GCSETMAJORINC, LUA_GCISRUNNING, LUA_GCGEN, LUA_GCINC}; - int o = optsnum[luaL_checkoption(L, 1, "collect", opts)]; - int ex = luaL_optint(L, 2, 0); - int res = lua_gc(L, o, ex); - switch (o) { - case LUA_GCCOUNT: { - int b = lua_gc(L, LUA_GCCOUNTB, 0); - lua_pushnumber(L, res + ((lua_Number)b/1024)); - lua_pushinteger(L, b); - return 2; - } - case LUA_GCSTEP: case LUA_GCISRUNNING: { - lua_pushboolean(L, res); - return 1; - } - default: { - lua_pushinteger(L, res); - return 1; - } - } -} - - -static int luaB_type (lua_State *L) { - luaL_checkany(L, 1); - lua_pushstring(L, luaL_typename(L, 1)); - return 1; -} - - -static int pairsmeta (lua_State *L, const char *method, int iszero, - lua_CFunction iter) { - if (!luaL_getmetafield(L, 1, method)) { /* no metamethod? */ - luaL_checktype(L, 1, LUA_TTABLE); /* argument must be a table */ - lua_pushcfunction(L, iter); /* will return generator, */ - lua_pushvalue(L, 1); /* state, */ - if (iszero) lua_pushinteger(L, 0); /* and initial value */ - else lua_pushnil(L); - } - else { - lua_pushvalue(L, 1); /* argument 'self' to metamethod */ - lua_call(L, 1, 3); /* get 3 values from metamethod */ - } - return 3; -} - - -static int luaB_next (lua_State *L) { - luaL_checktype(L, 1, LUA_TTABLE); - lua_settop(L, 2); /* create a 2nd argument if there isn't one */ - if (lua_next(L, 1)) - return 2; - else { - lua_pushnil(L); - return 1; - } -} - - -static int luaB_pairs (lua_State *L) { - return pairsmeta(L, "__pairs", 0, luaB_next); -} - - -static int ipairsaux (lua_State *L) { - int i = luaL_checkint(L, 2); - luaL_checktype(L, 1, LUA_TTABLE); - i++; /* next value */ - lua_pushinteger(L, i); - lua_rawgeti(L, 1, i); - return (lua_isnil(L, -1)) ? 1 : 2; -} - - -static int luaB_ipairs (lua_State *L) { - return pairsmeta(L, "__ipairs", 1, ipairsaux); -} - - -static int luaB_assert (lua_State *L) { - if (!lua_toboolean(L, 1)) - return luaL_error(L, "%s", luaL_optstring(L, 2, "assertion failed!")); - return lua_gettop(L); -} - - -static int luaB_select (lua_State *L) { - int n = lua_gettop(L); - if (lua_type(L, 1) == LUA_TSTRING && *lua_tostring(L, 1) == '#') { - lua_pushinteger(L, n-1); - return 1; - } - else { - int i = luaL_checkint(L, 1); - if (i < 0) i = n + i; - else if (i > n) i = n; - luaL_argcheck(L, 1 <= i, 1, "index out of range"); - return n - i; - } -} - -static int luaB_tostring (lua_State *L) { - luaL_checkany(L, 1); - luaL_tolstring(L, 1, NULL); - return 1; -} - -static const luaL_Reg base_funcs[] = { - {"assert", luaB_assert}, - {"collectgarbage", luaB_collectgarbage}, - {"error", luaB_error}, - {"getmetatable", luaB_getmetatable}, - {"ipairs", luaB_ipairs}, -#if defined(LUA_COMPAT_LOADSTRING) - {"loadstring", luaB_load}, -#endif - {"next", luaB_next}, - {"pairs", luaB_pairs}, - {"rawequal", luaB_rawequal}, - {"rawlen", luaB_rawlen}, - {"rawget", luaB_rawget}, - {"rawset", luaB_rawset}, - {"select", luaB_select}, - {"setmetatable", luaB_setmetatable}, - {"tonumber", luaB_tonumber}, - {"tostring", luaB_tostring}, - {"type", luaB_type}, - {NULL, NULL} -}; - - -LUAMOD_API int luaopen_base (lua_State *L) { - /* set global _G */ - lua_pushglobaltable(L); - lua_pushglobaltable(L); - lua_setfield(L, -2, "_G"); - /* open lib into global table */ - luaL_setfuncs(L, base_funcs, 0); - lua_pushliteral(L, LUA_VERSION); - lua_setfield(L, -2, "_VERSION"); /* set global _VERSION */ - return 1; -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c deleted file mode 100644 index 31c7b66f1290..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c +++ /dev/null @@ -1,212 +0,0 @@ -/* -** $Id: lbitlib.c,v 1.18.1.2 2013/07/09 18:01:41 roberto Exp $ -** Standard library for bitwise operations -** See Copyright Notice in lua.h -*/ - -#define lbitlib_c -#define LUA_LIB - -#include "lua.h" - -#include "lauxlib.h" -#include "lualib.h" - - -/* number of bits to consider in a number */ -#if !defined(LUA_NBITS) -#define LUA_NBITS 32 -#endif - - -#define ALLONES (~(((~(lua_Unsigned)0) << (LUA_NBITS - 1)) << 1)) - -/* macro to trim extra bits */ -#define trim(x) ((x) & ALLONES) - - -/* builds a number with 'n' ones (1 <= n <= LUA_NBITS) */ -#define mask(n) (~((ALLONES << 1) << ((n) - 1))) - - -typedef lua_Unsigned b_uint; - - - -static b_uint andaux (lua_State *L) { - int i, n = lua_gettop(L); - b_uint r = ~(b_uint)0; - for (i = 1; i <= n; i++) - r &= luaL_checkunsigned(L, i); - return trim(r); -} - - -static int b_and (lua_State *L) { - b_uint r = andaux(L); - lua_pushunsigned(L, r); - return 1; -} - - -static int b_test (lua_State *L) { - b_uint r = andaux(L); - lua_pushboolean(L, r != 0); - return 1; -} - - -static int b_or (lua_State *L) { - int i, n = lua_gettop(L); - b_uint r = 0; - for (i = 1; i <= n; i++) - r |= luaL_checkunsigned(L, i); - lua_pushunsigned(L, trim(r)); - return 1; -} - - -static int b_xor (lua_State *L) { - int i, n = lua_gettop(L); - b_uint r = 0; - for (i = 1; i <= n; i++) - r ^= luaL_checkunsigned(L, i); - lua_pushunsigned(L, trim(r)); - return 1; -} - - -static int b_not (lua_State *L) { - b_uint r = ~luaL_checkunsigned(L, 1); - lua_pushunsigned(L, trim(r)); - return 1; -} - - -static int b_shift (lua_State *L, b_uint r, int i) { - if (i < 0) { /* shift right? */ - i = -i; - r = trim(r); - if (i >= LUA_NBITS) r = 0; - else r >>= i; - } - else { /* shift left */ - if (i >= LUA_NBITS) r = 0; - else r <<= i; - r = trim(r); - } - lua_pushunsigned(L, r); - return 1; -} - - -static int b_lshift (lua_State *L) { - return b_shift(L, luaL_checkunsigned(L, 1), luaL_checkint(L, 2)); -} - - -static int b_rshift (lua_State *L) { - return b_shift(L, luaL_checkunsigned(L, 1), -luaL_checkint(L, 2)); -} - - -static int b_arshift (lua_State *L) { - b_uint r = luaL_checkunsigned(L, 1); - int i = luaL_checkint(L, 2); - if (i < 0 || !(r & ((b_uint)1 << (LUA_NBITS - 1)))) - return b_shift(L, r, -i); - else { /* arithmetic shift for 'negative' number */ - if (i >= LUA_NBITS) r = ALLONES; - else - r = trim((r >> i) | ~(~(b_uint)0 >> i)); /* add signal bit */ - lua_pushunsigned(L, r); - return 1; - } -} - - -static int b_rot (lua_State *L, int i) { - b_uint r = luaL_checkunsigned(L, 1); - i &= (LUA_NBITS - 1); /* i = i % NBITS */ - r = trim(r); - if (i != 0) /* avoid undefined shift of LUA_NBITS when i == 0 */ - r = (r << i) | (r >> (LUA_NBITS - i)); - lua_pushunsigned(L, trim(r)); - return 1; -} - - -static int b_lrot (lua_State *L) { - return b_rot(L, luaL_checkint(L, 2)); -} - - -static int b_rrot (lua_State *L) { - return b_rot(L, -luaL_checkint(L, 2)); -} - - -/* -** get field and width arguments for field-manipulation functions, -** checking whether they are valid. -** ('luaL_error' called without 'return' to avoid later warnings about -** 'width' being used uninitialized.) -*/ -static int fieldargs (lua_State *L, int farg, int *width) { - int f = luaL_checkint(L, farg); - int w = luaL_optint(L, farg + 1, 1); - luaL_argcheck(L, 0 <= f, farg, "field cannot be negative"); - luaL_argcheck(L, 0 < w, farg + 1, "width must be positive"); - if (f + w > LUA_NBITS) - luaL_error(L, "trying to access non-existent bits"); - *width = w; - return f; -} - - -static int b_extract (lua_State *L) { - int w; - b_uint r = luaL_checkunsigned(L, 1); - int f = fieldargs(L, 2, &w); - r = (r >> f) & mask(w); - lua_pushunsigned(L, r); - return 1; -} - - -static int b_replace (lua_State *L) { - int w; - b_uint r = luaL_checkunsigned(L, 1); - b_uint v = luaL_checkunsigned(L, 2); - int f = fieldargs(L, 3, &w); - int m = mask(w); - v &= m; /* erase bits outside given width */ - r = (r & ~(m << f)) | (v << f); - lua_pushunsigned(L, r); - return 1; -} - - -static const luaL_Reg bitlib[] = { - {"arshift", b_arshift}, - {"band", b_and}, - {"bnot", b_not}, - {"bor", b_or}, - {"bxor", b_xor}, - {"btest", b_test}, - {"extract", b_extract}, - {"lrotate", b_lrot}, - {"lshift", b_lshift}, - {"replace", b_replace}, - {"rrotate", b_rrot}, - {"rshift", b_rshift}, - {NULL, NULL} -}; - - - -LUAMOD_API int luaopen_bit32 (lua_State *L) { - luaL_newlib(L, bitlib); - return 1; -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c deleted file mode 100644 index f155014d12c4..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c +++ /dev/null @@ -1,885 +0,0 @@ -/* -** $Id: lcode.c,v 2.62.1.1 2013/04/12 18:48:47 roberto Exp $ -** Code generator for Lua -** See Copyright Notice in lua.h -*/ - -#include - -#define lcode_c -#define LUA_CORE - -#include "lua.h" - -#include "lcode.h" -#include "ldebug.h" -#include "ldo.h" -#include "lgc.h" -#include "llex.h" -#include "lmem.h" -#include "lobject.h" -#include "lopcodes.h" -#include "lparser.h" -#include "lstring.h" -#include "ltable.h" -#include "lvm.h" - - -#define hasjumps(e) ((e)->t != (e)->f) - - -static int isnumeral(expdesc *e) { - return (e->k == VKNUM && e->t == NO_JUMP && e->f == NO_JUMP); -} - - -void luaK_nil (FuncState *fs, int from, int n) { - Instruction *previous; - int l = from + n - 1; /* last register to set nil */ - if (fs->pc > fs->lasttarget) { /* no jumps to current position? */ - previous = &fs->f->code[fs->pc-1]; - if (GET_OPCODE(*previous) == OP_LOADNIL) { - int pfrom = GETARG_A(*previous); - int pl = pfrom + GETARG_B(*previous); - if ((pfrom <= from && from <= pl + 1) || - (from <= pfrom && pfrom <= l + 1)) { /* can connect both? */ - if (pfrom < from) from = pfrom; /* from = min(from, pfrom) */ - if (pl > l) l = pl; /* l = max(l, pl) */ - SETARG_A(*previous, from); - SETARG_B(*previous, l - from); - return; - } - } /* else go through */ - } - luaK_codeABC(fs, OP_LOADNIL, from, n - 1, 0); /* else no optimization */ -} - - -int luaK_jump (FuncState *fs) { - int jpc = fs->jpc; /* save list of jumps to here */ - int j; - fs->jpc = NO_JUMP; - j = luaK_codeAsBx(fs, OP_JMP, 0, NO_JUMP); - luaK_concat(fs, &j, jpc); /* keep them on hold */ - return j; -} - - -void luaK_ret (FuncState *fs, int first, int nret) { - luaK_codeABC(fs, OP_RETURN, first, nret+1, 0); -} - - -static int condjump (FuncState *fs, OpCode op, int A, int B, int C) { - luaK_codeABC(fs, op, A, B, C); - return luaK_jump(fs); -} - - -static void fixjump (FuncState *fs, int pc, int dest) { - Instruction *jmp = &fs->f->code[pc]; - int offset = dest-(pc+1); - lua_assert(dest != NO_JUMP); - if (abs(offset) > MAXARG_sBx) - luaX_syntaxerror(fs->ls, "control structure too long"); - SETARG_sBx(*jmp, offset); -} - - -/* -** returns current `pc' and marks it as a jump target (to avoid wrong -** optimizations with consecutive instructions not in the same basic block). -*/ -int luaK_getlabel (FuncState *fs) { - fs->lasttarget = fs->pc; - return fs->pc; -} - - -static int getjump (FuncState *fs, int pc) { - int offset = GETARG_sBx(fs->f->code[pc]); - if (offset == NO_JUMP) /* point to itself represents end of list */ - return NO_JUMP; /* end of list */ - else - return (pc+1)+offset; /* turn offset into absolute position */ -} - - -static Instruction *getjumpcontrol (FuncState *fs, int pc) { - Instruction *pi = &fs->f->code[pc]; - if (pc >= 1 && testTMode(GET_OPCODE(*(pi-1)))) - return pi-1; - else - return pi; -} - - -/* -** check whether list has any jump that do not produce a value -** (or produce an inverted value) -*/ -static int need_value (FuncState *fs, int list) { - for (; list != NO_JUMP; list = getjump(fs, list)) { - Instruction i = *getjumpcontrol(fs, list); - if (GET_OPCODE(i) != OP_TESTSET) return 1; - } - return 0; /* not found */ -} - - -static int patchtestreg (FuncState *fs, int node, int reg) { - Instruction *i = getjumpcontrol(fs, node); - if (GET_OPCODE(*i) != OP_TESTSET) - return 0; /* cannot patch other instructions */ - if (reg != NO_REG && reg != GETARG_B(*i)) - SETARG_A(*i, reg); - else /* no register to put value or register already has the value */ - *i = CREATE_ABC(OP_TEST, GETARG_B(*i), 0, GETARG_C(*i)); - - return 1; -} - - -static void removevalues (FuncState *fs, int list) { - for (; list != NO_JUMP; list = getjump(fs, list)) - patchtestreg(fs, list, NO_REG); -} - - -static void patchlistaux (FuncState *fs, int list, int vtarget, int reg, - int dtarget) { - while (list != NO_JUMP) { - int next = getjump(fs, list); - if (patchtestreg(fs, list, reg)) - fixjump(fs, list, vtarget); - else - fixjump(fs, list, dtarget); /* jump to default target */ - list = next; - } -} - - -static void dischargejpc (FuncState *fs) { - patchlistaux(fs, fs->jpc, fs->pc, NO_REG, fs->pc); - fs->jpc = NO_JUMP; -} - - -void luaK_patchlist (FuncState *fs, int list, int target) { - if (target == fs->pc) - luaK_patchtohere(fs, list); - else { - lua_assert(target < fs->pc); - patchlistaux(fs, list, target, NO_REG, target); - } -} - - -LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level) { - level++; /* argument is +1 to reserve 0 as non-op */ - while (list != NO_JUMP) { - int next = getjump(fs, list); - lua_assert(GET_OPCODE(fs->f->code[list]) == OP_JMP && - (GETARG_A(fs->f->code[list]) == 0 || - GETARG_A(fs->f->code[list]) >= level)); - SETARG_A(fs->f->code[list], level); - list = next; - } -} - - -void luaK_patchtohere (FuncState *fs, int list) { - luaK_getlabel(fs); - luaK_concat(fs, &fs->jpc, list); -} - - -void luaK_concat (FuncState *fs, int *l1, int l2) { - if (l2 == NO_JUMP) return; - else if (*l1 == NO_JUMP) - *l1 = l2; - else { - int list = *l1; - int next; - while ((next = getjump(fs, list)) != NO_JUMP) /* find last element */ - list = next; - fixjump(fs, list, l2); - } -} - - -static int luaK_code (FuncState *fs, Instruction i) { - Proto *f = fs->f; - dischargejpc(fs); /* `pc' will change */ - /* put new instruction in code array */ - luaM_growvector(fs->ls->L, f->code, fs->pc, f->sizecode, Instruction, - MAX_INT, "opcodes"); - f->code[fs->pc] = i; - /* save corresponding line information */ - luaM_growvector(fs->ls->L, f->lineinfo, fs->pc, f->sizelineinfo, int, - MAX_INT, "opcodes"); - f->lineinfo[fs->pc] = fs->ls->lastline; - return fs->pc++; -} - - -int luaK_codeABC (FuncState *fs, OpCode o, int a, int b, int c) { - lua_assert(getOpMode(o) == iABC); - lua_assert(getBMode(o) != OpArgN || b == 0); - lua_assert(getCMode(o) != OpArgN || c == 0); - lua_assert(a <= MAXARG_A && b <= MAXARG_B && c <= MAXARG_C); - return luaK_code(fs, CREATE_ABC(o, a, b, c)); -} - - -int luaK_codeABx (FuncState *fs, OpCode o, int a, unsigned int bc) { - lua_assert(getOpMode(o) == iABx || getOpMode(o) == iAsBx); - lua_assert(getCMode(o) == OpArgN); - lua_assert(a <= MAXARG_A && bc <= MAXARG_Bx); - return luaK_code(fs, CREATE_ABx(o, a, bc)); -} - - -static int codeextraarg (FuncState *fs, int a) { - lua_assert(a <= MAXARG_Ax); - return luaK_code(fs, CREATE_Ax(OP_EXTRAARG, a)); -} - - -int luaK_codek (FuncState *fs, int reg, int k) { - if (k <= MAXARG_Bx) - return luaK_codeABx(fs, OP_LOADK, reg, k); - else { - int p = luaK_codeABx(fs, OP_LOADKX, reg, 0); - codeextraarg(fs, k); - return p; - } -} - - -void luaK_checkstack (FuncState *fs, int n) { - int newstack = fs->freereg + n; - if (newstack > fs->f->maxstacksize) { - if (newstack >= MAXSTACK) - luaX_syntaxerror(fs->ls, "function or expression too complex"); - fs->f->maxstacksize = cast_byte(newstack); - } -} - - -void luaK_reserveregs (FuncState *fs, int n) { - luaK_checkstack(fs, n); - fs->freereg += n; -} - - -static void freereg (FuncState *fs, int reg) { - if (!ISK(reg) && reg >= fs->nactvar) { - fs->freereg--; - lua_assert(reg == fs->freereg); - } -} - - -static void freeexp (FuncState *fs, expdesc *e) { - if (e->k == VNONRELOC) - freereg(fs, e->u.info); -} - - -static int addk (FuncState *fs, TValue *key, TValue *v) { - lua_State *L = fs->ls->L; - TValue *idx = luaH_set(L, fs->h, key); - Proto *f = fs->f; - int k, oldsize; - if (ttisnumber(idx)) { - lua_Number n = nvalue(idx); - lua_number2int(k, n); - if (luaV_rawequalobj(&f->k[k], v)) - return k; - /* else may be a collision (e.g., between 0.0 and "\0\0\0\0\0\0\0\0"); - go through and create a new entry for this value */ - } - /* constant not found; create a new entry */ - oldsize = f->sizek; - k = fs->nk; - /* numerical value does not need GC barrier; - table has no metatable, so it does not need to invalidate cache */ - setnvalue(idx, cast_num(k)); - luaM_growvector(L, f->k, k, f->sizek, TValue, MAXARG_Ax, "constants"); - while (oldsize < f->sizek) setnilvalue(&f->k[oldsize++]); - setobj(L, &f->k[k], v); - fs->nk++; - luaC_barrier(L, f, v); - return k; -} - - -int luaK_stringK (FuncState *fs, TString *s) { - TValue o; - setsvalue(fs->ls->L, &o, s); - return addk(fs, &o, &o); -} - - -int luaK_numberK (FuncState *fs, lua_Number r) { - int n; - lua_State *L = fs->ls->L; - TValue o; - setnvalue(&o, r); - if (r == 0 || luai_numisnan(NULL, r)) { /* handle -0 and NaN */ - /* use raw representation as key to avoid numeric problems */ - setsvalue(L, L->top++, luaS_newlstr(L, (char *)&r, sizeof(r))); - n = addk(fs, L->top - 1, &o); - L->top--; - } - else - n = addk(fs, &o, &o); /* regular case */ - return n; -} - - -static int boolK (FuncState *fs, int b) { - TValue o; - setbvalue(&o, b); - return addk(fs, &o, &o); -} - - -static int nilK (FuncState *fs) { - TValue k, v; - setnilvalue(&v); - /* cannot use nil as key; instead use table itself to represent nil */ - sethvalue(fs->ls->L, &k, fs->h); - return addk(fs, &k, &v); -} - - -void luaK_setreturns (FuncState *fs, expdesc *e, int nresults) { - if (e->k == VCALL) { /* expression is an open function call? */ - SETARG_C(getcode(fs, e), nresults+1); - } - else if (e->k == VVARARG) { - SETARG_B(getcode(fs, e), nresults+1); - SETARG_A(getcode(fs, e), fs->freereg); - luaK_reserveregs(fs, 1); - } -} - - -void luaK_setoneret (FuncState *fs, expdesc *e) { - if (e->k == VCALL) { /* expression is an open function call? */ - e->k = VNONRELOC; - e->u.info = GETARG_A(getcode(fs, e)); - } - else if (e->k == VVARARG) { - SETARG_B(getcode(fs, e), 2); - e->k = VRELOCABLE; /* can relocate its simple result */ - } -} - - -void luaK_dischargevars (FuncState *fs, expdesc *e) { - switch (e->k) { - case VLOCAL: { - e->k = VNONRELOC; - break; - } - case VUPVAL: { - e->u.info = luaK_codeABC(fs, OP_GETUPVAL, 0, e->u.info, 0); - e->k = VRELOCABLE; - break; - } - case VINDEXED: { - OpCode op = OP_GETTABUP; /* assume 't' is in an upvalue */ - freereg(fs, e->u.ind.idx); - if (e->u.ind.vt == VLOCAL) { /* 't' is in a register? */ - freereg(fs, e->u.ind.t); - op = OP_GETTABLE; - } - e->u.info = luaK_codeABC(fs, op, 0, e->u.ind.t, e->u.ind.idx); - e->k = VRELOCABLE; - break; - } - case VVARARG: - case VCALL: { - luaK_setoneret(fs, e); - break; - } - default: break; /* there is one value available (somewhere) */ - } -} - - -static int code_label (FuncState *fs, int A, int b, int jump) { - luaK_getlabel(fs); /* those instructions may be jump targets */ - return luaK_codeABC(fs, OP_LOADBOOL, A, b, jump); -} - - -static void discharge2reg (FuncState *fs, expdesc *e, int reg) { - luaK_dischargevars(fs, e); - switch (e->k) { - case VNIL: { - luaK_nil(fs, reg, 1); - break; - } - case VFALSE: case VTRUE: { - luaK_codeABC(fs, OP_LOADBOOL, reg, e->k == VTRUE, 0); - break; - } - case VK: { - luaK_codek(fs, reg, e->u.info); - break; - } - case VKNUM: { - luaK_codek(fs, reg, luaK_numberK(fs, e->u.nval)); - break; - } - case VRELOCABLE: { - Instruction *pc = &getcode(fs, e); - SETARG_A(*pc, reg); - break; - } - case VNONRELOC: { - if (reg != e->u.info) - luaK_codeABC(fs, OP_MOVE, reg, e->u.info, 0); - break; - } - default: { - lua_assert(e->k == VVOID || e->k == VJMP); - return; /* nothing to do... */ - } - } - e->u.info = reg; - e->k = VNONRELOC; -} - - -static void discharge2anyreg (FuncState *fs, expdesc *e) { - if (e->k != VNONRELOC) { - luaK_reserveregs(fs, 1); - discharge2reg(fs, e, fs->freereg-1); - } -} - - -static void exp2reg (FuncState *fs, expdesc *e, int reg) { - discharge2reg(fs, e, reg); - if (e->k == VJMP) - luaK_concat(fs, &e->t, e->u.info); /* put this jump in `t' list */ - if (hasjumps(e)) { - int final; /* position after whole expression */ - int p_f = NO_JUMP; /* position of an eventual LOAD false */ - int p_t = NO_JUMP; /* position of an eventual LOAD true */ - if (need_value(fs, e->t) || need_value(fs, e->f)) { - int fj = (e->k == VJMP) ? NO_JUMP : luaK_jump(fs); - p_f = code_label(fs, reg, 0, 1); - p_t = code_label(fs, reg, 1, 0); - luaK_patchtohere(fs, fj); - } - final = luaK_getlabel(fs); - patchlistaux(fs, e->f, final, reg, p_f); - patchlistaux(fs, e->t, final, reg, p_t); - } - e->f = e->t = NO_JUMP; - e->u.info = reg; - e->k = VNONRELOC; -} - - -void luaK_exp2nextreg (FuncState *fs, expdesc *e) { - luaK_dischargevars(fs, e); - freeexp(fs, e); - luaK_reserveregs(fs, 1); - exp2reg(fs, e, fs->freereg - 1); -} - - -int luaK_exp2anyreg (FuncState *fs, expdesc *e) { - luaK_dischargevars(fs, e); - if (e->k == VNONRELOC) { - if (!hasjumps(e)) return e->u.info; /* exp is already in a register */ - if (e->u.info >= fs->nactvar) { /* reg. is not a local? */ - exp2reg(fs, e, e->u.info); /* put value on it */ - return e->u.info; - } - } - luaK_exp2nextreg(fs, e); /* default */ - return e->u.info; -} - - -void luaK_exp2anyregup (FuncState *fs, expdesc *e) { - if (e->k != VUPVAL || hasjumps(e)) - luaK_exp2anyreg(fs, e); -} - - -void luaK_exp2val (FuncState *fs, expdesc *e) { - if (hasjumps(e)) - luaK_exp2anyreg(fs, e); - else - luaK_dischargevars(fs, e); -} - - -int luaK_exp2RK (FuncState *fs, expdesc *e) { - luaK_exp2val(fs, e); - switch (e->k) { - case VTRUE: - case VFALSE: - case VNIL: { - if (fs->nk <= MAXINDEXRK) { /* constant fits in RK operand? */ - e->u.info = (e->k == VNIL) ? nilK(fs) : boolK(fs, (e->k == VTRUE)); - e->k = VK; - return RKASK(e->u.info); - } - else break; - } - case VKNUM: { - e->u.info = luaK_numberK(fs, e->u.nval); - e->k = VK; - /* go through */ - } - case VK: { - if (e->u.info <= MAXINDEXRK) /* constant fits in argC? */ - return RKASK(e->u.info); - else break; - } - default: break; - } - /* not a constant in the right range: put it in a register */ - return luaK_exp2anyreg(fs, e); -} - - -void luaK_storevar (FuncState *fs, expdesc *var, expdesc *ex) { - switch (var->k) { - case VLOCAL: { - freeexp(fs, ex); - exp2reg(fs, ex, var->u.info); - return; - } - case VUPVAL: { - int e = luaK_exp2anyreg(fs, ex); - luaK_codeABC(fs, OP_SETUPVAL, e, var->u.info, 0); - break; - } - case VINDEXED: { - OpCode op = (var->u.ind.vt == VLOCAL) ? OP_SETTABLE : OP_SETTABUP; - int e = luaK_exp2RK(fs, ex); - luaK_codeABC(fs, op, var->u.ind.t, var->u.ind.idx, e); - break; - } - default: { - lua_assert(0); /* invalid var kind to store */ - break; - } - } - freeexp(fs, ex); -} - - -void luaK_self (FuncState *fs, expdesc *e, expdesc *key) { - int ereg; - luaK_exp2anyreg(fs, e); - ereg = e->u.info; /* register where 'e' was placed */ - freeexp(fs, e); - e->u.info = fs->freereg; /* base register for op_self */ - e->k = VNONRELOC; - luaK_reserveregs(fs, 2); /* function and 'self' produced by op_self */ - luaK_codeABC(fs, OP_SELF, e->u.info, ereg, luaK_exp2RK(fs, key)); - freeexp(fs, key); -} - - -static void invertjump (FuncState *fs, expdesc *e) { - Instruction *pc = getjumpcontrol(fs, e->u.info); - lua_assert(testTMode(GET_OPCODE(*pc)) && GET_OPCODE(*pc) != OP_TESTSET && - GET_OPCODE(*pc) != OP_TEST); - SETARG_A(*pc, !(GETARG_A(*pc))); -} - - -static int jumponcond (FuncState *fs, expdesc *e, int cond) { - if (e->k == VRELOCABLE) { - Instruction ie = getcode(fs, e); - if (GET_OPCODE(ie) == OP_NOT) { - fs->pc--; /* remove previous OP_NOT */ - return condjump(fs, OP_TEST, GETARG_B(ie), 0, !cond); - } - /* else go through */ - } - discharge2anyreg(fs, e); - freeexp(fs, e); - return condjump(fs, OP_TESTSET, NO_REG, e->u.info, cond); -} - - -void luaK_goiftrue (FuncState *fs, expdesc *e) { - int pc; /* pc of last jump */ - luaK_dischargevars(fs, e); - switch (e->k) { - case VJMP: { - invertjump(fs, e); - pc = e->u.info; - break; - } - case VK: case VKNUM: case VTRUE: { - pc = NO_JUMP; /* always true; do nothing */ - break; - } - default: { - pc = jumponcond(fs, e, 0); - break; - } - } - luaK_concat(fs, &e->f, pc); /* insert last jump in `f' list */ - luaK_patchtohere(fs, e->t); - e->t = NO_JUMP; -} - - -void luaK_goiffalse (FuncState *fs, expdesc *e) { - int pc; /* pc of last jump */ - luaK_dischargevars(fs, e); - switch (e->k) { - case VJMP: { - pc = e->u.info; - break; - } - case VNIL: case VFALSE: { - pc = NO_JUMP; /* always false; do nothing */ - break; - } - default: { - pc = jumponcond(fs, e, 1); - break; - } - } - luaK_concat(fs, &e->t, pc); /* insert last jump in `t' list */ - luaK_patchtohere(fs, e->f); - e->f = NO_JUMP; -} - - -static void codenot (FuncState *fs, expdesc *e) { - luaK_dischargevars(fs, e); - switch (e->k) { - case VNIL: case VFALSE: { - e->k = VTRUE; - break; - } - case VK: case VKNUM: case VTRUE: { - e->k = VFALSE; - break; - } - case VJMP: { - invertjump(fs, e); - break; - } - case VRELOCABLE: - case VNONRELOC: { - discharge2anyreg(fs, e); - freeexp(fs, e); - e->u.info = luaK_codeABC(fs, OP_NOT, 0, e->u.info, 0); - e->k = VRELOCABLE; - break; - } - default: { - lua_assert(0); /* cannot happen */ - break; - } - } - /* interchange true and false lists */ - { int temp = e->f; e->f = e->t; e->t = temp; } - removevalues(fs, e->f); - removevalues(fs, e->t); -} - - -void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k) { - lua_assert(!hasjumps(t)); - t->u.ind.t = t->u.info; - t->u.ind.idx = luaK_exp2RK(fs, k); - t->u.ind.vt = (t->k == VUPVAL) ? VUPVAL - : check_exp(vkisinreg(t->k), VLOCAL); - t->k = VINDEXED; -} - - -static int constfolding (OpCode op, expdesc *e1, expdesc *e2) { - lua_Number r; - if (!isnumeral(e1) || !isnumeral(e2)) return 0; - if ((op == OP_DIV || op == OP_MOD) && e2->u.nval == 0) - return 0; /* do not attempt to divide by 0 */ - /* - * Patched: check for MIN_INT / -1 - */ - if (op == OP_DIV && e1->u.nval == INT64_MIN && e2->u.nval == -1) - return 0; - r = luaO_arith(op - OP_ADD + LUA_OPADD, e1->u.nval, e2->u.nval); - e1->u.nval = r; - return 1; -} - - -static void codearith (FuncState *fs, OpCode op, - expdesc *e1, expdesc *e2, int line) { - if (constfolding(op, e1, e2)) - return; - else { - int o2 = (op != OP_UNM && op != OP_LEN) ? luaK_exp2RK(fs, e2) : 0; - int o1 = luaK_exp2RK(fs, e1); - if (o1 > o2) { - freeexp(fs, e1); - freeexp(fs, e2); - } - else { - freeexp(fs, e2); - freeexp(fs, e1); - } - e1->u.info = luaK_codeABC(fs, op, 0, o1, o2); - e1->k = VRELOCABLE; - luaK_fixline(fs, line); - } -} - - -static void codecomp (FuncState *fs, OpCode op, int cond, expdesc *e1, - expdesc *e2) { - int o1 = luaK_exp2RK(fs, e1); - int o2 = luaK_exp2RK(fs, e2); - freeexp(fs, e2); - freeexp(fs, e1); - if (cond == 0 && op != OP_EQ) { - int temp; /* exchange args to replace by `<' or `<=' */ - temp = o1; o1 = o2; o2 = temp; /* o1 <==> o2 */ - cond = 1; - } - e1->u.info = condjump(fs, op, cond, o1, o2); - e1->k = VJMP; -} - - -void luaK_prefix (FuncState *fs, UnOpr op, expdesc *e, int line) { - expdesc e2; - e2.t = e2.f = NO_JUMP; e2.k = VKNUM; e2.u.nval = 0; - switch (op) { - case OPR_MINUS: { - if (isnumeral(e)) /* minus constant? */ - e->u.nval = luai_numunm(NULL, e->u.nval); /* fold it */ - else { - luaK_exp2anyreg(fs, e); - codearith(fs, OP_UNM, e, &e2, line); - } - break; - } - case OPR_NOT: codenot(fs, e); break; - case OPR_LEN: { - luaK_exp2anyreg(fs, e); /* cannot operate on constants */ - codearith(fs, OP_LEN, e, &e2, line); - break; - } - default: lua_assert(0); - } -} - - -void luaK_infix (FuncState *fs, BinOpr op, expdesc *v) { - switch (op) { - case OPR_AND: { - luaK_goiftrue(fs, v); - break; - } - case OPR_OR: { - luaK_goiffalse(fs, v); - break; - } - case OPR_CONCAT: { - luaK_exp2nextreg(fs, v); /* operand must be on the `stack' */ - break; - } - case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV: - case OPR_MOD: case OPR_POW: { - if (!isnumeral(v)) luaK_exp2RK(fs, v); - break; - } - default: { - luaK_exp2RK(fs, v); - break; - } - } -} - - -void luaK_posfix (FuncState *fs, BinOpr op, - expdesc *e1, expdesc *e2, int line) { - switch (op) { - case OPR_AND: { - lua_assert(e1->t == NO_JUMP); /* list must be closed */ - luaK_dischargevars(fs, e2); - luaK_concat(fs, &e2->f, e1->f); - *e1 = *e2; - break; - } - case OPR_OR: { - lua_assert(e1->f == NO_JUMP); /* list must be closed */ - luaK_dischargevars(fs, e2); - luaK_concat(fs, &e2->t, e1->t); - *e1 = *e2; - break; - } - case OPR_CONCAT: { - luaK_exp2val(fs, e2); - if (e2->k == VRELOCABLE && GET_OPCODE(getcode(fs, e2)) == OP_CONCAT) { - lua_assert(e1->u.info == GETARG_B(getcode(fs, e2))-1); - freeexp(fs, e1); - SETARG_B(getcode(fs, e2), e1->u.info); - e1->k = VRELOCABLE; e1->u.info = e2->u.info; - } - else { - luaK_exp2nextreg(fs, e2); /* operand must be on the 'stack' */ - codearith(fs, OP_CONCAT, e1, e2, line); - } - break; - } - case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV: - case OPR_MOD: case OPR_POW: { - codearith(fs, cast(OpCode, op - OPR_ADD + OP_ADD), e1, e2, line); - break; - } - case OPR_EQ: case OPR_LT: case OPR_LE: { - codecomp(fs, cast(OpCode, op - OPR_EQ + OP_EQ), 1, e1, e2); - break; - } - case OPR_NE: case OPR_GT: case OPR_GE: { - codecomp(fs, cast(OpCode, op - OPR_NE + OP_EQ), 0, e1, e2); - break; - } - default: lua_assert(0); - } -} - - -void luaK_fixline (FuncState *fs, int line) { - fs->f->lineinfo[fs->pc - 1] = line; -} - - -void luaK_setlist (FuncState *fs, int base, int nelems, int tostore) { - int c = (nelems - 1)/LFIELDS_PER_FLUSH + 1; - int b = (tostore == LUA_MULTRET) ? 0 : tostore; - lua_assert(tostore != 0); - if (c <= MAXARG_C) - luaK_codeABC(fs, OP_SETLIST, base, b, c); - else if (c <= MAXARG_Ax) { - luaK_codeABC(fs, OP_SETLIST, base, b, 0); - codeextraarg(fs, c); - } - else - luaX_syntaxerror(fs->ls, "constructor too long"); - fs->freereg = base + 1; /* free registers with list values */ -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h deleted file mode 100644 index 6a1424cf5a73..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h +++ /dev/null @@ -1,83 +0,0 @@ -/* -** $Id: lcode.h,v 1.58.1.1 2013/04/12 18:48:47 roberto Exp $ -** Code generator for Lua -** See Copyright Notice in lua.h -*/ - -#ifndef lcode_h -#define lcode_h - -#include "llex.h" -#include "lobject.h" -#include "lopcodes.h" -#include "lparser.h" - - -/* -** Marks the end of a patch list. It is an invalid value both as an absolute -** address, and as a list link (would link an element to itself). -*/ -#define NO_JUMP (-1) - - -/* -** grep "ORDER OPR" if you change these enums (ORDER OP) -*/ -typedef enum BinOpr { - OPR_ADD, OPR_SUB, OPR_MUL, OPR_DIV, OPR_MOD, OPR_POW, - OPR_CONCAT, - OPR_EQ, OPR_LT, OPR_LE, - OPR_NE, OPR_GT, OPR_GE, - OPR_AND, OPR_OR, - OPR_NOBINOPR -} BinOpr; - - -typedef enum UnOpr { OPR_MINUS, OPR_NOT, OPR_LEN, OPR_NOUNOPR } UnOpr; - - -#define getcode(fs,e) ((fs)->f->code[(e)->u.info]) - -#define luaK_codeAsBx(fs,o,A,sBx) luaK_codeABx(fs,o,A,(sBx)+MAXARG_sBx) - -#define luaK_setmultret(fs,e) luaK_setreturns(fs, e, LUA_MULTRET) - -#define luaK_jumpto(fs,t) luaK_patchlist(fs, luaK_jump(fs), t) - -LUAI_FUNC int luaK_codeABx (FuncState *fs, OpCode o, int A, unsigned int Bx); -LUAI_FUNC int luaK_codeABC (FuncState *fs, OpCode o, int A, int B, int C); -LUAI_FUNC int luaK_codek (FuncState *fs, int reg, int k); -LUAI_FUNC void luaK_fixline (FuncState *fs, int line); -LUAI_FUNC void luaK_nil (FuncState *fs, int from, int n); -LUAI_FUNC void luaK_reserveregs (FuncState *fs, int n); -LUAI_FUNC void luaK_checkstack (FuncState *fs, int n); -LUAI_FUNC int luaK_stringK (FuncState *fs, TString *s); -LUAI_FUNC int luaK_numberK (FuncState *fs, lua_Number r); -LUAI_FUNC void luaK_dischargevars (FuncState *fs, expdesc *e); -LUAI_FUNC int luaK_exp2anyreg (FuncState *fs, expdesc *e); -LUAI_FUNC void luaK_exp2anyregup (FuncState *fs, expdesc *e); -LUAI_FUNC void luaK_exp2nextreg (FuncState *fs, expdesc *e); -LUAI_FUNC void luaK_exp2val (FuncState *fs, expdesc *e); -LUAI_FUNC int luaK_exp2RK (FuncState *fs, expdesc *e); -LUAI_FUNC void luaK_self (FuncState *fs, expdesc *e, expdesc *key); -LUAI_FUNC void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k); -LUAI_FUNC void luaK_goiftrue (FuncState *fs, expdesc *e); -LUAI_FUNC void luaK_goiffalse (FuncState *fs, expdesc *e); -LUAI_FUNC void luaK_storevar (FuncState *fs, expdesc *var, expdesc *e); -LUAI_FUNC void luaK_setreturns (FuncState *fs, expdesc *e, int nresults); -LUAI_FUNC void luaK_setoneret (FuncState *fs, expdesc *e); -LUAI_FUNC int luaK_jump (FuncState *fs); -LUAI_FUNC void luaK_ret (FuncState *fs, int first, int nret); -LUAI_FUNC void luaK_patchlist (FuncState *fs, int list, int target); -LUAI_FUNC void luaK_patchtohere (FuncState *fs, int list); -LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level); -LUAI_FUNC void luaK_concat (FuncState *fs, int *l1, int l2); -LUAI_FUNC int luaK_getlabel (FuncState *fs); -LUAI_FUNC void luaK_prefix (FuncState *fs, UnOpr op, expdesc *v, int line); -LUAI_FUNC void luaK_infix (FuncState *fs, BinOpr op, expdesc *v); -LUAI_FUNC void luaK_posfix (FuncState *fs, BinOpr op, expdesc *v1, - expdesc *v2, int line); -LUAI_FUNC void luaK_setlist (FuncState *fs, int base, int nelems, int tostore); - - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c deleted file mode 100644 index 55564ddbd9fd..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2016 by Delphix. All rights reserved. - */ - -#include "lua.h" - -#include - -ssize_t -lcompat_sprintf(char *buf, const char *fmt, ...) -{ - ssize_t res; - va_list args; - - va_start(args, fmt); - res = vsnprintf(buf, INT_MAX, fmt, args); - va_end(args); - - return (res); -} - -int64_t -lcompat_strtoll(const char *str, char **ptr) -{ - int base; - const char *cp; - int digits; - int64_t value; - boolean_t is_negative; - - cp = str; - while (*cp == ' ' || *cp == '\t' || *cp == '\n') { - cp++; - } - is_negative = (*cp == '-'); - if (is_negative) { - cp++; - } - base = 10; - - if (*cp == '0') { - base = 8; - cp++; - if (*cp == 'x' || *cp == 'X') { - base = 16; - cp++; - } - } - - value = 0; - for (; *cp != '\0'; cp++) { - if (*cp >= '0' && *cp <= '9') { - digits = *cp - '0'; - } else if (*cp >= 'a' && *cp <= 'f') { - digits = *cp - 'a' + 10; - } else if (*cp >= 'A' && *cp <= 'F') { - digits = *cp - 'A' + 10; - } else { - break; - } - if (digits >= base) { - break; - } - value = (value * base) + digits; - } - - if (ptr != NULL) { - *ptr = (char *)cp; - } - if (is_negative) { - value = -value; - } - return (value); -} - -int64_t -lcompat_pow(int64_t x, int64_t y) -{ - int64_t result = 1; - if (y < 0) - return (0); - - while (y) { - if (y & 1) - result *= x; - y >>= 1; - x *= x; - } - return (result); -} - -int -lcompat_hashnum(int64_t x) -{ - x = (~x) + (x << 18); - x = x ^ (x >> 31); - x = x * 21; - x = x ^ (x >> 11); - x = x + (x << 6); - x = x ^ (x >> 22); - return ((int)x); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c deleted file mode 100644 index 405350bb145b..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c +++ /dev/null @@ -1,154 +0,0 @@ -/* -** $Id: lcorolib.c,v 1.5.1.1 2013/04/12 18:48:47 roberto Exp $ -** Coroutine Library -** See Copyright Notice in lua.h -*/ - - -#include - -#define lcorolib_c -#define LUA_LIB - -#include "lua.h" - -#include "lauxlib.h" -#include "lualib.h" - - -static int auxresume (lua_State *L, lua_State *co, int narg) { - int status; - if (!lua_checkstack(co, narg)) { - lua_pushliteral(L, "too many arguments to resume"); - return -1; /* error flag */ - } - if (lua_status(co) == LUA_OK && lua_gettop(co) == 0) { - lua_pushliteral(L, "cannot resume dead coroutine"); - return -1; /* error flag */ - } - lua_xmove(L, co, narg); - status = lua_resume(co, L, narg); - if (status == LUA_OK || status == LUA_YIELD) { - int nres = lua_gettop(co); - if (!lua_checkstack(L, nres + 1)) { - lua_pop(co, nres); /* remove results anyway */ - lua_pushliteral(L, "too many results to resume"); - return -1; /* error flag */ - } - lua_xmove(co, L, nres); /* move yielded values */ - return nres; - } - else { - lua_xmove(co, L, 1); /* move error message */ - return -1; /* error flag */ - } -} - - -static int luaB_coresume (lua_State *L) { - lua_State *co = lua_tothread(L, 1); - int r; - luaL_argcheck(L, co, 1, "coroutine expected"); - r = auxresume(L, co, lua_gettop(L) - 1); - if (r < 0) { - lua_pushboolean(L, 0); - lua_insert(L, -2); - return 2; /* return false + error message */ - } - else { - lua_pushboolean(L, 1); - lua_insert(L, -(r + 1)); - return r + 1; /* return true + `resume' returns */ - } -} - - -static int luaB_auxwrap (lua_State *L) { - lua_State *co = lua_tothread(L, lua_upvalueindex(1)); - int r = auxresume(L, co, lua_gettop(L)); - if (r < 0) { - if (lua_isstring(L, -1)) { /* error object is a string? */ - luaL_where(L, 1); /* add extra info */ - lua_insert(L, -2); - lua_concat(L, 2); - } - return lua_error(L); /* propagate error */ - } - return r; -} - - -static int luaB_cocreate (lua_State *L) { - lua_State *NL; - luaL_checktype(L, 1, LUA_TFUNCTION); - NL = lua_newthread(L); - lua_pushvalue(L, 1); /* move function to top */ - lua_xmove(L, NL, 1); /* move function from L to NL */ - return 1; -} - - -static int luaB_cowrap (lua_State *L) { - luaB_cocreate(L); - lua_pushcclosure(L, luaB_auxwrap, 1); - return 1; -} - - -static int luaB_yield (lua_State *L) { - return lua_yield(L, lua_gettop(L)); -} - - -static int luaB_costatus (lua_State *L) { - lua_State *co = lua_tothread(L, 1); - luaL_argcheck(L, co, 1, "coroutine expected"); - if (L == co) lua_pushliteral(L, "running"); - else { - switch (lua_status(co)) { - case LUA_YIELD: - lua_pushliteral(L, "suspended"); - break; - case LUA_OK: { - lua_Debug ar; - if (lua_getstack(co, 0, &ar) > 0) /* does it have frames? */ - lua_pushliteral(L, "normal"); /* it is running */ - else if (lua_gettop(co) == 0) - lua_pushliteral(L, "dead"); - else - lua_pushliteral(L, "suspended"); /* initial state */ - break; - } - default: /* some error occurred */ - lua_pushliteral(L, "dead"); - break; - } - } - return 1; -} - - -static int luaB_corunning (lua_State *L) { - int ismain = lua_pushthread(L); - lua_pushboolean(L, ismain); - return 2; -} - - -static const luaL_Reg co_funcs[] = { - {"create", luaB_cocreate}, - {"resume", luaB_coresume}, - {"running", luaB_corunning}, - {"status", luaB_costatus}, - {"wrap", luaB_cowrap}, - {"yield", luaB_yield}, - {NULL, NULL} -}; - - - -LUAMOD_API int luaopen_coroutine (lua_State *L) { - luaL_newlib(L, co_funcs); - return 1; -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c deleted file mode 100644 index 107859811bfc..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c +++ /dev/null @@ -1,52 +0,0 @@ -/* -** $Id: lctype.c,v 1.11.1.1 2013/04/12 18:48:47 roberto Exp $ -** 'ctype' functions for Lua -** See Copyright Notice in lua.h -*/ - -#define lctype_c -#define LUA_CORE - -#include "lctype.h" - -#if !LUA_USE_CTYPE /* { */ - -#include - -LUAI_DDEF const lu_byte luai_ctype_[UCHAR_MAX + 2] = { - 0x00, /* EOZ */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0. */ - 0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 1. */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x0c, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, /* 2. */ - 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, - 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, /* 3. */ - 0x16, 0x16, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, - 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 4. */ - 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, - 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 5. */ - 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x05, - 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 6. */ - 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, - 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 7. */ - 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 8. */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 9. */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* a. */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* b. */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* c. */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* d. */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* e. */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* f. */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -}; - -#endif /* } */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h deleted file mode 100644 index 299a59b92e2c..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h +++ /dev/null @@ -1,93 +0,0 @@ -/* -** $Id: lctype.h,v 1.12.1.1 2013/04/12 18:48:47 roberto Exp $ -** 'ctype' functions for Lua -** See Copyright Notice in lua.h -*/ - -#ifndef lctype_h -#define lctype_h - -#include "lua.h" - - -/* -** WARNING: the functions defined here do not necessarily correspond -** to the similar functions in the standard C ctype.h. They are -** optimized for the specific needs of Lua -*/ - -#if !defined(LUA_USE_CTYPE) - -#if 'A' == 65 && '0' == 48 -/* ASCII case: can use its own tables; faster and fixed */ -#define LUA_USE_CTYPE 0 -#else -/* must use standard C ctype */ -#define LUA_USE_CTYPE 1 -#endif - -#endif - - -#if !LUA_USE_CTYPE /* { */ - -#include "llimits.h" - - -#define ALPHABIT 0 -#define DIGITBIT 1 -#define PRINTBIT 2 -#define SPACEBIT 3 -#define XDIGITBIT 4 - - -#define MASK(B) (1 << (B)) - - -/* -** add 1 to char to allow index -1 (EOZ) -*/ -#define testprop(c,p) (luai_ctype_[(c)+1] & (p)) - -/* -** 'lalpha' (Lua alphabetic) and 'lalnum' (Lua alphanumeric) both include '_' -*/ -#define lislalpha(c) testprop(c, MASK(ALPHABIT)) -#define lislalnum(c) testprop(c, (MASK(ALPHABIT) | MASK(DIGITBIT))) -#define lisdigit(c) testprop(c, MASK(DIGITBIT)) -#define lisspace(c) testprop(c, MASK(SPACEBIT)) -#define lisprint(c) testprop(c, MASK(PRINTBIT)) -#define lisxdigit(c) testprop(c, MASK(XDIGITBIT)) - -/* -** this 'ltolower' only works for alphabetic characters -*/ -#define ltolower(c) ((c) | ('A' ^ 'a')) - - -/* two more entries for 0 and -1 (EOZ) */ -LUAI_DDEC const lu_byte luai_ctype_[UCHAR_MAX + 2]; - - -#else /* }{ */ - -/* -** use standard C ctypes -*/ - -#include - - -#define lislalpha(c) (isalpha(c) || (c) == '_') -#define lislalnum(c) (isalnum(c) || (c) == '_') -#define lisdigit(c) (isdigit(c)) -#define lisspace(c) (isspace(c)) -#define lisprint(c) (isprint(c)) -#define lisxdigit(c) (isxdigit(c)) - -#define ltolower(c) (tolower(c)) - -#endif /* } */ - -#endif - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c deleted file mode 100644 index b8ddcff3c6bb..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c +++ /dev/null @@ -1,607 +0,0 @@ -/* -** $Id: ldebug.c,v 2.90.1.4 2015/02/19 17:05:13 roberto Exp $ -** Debug Interface -** See Copyright Notice in lua.h -*/ - - -#include - -#define ldebug_c -#define LUA_CORE - -#include "lua.h" - -#include "lapi.h" -#include "lcode.h" -#include "ldebug.h" -#include "ldo.h" -#include "lfunc.h" -#include "lobject.h" -#include "lopcodes.h" -#include "lstate.h" -#include "lstring.h" -#include "ltable.h" -#include "ltm.h" -#include "lvm.h" - - - -#define noLuaClosure(f) ((f) == NULL || (f)->c.tt == LUA_TCCL) - - -static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name); - - -static int currentpc (CallInfo *ci) { - lua_assert(isLua(ci)); - return pcRel(ci->u.l.savedpc, ci_func(ci)->p); -} - - -static int currentline (CallInfo *ci) { - return getfuncline(ci_func(ci)->p, currentpc(ci)); -} - - -static void swapextra (lua_State *L) { - if (L->status == LUA_YIELD) { - CallInfo *ci = L->ci; /* get function that yielded */ - StkId temp = ci->func; /* exchange its 'func' and 'extra' values */ - ci->func = restorestack(L, ci->extra); - ci->extra = savestack(L, temp); - } -} - - -/* -** this function can be called asynchronous (e.g. during a signal) -*/ -LUA_API int lua_sethook (lua_State *L, lua_Hook func, int mask, int count) { - if (func == NULL || mask == 0) { /* turn off hooks? */ - mask = 0; - func = NULL; - } - if (isLua(L->ci)) - L->oldpc = L->ci->u.l.savedpc; - L->hook = func; - L->basehookcount = count; - resethookcount(L); - L->hookmask = cast_byte(mask); - return 1; -} - - -LUA_API lua_Hook lua_gethook (lua_State *L) { - return L->hook; -} - - -LUA_API int lua_gethookmask (lua_State *L) { - return L->hookmask; -} - - -LUA_API int lua_gethookcount (lua_State *L) { - return L->basehookcount; -} - - -LUA_API int lua_getstack (lua_State *L, int level, lua_Debug *ar) { - int status; - CallInfo *ci; - if (level < 0) return 0; /* invalid (negative) level */ - lua_lock(L); - for (ci = L->ci; level > 0 && ci != &L->base_ci; ci = ci->previous) - level--; - if (level == 0 && ci != &L->base_ci) { /* level found? */ - status = 1; - ar->i_ci = ci; - } - else status = 0; /* no such level */ - lua_unlock(L); - return status; -} - - -static const char *upvalname (Proto *p, int uv) { - TString *s = check_exp(uv < p->sizeupvalues, p->upvalues[uv].name); - if (s == NULL) return "?"; - else return getstr(s); -} - - -static const char *findvararg (CallInfo *ci, int n, StkId *pos) { - int nparams = clLvalue(ci->func)->p->numparams; - if (n >= ci->u.l.base - ci->func - nparams) - return NULL; /* no such vararg */ - else { - *pos = ci->func + nparams + n; - return "(*vararg)"; /* generic name for any vararg */ - } -} - - -static const char *findlocal (lua_State *L, CallInfo *ci, int n, - StkId *pos) { - const char *name = NULL; - StkId base; - if (isLua(ci)) { - if (n < 0) /* access to vararg values? */ - return findvararg(ci, -n, pos); - else { - base = ci->u.l.base; - name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci)); - } - } - else - base = ci->func + 1; - if (name == NULL) { /* no 'standard' name? */ - StkId limit = (ci == L->ci) ? L->top : ci->next->func; - if (limit - base >= n && n > 0) /* is 'n' inside 'ci' stack? */ - name = "(*temporary)"; /* generic name for any valid slot */ - else - return NULL; /* no name */ - } - *pos = base + (n - 1); - return name; -} - - -LUA_API const char *lua_getlocal (lua_State *L, const lua_Debug *ar, int n) { - const char *name; - lua_lock(L); - swapextra(L); - if (ar == NULL) { /* information about non-active function? */ - if (!isLfunction(L->top - 1)) /* not a Lua function? */ - name = NULL; - else /* consider live variables at function start (parameters) */ - name = luaF_getlocalname(clLvalue(L->top - 1)->p, n, 0); - } - else { /* active function; get information through 'ar' */ - StkId pos = 0; /* to avoid warnings */ - name = findlocal(L, ar->i_ci, n, &pos); - if (name) { - setobj2s(L, L->top, pos); - api_incr_top(L); - } - } - swapextra(L); - lua_unlock(L); - return name; -} - - -LUA_API const char *lua_setlocal (lua_State *L, const lua_Debug *ar, int n) { - StkId pos = 0; /* to avoid warnings */ - const char *name; - lua_lock(L); - swapextra(L); - name = findlocal(L, ar->i_ci, n, &pos); - if (name) - setobjs2s(L, pos, L->top - 1); - L->top--; /* pop value */ - swapextra(L); - lua_unlock(L); - return name; -} - - -static void funcinfo (lua_Debug *ar, Closure *cl) { - if (noLuaClosure(cl)) { - ar->source = "=[C]"; - ar->linedefined = -1; - ar->lastlinedefined = -1; - ar->what = "C"; - } - else { - Proto *p = cl->l.p; - ar->source = p->source ? getstr(p->source) : "=?"; - ar->linedefined = p->linedefined; - ar->lastlinedefined = p->lastlinedefined; - ar->what = (ar->linedefined == 0) ? "main" : "Lua"; - } - luaO_chunkid(ar->short_src, ar->source, LUA_IDSIZE); -} - - -static void collectvalidlines (lua_State *L, Closure *f) { - if (noLuaClosure(f)) { - setnilvalue(L->top); - api_incr_top(L); - } - else { - int i; - TValue v; - int *lineinfo = f->l.p->lineinfo; - Table *t = luaH_new(L); /* new table to store active lines */ - sethvalue(L, L->top, t); /* push it on stack */ - api_incr_top(L); - setbvalue(&v, 1); /* boolean 'true' to be the value of all indices */ - for (i = 0; i < f->l.p->sizelineinfo; i++) /* for all lines with code */ - luaH_setint(L, t, lineinfo[i], &v); /* table[line] = true */ - } -} - - -static int auxgetinfo (lua_State *L, const char *what, lua_Debug *ar, - Closure *f, CallInfo *ci) { - int status = 1; - for (; *what; what++) { - switch (*what) { - case 'S': { - funcinfo(ar, f); - break; - } - case 'l': { - ar->currentline = (ci && isLua(ci)) ? currentline(ci) : -1; - break; - } - case 'u': { - ar->nups = (f == NULL) ? 0 : f->c.nupvalues; - if (noLuaClosure(f)) { - ar->isvararg = 1; - ar->nparams = 0; - } - else { - ar->isvararg = f->l.p->is_vararg; - ar->nparams = f->l.p->numparams; - } - break; - } - case 't': { - ar->istailcall = (ci) ? ci->callstatus & CIST_TAIL : 0; - break; - } - case 'n': { - /* calling function is a known Lua function? */ - if (ci && !(ci->callstatus & CIST_TAIL) && isLua(ci->previous)) - ar->namewhat = getfuncname(L, ci->previous, &ar->name); - else - ar->namewhat = NULL; - if (ar->namewhat == NULL) { - ar->namewhat = ""; /* not found */ - ar->name = NULL; - } - break; - } - case 'L': - case 'f': /* handled by lua_getinfo */ - break; - default: status = 0; /* invalid option */ - } - } - return status; -} - - -LUA_API int lua_getinfo (lua_State *L, const char *what, lua_Debug *ar) { - int status; - Closure *cl; - CallInfo *ci; - StkId func; - lua_lock(L); - swapextra(L); - if (*what == '>') { - ci = NULL; - func = L->top - 1; - api_check(L, ttisfunction(func), "function expected"); - what++; /* skip the '>' */ - L->top--; /* pop function */ - } - else { - ci = ar->i_ci; - func = ci->func; - lua_assert(ttisfunction(ci->func)); - } - cl = ttisclosure(func) ? clvalue(func) : NULL; - status = auxgetinfo(L, what, ar, cl, ci); - if (strchr(what, 'f')) { - setobjs2s(L, L->top, func); - api_incr_top(L); - } - swapextra(L); - if (strchr(what, 'L')) - collectvalidlines(L, cl); - lua_unlock(L); - return status; -} - - -/* -** {====================================================== -** Symbolic Execution -** ======================================================= -*/ - -static const char *getobjname (Proto *p, int lastpc, int reg, - const char **name); - - -/* -** find a "name" for the RK value 'c' -*/ -static void kname (Proto *p, int pc, int c, const char **name) { - if (ISK(c)) { /* is 'c' a constant? */ - TValue *kvalue = &p->k[INDEXK(c)]; - if (ttisstring(kvalue)) { /* literal constant? */ - *name = svalue(kvalue); /* it is its own name */ - return; - } - /* else no reasonable name found */ - } - else { /* 'c' is a register */ - const char *what = getobjname(p, pc, c, name); /* search for 'c' */ - if (what && *what == 'c') { /* found a constant name? */ - return; /* 'name' already filled */ - } - /* else no reasonable name found */ - } - *name = "?"; /* no reasonable name found */ -} - - -static int filterpc (int pc, int jmptarget) { - if (pc < jmptarget) /* is code conditional (inside a jump)? */ - return -1; /* cannot know who sets that register */ - else return pc; /* current position sets that register */ -} - - -/* -** try to find last instruction before 'lastpc' that modified register 'reg' -*/ -static int findsetreg (Proto *p, int lastpc, int reg) { - int pc; - int setreg = -1; /* keep last instruction that changed 'reg' */ - int jmptarget = 0; /* any code before this address is conditional */ - for (pc = 0; pc < lastpc; pc++) { - Instruction i = p->code[pc]; - OpCode op = GET_OPCODE(i); - int a = GETARG_A(i); - switch (op) { - case OP_LOADNIL: { - int b = GETARG_B(i); - if (a <= reg && reg <= a + b) /* set registers from 'a' to 'a+b' */ - setreg = filterpc(pc, jmptarget); - break; - } - case OP_TFORCALL: { - if (reg >= a + 2) /* affect all regs above its base */ - setreg = filterpc(pc, jmptarget); - break; - } - case OP_CALL: - case OP_TAILCALL: { - if (reg >= a) /* affect all registers above base */ - setreg = filterpc(pc, jmptarget); - break; - } - case OP_JMP: { - int b = GETARG_sBx(i); - int dest = pc + 1 + b; - /* jump is forward and do not skip `lastpc'? */ - if (pc < dest && dest <= lastpc) { - if (dest > jmptarget) - jmptarget = dest; /* update 'jmptarget' */ - } - break; - } - case OP_TEST: { - if (reg == a) /* jumped code can change 'a' */ - setreg = filterpc(pc, jmptarget); - break; - } - default: - if (testAMode(op) && reg == a) /* any instruction that set A */ - setreg = filterpc(pc, jmptarget); - break; - } - } - return setreg; -} - - -static const char *getobjname (Proto *p, int lastpc, int reg, - const char **name) { - int pc; - *name = luaF_getlocalname(p, reg + 1, lastpc); - if (*name) /* is a local? */ - return "local"; - /* else try symbolic execution */ - pc = findsetreg(p, lastpc, reg); - if (pc != -1) { /* could find instruction? */ - Instruction i = p->code[pc]; - OpCode op = GET_OPCODE(i); - switch (op) { - case OP_MOVE: { - int b = GETARG_B(i); /* move from 'b' to 'a' */ - if (b < GETARG_A(i)) - return getobjname(p, pc, b, name); /* get name for 'b' */ - break; - } - case OP_GETTABUP: - case OP_GETTABLE: { - int k = GETARG_C(i); /* key index */ - int t = GETARG_B(i); /* table index */ - const char *vn = (op == OP_GETTABLE) /* name of indexed variable */ - ? luaF_getlocalname(p, t + 1, pc) - : upvalname(p, t); - kname(p, pc, k, name); - return (vn && strcmp(vn, LUA_ENV) == 0) ? "global" : "field"; - } - case OP_GETUPVAL: { - *name = upvalname(p, GETARG_B(i)); - return "upvalue"; - } - case OP_LOADK: - case OP_LOADKX: { - int b = (op == OP_LOADK) ? GETARG_Bx(i) - : GETARG_Ax(p->code[pc + 1]); - if (ttisstring(&p->k[b])) { - *name = svalue(&p->k[b]); - return "constant"; - } - break; - } - case OP_SELF: { - int k = GETARG_C(i); /* key index */ - kname(p, pc, k, name); - return "method"; - } - default: break; /* go through to return NULL */ - } - } - return NULL; /* could not find reasonable name */ -} - - -static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name) { - TMS tm; - Proto *p = ci_func(ci)->p; /* calling function */ - int pc = currentpc(ci); /* calling instruction index */ - Instruction i = p->code[pc]; /* calling instruction */ - switch (GET_OPCODE(i)) { - case OP_CALL: - case OP_TAILCALL: /* get function name */ - return getobjname(p, pc, GETARG_A(i), name); - case OP_TFORCALL: { /* for iterator */ - *name = "for iterator"; - return "for iterator"; - } - /* all other instructions can call only through metamethods */ - case OP_SELF: - case OP_GETTABUP: - case OP_GETTABLE: tm = TM_INDEX; break; - case OP_SETTABUP: - case OP_SETTABLE: tm = TM_NEWINDEX; break; - case OP_EQ: tm = TM_EQ; break; - case OP_ADD: tm = TM_ADD; break; - case OP_SUB: tm = TM_SUB; break; - case OP_MUL: tm = TM_MUL; break; - case OP_DIV: tm = TM_DIV; break; - case OP_MOD: tm = TM_MOD; break; - case OP_POW: tm = TM_POW; break; - case OP_UNM: tm = TM_UNM; break; - case OP_LEN: tm = TM_LEN; break; - case OP_LT: tm = TM_LT; break; - case OP_LE: tm = TM_LE; break; - case OP_CONCAT: tm = TM_CONCAT; break; - default: - return NULL; /* else no useful name can be found */ - } - *name = getstr(G(L)->tmname[tm]); - return "metamethod"; -} - -/* }====================================================== */ - - - -/* -** only ANSI way to check whether a pointer points to an array -** (used only for error messages, so efficiency is not a big concern) -*/ -static int isinstack (CallInfo *ci, const TValue *o) { - StkId p; - for (p = ci->u.l.base; p < ci->top; p++) - if (o == p) return 1; - return 0; -} - - -static const char *getupvalname (CallInfo *ci, const TValue *o, - const char **name) { - LClosure *c = ci_func(ci); - int i; - for (i = 0; i < c->nupvalues; i++) { - if (c->upvals[i]->v == o) { - *name = upvalname(c->p, i); - return "upvalue"; - } - } - return NULL; -} - - -l_noret luaG_typeerror (lua_State *L, const TValue *o, const char *op) { - CallInfo *ci = L->ci; - const char *name = NULL; - const char *t = objtypename(o); - const char *kind = NULL; - if (isLua(ci)) { - kind = getupvalname(ci, o, &name); /* check whether 'o' is an upvalue */ - if (!kind && isinstack(ci, o)) /* no? try a register */ - kind = getobjname(ci_func(ci)->p, currentpc(ci), - cast_int(o - ci->u.l.base), &name); - } - if (kind) - luaG_runerror(L, "attempt to %s %s " LUA_QS " (a %s value)", - op, kind, name, t); - else - luaG_runerror(L, "attempt to %s a %s value", op, t); -} - - -l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2) { - if (ttisstring(p1) || ttisnumber(p1)) p1 = p2; - lua_assert(!ttisstring(p1) && !ttisnumber(p1)); - luaG_typeerror(L, p1, "concatenate"); -} - - -l_noret luaG_aritherror (lua_State *L, const TValue *p1, const TValue *p2) { - TValue temp; - if (luaV_tonumber(p1, &temp) == NULL) - p2 = p1; /* first operand is wrong */ - luaG_typeerror(L, p2, "perform arithmetic on"); -} - - -l_noret luaG_ordererror (lua_State *L, const TValue *p1, const TValue *p2) { - const char *t1 = objtypename(p1); - const char *t2 = objtypename(p2); - if (t1 == t2) - luaG_runerror(L, "attempt to compare two %s values", t1); - else - luaG_runerror(L, "attempt to compare %s with %s", t1, t2); -} - - -static void addinfo (lua_State *L, const char *msg) { - CallInfo *ci = L->ci; - if (isLua(ci)) { /* is Lua code? */ - char buff[LUA_IDSIZE]; /* add file:line information */ - int line = currentline(ci); - TString *src = ci_func(ci)->p->source; - if (src) - luaO_chunkid(buff, getstr(src), LUA_IDSIZE); - else { /* no source available; use "?" instead */ - buff[0] = '?'; buff[1] = '\0'; - } - luaO_pushfstring(L, "%s:%d: %s", buff, line, msg); - } -} - - -l_noret luaG_errormsg (lua_State *L) { - if (L->errfunc != 0) { /* is there an error handling function? */ - StkId errfunc = restorestack(L, L->errfunc); - if (!ttisfunction(errfunc)) luaD_throw(L, LUA_ERRERR); - setobjs2s(L, L->top, L->top - 1); /* move argument */ - setobjs2s(L, L->top - 1, errfunc); /* push function */ - L->top++; - luaD_call(L, L->top - 2, 1, 0); /* call it */ - } - luaD_throw(L, LUA_ERRRUN); -} - - -l_noret luaG_runerror (lua_State *L, const char *fmt, ...) { - va_list argp; - va_start(argp, fmt); - addinfo(L, luaO_pushvfstring(L, fmt, argp)); - va_end(argp); - luaG_errormsg(L); -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h deleted file mode 100644 index 6445c763ea51..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h +++ /dev/null @@ -1,34 +0,0 @@ -/* -** $Id: ldebug.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $ -** Auxiliary functions from Debug Interface module -** See Copyright Notice in lua.h -*/ - -#ifndef ldebug_h -#define ldebug_h - - -#include "lstate.h" - - -#define pcRel(pc, p) (cast(int, (pc) - (p)->code) - 1) - -#define getfuncline(f,pc) (((f)->lineinfo) ? (f)->lineinfo[pc] : 0) - -#define resethookcount(L) (L->hookcount = L->basehookcount) - -/* Active Lua function (given call info) */ -#define ci_func(ci) (clLvalue((ci)->func)) - - -LUAI_FUNC l_noret luaG_typeerror (lua_State *L, const TValue *o, - const char *opname); -LUAI_FUNC l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2); -LUAI_FUNC l_noret luaG_aritherror (lua_State *L, const TValue *p1, - const TValue *p2); -LUAI_FUNC l_noret luaG_ordererror (lua_State *L, const TValue *p1, - const TValue *p2); -LUAI_FUNC l_noret luaG_runerror (lua_State *L, const char *fmt, ...); -LUAI_FUNC l_noret luaG_errormsg (lua_State *L); - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c deleted file mode 100644 index cb49cb55e6cf..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c +++ /dev/null @@ -1,691 +0,0 @@ -/* -** $Id: ldo.c,v 2.108.1.3 2013/11/08 18:22:50 roberto Exp $ -** Stack and Call structure of Lua -** See Copyright Notice in lua.h -*/ - - -#include - -#define ldo_c -#define LUA_CORE - -#include "lua.h" - -#include "lapi.h" -#include "ldebug.h" -#include "ldo.h" -#include "lfunc.h" -#include "lgc.h" -#include "lmem.h" -#include "lobject.h" -#include "lopcodes.h" -#include "lparser.h" -#include "lstate.h" -#include "lstring.h" -#include "ltable.h" -#include "ltm.h" -#include "lundump.h" -#include "lvm.h" -#include "lzio.h" - - - - -/* -** {====================================================== -** Error-recovery functions -** ======================================================= -*/ - -/* -** LUAI_THROW/LUAI_TRY define how Lua does exception handling. By -** default, Lua handles errors with exceptions when compiling as -** C++ code, with _longjmp/_setjmp when asked to use them, and with -** longjmp/setjmp otherwise. -*/ -#if !defined(LUAI_THROW) - -#ifdef _KERNEL -#ifdef illumos -#define LUAI_THROW(L,c) longjmp(&(c)->b) -#define LUAI_TRY(L,c,a) if (setjmp(&(c)->b) == 0) { a } -#define luai_jmpbuf label_t -#else -#define LUAI_THROW(L,c) longjmp((c)->b, 1) -#define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a } -#define luai_jmpbuf jmp_buf -#endif -#else -#if defined(__cplusplus) && !defined(LUA_USE_LONGJMP) -/* C++ exceptions */ -#define LUAI_THROW(L,c) throw(c) -#define LUAI_TRY(L,c,a) \ - try { a } catch(...) { if ((c)->status == 0) (c)->status = -1; } -#define luai_jmpbuf int /* dummy variable */ - -#elif defined(LUA_USE_ULONGJMP) -/* in Unix, try _longjmp/_setjmp (more efficient) */ -#define LUAI_THROW(L,c) _longjmp((c)->b, 1) -#define LUAI_TRY(L,c,a) if (_setjmp((c)->b) == 0) { a } -#define luai_jmpbuf jmp_buf - -#else -/* default handling with long jumps */ -#define LUAI_THROW(L,c) longjmp((c)->b, 1) -#define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a } -#define luai_jmpbuf jmp_buf - -#endif - -#endif - -#endif - - -/* chain list of long jump buffers */ -struct lua_longjmp { - struct lua_longjmp *previous; - luai_jmpbuf b; - volatile int status; /* error code */ -}; - - -static void seterrorobj (lua_State *L, int errcode, StkId oldtop) { - switch (errcode) { - case LUA_ERRMEM: { /* memory error? */ - setsvalue2s(L, oldtop, G(L)->memerrmsg); /* reuse preregistered msg. */ - break; - } - case LUA_ERRERR: { - setsvalue2s(L, oldtop, luaS_newliteral(L, "error in error handling")); - break; - } - default: { - setobjs2s(L, oldtop, L->top - 1); /* error message on current top */ - break; - } - } - L->top = oldtop + 1; -} - - -l_noret luaD_throw (lua_State *L, int errcode) { - if (L->errorJmp) { /* thread has an error handler? */ - L->errorJmp->status = errcode; /* set status */ - LUAI_THROW(L, L->errorJmp); /* jump to it */ - } - else { /* thread has no error handler */ - L->status = cast_byte(errcode); /* mark it as dead */ - if (G(L)->mainthread->errorJmp) { /* main thread has a handler? */ - setobjs2s(L, G(L)->mainthread->top++, L->top - 1); /* copy error obj. */ - luaD_throw(G(L)->mainthread, errcode); /* re-throw in main thread */ - } - else { /* no handler at all; abort */ - if (G(L)->panic) { /* panic function? */ - lua_unlock(L); - G(L)->panic(L); /* call it (last chance to jump out) */ - } - panic("no error handler"); - } - } -} - - -int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud) { - unsigned short oldnCcalls = L->nCcalls; - struct lua_longjmp lj; - lj.status = LUA_OK; - lj.previous = L->errorJmp; /* chain new error handler */ - L->errorJmp = &lj; - LUAI_TRY(L, &lj, - (*f)(L, ud); - ); - L->errorJmp = lj.previous; /* restore old error handler */ - L->nCcalls = oldnCcalls; - return lj.status; -} - -/* }====================================================== */ - - -static void correctstack (lua_State *L, TValue *oldstack) { - CallInfo *ci; - GCObject *up; - L->top = (L->top - oldstack) + L->stack; - for (up = L->openupval; up != NULL; up = up->gch.next) - gco2uv(up)->v = (gco2uv(up)->v - oldstack) + L->stack; - for (ci = L->ci; ci != NULL; ci = ci->previous) { - ci->top = (ci->top - oldstack) + L->stack; - ci->func = (ci->func - oldstack) + L->stack; - if (isLua(ci)) - ci->u.l.base = (ci->u.l.base - oldstack) + L->stack; - } -} - - -/* some space for error handling */ -#define ERRORSTACKSIZE (LUAI_MAXSTACK + 200) - - -void luaD_reallocstack (lua_State *L, int newsize) { - TValue *oldstack = L->stack; - int lim = L->stacksize; - lua_assert(newsize <= LUAI_MAXSTACK || newsize == ERRORSTACKSIZE); - lua_assert(L->stack_last - L->stack == L->stacksize - EXTRA_STACK); - luaM_reallocvector(L, L->stack, L->stacksize, newsize, TValue); - for (; lim < newsize; lim++) - setnilvalue(L->stack + lim); /* erase new segment */ - L->stacksize = newsize; - L->stack_last = L->stack + newsize - EXTRA_STACK; - correctstack(L, oldstack); -} - - -void luaD_growstack (lua_State *L, int n) { - int size = L->stacksize; - if (size > LUAI_MAXSTACK) /* error after extra size? */ - luaD_throw(L, LUA_ERRERR); - else { - int needed = cast_int(L->top - L->stack) + n + EXTRA_STACK; - int newsize = 2 * size; - if (newsize > LUAI_MAXSTACK) newsize = LUAI_MAXSTACK; - if (newsize < needed) newsize = needed; - if (newsize > LUAI_MAXSTACK) { /* stack overflow? */ - luaD_reallocstack(L, ERRORSTACKSIZE); - luaG_runerror(L, "stack overflow"); - } - else - luaD_reallocstack(L, newsize); - } -} - - -static int stackinuse (lua_State *L) { - CallInfo *ci; - StkId lim = L->top; - for (ci = L->ci; ci != NULL; ci = ci->previous) { - lua_assert(ci->top <= L->stack_last); - if (lim < ci->top) lim = ci->top; - } - return cast_int(lim - L->stack) + 1; /* part of stack in use */ -} - - -void luaD_shrinkstack (lua_State *L) { - int inuse = stackinuse(L); - int goodsize = inuse + (inuse / 8) + 2*EXTRA_STACK; - if (goodsize > LUAI_MAXSTACK) goodsize = LUAI_MAXSTACK; - if (inuse > LUAI_MAXSTACK || /* handling stack overflow? */ - goodsize >= L->stacksize) /* would grow instead of shrink? */ - condmovestack(L); /* don't change stack (change only for debugging) */ - else - luaD_reallocstack(L, goodsize); /* shrink it */ -} - - -void luaD_hook (lua_State *L, int event, int line) { - lua_Hook hook = L->hook; - if (hook && L->allowhook) { - CallInfo *ci = L->ci; - ptrdiff_t top = savestack(L, L->top); - ptrdiff_t ci_top = savestack(L, ci->top); - lua_Debug ar; - ar.event = event; - ar.currentline = line; - ar.i_ci = ci; - luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */ - ci->top = L->top + LUA_MINSTACK; - lua_assert(ci->top <= L->stack_last); - L->allowhook = 0; /* cannot call hooks inside a hook */ - ci->callstatus |= CIST_HOOKED; - lua_unlock(L); - (*hook)(L, &ar); - lua_lock(L); - lua_assert(!L->allowhook); - L->allowhook = 1; - ci->top = restorestack(L, ci_top); - L->top = restorestack(L, top); - ci->callstatus &= ~CIST_HOOKED; - } -} - - -static void callhook (lua_State *L, CallInfo *ci) { - int hook = LUA_HOOKCALL; - ci->u.l.savedpc++; /* hooks assume 'pc' is already incremented */ - if (isLua(ci->previous) && - GET_OPCODE(*(ci->previous->u.l.savedpc - 1)) == OP_TAILCALL) { - ci->callstatus |= CIST_TAIL; - hook = LUA_HOOKTAILCALL; - } - luaD_hook(L, hook, -1); - ci->u.l.savedpc--; /* correct 'pc' */ -} - - -static StkId adjust_varargs (lua_State *L, Proto *p, int actual) { - int i; - int nfixargs = p->numparams; - StkId base, fixed; - lua_assert(actual >= nfixargs); - /* move fixed parameters to final position */ - luaD_checkstack(L, p->maxstacksize); /* check again for new 'base' */ - fixed = L->top - actual; /* first fixed argument */ - base = L->top; /* final position of first argument */ - for (i=0; itop++, fixed + i); - setnilvalue(fixed + i); - } - return base; -} - - -static StkId tryfuncTM (lua_State *L, StkId func) { - const TValue *tm = luaT_gettmbyobj(L, func, TM_CALL); - StkId p; - ptrdiff_t funcr = savestack(L, func); - if (!ttisfunction(tm)) - luaG_typeerror(L, func, "call"); - /* Open a hole inside the stack at `func' */ - for (p = L->top; p > func; p--) setobjs2s(L, p, p-1); - incr_top(L); - func = restorestack(L, funcr); /* previous call may change stack */ - setobj2s(L, func, tm); /* tag method is the new function to be called */ - return func; -} - - - -#define next_ci(L) (L->ci = (L->ci->next ? L->ci->next : luaE_extendCI(L))) - - -/* -** returns true if function has been executed (C function) -*/ -int luaD_precall (lua_State *L, StkId func, int nresults) { - lua_CFunction f; - CallInfo *ci; - int n; /* number of arguments (Lua) or returns (C) */ - ptrdiff_t funcr = savestack(L, func); - switch (ttype(func)) { - case LUA_TLCF: /* light C function */ - f = fvalue(func); - goto Cfunc; - case LUA_TCCL: { /* C closure */ - f = clCvalue(func)->f; - Cfunc: - luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */ - ci = next_ci(L); /* now 'enter' new function */ - ci->nresults = nresults; - ci->func = restorestack(L, funcr); - ci->top = L->top + LUA_MINSTACK; - lua_assert(ci->top <= L->stack_last); - ci->callstatus = 0; - luaC_checkGC(L); /* stack grow uses memory */ - if (L->hookmask & LUA_MASKCALL) - luaD_hook(L, LUA_HOOKCALL, -1); - lua_unlock(L); - n = (*f)(L); /* do the actual call */ - lua_lock(L); - api_checknelems(L, n); - luaD_poscall(L, L->top - n); - return 1; - } - case LUA_TLCL: { /* Lua function: prepare its call */ - StkId base; - Proto *p = clLvalue(func)->p; - n = cast_int(L->top - func) - 1; /* number of real arguments */ - luaD_checkstack(L, p->maxstacksize); - for (; n < p->numparams; n++) - setnilvalue(L->top++); /* complete missing arguments */ - if (!p->is_vararg) { - func = restorestack(L, funcr); - base = func + 1; - } - else { - base = adjust_varargs(L, p, n); - func = restorestack(L, funcr); /* previous call can change stack */ - } - ci = next_ci(L); /* now 'enter' new function */ - ci->nresults = nresults; - ci->func = func; - ci->u.l.base = base; - ci->top = base + p->maxstacksize; - lua_assert(ci->top <= L->stack_last); - ci->u.l.savedpc = p->code; /* starting point */ - ci->callstatus = CIST_LUA; - L->top = ci->top; - luaC_checkGC(L); /* stack grow uses memory */ - if (L->hookmask & LUA_MASKCALL) - callhook(L, ci); - return 0; - } - default: { /* not a function */ - func = tryfuncTM(L, func); /* retry with 'function' tag method */ - return luaD_precall(L, func, nresults); /* now it must be a function */ - } - } -} - - -int luaD_poscall (lua_State *L, StkId firstResult) { - StkId res; - int wanted, i; - CallInfo *ci = L->ci; - if (L->hookmask & (LUA_MASKRET | LUA_MASKLINE)) { - if (L->hookmask & LUA_MASKRET) { - ptrdiff_t fr = savestack(L, firstResult); /* hook may change stack */ - luaD_hook(L, LUA_HOOKRET, -1); - firstResult = restorestack(L, fr); - } - L->oldpc = ci->previous->u.l.savedpc; /* 'oldpc' for caller function */ - } - res = ci->func; /* res == final position of 1st result */ - wanted = ci->nresults; - L->ci = ci = ci->previous; /* back to caller */ - /* move results to correct place */ - for (i = wanted; i != 0 && firstResult < L->top; i--) - setobjs2s(L, res++, firstResult++); - while (i-- > 0) - setnilvalue(res++); - L->top = res; - return (wanted - LUA_MULTRET); /* 0 iff wanted == LUA_MULTRET */ -} - - -/* -** Call a function (C or Lua). The function to be called is at *func. -** The arguments are on the stack, right after the function. -** When returns, all the results are on the stack, starting at the original -** function position. -*/ -void luaD_call (lua_State *L, StkId func, int nResults, int allowyield) { - if (++L->nCcalls >= LUAI_MAXCCALLS) { - if (L->nCcalls == LUAI_MAXCCALLS) - luaG_runerror(L, "C stack overflow"); - else if (L->nCcalls >= (LUAI_MAXCCALLS + (LUAI_MAXCCALLS>>3))) - luaD_throw(L, LUA_ERRERR); /* error while handing stack error */ - } - if (!allowyield) L->nny++; - if (!luaD_precall(L, func, nResults)) /* is a Lua function? */ - luaV_execute(L); /* call it */ - if (!allowyield) L->nny--; - L->nCcalls--; -} - - -static void finishCcall (lua_State *L) { - CallInfo *ci = L->ci; - int n; - lua_assert(ci->u.c.k != NULL); /* must have a continuation */ - lua_assert(L->nny == 0); - if (ci->callstatus & CIST_YPCALL) { /* was inside a pcall? */ - ci->callstatus &= ~CIST_YPCALL; /* finish 'lua_pcall' */ - L->errfunc = ci->u.c.old_errfunc; - } - /* finish 'lua_callk'/'lua_pcall' */ - adjustresults(L, ci->nresults); - /* call continuation function */ - if (!(ci->callstatus & CIST_STAT)) /* no call status? */ - ci->u.c.status = LUA_YIELD; /* 'default' status */ - lua_assert(ci->u.c.status != LUA_OK); - ci->callstatus = (ci->callstatus & ~(CIST_YPCALL | CIST_STAT)) | CIST_YIELDED; - lua_unlock(L); - n = (*ci->u.c.k)(L); - lua_lock(L); - api_checknelems(L, n); - /* finish 'luaD_precall' */ - luaD_poscall(L, L->top - n); -} - - -static void unroll (lua_State *L, void *ud) { - UNUSED(ud); - for (;;) { - if (L->ci == &L->base_ci) /* stack is empty? */ - return; /* coroutine finished normally */ - if (!isLua(L->ci)) /* C function? */ - finishCcall(L); - else { /* Lua function */ - luaV_finishOp(L); /* finish interrupted instruction */ - luaV_execute(L); /* execute down to higher C 'boundary' */ - } - } -} - - -/* -** check whether thread has a suspended protected call -*/ -static CallInfo *findpcall (lua_State *L) { - CallInfo *ci; - for (ci = L->ci; ci != NULL; ci = ci->previous) { /* search for a pcall */ - if (ci->callstatus & CIST_YPCALL) - return ci; - } - return NULL; /* no pending pcall */ -} - - -static int recover (lua_State *L, int status) { - StkId oldtop; - CallInfo *ci = findpcall(L); - if (ci == NULL) return 0; /* no recovery point */ - /* "finish" luaD_pcall */ - oldtop = restorestack(L, ci->extra); - luaF_close(L, oldtop); - seterrorobj(L, status, oldtop); - L->ci = ci; - L->allowhook = ci->u.c.old_allowhook; - L->nny = 0; /* should be zero to be yieldable */ - luaD_shrinkstack(L); - L->errfunc = ci->u.c.old_errfunc; - ci->callstatus |= CIST_STAT; /* call has error status */ - ci->u.c.status = status; /* (here it is) */ - return 1; /* continue running the coroutine */ -} - - -/* -** signal an error in the call to 'resume', not in the execution of the -** coroutine itself. (Such errors should not be handled by any coroutine -** error handler and should not kill the coroutine.) -*/ -static l_noret resume_error (lua_State *L, const char *msg, StkId firstArg) { - L->top = firstArg; /* remove args from the stack */ - setsvalue2s(L, L->top, luaS_new(L, msg)); /* push error message */ - api_incr_top(L); - luaD_throw(L, -1); /* jump back to 'lua_resume' */ -} - - -/* -** do the work for 'lua_resume' in protected mode -*/ -static void resume_cb (lua_State *L, void *ud) { - int nCcalls = L->nCcalls; - StkId firstArg = cast(StkId, ud); - CallInfo *ci = L->ci; - if (nCcalls >= LUAI_MAXCCALLS) - resume_error(L, "C stack overflow", firstArg); - if (L->status == LUA_OK) { /* may be starting a coroutine */ - if (ci != &L->base_ci) /* not in base level? */ - resume_error(L, "cannot resume non-suspended coroutine", firstArg); - /* coroutine is in base level; start running it */ - if (!luaD_precall(L, firstArg - 1, LUA_MULTRET)) /* Lua function? */ - luaV_execute(L); /* call it */ - } - else if (L->status != LUA_YIELD) - resume_error(L, "cannot resume dead coroutine", firstArg); - else { /* resuming from previous yield */ - L->status = LUA_OK; - ci->func = restorestack(L, ci->extra); - if (isLua(ci)) /* yielded inside a hook? */ - luaV_execute(L); /* just continue running Lua code */ - else { /* 'common' yield */ - if (ci->u.c.k != NULL) { /* does it have a continuation? */ - int n; - ci->u.c.status = LUA_YIELD; /* 'default' status */ - ci->callstatus |= CIST_YIELDED; - lua_unlock(L); - n = (*ci->u.c.k)(L); /* call continuation */ - lua_lock(L); - api_checknelems(L, n); - firstArg = L->top - n; /* yield results come from continuation */ - } - luaD_poscall(L, firstArg); /* finish 'luaD_precall' */ - } - unroll(L, NULL); - } - lua_assert(nCcalls == L->nCcalls); -} - - -LUA_API int lua_resume (lua_State *L, lua_State *from, int nargs) { - int status; - int oldnny = L->nny; /* save 'nny' */ - lua_lock(L); - luai_userstateresume(L, nargs); - L->nCcalls = (from) ? from->nCcalls + 1 : 1; - L->nny = 0; /* allow yields */ - api_checknelems(L, (L->status == LUA_OK) ? nargs + 1 : nargs); - status = luaD_rawrunprotected(L, resume_cb, L->top - nargs); - if (status == -1) /* error calling 'lua_resume'? */ - status = LUA_ERRRUN; - else { /* yield or regular error */ - while (status != LUA_OK && status != LUA_YIELD) { /* error? */ - if (recover(L, status)) /* recover point? */ - status = luaD_rawrunprotected(L, unroll, NULL); /* run continuation */ - else { /* unrecoverable error */ - L->status = cast_byte(status); /* mark thread as `dead' */ - seterrorobj(L, status, L->top); - L->ci->top = L->top; - break; - } - } - lua_assert(status == L->status); - } - L->nny = oldnny; /* restore 'nny' */ - L->nCcalls--; - lua_assert(L->nCcalls == ((from) ? from->nCcalls : 0)); - lua_unlock(L); - return status; -} - - -LUA_API int lua_yieldk (lua_State *L, int nresults, int ctx, lua_CFunction k) { - CallInfo *ci = L->ci; - luai_userstateyield(L, nresults); - lua_lock(L); - api_checknelems(L, nresults); - if (L->nny > 0) { - if (L != G(L)->mainthread) - luaG_runerror(L, "attempt to yield across a C-call boundary"); - else - luaG_runerror(L, "attempt to yield from outside a coroutine"); - } - L->status = LUA_YIELD; - ci->extra = savestack(L, ci->func); /* save current 'func' */ - if (isLua(ci)) { /* inside a hook? */ - api_check(L, k == NULL, "hooks cannot continue after yielding"); - } - else { - if ((ci->u.c.k = k) != NULL) /* is there a continuation? */ - ci->u.c.ctx = ctx; /* save context */ - ci->func = L->top - nresults - 1; /* protect stack below results */ - luaD_throw(L, LUA_YIELD); - } - lua_assert(ci->callstatus & CIST_HOOKED); /* must be inside a hook */ - lua_unlock(L); - return 0; /* return to 'luaD_hook' */ -} - - -int luaD_pcall (lua_State *L, Pfunc func, void *u, - ptrdiff_t old_top, ptrdiff_t ef) { - int status; - CallInfo *old_ci = L->ci; - lu_byte old_allowhooks = L->allowhook; - unsigned short old_nny = L->nny; - ptrdiff_t old_errfunc = L->errfunc; - L->errfunc = ef; - status = luaD_rawrunprotected(L, func, u); - if (status != LUA_OK) { /* an error occurred? */ - StkId oldtop = restorestack(L, old_top); - luaF_close(L, oldtop); /* close possible pending closures */ - seterrorobj(L, status, oldtop); - L->ci = old_ci; - L->allowhook = old_allowhooks; - L->nny = old_nny; - luaD_shrinkstack(L); - } - L->errfunc = old_errfunc; - return status; -} - - - -/* -** Execute a protected parser. -*/ -struct SParser { /* data to `f_parser' */ - ZIO *z; - Mbuffer buff; /* dynamic structure used by the scanner */ - Dyndata dyd; /* dynamic structures used by the parser */ - const char *mode; - const char *name; -}; - - -static void checkmode (lua_State *L, const char *mode, const char *x) { - if (mode && strchr(mode, x[0]) == NULL) { - luaO_pushfstring(L, - "attempt to load a %s chunk (mode is " LUA_QS ")", x, mode); - luaD_throw(L, LUA_ERRSYNTAX); - } -} - - -static void f_parser (lua_State *L, void *ud) { - int i; - Closure *cl; - struct SParser *p = cast(struct SParser *, ud); - int c = zgetc(p->z); /* read first character */ - if (c == LUA_SIGNATURE[0]) { - checkmode(L, p->mode, "binary"); - cl = luaU_undump(L, p->z, &p->buff, p->name); - } - else { - checkmode(L, p->mode, "text"); - cl = luaY_parser(L, p->z, &p->buff, &p->dyd, p->name, c); - } - lua_assert(cl->l.nupvalues == cl->l.p->sizeupvalues); - for (i = 0; i < cl->l.nupvalues; i++) { /* initialize upvalues */ - UpVal *up = luaF_newupval(L); - cl->l.upvals[i] = up; - luaC_objbarrier(L, cl, up); - } -} - - -int luaD_protectedparser (lua_State *L, ZIO *z, const char *name, - const char *mode) { - struct SParser p; - int status; - L->nny++; /* cannot yield during parsing */ - p.z = z; p.name = name; p.mode = mode; - p.dyd.actvar.arr = NULL; p.dyd.actvar.size = 0; - p.dyd.gt.arr = NULL; p.dyd.gt.size = 0; - p.dyd.label.arr = NULL; p.dyd.label.size = 0; - luaZ_initbuffer(L, &p.buff); - status = luaD_pcall(L, f_parser, &p, savestack(L, L->top), L->errfunc); - luaZ_freebuffer(L, &p.buff); - luaM_freearray(L, p.dyd.actvar.arr, p.dyd.actvar.size); - luaM_freearray(L, p.dyd.gt.arr, p.dyd.gt.size); - luaM_freearray(L, p.dyd.label.arr, p.dyd.label.size); - L->nny--; - return status; -} - - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h deleted file mode 100644 index d3d3082c9ba3..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h +++ /dev/null @@ -1,46 +0,0 @@ -/* -** $Id: ldo.h,v 2.20.1.1 2013/04/12 18:48:47 roberto Exp $ -** Stack and Call structure of Lua -** See Copyright Notice in lua.h -*/ - -#ifndef ldo_h -#define ldo_h - - -#include "lobject.h" -#include "lstate.h" -#include "lzio.h" - - -#define luaD_checkstack(L,n) if (L->stack_last - L->top <= (n)) \ - luaD_growstack(L, n); else condmovestack(L); - - -#define incr_top(L) {L->top++; luaD_checkstack(L,0);} - -#define savestack(L,p) ((char *)(p) - (char *)L->stack) -#define restorestack(L,n) ((TValue *)((char *)L->stack + (n))) - - -/* type of protected functions, to be ran by `runprotected' */ -typedef void (*Pfunc) (lua_State *L, void *ud); - -LUAI_FUNC int luaD_protectedparser (lua_State *L, ZIO *z, const char *name, - const char *mode); -LUAI_FUNC void luaD_hook (lua_State *L, int event, int line); -LUAI_FUNC int luaD_precall (lua_State *L, StkId func, int nresults); -LUAI_FUNC void luaD_call (lua_State *L, StkId func, int nResults, - int allowyield); -LUAI_FUNC int luaD_pcall (lua_State *L, Pfunc func, void *u, - ptrdiff_t oldtop, ptrdiff_t ef); -LUAI_FUNC int luaD_poscall (lua_State *L, StkId firstResult); -LUAI_FUNC void luaD_reallocstack (lua_State *L, int newsize); -LUAI_FUNC void luaD_growstack (lua_State *L, int n); -LUAI_FUNC void luaD_shrinkstack (lua_State *L); - -LUAI_FUNC l_noret luaD_throw (lua_State *L, int errcode); -LUAI_FUNC int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud); - -#endif - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c deleted file mode 100644 index 64e564933268..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c +++ /dev/null @@ -1,173 +0,0 @@ -/* -** $Id: ldump.c,v 2.17.1.1 2013/04/12 18:48:47 roberto Exp $ -** save precompiled Lua chunks -** See Copyright Notice in lua.h -*/ - -#include - -#define ldump_c -#define LUA_CORE - -#include "lua.h" - -#include "lobject.h" -#include "lstate.h" -#include "lundump.h" - -typedef struct { - lua_State* L; - lua_Writer writer; - void* data; - int strip; - int status; -} DumpState; - -#define DumpMem(b,n,size,D) DumpBlock(b,(n)*(size),D) -#define DumpVar(x,D) DumpMem(&x,1,sizeof(x),D) - -static void DumpBlock(const void* b, size_t size, DumpState* D) -{ - if (D->status==0) - { - lua_unlock(D->L); - D->status=(*D->writer)(D->L,b,size,D->data); - lua_lock(D->L); - } -} - -static void DumpChar(int y, DumpState* D) -{ - char x=(char)y; - DumpVar(x,D); -} - -static void DumpInt(int x, DumpState* D) -{ - DumpVar(x,D); -} - -static void DumpNumber(lua_Number x, DumpState* D) -{ - DumpVar(x,D); -} - -static void DumpVector(const void* b, int n, size_t size, DumpState* D) -{ - DumpInt(n,D); - DumpMem(b,n,size,D); -} - -static void DumpString(const TString* s, DumpState* D) -{ - if (s==NULL) - { - size_t size=0; - DumpVar(size,D); - } - else - { - size_t size=s->tsv.len+1; /* include trailing '\0' */ - DumpVar(size,D); - DumpBlock(getstr(s),size*sizeof(char),D); - } -} - -#define DumpCode(f,D) DumpVector(f->code,f->sizecode,sizeof(Instruction),D) - -static void DumpFunction(const Proto* f, DumpState* D); - -static void DumpConstants(const Proto* f, DumpState* D) -{ - int i,n=f->sizek; - DumpInt(n,D); - for (i=0; ik[i]; - DumpChar(ttypenv(o),D); - switch (ttypenv(o)) - { - case LUA_TNIL: - break; - case LUA_TBOOLEAN: - DumpChar(bvalue(o),D); - break; - case LUA_TNUMBER: - DumpNumber(nvalue(o),D); - break; - case LUA_TSTRING: - DumpString(rawtsvalue(o),D); - break; - default: lua_assert(0); - } - } - n=f->sizep; - DumpInt(n,D); - for (i=0; ip[i],D); -} - -static void DumpUpvalues(const Proto* f, DumpState* D) -{ - int i,n=f->sizeupvalues; - DumpInt(n,D); - for (i=0; iupvalues[i].instack,D); - DumpChar(f->upvalues[i].idx,D); - } -} - -static void DumpDebug(const Proto* f, DumpState* D) -{ - int i,n; - DumpString((D->strip) ? NULL : f->source,D); - n= (D->strip) ? 0 : f->sizelineinfo; - DumpVector(f->lineinfo,n,sizeof(int),D); - n= (D->strip) ? 0 : f->sizelocvars; - DumpInt(n,D); - for (i=0; ilocvars[i].varname,D); - DumpInt(f->locvars[i].startpc,D); - DumpInt(f->locvars[i].endpc,D); - } - n= (D->strip) ? 0 : f->sizeupvalues; - DumpInt(n,D); - for (i=0; iupvalues[i].name,D); -} - -static void DumpFunction(const Proto* f, DumpState* D) -{ - DumpInt(f->linedefined,D); - DumpInt(f->lastlinedefined,D); - DumpChar(f->numparams,D); - DumpChar(f->is_vararg,D); - DumpChar(f->maxstacksize,D); - DumpCode(f,D); - DumpConstants(f,D); - DumpUpvalues(f,D); - DumpDebug(f,D); -} - -static void DumpHeader(DumpState* D) -{ - lu_byte h[LUAC_HEADERSIZE]; - luaU_header(h); - DumpBlock(h,LUAC_HEADERSIZE,D); -} - -/* -** dump Lua function as precompiled chunk -*/ -int luaU_dump (lua_State* L, const Proto* f, lua_Writer w, void* data, int strip) -{ - DumpState D; - D.L=L; - D.writer=w; - D.data=data; - D.strip=strip; - D.status=0; - DumpHeader(&D); - DumpFunction(f,&D); - return D.status; -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c deleted file mode 100644 index 684e44709a8f..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c +++ /dev/null @@ -1,161 +0,0 @@ -/* -** $Id: lfunc.c,v 2.30.1.1 2013/04/12 18:48:47 roberto Exp $ -** Auxiliary functions to manipulate prototypes and closures -** See Copyright Notice in lua.h -*/ - - -#include - -#define lfunc_c -#define LUA_CORE - -#include "lua.h" - -#include "lfunc.h" -#include "lgc.h" -#include "lmem.h" -#include "lobject.h" -#include "lstate.h" - - - -Closure *luaF_newCclosure (lua_State *L, int n) { - Closure *c = &luaC_newobj(L, LUA_TCCL, sizeCclosure(n), NULL, 0)->cl; - c->c.nupvalues = cast_byte(n); - return c; -} - - -Closure *luaF_newLclosure (lua_State *L, int n) { - Closure *c = &luaC_newobj(L, LUA_TLCL, sizeLclosure(n), NULL, 0)->cl; - c->l.p = NULL; - c->l.nupvalues = cast_byte(n); - while (n--) c->l.upvals[n] = NULL; - return c; -} - - -UpVal *luaF_newupval (lua_State *L) { - UpVal *uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), NULL, 0)->uv; - uv->v = &uv->u.value; - setnilvalue(uv->v); - return uv; -} - - -UpVal *luaF_findupval (lua_State *L, StkId level) { - global_State *g = G(L); - GCObject **pp = &L->openupval; - UpVal *p; - UpVal *uv; - while (*pp != NULL && (p = gco2uv(*pp))->v >= level) { - GCObject *o = obj2gco(p); - lua_assert(p->v != &p->u.value); - lua_assert(!isold(o) || isold(obj2gco(L))); - if (p->v == level) { /* found a corresponding upvalue? */ - if (isdead(g, o)) /* is it dead? */ - changewhite(o); /* resurrect it */ - return p; - } - pp = &p->next; - } - /* not found: create a new one */ - uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), pp, 0)->uv; - uv->v = level; /* current value lives in the stack */ - uv->u.l.prev = &g->uvhead; /* double link it in `uvhead' list */ - uv->u.l.next = g->uvhead.u.l.next; - uv->u.l.next->u.l.prev = uv; - g->uvhead.u.l.next = uv; - lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv); - return uv; -} - - -static void unlinkupval (UpVal *uv) { - lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv); - uv->u.l.next->u.l.prev = uv->u.l.prev; /* remove from `uvhead' list */ - uv->u.l.prev->u.l.next = uv->u.l.next; -} - - -void luaF_freeupval (lua_State *L, UpVal *uv) { - if (uv->v != &uv->u.value) /* is it open? */ - unlinkupval(uv); /* remove from open list */ - luaM_free(L, uv); /* free upvalue */ -} - - -void luaF_close (lua_State *L, StkId level) { - UpVal *uv; - global_State *g = G(L); - while (L->openupval != NULL && (uv = gco2uv(L->openupval))->v >= level) { - GCObject *o = obj2gco(uv); - lua_assert(!isblack(o) && uv->v != &uv->u.value); - L->openupval = uv->next; /* remove from `open' list */ - if (isdead(g, o)) - luaF_freeupval(L, uv); /* free upvalue */ - else { - unlinkupval(uv); /* remove upvalue from 'uvhead' list */ - setobj(L, &uv->u.value, uv->v); /* move value to upvalue slot */ - uv->v = &uv->u.value; /* now current value lives here */ - gch(o)->next = g->allgc; /* link upvalue into 'allgc' list */ - g->allgc = o; - luaC_checkupvalcolor(g, uv); - } - } -} - - -Proto *luaF_newproto (lua_State *L) { - Proto *f = &luaC_newobj(L, LUA_TPROTO, sizeof(Proto), NULL, 0)->p; - f->k = NULL; - f->sizek = 0; - f->p = NULL; - f->sizep = 0; - f->code = NULL; - f->cache = NULL; - f->sizecode = 0; - f->lineinfo = NULL; - f->sizelineinfo = 0; - f->upvalues = NULL; - f->sizeupvalues = 0; - f->numparams = 0; - f->is_vararg = 0; - f->maxstacksize = 0; - f->locvars = NULL; - f->sizelocvars = 0; - f->linedefined = 0; - f->lastlinedefined = 0; - f->source = NULL; - return f; -} - - -void luaF_freeproto (lua_State *L, Proto *f) { - luaM_freearray(L, f->code, f->sizecode); - luaM_freearray(L, f->p, f->sizep); - luaM_freearray(L, f->k, f->sizek); - luaM_freearray(L, f->lineinfo, f->sizelineinfo); - luaM_freearray(L, f->locvars, f->sizelocvars); - luaM_freearray(L, f->upvalues, f->sizeupvalues); - luaM_free(L, f); -} - - -/* -** Look for n-th local variable at line `line' in function `func'. -** Returns NULL if not found. -*/ -const char *luaF_getlocalname (const Proto *f, int local_number, int pc) { - int i; - for (i = 0; isizelocvars && f->locvars[i].startpc <= pc; i++) { - if (pc < f->locvars[i].endpc) { /* is variable active? */ - local_number--; - if (local_number == 0) - return getstr(f->locvars[i].varname); - } - } - return NULL; /* not found */ -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h deleted file mode 100644 index ca0d3a3e0b03..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h +++ /dev/null @@ -1,33 +0,0 @@ -/* -** $Id: lfunc.h,v 2.8.1.1 2013/04/12 18:48:47 roberto Exp $ -** Auxiliary functions to manipulate prototypes and closures -** See Copyright Notice in lua.h -*/ - -#ifndef lfunc_h -#define lfunc_h - - -#include "lobject.h" - - -#define sizeCclosure(n) (cast(int, sizeof(CClosure)) + \ - cast(int, sizeof(TValue)*((n)-1))) - -#define sizeLclosure(n) (cast(int, sizeof(LClosure)) + \ - cast(int, sizeof(TValue *)*((n)-1))) - - -LUAI_FUNC Proto *luaF_newproto (lua_State *L); -LUAI_FUNC Closure *luaF_newCclosure (lua_State *L, int nelems); -LUAI_FUNC Closure *luaF_newLclosure (lua_State *L, int nelems); -LUAI_FUNC UpVal *luaF_newupval (lua_State *L); -LUAI_FUNC UpVal *luaF_findupval (lua_State *L, StkId level); -LUAI_FUNC void luaF_close (lua_State *L, StkId level); -LUAI_FUNC void luaF_freeproto (lua_State *L, Proto *f); -LUAI_FUNC void luaF_freeupval (lua_State *L, UpVal *uv); -LUAI_FUNC const char *luaF_getlocalname (const Proto *func, int local_number, - int pc); - - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c deleted file mode 100644 index 4a7d25af2083..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c +++ /dev/null @@ -1,1220 +0,0 @@ -/* -** $Id: lgc.c,v 2.140.1.3 2014/09/01 16:55:08 roberto Exp $ -** Garbage Collector -** See Copyright Notice in lua.h -*/ - -#include - -#define lgc_c -#define LUA_CORE - -#include "lua.h" - -#include "ldebug.h" -#include "ldo.h" -#include "lfunc.h" -#include "lgc.h" -#include "lmem.h" -#include "lobject.h" -#include "lstate.h" -#include "lstring.h" -#include "ltable.h" -#include "ltm.h" - - - -/* -** cost of sweeping one element (the size of a small object divided -** by some adjust for the sweep speed) -*/ -#define GCSWEEPCOST ((sizeof(TString) + 4) / 4) - -/* maximum number of elements to sweep in each single step */ -#define GCSWEEPMAX (cast_int((GCSTEPSIZE / GCSWEEPCOST) / 4)) - -/* maximum number of finalizers to call in each GC step */ -#define GCFINALIZENUM 4 - - -/* -** macro to adjust 'stepmul': 'stepmul' is actually used like -** 'stepmul / STEPMULADJ' (value chosen by tests) -*/ -#define STEPMULADJ 200 - - -/* -** macro to adjust 'pause': 'pause' is actually used like -** 'pause / PAUSEADJ' (value chosen by tests) -*/ -#define PAUSEADJ 100 - - -/* -** 'makewhite' erases all color bits plus the old bit and then -** sets only the current white bit -*/ -#define maskcolors (~(bit2mask(BLACKBIT, OLDBIT) | WHITEBITS)) -#define makewhite(g,x) \ - (gch(x)->marked = cast_byte((gch(x)->marked & maskcolors) | luaC_white(g))) - -#define white2gray(x) resetbits(gch(x)->marked, WHITEBITS) -#define black2gray(x) resetbit(gch(x)->marked, BLACKBIT) - - -#define isfinalized(x) testbit(gch(x)->marked, FINALIZEDBIT) - -#define checkdeadkey(n) lua_assert(!ttisdeadkey(gkey(n)) || ttisnil(gval(n))) - - -#define checkconsistency(obj) \ - lua_longassert(!iscollectable(obj) || righttt(obj)) - - -#define markvalue(g,o) { checkconsistency(o); \ - if (valiswhite(o)) reallymarkobject(g,gcvalue(o)); } - -#define markobject(g,t) { if ((t) && iswhite(obj2gco(t))) \ - reallymarkobject(g, obj2gco(t)); } - -static void reallymarkobject (global_State *g, GCObject *o); - - -/* -** {====================================================== -** Generic functions -** ======================================================= -*/ - - -/* -** one after last element in a hash array -*/ -#define gnodelast(h) gnode(h, cast(size_t, sizenode(h))) - - -/* -** link table 'h' into list pointed by 'p' -*/ -#define linktable(h,p) ((h)->gclist = *(p), *(p) = obj2gco(h)) - - -/* -** if key is not marked, mark its entry as dead (therefore removing it -** from the table) -*/ -static void removeentry (Node *n) { - lua_assert(ttisnil(gval(n))); - if (valiswhite(gkey(n))) - setdeadvalue(gkey(n)); /* unused and unmarked key; remove it */ -} - - -/* -** tells whether a key or value can be cleared from a weak -** table. Non-collectable objects are never removed from weak -** tables. Strings behave as `values', so are never removed too. for -** other objects: if really collected, cannot keep them; for objects -** being finalized, keep them in keys, but not in values -*/ -static int iscleared (global_State *g, const TValue *o) { - if (!iscollectable(o)) return 0; - else if (ttisstring(o)) { - markobject(g, rawtsvalue(o)); /* strings are `values', so are never weak */ - return 0; - } - else return iswhite(gcvalue(o)); -} - - -/* -** barrier that moves collector forward, that is, mark the white object -** being pointed by a black object. -*/ -void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v) { - global_State *g = G(L); - lua_assert(isblack(o) && iswhite(v) && !isdead(g, v) && !isdead(g, o)); - lua_assert(g->gcstate != GCSpause); - lua_assert(gch(o)->tt != LUA_TTABLE); - if (keepinvariantout(g)) /* must keep invariant? */ - reallymarkobject(g, v); /* restore invariant */ - else { /* sweep phase */ - lua_assert(issweepphase(g)); - makewhite(g, o); /* mark main obj. as white to avoid other barriers */ - } -} - - -/* -** barrier that moves collector backward, that is, mark the black object -** pointing to a white object as gray again. (Current implementation -** only works for tables; access to 'gclist' is not uniform across -** different types.) -*/ -void luaC_barrierback_ (lua_State *L, GCObject *o) { - global_State *g = G(L); - lua_assert(isblack(o) && !isdead(g, o) && gch(o)->tt == LUA_TTABLE); - black2gray(o); /* make object gray (again) */ - gco2t(o)->gclist = g->grayagain; - g->grayagain = o; -} - - -/* -** barrier for prototypes. When creating first closure (cache is -** NULL), use a forward barrier; this may be the only closure of the -** prototype (if it is a "regular" function, with a single instance) -** and the prototype may be big, so it is better to avoid traversing -** it again. Otherwise, use a backward barrier, to avoid marking all -** possible instances. -*/ -LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c) { - global_State *g = G(L); - lua_assert(isblack(obj2gco(p))); - if (p->cache == NULL) { /* first time? */ - luaC_objbarrier(L, p, c); - } - else { /* use a backward barrier */ - black2gray(obj2gco(p)); /* make prototype gray (again) */ - p->gclist = g->grayagain; - g->grayagain = obj2gco(p); - } -} - - -/* -** check color (and invariants) for an upvalue that was closed, -** i.e., moved into the 'allgc' list -*/ -void luaC_checkupvalcolor (global_State *g, UpVal *uv) { - GCObject *o = obj2gco(uv); - lua_assert(!isblack(o)); /* open upvalues are never black */ - if (isgray(o)) { - if (keepinvariant(g)) { - resetoldbit(o); /* see MOVE OLD rule */ - gray2black(o); /* it is being visited now */ - markvalue(g, uv->v); - } - else { - lua_assert(issweepphase(g)); - makewhite(g, o); - } - } -} - - -/* -** create a new collectable object (with given type and size) and link -** it to '*list'. 'offset' tells how many bytes to allocate before the -** object itself (used only by states). -*/ -GCObject *luaC_newobj (lua_State *L, int tt, size_t sz, GCObject **list, - int offset) { - global_State *g = G(L); - char *raw = cast(char *, luaM_newobject(L, novariant(tt), sz)); - GCObject *o = obj2gco(raw + offset); - if (list == NULL) - list = &g->allgc; /* standard list for collectable objects */ - gch(o)->marked = luaC_white(g); - gch(o)->tt = tt; - gch(o)->next = *list; - *list = o; - return o; -} - -/* }====================================================== */ - - - -/* -** {====================================================== -** Mark functions -** ======================================================= -*/ - - -/* -** mark an object. Userdata, strings, and closed upvalues are visited -** and turned black here. Other objects are marked gray and added -** to appropriate list to be visited (and turned black) later. (Open -** upvalues are already linked in 'headuv' list.) -*/ -static void reallymarkobject (global_State *g, GCObject *o) { - lu_mem size; - white2gray(o); - switch (gch(o)->tt) { - case LUA_TSHRSTR: - case LUA_TLNGSTR: { - size = sizestring(gco2ts(o)); - break; /* nothing else to mark; make it black */ - } - case LUA_TUSERDATA: { - Table *mt = gco2u(o)->metatable; - markobject(g, mt); - markobject(g, gco2u(o)->env); - size = sizeudata(gco2u(o)); - break; - } - case LUA_TUPVAL: { - UpVal *uv = gco2uv(o); - markvalue(g, uv->v); - if (uv->v != &uv->u.value) /* open? */ - return; /* open upvalues remain gray */ - size = sizeof(UpVal); - break; - } - case LUA_TLCL: { - gco2lcl(o)->gclist = g->gray; - g->gray = o; - return; - } - case LUA_TCCL: { - gco2ccl(o)->gclist = g->gray; - g->gray = o; - return; - } - case LUA_TTABLE: { - linktable(gco2t(o), &g->gray); - return; - } - case LUA_TTHREAD: { - gco2th(o)->gclist = g->gray; - g->gray = o; - return; - } - case LUA_TPROTO: { - gco2p(o)->gclist = g->gray; - g->gray = o; - return; - } - default: lua_assert(0); return; - } - gray2black(o); - g->GCmemtrav += size; -} - - -/* -** mark metamethods for basic types -*/ -static void markmt (global_State *g) { - int i; - for (i=0; i < LUA_NUMTAGS; i++) - markobject(g, g->mt[i]); -} - - -/* -** mark all objects in list of being-finalized -*/ -static void markbeingfnz (global_State *g) { - GCObject *o; - for (o = g->tobefnz; o != NULL; o = gch(o)->next) { - makewhite(g, o); - reallymarkobject(g, o); - } -} - - -/* -** mark all values stored in marked open upvalues. (See comment in -** 'lstate.h'.) -*/ -static void remarkupvals (global_State *g) { - UpVal *uv; - for (uv = g->uvhead.u.l.next; uv != &g->uvhead; uv = uv->u.l.next) { - if (isgray(obj2gco(uv))) - markvalue(g, uv->v); - } -} - - -/* -** mark root set and reset all gray lists, to start a new -** incremental (or full) collection -*/ -static void restartcollection (global_State *g) { - g->gray = g->grayagain = NULL; - g->weak = g->allweak = g->ephemeron = NULL; - markobject(g, g->mainthread); - markvalue(g, &g->l_registry); - markmt(g); - markbeingfnz(g); /* mark any finalizing object left from previous cycle */ -} - -/* }====================================================== */ - - -/* -** {====================================================== -** Traverse functions -** ======================================================= -*/ - -static void traverseweakvalue (global_State *g, Table *h) { - Node *n, *limit = gnodelast(h); - /* if there is array part, assume it may have white values (do not - traverse it just to check) */ - int hasclears = (h->sizearray > 0); - for (n = gnode(h, 0); n < limit; n++) { - checkdeadkey(n); - if (ttisnil(gval(n))) /* entry is empty? */ - removeentry(n); /* remove it */ - else { - lua_assert(!ttisnil(gkey(n))); - markvalue(g, gkey(n)); /* mark key */ - if (!hasclears && iscleared(g, gval(n))) /* is there a white value? */ - hasclears = 1; /* table will have to be cleared */ - } - } - if (hasclears) - linktable(h, &g->weak); /* has to be cleared later */ - else /* no white values */ - linktable(h, &g->grayagain); /* no need to clean */ -} - - -static int traverseephemeron (global_State *g, Table *h) { - int marked = 0; /* true if an object is marked in this traversal */ - int hasclears = 0; /* true if table has white keys */ - int prop = 0; /* true if table has entry "white-key -> white-value" */ - Node *n, *limit = gnodelast(h); - int i; - /* traverse array part (numeric keys are 'strong') */ - for (i = 0; i < h->sizearray; i++) { - if (valiswhite(&h->array[i])) { - marked = 1; - reallymarkobject(g, gcvalue(&h->array[i])); - } - } - /* traverse hash part */ - for (n = gnode(h, 0); n < limit; n++) { - checkdeadkey(n); - if (ttisnil(gval(n))) /* entry is empty? */ - removeentry(n); /* remove it */ - else if (iscleared(g, gkey(n))) { /* key is not marked (yet)? */ - hasclears = 1; /* table must be cleared */ - if (valiswhite(gval(n))) /* value not marked yet? */ - prop = 1; /* must propagate again */ - } - else if (valiswhite(gval(n))) { /* value not marked yet? */ - marked = 1; - reallymarkobject(g, gcvalue(gval(n))); /* mark it now */ - } - } - if (g->gcstate != GCSatomic || prop) - linktable(h, &g->ephemeron); /* have to propagate again */ - else if (hasclears) /* does table have white keys? */ - linktable(h, &g->allweak); /* may have to clean white keys */ - else /* no white keys */ - linktable(h, &g->grayagain); /* no need to clean */ - return marked; -} - - -static void traversestrongtable (global_State *g, Table *h) { - Node *n, *limit = gnodelast(h); - int i; - for (i = 0; i < h->sizearray; i++) /* traverse array part */ - markvalue(g, &h->array[i]); - for (n = gnode(h, 0); n < limit; n++) { /* traverse hash part */ - checkdeadkey(n); - if (ttisnil(gval(n))) /* entry is empty? */ - removeentry(n); /* remove it */ - else { - lua_assert(!ttisnil(gkey(n))); - markvalue(g, gkey(n)); /* mark key */ - markvalue(g, gval(n)); /* mark value */ - } - } -} - - -static lu_mem traversetable (global_State *g, Table *h) { - const char *weakkey, *weakvalue; - const TValue *mode = gfasttm(g, h->metatable, TM_MODE); - markobject(g, h->metatable); - if (mode && ttisstring(mode) && /* is there a weak mode? */ - ((weakkey = strchr(svalue(mode), 'k')), - (weakvalue = strchr(svalue(mode), 'v')), - (weakkey || weakvalue))) { /* is really weak? */ - black2gray(obj2gco(h)); /* keep table gray */ - if (!weakkey) /* strong keys? */ - traverseweakvalue(g, h); - else if (!weakvalue) /* strong values? */ - traverseephemeron(g, h); - else /* all weak */ - linktable(h, &g->allweak); /* nothing to traverse now */ - } - else /* not weak */ - traversestrongtable(g, h); - return sizeof(Table) + sizeof(TValue) * h->sizearray + - sizeof(Node) * cast(size_t, sizenode(h)); -} - - -static int traverseproto (global_State *g, Proto *f) { - int i; - if (f->cache && iswhite(obj2gco(f->cache))) - f->cache = NULL; /* allow cache to be collected */ - markobject(g, f->source); - for (i = 0; i < f->sizek; i++) /* mark literals */ - markvalue(g, &f->k[i]); - for (i = 0; i < f->sizeupvalues; i++) /* mark upvalue names */ - markobject(g, f->upvalues[i].name); - for (i = 0; i < f->sizep; i++) /* mark nested protos */ - markobject(g, f->p[i]); - for (i = 0; i < f->sizelocvars; i++) /* mark local-variable names */ - markobject(g, f->locvars[i].varname); - return sizeof(Proto) + sizeof(Instruction) * f->sizecode + - sizeof(Proto *) * f->sizep + - sizeof(TValue) * f->sizek + - sizeof(int) * f->sizelineinfo + - sizeof(LocVar) * f->sizelocvars + - sizeof(Upvaldesc) * f->sizeupvalues; -} - - -static lu_mem traverseCclosure (global_State *g, CClosure *cl) { - int i; - for (i = 0; i < cl->nupvalues; i++) /* mark its upvalues */ - markvalue(g, &cl->upvalue[i]); - return sizeCclosure(cl->nupvalues); -} - -static lu_mem traverseLclosure (global_State *g, LClosure *cl) { - int i; - markobject(g, cl->p); /* mark its prototype */ - for (i = 0; i < cl->nupvalues; i++) /* mark its upvalues */ - markobject(g, cl->upvals[i]); - return sizeLclosure(cl->nupvalues); -} - - -static lu_mem traversestack (global_State *g, lua_State *th) { - int n = 0; - StkId o = th->stack; - if (o == NULL) - return 1; /* stack not completely built yet */ - for (; o < th->top; o++) /* mark live elements in the stack */ - markvalue(g, o); - if (g->gcstate == GCSatomic) { /* final traversal? */ - StkId lim = th->stack + th->stacksize; /* real end of stack */ - for (; o < lim; o++) /* clear not-marked stack slice */ - setnilvalue(o); - } - else { /* count call infos to compute size */ - CallInfo *ci; - for (ci = &th->base_ci; ci != th->ci; ci = ci->next) - n++; - } - return sizeof(lua_State) + sizeof(TValue) * th->stacksize + - sizeof(CallInfo) * n; -} - - -/* -** traverse one gray object, turning it to black (except for threads, -** which are always gray). -*/ -static void propagatemark (global_State *g) { - lu_mem size; - GCObject *o = g->gray; - lua_assert(isgray(o)); - gray2black(o); - switch (gch(o)->tt) { - case LUA_TTABLE: { - Table *h = gco2t(o); - g->gray = h->gclist; /* remove from 'gray' list */ - size = traversetable(g, h); - break; - } - case LUA_TLCL: { - LClosure *cl = gco2lcl(o); - g->gray = cl->gclist; /* remove from 'gray' list */ - size = traverseLclosure(g, cl); - break; - } - case LUA_TCCL: { - CClosure *cl = gco2ccl(o); - g->gray = cl->gclist; /* remove from 'gray' list */ - size = traverseCclosure(g, cl); - break; - } - case LUA_TTHREAD: { - lua_State *th = gco2th(o); - g->gray = th->gclist; /* remove from 'gray' list */ - th->gclist = g->grayagain; - g->grayagain = o; /* insert into 'grayagain' list */ - black2gray(o); - size = traversestack(g, th); - break; - } - case LUA_TPROTO: { - Proto *p = gco2p(o); - g->gray = p->gclist; /* remove from 'gray' list */ - size = traverseproto(g, p); - break; - } - default: lua_assert(0); return; - } - g->GCmemtrav += size; -} - - -static void propagateall (global_State *g) { - while (g->gray) propagatemark(g); -} - - -static void propagatelist (global_State *g, GCObject *l) { - lua_assert(g->gray == NULL); /* no grays left */ - g->gray = l; - propagateall(g); /* traverse all elements from 'l' */ -} - -/* -** retraverse all gray lists. Because tables may be reinserted in other -** lists when traversed, traverse the original lists to avoid traversing -** twice the same table (which is not wrong, but inefficient) -*/ -static void retraversegrays (global_State *g) { - GCObject *weak = g->weak; /* save original lists */ - GCObject *grayagain = g->grayagain; - GCObject *ephemeron = g->ephemeron; - g->weak = g->grayagain = g->ephemeron = NULL; - propagateall(g); /* traverse main gray list */ - propagatelist(g, grayagain); - propagatelist(g, weak); - propagatelist(g, ephemeron); -} - - -static void convergeephemerons (global_State *g) { - int changed; - do { - GCObject *w; - GCObject *next = g->ephemeron; /* get ephemeron list */ - g->ephemeron = NULL; /* tables will return to this list when traversed */ - changed = 0; - while ((w = next) != NULL) { - next = gco2t(w)->gclist; - if (traverseephemeron(g, gco2t(w))) { /* traverse marked some value? */ - propagateall(g); /* propagate changes */ - changed = 1; /* will have to revisit all ephemeron tables */ - } - } - } while (changed); -} - -/* }====================================================== */ - - -/* -** {====================================================== -** Sweep Functions -** ======================================================= -*/ - - -/* -** clear entries with unmarked keys from all weaktables in list 'l' up -** to element 'f' -*/ -static void clearkeys (global_State *g, GCObject *l, GCObject *f) { - for (; l != f; l = gco2t(l)->gclist) { - Table *h = gco2t(l); - Node *n, *limit = gnodelast(h); - for (n = gnode(h, 0); n < limit; n++) { - if (!ttisnil(gval(n)) && (iscleared(g, gkey(n)))) { - setnilvalue(gval(n)); /* remove value ... */ - removeentry(n); /* and remove entry from table */ - } - } - } -} - - -/* -** clear entries with unmarked values from all weaktables in list 'l' up -** to element 'f' -*/ -static void clearvalues (global_State *g, GCObject *l, GCObject *f) { - for (; l != f; l = gco2t(l)->gclist) { - Table *h = gco2t(l); - Node *n, *limit = gnodelast(h); - int i; - for (i = 0; i < h->sizearray; i++) { - TValue *o = &h->array[i]; - if (iscleared(g, o)) /* value was collected? */ - setnilvalue(o); /* remove value */ - } - for (n = gnode(h, 0); n < limit; n++) { - if (!ttisnil(gval(n)) && iscleared(g, gval(n))) { - setnilvalue(gval(n)); /* remove value ... */ - removeentry(n); /* and remove entry from table */ - } - } - } -} - - -static void freeobj (lua_State *L, GCObject *o) { - switch (gch(o)->tt) { - case LUA_TPROTO: luaF_freeproto(L, gco2p(o)); break; - case LUA_TLCL: { - luaM_freemem(L, o, sizeLclosure(gco2lcl(o)->nupvalues)); - break; - } - case LUA_TCCL: { - luaM_freemem(L, o, sizeCclosure(gco2ccl(o)->nupvalues)); - break; - } - case LUA_TUPVAL: luaF_freeupval(L, gco2uv(o)); break; - case LUA_TTABLE: luaH_free(L, gco2t(o)); break; - case LUA_TTHREAD: luaE_freethread(L, gco2th(o)); break; - case LUA_TUSERDATA: luaM_freemem(L, o, sizeudata(gco2u(o))); break; - case LUA_TSHRSTR: - G(L)->strt.nuse--; - /* FALLTHROUGH */ - case LUA_TLNGSTR: { - luaM_freemem(L, o, sizestring(gco2ts(o))); - break; - } - default: lua_assert(0); - } -} - - -#define sweepwholelist(L,p) sweeplist(L,p,MAX_LUMEM) -static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count); - - -/* -** sweep the (open) upvalues of a thread and resize its stack and -** list of call-info structures. -*/ -static void sweepthread (lua_State *L, lua_State *L1) { - if (L1->stack == NULL) return; /* stack not completely built yet */ - sweepwholelist(L, &L1->openupval); /* sweep open upvalues */ - luaE_freeCI(L1); /* free extra CallInfo slots */ - /* should not change the stack during an emergency gc cycle */ - if (G(L)->gckind != KGC_EMERGENCY) - luaD_shrinkstack(L1); -} - - -/* -** sweep at most 'count' elements from a list of GCObjects erasing dead -** objects, where a dead (not alive) object is one marked with the "old" -** (non current) white and not fixed. -** In non-generational mode, change all non-dead objects back to white, -** preparing for next collection cycle. -** In generational mode, keep black objects black, and also mark them as -** old; stop when hitting an old object, as all objects after that -** one will be old too. -** When object is a thread, sweep its list of open upvalues too. -*/ -static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count) { - global_State *g = G(L); - int ow = otherwhite(g); - int toclear, toset; /* bits to clear and to set in all live objects */ - int tostop; /* stop sweep when this is true */ - if (isgenerational(g)) { /* generational mode? */ - toclear = ~0; /* clear nothing */ - toset = bitmask(OLDBIT); /* set the old bit of all surviving objects */ - tostop = bitmask(OLDBIT); /* do not sweep old generation */ - } - else { /* normal mode */ - toclear = maskcolors; /* clear all color bits + old bit */ - toset = luaC_white(g); /* make object white */ - tostop = 0; /* do not stop */ - } - while (*p != NULL && count-- > 0) { - GCObject *curr = *p; - int marked = gch(curr)->marked; - if (isdeadm(ow, marked)) { /* is 'curr' dead? */ - *p = gch(curr)->next; /* remove 'curr' from list */ - freeobj(L, curr); /* erase 'curr' */ - } - else { - if (testbits(marked, tostop)) - return NULL; /* stop sweeping this list */ - if (gch(curr)->tt == LUA_TTHREAD) - sweepthread(L, gco2th(curr)); /* sweep thread's upvalues */ - /* update marks */ - gch(curr)->marked = cast_byte((marked & toclear) | toset); - p = &gch(curr)->next; /* go to next element */ - } - } - return (*p == NULL) ? NULL : p; -} - - -/* -** sweep a list until a live object (or end of list) -*/ -static GCObject **sweeptolive (lua_State *L, GCObject **p, int *n) { - GCObject ** old = p; - int i = 0; - do { - i++; - p = sweeplist(L, p, 1); - } while (p == old); - if (n) *n += i; - return p; -} - -/* }====================================================== */ - - -/* -** {====================================================== -** Finalization -** ======================================================= -*/ - -static void checkSizes (lua_State *L) { - global_State *g = G(L); - if (g->gckind != KGC_EMERGENCY) { /* do not change sizes in emergency */ - int hs = g->strt.size / 2; /* half the size of the string table */ - if (g->strt.nuse < cast(lu_int32, hs)) /* using less than that half? */ - luaS_resize(L, hs); /* halve its size */ - luaZ_freebuffer(L, &g->buff); /* free concatenation buffer */ - } -} - - -static GCObject *udata2finalize (global_State *g) { - GCObject *o = g->tobefnz; /* get first element */ - lua_assert(isfinalized(o)); - g->tobefnz = gch(o)->next; /* remove it from 'tobefnz' list */ - gch(o)->next = g->allgc; /* return it to 'allgc' list */ - g->allgc = o; - resetbit(gch(o)->marked, SEPARATED); /* mark that it is not in 'tobefnz' */ - lua_assert(!isold(o)); /* see MOVE OLD rule */ - if (!keepinvariantout(g)) /* not keeping invariant? */ - makewhite(g, o); /* "sweep" object */ - return o; -} - - -static void dothecall (lua_State *L, void *ud) { - UNUSED(ud); - luaD_call(L, L->top - 2, 0, 0); -} - - -static void GCTM (lua_State *L, int propagateerrors) { - global_State *g = G(L); - const TValue *tm; - TValue v; - setgcovalue(L, &v, udata2finalize(g)); - tm = luaT_gettmbyobj(L, &v, TM_GC); - if (tm != NULL && ttisfunction(tm)) { /* is there a finalizer? */ - int status; - lu_byte oldah = L->allowhook; - int running = g->gcrunning; - L->allowhook = 0; /* stop debug hooks during GC metamethod */ - g->gcrunning = 0; /* avoid GC steps */ - setobj2s(L, L->top, tm); /* push finalizer... */ - setobj2s(L, L->top + 1, &v); /* ... and its argument */ - L->top += 2; /* and (next line) call the finalizer */ - status = luaD_pcall(L, dothecall, NULL, savestack(L, L->top - 2), 0); - L->allowhook = oldah; /* restore hooks */ - g->gcrunning = running; /* restore state */ - if (status != LUA_OK && propagateerrors) { /* error while running __gc? */ - if (status == LUA_ERRRUN) { /* is there an error object? */ - const char *msg = (ttisstring(L->top - 1)) - ? svalue(L->top - 1) - : "no message"; - luaO_pushfstring(L, "error in __gc metamethod (%s)", msg); - status = LUA_ERRGCMM; /* error in __gc metamethod */ - } - luaD_throw(L, status); /* re-throw error */ - } - } -} - - -/* -** move all unreachable objects (or 'all' objects) that need -** finalization from list 'finobj' to list 'tobefnz' (to be finalized) -*/ -static void separatetobefnz (lua_State *L, int all) { - global_State *g = G(L); - GCObject **p = &g->finobj; - GCObject *curr; - GCObject **lastnext = &g->tobefnz; - /* find last 'next' field in 'tobefnz' list (to add elements in its end) */ - while (*lastnext != NULL) - lastnext = &gch(*lastnext)->next; - while ((curr = *p) != NULL) { /* traverse all finalizable objects */ - lua_assert(!isfinalized(curr)); - lua_assert(testbit(gch(curr)->marked, SEPARATED)); - if (!(iswhite(curr) || all)) /* not being collected? */ - p = &gch(curr)->next; /* don't bother with it */ - else { - l_setbit(gch(curr)->marked, FINALIZEDBIT); /* won't be finalized again */ - *p = gch(curr)->next; /* remove 'curr' from 'finobj' list */ - gch(curr)->next = *lastnext; /* link at the end of 'tobefnz' list */ - *lastnext = curr; - lastnext = &gch(curr)->next; - } - } -} - - -/* -** if object 'o' has a finalizer, remove it from 'allgc' list (must -** search the list to find it) and link it in 'finobj' list. -*/ -void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt) { - global_State *g = G(L); - if (testbit(gch(o)->marked, SEPARATED) || /* obj. is already separated... */ - isfinalized(o) || /* ... or is finalized... */ - gfasttm(g, mt, TM_GC) == NULL) /* or has no finalizer? */ - return; /* nothing to be done */ - else { /* move 'o' to 'finobj' list */ - GCObject **p; - GCheader *ho = gch(o); - if (g->sweepgc == &ho->next) { /* avoid removing current sweep object */ - lua_assert(issweepphase(g)); - g->sweepgc = sweeptolive(L, g->sweepgc, NULL); - } - /* search for pointer pointing to 'o' */ - for (p = &g->allgc; *p != o; p = &gch(*p)->next) { /* empty */ } - *p = ho->next; /* remove 'o' from root list */ - ho->next = g->finobj; /* link it in list 'finobj' */ - g->finobj = o; - l_setbit(ho->marked, SEPARATED); /* mark it as such */ - if (!keepinvariantout(g)) /* not keeping invariant? */ - makewhite(g, o); /* "sweep" object */ - else - resetoldbit(o); /* see MOVE OLD rule */ - } -} - -/* }====================================================== */ - - -/* -** {====================================================== -** GC control -** ======================================================= -*/ - - -/* -** set a reasonable "time" to wait before starting a new GC cycle; -** cycle will start when memory use hits threshold -*/ -static void setpause (global_State *g, l_mem estimate) { - l_mem debt, threshold; - estimate = estimate / PAUSEADJ; /* adjust 'estimate' */ - threshold = (g->gcpause < MAX_LMEM / estimate) /* overflow? */ - ? estimate * g->gcpause /* no overflow */ - : MAX_LMEM; /* overflow; truncate to maximum */ - debt = -cast(l_mem, threshold - gettotalbytes(g)); - luaE_setdebt(g, debt); -} - - -#define sweepphases \ - (bitmask(GCSsweepstring) | bitmask(GCSsweepudata) | bitmask(GCSsweep)) - - -/* -** enter first sweep phase (strings) and prepare pointers for other -** sweep phases. The calls to 'sweeptolive' make pointers point to an -** object inside the list (instead of to the header), so that the real -** sweep do not need to skip objects created between "now" and the start -** of the real sweep. -** Returns how many objects it swept. -*/ -static int entersweep (lua_State *L) { - global_State *g = G(L); - int n = 0; - g->gcstate = GCSsweepstring; - lua_assert(g->sweepgc == NULL && g->sweepfin == NULL); - /* prepare to sweep strings, finalizable objects, and regular objects */ - g->sweepstrgc = 0; - g->sweepfin = sweeptolive(L, &g->finobj, &n); - g->sweepgc = sweeptolive(L, &g->allgc, &n); - return n; -} - - -/* -** change GC mode -*/ -void luaC_changemode (lua_State *L, int mode) { - global_State *g = G(L); - if (mode == g->gckind) return; /* nothing to change */ - if (mode == KGC_GEN) { /* change to generational mode */ - /* make sure gray lists are consistent */ - luaC_runtilstate(L, bitmask(GCSpropagate)); - g->GCestimate = gettotalbytes(g); - g->gckind = KGC_GEN; - } - else { /* change to incremental mode */ - /* sweep all objects to turn them back to white - (as white has not changed, nothing extra will be collected) */ - g->gckind = KGC_NORMAL; - entersweep(L); - luaC_runtilstate(L, ~sweepphases); - } -} - - -/* -** call all pending finalizers -*/ -static void callallpendingfinalizers (lua_State *L, int propagateerrors) { - global_State *g = G(L); - while (g->tobefnz) { - resetoldbit(g->tobefnz); - GCTM(L, propagateerrors); - } -} - - -void luaC_freeallobjects (lua_State *L) { - global_State *g = G(L); - int i; - separatetobefnz(L, 1); /* separate all objects with finalizers */ - lua_assert(g->finobj == NULL); - callallpendingfinalizers(L, 0); - g->currentwhite = WHITEBITS; /* this "white" makes all objects look dead */ - g->gckind = KGC_NORMAL; - sweepwholelist(L, &g->finobj); /* finalizers can create objs. in 'finobj' */ - sweepwholelist(L, &g->allgc); - for (i = 0; i < g->strt.size; i++) /* free all string lists */ - sweepwholelist(L, &g->strt.hash[i]); - lua_assert(g->strt.nuse == 0); -} - - -static l_mem atomic (lua_State *L) { - global_State *g = G(L); - l_mem work = -cast(l_mem, g->GCmemtrav); /* start counting work */ - GCObject *origweak, *origall; - lua_assert(!iswhite(obj2gco(g->mainthread))); - markobject(g, L); /* mark running thread */ - /* registry and global metatables may be changed by API */ - markvalue(g, &g->l_registry); - markmt(g); /* mark basic metatables */ - /* remark occasional upvalues of (maybe) dead threads */ - remarkupvals(g); - propagateall(g); /* propagate changes */ - work += g->GCmemtrav; /* stop counting (do not (re)count grays) */ - /* traverse objects caught by write barrier and by 'remarkupvals' */ - retraversegrays(g); - work -= g->GCmemtrav; /* restart counting */ - convergeephemerons(g); - /* at this point, all strongly accessible objects are marked. */ - /* clear values from weak tables, before checking finalizers */ - clearvalues(g, g->weak, NULL); - clearvalues(g, g->allweak, NULL); - origweak = g->weak; origall = g->allweak; - work += g->GCmemtrav; /* stop counting (objects being finalized) */ - separatetobefnz(L, 0); /* separate objects to be finalized */ - markbeingfnz(g); /* mark objects that will be finalized */ - propagateall(g); /* remark, to propagate `preserveness' */ - work -= g->GCmemtrav; /* restart counting */ - convergeephemerons(g); - /* at this point, all resurrected objects are marked. */ - /* remove dead objects from weak tables */ - clearkeys(g, g->ephemeron, NULL); /* clear keys from all ephemeron tables */ - clearkeys(g, g->allweak, NULL); /* clear keys from all allweak tables */ - /* clear values from resurrected weak tables */ - clearvalues(g, g->weak, origweak); - clearvalues(g, g->allweak, origall); - g->currentwhite = cast_byte(otherwhite(g)); /* flip current white */ - work += g->GCmemtrav; /* complete counting */ - return work; /* estimate of memory marked by 'atomic' */ -} - - -static lu_mem singlestep (lua_State *L) { - global_State *g = G(L); - switch (g->gcstate) { - case GCSpause: { - /* start to count memory traversed */ - g->GCmemtrav = g->strt.size * sizeof(GCObject*); - lua_assert(!isgenerational(g)); - restartcollection(g); - g->gcstate = GCSpropagate; - return g->GCmemtrav; - } - case GCSpropagate: { - if (g->gray) { - lu_mem oldtrav = g->GCmemtrav; - propagatemark(g); - return g->GCmemtrav - oldtrav; /* memory traversed in this step */ - } - else { /* no more `gray' objects */ - lu_mem work; - int sw; - g->gcstate = GCSatomic; /* finish mark phase */ - g->GCestimate = g->GCmemtrav; /* save what was counted */; - work = atomic(L); /* add what was traversed by 'atomic' */ - g->GCestimate += work; /* estimate of total memory traversed */ - sw = entersweep(L); - return work + sw * GCSWEEPCOST; - } - } - case GCSsweepstring: { - int i; - for (i = 0; i < GCSWEEPMAX && g->sweepstrgc + i < g->strt.size; i++) - sweepwholelist(L, &g->strt.hash[g->sweepstrgc + i]); - g->sweepstrgc += i; - if (g->sweepstrgc >= g->strt.size) /* no more strings to sweep? */ - g->gcstate = GCSsweepudata; - return i * GCSWEEPCOST; - } - case GCSsweepudata: { - if (g->sweepfin) { - g->sweepfin = sweeplist(L, g->sweepfin, GCSWEEPMAX); - return GCSWEEPMAX*GCSWEEPCOST; - } - else { - g->gcstate = GCSsweep; - return 0; - } - } - case GCSsweep: { - if (g->sweepgc) { - g->sweepgc = sweeplist(L, g->sweepgc, GCSWEEPMAX); - return GCSWEEPMAX*GCSWEEPCOST; - } - else { - /* sweep main thread */ - GCObject *mt = obj2gco(g->mainthread); - sweeplist(L, &mt, 1); - checkSizes(L); - g->gcstate = GCSpause; /* finish collection */ - return GCSWEEPCOST; - } - } - default: lua_assert(0); return 0; - } -} - - -/* -** advances the garbage collector until it reaches a state allowed -** by 'statemask' -*/ -void luaC_runtilstate (lua_State *L, int statesmask) { - global_State *g = G(L); - while (!testbit(statesmask, g->gcstate)) - singlestep(L); -} - - -static void generationalcollection (lua_State *L) { - global_State *g = G(L); - lua_assert(g->gcstate == GCSpropagate); - if (g->GCestimate == 0) { /* signal for another major collection? */ - luaC_fullgc(L, 0); /* perform a full regular collection */ - g->GCestimate = gettotalbytes(g); /* update control */ - } - else { - lu_mem estimate = g->GCestimate; - luaC_runtilstate(L, bitmask(GCSpause)); /* run complete (minor) cycle */ - g->gcstate = GCSpropagate; /* skip restart */ - if (gettotalbytes(g) > (estimate / 100) * g->gcmajorinc) - g->GCestimate = 0; /* signal for a major collection */ - else - g->GCestimate = estimate; /* keep estimate from last major coll. */ - - } - setpause(g, gettotalbytes(g)); - lua_assert(g->gcstate == GCSpropagate); -} - - -static void incstep (lua_State *L) { - global_State *g = G(L); - l_mem debt = g->GCdebt; - int stepmul = g->gcstepmul; - if (stepmul < 40) stepmul = 40; /* avoid ridiculous low values (and 0) */ - /* convert debt from Kb to 'work units' (avoid zero debt and overflows) */ - debt = (debt / STEPMULADJ) + 1; - debt = (debt < MAX_LMEM / stepmul) ? debt * stepmul : MAX_LMEM; - do { /* always perform at least one single step */ - lu_mem work = singlestep(L); /* do some work */ - debt -= work; - } while (debt > -GCSTEPSIZE && g->gcstate != GCSpause); - if (g->gcstate == GCSpause) - setpause(g, g->GCestimate); /* pause until next cycle */ - else { - debt = (debt / stepmul) * STEPMULADJ; /* convert 'work units' to Kb */ - luaE_setdebt(g, debt); - } -} - - -/* -** performs a basic GC step -*/ -void luaC_forcestep (lua_State *L) { - global_State *g = G(L); - int i; - if (isgenerational(g)) generationalcollection(L); - else incstep(L); - /* run a few finalizers (or all of them at the end of a collect cycle) */ - for (i = 0; g->tobefnz && (i < GCFINALIZENUM || g->gcstate == GCSpause); i++) - GCTM(L, 1); /* call one finalizer */ -} - - -/* -** performs a basic GC step only if collector is running -*/ -void luaC_step (lua_State *L) { - global_State *g = G(L); - if (g->gcrunning) luaC_forcestep(L); - else luaE_setdebt(g, -GCSTEPSIZE); /* avoid being called too often */ -} - - - -/* -** performs a full GC cycle; if "isemergency", does not call -** finalizers (which could change stack positions) -*/ -void luaC_fullgc (lua_State *L, int isemergency) { - global_State *g = G(L); - int origkind = g->gckind; - lua_assert(origkind != KGC_EMERGENCY); - if (isemergency) /* do not run finalizers during emergency GC */ - g->gckind = KGC_EMERGENCY; - else { - g->gckind = KGC_NORMAL; - callallpendingfinalizers(L, 1); - } - if (keepinvariant(g)) { /* may there be some black objects? */ - /* must sweep all objects to turn them back to white - (as white has not changed, nothing will be collected) */ - entersweep(L); - } - /* finish any pending sweep phase to start a new cycle */ - luaC_runtilstate(L, bitmask(GCSpause)); - luaC_runtilstate(L, ~bitmask(GCSpause)); /* start new collection */ - luaC_runtilstate(L, bitmask(GCSpause)); /* run entire collection */ - if (origkind == KGC_GEN) { /* generational mode? */ - /* generational mode must be kept in propagate phase */ - luaC_runtilstate(L, bitmask(GCSpropagate)); - } - g->gckind = origkind; - setpause(g, gettotalbytes(g)); - if (!isemergency) /* do not run finalizers during emergency GC */ - callallpendingfinalizers(L, 1); -} - -/* }====================================================== */ - - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h deleted file mode 100644 index 84bb1cdf99fa..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h +++ /dev/null @@ -1,157 +0,0 @@ -/* -** $Id: lgc.h,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $ -** Garbage Collector -** See Copyright Notice in lua.h -*/ - -#ifndef lgc_h -#define lgc_h - - -#include "lobject.h" -#include "lstate.h" - -/* -** Collectable objects may have one of three colors: white, which -** means the object is not marked; gray, which means the -** object is marked, but its references may be not marked; and -** black, which means that the object and all its references are marked. -** The main invariant of the garbage collector, while marking objects, -** is that a black object can never point to a white one. Moreover, -** any gray object must be in a "gray list" (gray, grayagain, weak, -** allweak, ephemeron) so that it can be visited again before finishing -** the collection cycle. These lists have no meaning when the invariant -** is not being enforced (e.g., sweep phase). -*/ - - - -/* how much to allocate before next GC step */ -#if !defined(GCSTEPSIZE) -/* ~100 small strings */ -#define GCSTEPSIZE (cast_int(100 * sizeof(TString))) -#endif - - -/* -** Possible states of the Garbage Collector -*/ -#define GCSpropagate 0 -#define GCSatomic 1 -#define GCSsweepstring 2 -#define GCSsweepudata 3 -#define GCSsweep 4 -#define GCSpause 5 - - -#define issweepphase(g) \ - (GCSsweepstring <= (g)->gcstate && (g)->gcstate <= GCSsweep) - -#define isgenerational(g) ((g)->gckind == KGC_GEN) - -/* -** macros to tell when main invariant (white objects cannot point to black -** ones) must be kept. During a non-generational collection, the sweep -** phase may break the invariant, as objects turned white may point to -** still-black objects. The invariant is restored when sweep ends and -** all objects are white again. During a generational collection, the -** invariant must be kept all times. -*/ - -#define keepinvariant(g) (isgenerational(g) || g->gcstate <= GCSatomic) - - -/* -** Outside the collector, the state in generational mode is kept in -** 'propagate', so 'keepinvariant' is always true. -*/ -#define keepinvariantout(g) \ - check_exp(g->gcstate == GCSpropagate || !isgenerational(g), \ - g->gcstate <= GCSatomic) - - -/* -** some useful bit tricks -*/ -#define resetbits(x,m) ((x) &= cast(lu_byte, ~(m))) -#define setbits(x,m) ((x) |= (m)) -#define testbits(x,m) ((x) & (m)) -#define bitmask(b) (1<<(b)) -#define bit2mask(b1,b2) (bitmask(b1) | bitmask(b2)) -#define l_setbit(x,b) setbits(x, bitmask(b)) -#define resetbit(x,b) resetbits(x, bitmask(b)) -#define testbit(x,b) testbits(x, bitmask(b)) - - -/* Layout for bit use in `marked' field: */ -#define WHITE0BIT 0 /* object is white (type 0) */ -#define WHITE1BIT 1 /* object is white (type 1) */ -#define BLACKBIT 2 /* object is black */ -#define FINALIZEDBIT 3 /* object has been separated for finalization */ -#define SEPARATED 4 /* object is in 'finobj' list or in 'tobefnz' */ -#define FIXEDBIT 5 /* object is fixed (should not be collected) */ -#define OLDBIT 6 /* object is old (only in generational mode) */ -/* bit 7 is currently used by tests (luaL_checkmemory) */ - -#define WHITEBITS bit2mask(WHITE0BIT, WHITE1BIT) - - -#define iswhite(x) testbits((x)->gch.marked, WHITEBITS) -#define isblack(x) testbit((x)->gch.marked, BLACKBIT) -#define isgray(x) /* neither white nor black */ \ - (!testbits((x)->gch.marked, WHITEBITS | bitmask(BLACKBIT))) - -#define isold(x) testbit((x)->gch.marked, OLDBIT) - -/* MOVE OLD rule: whenever an object is moved to the beginning of - a GC list, its old bit must be cleared */ -#define resetoldbit(o) resetbit((o)->gch.marked, OLDBIT) - -#define otherwhite(g) (g->currentwhite ^ WHITEBITS) -#define isdeadm(ow,m) (!(((m) ^ WHITEBITS) & (ow))) -#define isdead(g,v) isdeadm(otherwhite(g), (v)->gch.marked) - -#define changewhite(x) ((x)->gch.marked ^= WHITEBITS) -#define gray2black(x) l_setbit((x)->gch.marked, BLACKBIT) - -#define valiswhite(x) (iscollectable(x) && iswhite(gcvalue(x))) - -#define luaC_white(g) cast(lu_byte, (g)->currentwhite & WHITEBITS) - - -#define luaC_condGC(L,c) \ - {if (G(L)->GCdebt > 0) {c;}; condchangemem(L);} -#define luaC_checkGC(L) luaC_condGC(L, luaC_step(L);) - - -#define luaC_barrier(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p))) \ - luaC_barrier_(L,obj2gco(p),gcvalue(v)); } - -#define luaC_barrierback(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p))) \ - luaC_barrierback_(L,p); } - -#define luaC_objbarrier(L,p,o) \ - { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) \ - luaC_barrier_(L,obj2gco(p),obj2gco(o)); } - -#define luaC_objbarrierback(L,p,o) \ - { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) luaC_barrierback_(L,p); } - -#define luaC_barrierproto(L,p,c) \ - { if (isblack(obj2gco(p))) luaC_barrierproto_(L,p,c); } - -LUAI_FUNC void luaC_freeallobjects (lua_State *L); -LUAI_FUNC void luaC_step (lua_State *L); -LUAI_FUNC void luaC_forcestep (lua_State *L); -LUAI_FUNC void luaC_runtilstate (lua_State *L, int statesmask); -LUAI_FUNC void luaC_fullgc (lua_State *L, int isemergency); -LUAI_FUNC GCObject *luaC_newobj (lua_State *L, int tt, size_t sz, - GCObject **list, int offset); -LUAI_FUNC void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v); -LUAI_FUNC void luaC_barrierback_ (lua_State *L, GCObject *o); -LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c); -LUAI_FUNC void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt); -LUAI_FUNC void luaC_checkupvalcolor (global_State *g, UpVal *uv); -LUAI_FUNC void luaC_changemode (lua_State *L, int mode); - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c deleted file mode 100644 index dfac7aef8645..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c +++ /dev/null @@ -1,529 +0,0 @@ -/* -** $Id: llex.c,v 2.63.1.3 2015/02/09 17:56:34 roberto Exp $ -** Lexical Analyzer -** See Copyright Notice in lua.h -*/ - -#include - -#define llex_c -#define LUA_CORE - -#include "lua.h" - -#include "lctype.h" -#include "ldo.h" -#include "llex.h" -#include "lobject.h" -#include "lparser.h" -#include "lstate.h" -#include "lstring.h" -#include "ltable.h" -#include "lzio.h" - - - -#define next(ls) (ls->current = zgetc(ls->z)) - - - -#define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r') - - -/* ORDER RESERVED */ -static const char *const luaX_tokens [] = { - "and", "break", "do", "else", "elseif", - "end", "false", "for", "function", "goto", "if", - "in", "local", "nil", "not", "or", "repeat", - "return", "then", "true", "until", "while", - "..", "...", "==", ">=", "<=", "~=", "::", "", - "", "", "" -}; - - -#define save_and_next(ls) (save(ls, ls->current), next(ls)) - - -static l_noret lexerror (LexState *ls, const char *msg, int token); - - -static void save (LexState *ls, int c) { - Mbuffer *b = ls->buff; - if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) { - size_t newsize; - if (luaZ_sizebuffer(b) >= MAX_SIZET/2) - lexerror(ls, "lexical element too long", 0); - newsize = luaZ_sizebuffer(b) * 2; - luaZ_resizebuffer(ls->L, b, newsize); - } - b->buffer[luaZ_bufflen(b)++] = cast(char, c); -} - - -void luaX_init (lua_State *L) { - int i; - for (i=0; itsv.extra = cast_byte(i+1); /* reserved word */ - } -} - - -const char *luaX_token2str (LexState *ls, int token) { - if (token < FIRST_RESERVED) { /* single-byte symbols? */ - lua_assert(token == cast(unsigned char, token)); - return (lisprint(token)) ? luaO_pushfstring(ls->L, LUA_QL("%c"), token) : - luaO_pushfstring(ls->L, "char(%d)", token); - } - else { - const char *s = luaX_tokens[token - FIRST_RESERVED]; - if (token < TK_EOS) /* fixed format (symbols and reserved words)? */ - return luaO_pushfstring(ls->L, LUA_QS, s); - else /* names, strings, and numerals */ - return s; - } -} - - -static const char *txtToken (LexState *ls, int token) { - switch (token) { - case TK_NAME: - case TK_STRING: - case TK_NUMBER: - save(ls, '\0'); - return luaO_pushfstring(ls->L, LUA_QS, luaZ_buffer(ls->buff)); - default: - return luaX_token2str(ls, token); - } -} - - -static l_noret lexerror (LexState *ls, const char *msg, int token) { - char buff[LUA_IDSIZE]; - luaO_chunkid(buff, getstr(ls->source), LUA_IDSIZE); - msg = luaO_pushfstring(ls->L, "%s:%d: %s", buff, ls->linenumber, msg); - if (token) - luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token)); - luaD_throw(ls->L, LUA_ERRSYNTAX); -} - - -l_noret luaX_syntaxerror (LexState *ls, const char *msg) { - lexerror(ls, msg, ls->t.token); -} - - -/* -** creates a new string and anchors it in function's table so that -** it will not be collected until the end of the function's compilation -** (by that time it should be anchored in function's prototype) -*/ -TString *luaX_newstring (LexState *ls, const char *str, size_t l) { - lua_State *L = ls->L; - TValue *o; /* entry for `str' */ - TString *ts = luaS_newlstr(L, str, l); /* create new string */ - setsvalue2s(L, L->top++, ts); /* temporarily anchor it in stack */ - o = luaH_set(L, ls->fs->h, L->top - 1); - if (ttisnil(o)) { /* not in use yet? (see 'addK') */ - /* boolean value does not need GC barrier; - table has no metatable, so it does not need to invalidate cache */ - setbvalue(o, 1); /* t[string] = true */ - luaC_checkGC(L); - } - else { /* string already present */ - ts = rawtsvalue(keyfromval(o)); /* re-use value previously stored */ - } - L->top--; /* remove string from stack */ - return ts; -} - - -/* -** increment line number and skips newline sequence (any of -** \n, \r, \n\r, or \r\n) -*/ -static void inclinenumber (LexState *ls) { - int old = ls->current; - lua_assert(currIsNewline(ls)); - next(ls); /* skip `\n' or `\r' */ - if (currIsNewline(ls) && ls->current != old) - next(ls); /* skip `\n\r' or `\r\n' */ - if (++ls->linenumber >= MAX_INT) - lexerror(ls, "chunk has too many lines", 0); -} - - -void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source, - int firstchar) { - ls->decpoint = '.'; - ls->L = L; - ls->current = firstchar; - ls->lookahead.token = TK_EOS; /* no look-ahead token */ - ls->z = z; - ls->fs = NULL; - ls->linenumber = 1; - ls->lastline = 1; - ls->source = source; - ls->envn = luaS_new(L, LUA_ENV); /* create env name */ - luaS_fix(ls->envn); /* never collect this name */ - luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER); /* initialize buffer */ -} - - - -/* -** ======================================================= -** LEXICAL ANALYZER -** ======================================================= -*/ - - - -static int check_next (LexState *ls, const char *set) { - if (ls->current == '\0' || !strchr(set, ls->current)) - return 0; - save_and_next(ls); - return 1; -} - - -/* -** change all characters 'from' in buffer to 'to' -*/ -static void buffreplace (LexState *ls, char from, char to) { - size_t n = luaZ_bufflen(ls->buff); - char *p = luaZ_buffer(ls->buff); - while (n--) - if (p[n] == from) p[n] = to; -} - - -#if !defined(getlocaledecpoint) -#define getlocaledecpoint() (localeconv()->decimal_point[0]) -#endif - - -#define buff2d(b,e) luaO_str2d(luaZ_buffer(b), luaZ_bufflen(b) - 1, e) - -/* -** in case of format error, try to change decimal point separator to -** the one defined in the current locale and check again -*/ -static void trydecpoint (LexState *ls, SemInfo *seminfo) { - char old = ls->decpoint; - ls->decpoint = getlocaledecpoint(); - buffreplace(ls, old, ls->decpoint); /* try new decimal separator */ - if (!buff2d(ls->buff, &seminfo->r)) { - /* format error with correct decimal point: no more options */ - buffreplace(ls, ls->decpoint, '.'); /* undo change (for error message) */ - lexerror(ls, "malformed number", TK_NUMBER); - } -} - - -/* LUA_NUMBER */ -/* -** this function is quite liberal in what it accepts, as 'luaO_str2d' -** will reject ill-formed numerals. -*/ -static void read_numeral (LexState *ls, SemInfo *seminfo) { - const char *expo = "Ee"; - int first = ls->current; - lua_assert(lisdigit(ls->current)); - save_and_next(ls); - if (first == '0' && check_next(ls, "Xx")) /* hexadecimal? */ - expo = "Pp"; - for (;;) { - if (check_next(ls, expo)) /* exponent part? */ - check_next(ls, "+-"); /* optional exponent sign */ - if (lisxdigit(ls->current) || ls->current == '.') - save_and_next(ls); - else break; - } - save(ls, '\0'); - buffreplace(ls, '.', ls->decpoint); /* follow locale for decimal point */ - if (!buff2d(ls->buff, &seminfo->r)) /* format error? */ - trydecpoint(ls, seminfo); /* try to update decimal point separator */ -} - - -/* -** skip a sequence '[=*[' or ']=*]' and return its number of '='s or -** -1 if sequence is malformed -*/ -static int skip_sep (LexState *ls) { - int count = 0; - int s = ls->current; - lua_assert(s == '[' || s == ']'); - save_and_next(ls); - while (ls->current == '=') { - save_and_next(ls); - count++; - } - return (ls->current == s) ? count : (-count) - 1; -} - - -static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) { - save_and_next(ls); /* skip 2nd `[' */ - if (currIsNewline(ls)) /* string starts with a newline? */ - inclinenumber(ls); /* skip it */ - for (;;) { - switch (ls->current) { - case EOZ: - lexerror(ls, (seminfo) ? "unfinished long string" : - "unfinished long comment", TK_EOS); - break; /* to avoid warnings */ - case ']': { - if (skip_sep(ls) == sep) { - save_and_next(ls); /* skip 2nd `]' */ - goto endloop; - } - break; - } - case '\n': case '\r': { - save(ls, '\n'); - inclinenumber(ls); - if (!seminfo) luaZ_resetbuffer(ls->buff); /* avoid wasting space */ - break; - } - default: { - if (seminfo) save_and_next(ls); - else next(ls); - } - } - } endloop: - if (seminfo) - seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + (2 + sep), - luaZ_bufflen(ls->buff) - 2*(2 + sep)); -} - - -static void escerror (LexState *ls, int *c, int n, const char *msg) { - int i; - luaZ_resetbuffer(ls->buff); /* prepare error message */ - save(ls, '\\'); - for (i = 0; i < n && c[i] != EOZ; i++) - save(ls, c[i]); - lexerror(ls, msg, TK_STRING); -} - - -static int readhexaesc (LexState *ls) { - int c[3], i; /* keep input for error message */ - int r = 0; /* result accumulator */ - c[0] = 'x'; /* for error message */ - for (i = 1; i < 3; i++) { /* read two hexadecimal digits */ - c[i] = next(ls); - if (!lisxdigit(c[i])) - escerror(ls, c, i + 1, "hexadecimal digit expected"); - r = (r << 4) + luaO_hexavalue(c[i]); - } - return r; -} - - -static int readdecesc (LexState *ls) { - int c[3], i; - int r = 0; /* result accumulator */ - for (i = 0; i < 3 && lisdigit(ls->current); i++) { /* read up to 3 digits */ - c[i] = ls->current; - r = 10*r + c[i] - '0'; - next(ls); - } - if (r > UCHAR_MAX) - escerror(ls, c, i, "decimal escape too large"); - return r; -} - - -static void read_string (LexState *ls, int del, SemInfo *seminfo) { - save_and_next(ls); /* keep delimiter (for error messages) */ - while (ls->current != del) { - switch (ls->current) { - case EOZ: - lexerror(ls, "unfinished string", TK_EOS); - break; /* to avoid warnings */ - case '\n': - case '\r': - lexerror(ls, "unfinished string", TK_STRING); - break; /* to avoid warnings */ - case '\\': { /* escape sequences */ - int c; /* final character to be saved */ - next(ls); /* do not save the `\' */ - switch (ls->current) { - case 'a': c = '\a'; goto read_save; - case 'b': c = '\b'; goto read_save; - case 'f': c = '\f'; goto read_save; - case 'n': c = '\n'; goto read_save; - case 'r': c = '\r'; goto read_save; - case 't': c = '\t'; goto read_save; - case 'v': c = '\v'; goto read_save; - case 'x': c = readhexaesc(ls); goto read_save; - case '\n': case '\r': - inclinenumber(ls); c = '\n'; goto only_save; - case '\\': case '\"': case '\'': - c = ls->current; goto read_save; - case EOZ: goto no_save; /* will raise an error next loop */ - case 'z': { /* zap following span of spaces */ - next(ls); /* skip the 'z' */ - while (lisspace(ls->current)) { - if (currIsNewline(ls)) inclinenumber(ls); - else next(ls); - } - goto no_save; - } - default: { - if (!lisdigit(ls->current)) - escerror(ls, &ls->current, 1, "invalid escape sequence"); - /* digital escape \ddd */ - c = readdecesc(ls); - goto only_save; - } - } - read_save: next(ls); /* read next character */ - only_save: save(ls, c); /* save 'c' */ - no_save: break; - } - default: - save_and_next(ls); - } - } - save_and_next(ls); /* skip delimiter */ - seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1, - luaZ_bufflen(ls->buff) - 2); -} - - -static int llex (LexState *ls, SemInfo *seminfo) { - luaZ_resetbuffer(ls->buff); - for (;;) { - switch (ls->current) { - case '\n': case '\r': { /* line breaks */ - inclinenumber(ls); - break; - } - case ' ': case '\f': case '\t': case '\v': { /* spaces */ - next(ls); - break; - } - case '-': { /* '-' or '--' (comment) */ - next(ls); - if (ls->current != '-') return '-'; - /* else is a comment */ - next(ls); - if (ls->current == '[') { /* long comment? */ - int sep = skip_sep(ls); - luaZ_resetbuffer(ls->buff); /* `skip_sep' may dirty the buffer */ - if (sep >= 0) { - read_long_string(ls, NULL, sep); /* skip long comment */ - luaZ_resetbuffer(ls->buff); /* previous call may dirty the buff. */ - break; - } - } - /* else short comment */ - while (!currIsNewline(ls) && ls->current != EOZ) - next(ls); /* skip until end of line (or end of file) */ - break; - } - case '[': { /* long string or simply '[' */ - int sep = skip_sep(ls); - if (sep >= 0) { - read_long_string(ls, seminfo, sep); - return TK_STRING; - } - else if (sep == -1) return '['; - else lexerror(ls, "invalid long string delimiter", TK_STRING); - } - case '=': { - next(ls); - if (ls->current != '=') return '='; - else { next(ls); return TK_EQ; } - } - case '<': { - next(ls); - if (ls->current != '=') return '<'; - else { next(ls); return TK_LE; } - } - case '>': { - next(ls); - if (ls->current != '=') return '>'; - else { next(ls); return TK_GE; } - } - case '~': { - next(ls); - if (ls->current != '=') return '~'; - else { next(ls); return TK_NE; } - } - case ':': { - next(ls); - if (ls->current != ':') return ':'; - else { next(ls); return TK_DBCOLON; } - } - case '"': case '\'': { /* short literal strings */ - read_string(ls, ls->current, seminfo); - return TK_STRING; - } - case '.': { /* '.', '..', '...', or number */ - save_and_next(ls); - if (check_next(ls, ".")) { - if (check_next(ls, ".")) - return TK_DOTS; /* '...' */ - else return TK_CONCAT; /* '..' */ - } - else if (!lisdigit(ls->current)) return '.'; - /* else go through */ - } - /* FALLTHROUGH */ - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': { - read_numeral(ls, seminfo); - return TK_NUMBER; - } - case EOZ: { - return TK_EOS; - } - default: { - if (lislalpha(ls->current)) { /* identifier or reserved word? */ - TString *ts; - do { - save_and_next(ls); - } while (lislalnum(ls->current)); - ts = luaX_newstring(ls, luaZ_buffer(ls->buff), - luaZ_bufflen(ls->buff)); - seminfo->ts = ts; - if (isreserved(ts)) /* reserved word? */ - return ts->tsv.extra - 1 + FIRST_RESERVED; - else { - return TK_NAME; - } - } - else { /* single-char tokens (+ - / ...) */ - int c = ls->current; - next(ls); - return c; - } - } - } - } -} - - -void luaX_next (LexState *ls) { - ls->lastline = ls->linenumber; - if (ls->lookahead.token != TK_EOS) { /* is there a look-ahead token? */ - ls->t = ls->lookahead; /* use this one */ - ls->lookahead.token = TK_EOS; /* and discharge it */ - } - else - ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */ -} - - -int luaX_lookahead (LexState *ls) { - lua_assert(ls->lookahead.token == TK_EOS); - ls->lookahead.token = llex(ls, &ls->lookahead.seminfo); - return ls->lookahead.token; -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h deleted file mode 100644 index a4acdd30218a..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h +++ /dev/null @@ -1,78 +0,0 @@ -/* -** $Id: llex.h,v 1.72.1.1 2013/04/12 18:48:47 roberto Exp $ -** Lexical Analyzer -** See Copyright Notice in lua.h -*/ - -#ifndef llex_h -#define llex_h - -#include "lobject.h" -#include "lzio.h" - - -#define FIRST_RESERVED 257 - - - -/* -* WARNING: if you change the order of this enumeration, -* grep "ORDER RESERVED" -*/ -enum RESERVED { - /* terminal symbols denoted by reserved words */ - TK_AND = FIRST_RESERVED, TK_BREAK, - TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION, - TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT, - TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE, - /* other terminal symbols */ - TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE, TK_DBCOLON, TK_EOS, - TK_NUMBER, TK_NAME, TK_STRING -}; - -/* number of reserved words */ -#define NUM_RESERVED (cast(int, TK_WHILE-FIRST_RESERVED+1)) - - -typedef union { - lua_Number r; - TString *ts; -} SemInfo; /* semantics information */ - - -typedef struct Token { - int token; - SemInfo seminfo; -} Token; - - -/* state of the lexer plus state of the parser when shared by all - functions */ -typedef struct LexState { - int current; /* current character (charint) */ - int linenumber; /* input line counter */ - int lastline; /* line of last token `consumed' */ - Token t; /* current token */ - Token lookahead; /* look ahead token */ - struct FuncState *fs; /* current function (parser) */ - struct lua_State *L; - ZIO *z; /* input stream */ - Mbuffer *buff; /* buffer for tokens */ - struct Dyndata *dyd; /* dynamic structures used by the parser */ - TString *source; /* current source name */ - TString *envn; /* environment variable name */ - char decpoint; /* locale decimal point */ -} LexState; - - -LUAI_FUNC void luaX_init (lua_State *L); -LUAI_FUNC void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, - TString *source, int firstchar); -LUAI_FUNC TString *luaX_newstring (LexState *ls, const char *str, size_t l); -LUAI_FUNC void luaX_next (LexState *ls); -LUAI_FUNC int luaX_lookahead (LexState *ls); -LUAI_FUNC l_noret luaX_syntaxerror (LexState *ls, const char *s); -LUAI_FUNC const char *luaX_token2str (LexState *ls, int token); - - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h deleted file mode 100644 index 4277c1fd03db..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h +++ /dev/null @@ -1,308 +0,0 @@ -/* -** $Id: llimits.h,v 1.103.1.1 2013/04/12 18:48:47 roberto Exp $ -** Limits, basic types, and some other `installation-dependent' definitions -** See Copyright Notice in lua.h -*/ - -#ifndef llimits_h -#define llimits_h - - -#include - -#include "lua.h" - - -typedef unsigned LUA_INT32 lu_int32; - -typedef LUAI_UMEM lu_mem; - -typedef LUAI_MEM l_mem; - - - -/* chars used as small naturals (so that `char' is reserved for characters) */ -typedef unsigned char lu_byte; - - -#define MAX_SIZET ((size_t)(~(size_t)0)-2) - -#define MAX_LUMEM ((lu_mem)(~(lu_mem)0)-2) - -#define MAX_LMEM ((l_mem) ((MAX_LUMEM >> 1) - 2)) - - -#define MAX_INT (INT_MAX-2) /* maximum value of an int (-2 for safety) */ - -/* -** conversion of pointer to integer -** this is for hashing only; there is no problem if the integer -** cannot hold the whole pointer value -*/ -#define IntPoint(p) ((unsigned int)(lu_mem)(p)) - - - -/* type to ensure maximum alignment */ -#if !defined(LUAI_USER_ALIGNMENT_T) -#define LUAI_USER_ALIGNMENT_T union { double u; void *s; long l; } -#endif - -typedef LUAI_USER_ALIGNMENT_T L_Umaxalign; - - -/* result of a `usual argument conversion' over lua_Number */ -typedef LUAI_UACNUMBER l_uacNumber; - - -/* internal assertions for in-house debugging */ -#if defined(lua_assert) -#define check_exp(c,e) (lua_assert(c), (e)) -/* to avoid problems with conditions too long */ -#define lua_longassert(c) { if (!(c)) lua_assert(0); } -#else -#define lua_assert(c) ((void)0) -#define check_exp(c,e) (e) -#define lua_longassert(c) ((void)0) -#endif - -/* -** assertion for checking API calls -*/ -#if !defined(luai_apicheck) - -#if defined(LUA_USE_APICHECK) -#include -#define luai_apicheck(L,e) assert(e) -#else -#define luai_apicheck(L,e) lua_assert(e) -#endif - -#endif - -#define api_check(l,e,msg) luai_apicheck(l,(e) && msg) - - -#if !defined(UNUSED) -#define UNUSED(x) ((void)(x)) /* to avoid warnings */ -#endif - - -#define cast(t, exp) ((t)(exp)) - -#define cast_byte(i) cast(lu_byte, (i)) -#define cast_num(i) cast(lua_Number, (i)) -#define cast_int(i) cast(int, (i)) -#define cast_uchar(i) cast(unsigned char, (i)) - - -/* -** non-return type -*/ -#if defined(__GNUC__) -#define l_noret void __attribute__((noreturn)) -#elif defined(_MSC_VER) -#define l_noret void __declspec(noreturn) -#else -#define l_noret void -#endif - - - -/* -** maximum depth for nested C calls and syntactical nested non-terminals -** in a program. (Value must fit in an unsigned short int.) -** -** Note: On amd64 platform, the limit has been measured to be 45. We set -** the maximum lower to give a margin for changing the amount of stack -** used by various functions involved in parsing and executing code. -*/ -#if !defined(LUAI_MAXCCALLS) -#define LUAI_MAXCCALLS 20 -#endif - -/* -** maximum number of upvalues in a closure (both C and Lua). (Value -** must fit in an unsigned char.) -*/ -#define MAXUPVAL UCHAR_MAX - - -/* -** type for virtual-machine instructions -** must be an unsigned with (at least) 4 bytes (see details in lopcodes.h) -*/ -typedef lu_int32 Instruction; - - - -/* maximum stack for a Lua function */ -#define MAXSTACK 250 - - - -/* minimum size for the string table (must be power of 2) */ -#if !defined(MINSTRTABSIZE) -#define MINSTRTABSIZE 32 -#endif - - -/* minimum size for string buffer */ -#if !defined(LUA_MINBUFFER) -#define LUA_MINBUFFER 32 -#endif - - -#if !defined(lua_lock) -#define lua_lock(L) ((void) 0) -#define lua_unlock(L) ((void) 0) -#endif - -#if !defined(luai_threadyield) -#define luai_threadyield(L) {lua_unlock(L); lua_lock(L);} -#endif - - -/* -** these macros allow user-specific actions on threads when you defined -** LUAI_EXTRASPACE and need to do something extra when a thread is -** created/deleted/resumed/yielded. -*/ -#if !defined(luai_userstateopen) -#define luai_userstateopen(L) ((void)L) -#endif - -#if !defined(luai_userstateclose) -#define luai_userstateclose(L) ((void)L) -#endif - -#if !defined(luai_userstatethread) -#define luai_userstatethread(L,L1) ((void)L) -#endif - -#if !defined(luai_userstatefree) -#define luai_userstatefree(L,L1) ((void)L) -#endif - -#if !defined(luai_userstateresume) -#define luai_userstateresume(L,n) ((void)L) -#endif - -#if !defined(luai_userstateyield) -#define luai_userstateyield(L,n) ((void)L) -#endif - -/* -** lua_number2int is a macro to convert lua_Number to int. -** lua_number2integer is a macro to convert lua_Number to lua_Integer. -** lua_number2unsigned is a macro to convert a lua_Number to a lua_Unsigned. -** lua_unsigned2number is a macro to convert a lua_Unsigned to a lua_Number. -** luai_hashnum is a macro to hash a lua_Number value into an integer. -** The hash must be deterministic and give reasonable values for -** both small and large values (outside the range of integers). -*/ - -#if defined(MS_ASMTRICK) || defined(LUA_MSASMTRICK) /* { */ -/* trick with Microsoft assembler for X86 */ - -#define lua_number2int(i,n) __asm {__asm fld n __asm fistp i} -#define lua_number2integer(i,n) lua_number2int(i, n) -#define lua_number2unsigned(i,n) \ - {__int64 l; __asm {__asm fld n __asm fistp l} i = (unsigned int)l;} - - -#elif defined(LUA_IEEE754TRICK) /* }{ */ -/* the next trick should work on any machine using IEEE754 with - a 32-bit int type */ - -union luai_Cast { double l_d; LUA_INT32 l_p[2]; }; - -#if !defined(LUA_IEEEENDIAN) /* { */ -#define LUAI_EXTRAIEEE \ - static const union luai_Cast ieeeendian = {-(33.0 + 6755399441055744.0)}; -#define LUA_IEEEENDIANLOC (ieeeendian.l_p[1] == 33) -#else -#define LUA_IEEEENDIANLOC LUA_IEEEENDIAN -#define LUAI_EXTRAIEEE /* empty */ -#endif /* } */ - -#define lua_number2int32(i,n,t) \ - { LUAI_EXTRAIEEE \ - volatile union luai_Cast u; u.l_d = (n) + 6755399441055744.0; \ - (i) = (t)u.l_p[LUA_IEEEENDIANLOC]; } - -#define luai_hashnum(i,n) \ - { volatile union luai_Cast u; u.l_d = (n) + 1.0; /* avoid -0 */ \ - (i) = u.l_p[0]; (i) += u.l_p[1]; } /* add double bits for his hash */ - -#define lua_number2int(i,n) lua_number2int32(i, n, int) -#define lua_number2unsigned(i,n) lua_number2int32(i, n, lua_Unsigned) - -/* the trick can be expanded to lua_Integer when it is a 32-bit value */ -#if defined(LUA_IEEELL) -#define lua_number2integer(i,n) lua_number2int32(i, n, lua_Integer) -#endif - -#endif /* } */ - - -/* the following definitions always work, but may be slow */ - -#if !defined(lua_number2int) -#define lua_number2int(i,n) ((i)=(int)(n)) -#endif - -#if !defined(lua_number2integer) -#define lua_number2integer(i,n) ((i)=(lua_Integer)(n)) -#endif - -#if !defined(lua_number2unsigned) /* { */ -/* the following definition assures proper modulo behavior */ -#if defined(LUA_NUMBER_DOUBLE) || defined(LUA_NUMBER_FLOAT) -#include -#define SUPUNSIGNED ((lua_Number)(~(lua_Unsigned)0) + 1) -#define lua_number2unsigned(i,n) \ - ((i)=(lua_Unsigned)((n) - floor((n)/SUPUNSIGNED)*SUPUNSIGNED)) -#else -#define lua_number2unsigned(i,n) ((i)=(lua_Unsigned)(n)) -#endif -#endif /* } */ - - -#if !defined(lua_unsigned2number) -/* on several machines, coercion from unsigned to double is slow, - so it may be worth to avoid */ -#define lua_unsigned2number(u) \ - (((u) <= (lua_Unsigned)INT_MAX) ? (lua_Number)(int)(u) : (lua_Number)(u)) -#endif - - - -#if defined(ltable_c) && !defined(luai_hashnum) - -extern int lcompat_hashnum(int64_t); - -#define luai_hashnum(i,n) (i = lcompat_hashnum(n)) - -#endif - - - -/* -** macro to control inclusion of some hard tests on stack reallocation -*/ -#if !defined(HARDSTACKTESTS) -#define condmovestack(L) ((void)0) -#else -/* realloc stack keeping its size */ -#define condmovestack(L) luaD_reallocstack((L), (L)->stacksize) -#endif - -#if !defined(HARDMEMTESTS) -#define condchangemem(L) condmovestack(L) -#else -#define condchangemem(L) \ - ((void)(!(G(L)->gcrunning) || (luaC_fullgc(L, 0), 1))) -#endif - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c deleted file mode 100644 index 0d070fbde83c..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c +++ /dev/null @@ -1,99 +0,0 @@ -/* -** $Id: lmem.c,v 1.84.1.1 2013/04/12 18:48:47 roberto Exp $ -** Interface to Memory Manager -** See Copyright Notice in lua.h -*/ - - -#include - -#define lmem_c -#define LUA_CORE - -#include "lua.h" - -#include "ldebug.h" -#include "ldo.h" -#include "lgc.h" -#include "lmem.h" -#include "lobject.h" -#include "lstate.h" - - - -/* -** About the realloc function: -** void * frealloc (void *ud, void *ptr, size_t osize, size_t nsize); -** (`osize' is the old size, `nsize' is the new size) -** -** * frealloc(ud, NULL, x, s) creates a new block of size `s' (no -** matter 'x'). -** -** * frealloc(ud, p, x, 0) frees the block `p' -** (in this specific case, frealloc must return NULL); -** particularly, frealloc(ud, NULL, 0, 0) does nothing -** (which is equivalent to free(NULL) in ANSI C) -** -** frealloc returns NULL if it cannot create or reallocate the area -** (any reallocation to an equal or smaller size cannot fail!) -*/ - - - -#define MINSIZEARRAY 4 - - -void *luaM_growaux_ (lua_State *L, void *block, int *size, size_t size_elems, - int limit, const char *what) { - void *newblock; - int newsize; - if (*size >= limit/2) { /* cannot double it? */ - if (*size >= limit) /* cannot grow even a little? */ - luaG_runerror(L, "too many %s (limit is %d)", what, limit); - newsize = limit; /* still have at least one free place */ - } - else { - newsize = (*size)*2; - if (newsize < MINSIZEARRAY) - newsize = MINSIZEARRAY; /* minimum size */ - } - newblock = luaM_reallocv(L, block, *size, newsize, size_elems); - *size = newsize; /* update only when everything else is OK */ - return newblock; -} - - -l_noret luaM_toobig (lua_State *L) { - luaG_runerror(L, "memory allocation error: block too big"); -} - - - -/* -** generic allocation routine. -*/ -void *luaM_realloc_ (lua_State *L, void *block, size_t osize, size_t nsize) { - void *newblock; - global_State *g = G(L); - size_t realosize = (block) ? osize : 0; - lua_assert((realosize == 0) == (block == NULL)); -#if defined(HARDMEMTESTS) - if (nsize > realosize && g->gcrunning) - luaC_fullgc(L, 1); /* force a GC whenever possible */ -#endif - newblock = (*g->frealloc)(g->ud, block, osize, nsize); - if (newblock == NULL && nsize > 0) { - api_check(L, nsize > realosize, - "realloc cannot fail when shrinking a block"); - if (g->gcrunning) { - luaC_fullgc(L, 1); /* try to free some memory... */ - newblock = (*g->frealloc)(g->ud, block, osize, nsize); /* try again */ - } - if (newblock == NULL) - luaD_throw(L, LUA_ERRMEM); - } - lua_assert((nsize == 0) == (newblock == NULL)); - g->GCdebt = (g->GCdebt + nsize) - realosize; - return newblock; -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h deleted file mode 100644 index c75a3d50984a..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h +++ /dev/null @@ -1,57 +0,0 @@ -/* -** $Id: lmem.h,v 1.40.1.1 2013/04/12 18:48:47 roberto Exp $ -** Interface to Memory Manager -** See Copyright Notice in lua.h -*/ - -#ifndef lmem_h -#define lmem_h - - -#include - -#include "llimits.h" -#include "lua.h" - - -/* -** This macro avoids the runtime division MAX_SIZET/(e), as 'e' is -** always constant. -** The macro is somewhat complex to avoid warnings: -** +1 avoids warnings of "comparison has constant result"; -** cast to 'void' avoids warnings of "value unused". -*/ -#define luaM_reallocv(L,b,on,n,e) \ - (cast(void, \ - (cast(size_t, (n)+1) > MAX_SIZET/(e)) ? (luaM_toobig(L), 0) : 0), \ - luaM_realloc_(L, (b), (on)*(e), (n)*(e))) - -#define luaM_freemem(L, b, s) luaM_realloc_(L, (b), (s), 0) -#define luaM_free(L, b) luaM_realloc_(L, (b), sizeof(*(b)), 0) -#define luaM_freearray(L, b, n) luaM_reallocv(L, (b), n, 0, sizeof((b)[0])) - -#define luaM_malloc(L,s) luaM_realloc_(L, NULL, 0, (s)) -#define luaM_new(L,t) cast(t *, luaM_malloc(L, sizeof(t))) -#define luaM_newvector(L,n,t) \ - cast(t *, luaM_reallocv(L, NULL, 0, n, sizeof(t))) - -#define luaM_newobject(L,tag,s) luaM_realloc_(L, NULL, tag, (s)) - -#define luaM_growvector(L,v,nelems,size,t,limit,e) \ - if ((nelems)+1 > (size)) \ - ((v)=cast(t *, luaM_growaux_(L,v,&(size),sizeof(t),limit,e))) - -#define luaM_reallocvector(L, v,oldn,n,t) \ - ((v)=cast(t *, luaM_reallocv(L, v, oldn, n, sizeof(t)))) - -LUAI_FUNC l_noret luaM_toobig (lua_State *L); - -/* not to be called directly */ -LUAI_FUNC void *luaM_realloc_ (lua_State *L, void *block, size_t oldsize, - size_t size); -LUAI_FUNC void *luaM_growaux_ (lua_State *L, void *block, int *size, - size_t size_elem, int limit, - const char *what); - -#endif - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c deleted file mode 100644 index 339c84d21d79..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c +++ /dev/null @@ -1,283 +0,0 @@ -/* -** $Id: lobject.c,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $ -** Some generic functions over Lua objects -** See Copyright Notice in lua.h -*/ - -#include - -#define lobject_c -#define LUA_CORE - -#include "lua.h" - -#include "lctype.h" -#include "ldebug.h" -#include "ldo.h" -#include "lmem.h" -#include "lobject.h" -#include "lstate.h" -#include "lstring.h" -#include "lvm.h" - - - -LUAI_DDEF const TValue luaO_nilobject_ = {NILCONSTANT}; - - -/* -** converts an integer to a "floating point byte", represented as -** (eeeeexxx), where the real value is (1xxx) * 2^(eeeee - 1) if -** eeeee != 0 and (xxx) otherwise. -*/ -int luaO_int2fb (unsigned int x) { - int e = 0; /* exponent */ - if (x < 8) return x; - while (x >= 0x10) { - x = (x+1) >> 1; - e++; - } - return ((e+1) << 3) | (cast_int(x) - 8); -} - - -/* converts back */ -int luaO_fb2int (int x) { - int e = (x >> 3) & 0x1f; - if (e == 0) return x; - else return ((x & 7) + 8) << (e - 1); -} - - -int luaO_ceillog2 (unsigned int x) { - static const lu_byte log_2[256] = { - 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, - 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, - 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, - 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, - 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 - }; - int l = 0; - x--; - while (x >= 256) { l += 8; x >>= 8; } - return l + log_2[x]; -} - - -lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2) { - switch (op) { - case LUA_OPADD: return luai_numadd(NULL, v1, v2); - case LUA_OPSUB: return luai_numsub(NULL, v1, v2); - case LUA_OPMUL: return luai_nummul(NULL, v1, v2); - case LUA_OPDIV: return luai_numdiv(NULL, v1, v2); - case LUA_OPMOD: return luai_nummod(NULL, v1, v2); - case LUA_OPPOW: return luai_numpow(NULL, v1, v2); - case LUA_OPUNM: return luai_numunm(NULL, v1); - default: lua_assert(0); return 0; - } -} - - -int luaO_hexavalue (int c) { - if (lisdigit(c)) return c - '0'; - else return ltolower(c) - 'a' + 10; -} - - -#if !defined(lua_strx2number) - - - -static int isneg (const char **s) { - if (**s == '-') { (*s)++; return 1; } - else if (**s == '+') (*s)++; - return 0; -} - - -static lua_Number readhexa (const char **s, lua_Number r, int *count) { - for (; lisxdigit(cast_uchar(**s)); (*s)++) { /* read integer part */ - r = (r * cast_num(16.0)) + cast_num(luaO_hexavalue(cast_uchar(**s))); - (*count)++; - } - return r; -} - - -/* -** convert an hexadecimal numeric string to a number, following -** C99 specification for 'strtod' -*/ -static lua_Number lua_strx2number (const char *s, char **endptr) { - lua_Number r = 0.0; - int e = 0, i = 0; - int neg = 0; /* 1 if number is negative */ - *endptr = cast(char *, s); /* nothing is valid yet */ - while (lisspace(cast_uchar(*s))) s++; /* skip initial spaces */ - neg = isneg(&s); /* check signal */ - if (!(*s == '0' && (*(s + 1) == 'x' || *(s + 1) == 'X'))) /* check '0x' */ - return 0.0; /* invalid format (no '0x') */ - s += 2; /* skip '0x' */ - r = readhexa(&s, r, &i); /* read integer part */ - if (*s == '.') { - s++; /* skip dot */ - r = readhexa(&s, r, &e); /* read fractional part */ - } - if (i == 0 && e == 0) - return 0.0; /* invalid format (no digit) */ - e *= -4; /* each fractional digit divides value by 2^-4 */ - *endptr = cast(char *, s); /* valid up to here */ - if (*s == 'p' || *s == 'P') { /* exponent part? */ - int exp1 = 0; - int neg1; - s++; /* skip 'p' */ - neg1 = isneg(&s); /* signal */ - if (!lisdigit(cast_uchar(*s))) - goto ret; /* must have at least one digit */ - while (lisdigit(cast_uchar(*s))) /* read exponent */ - exp1 = exp1 * 10 + *(s++) - '0'; - if (neg1) exp1 = -exp1; - e += exp1; - } - *endptr = cast(char *, s); /* valid up to here */ - ret: - if (neg) r = -r; - return (r * (1 << e)); -} - -#endif - - -int luaO_str2d (const char *s, size_t len, lua_Number *result) { - char *endptr; - if (strpbrk(s, "nN")) /* reject 'inf' and 'nan' */ - return 0; - else if (strpbrk(s, "xX")) /* hexa? */ - *result = lua_strx2number(s, &endptr); - else - *result = lua_str2number(s, &endptr); - if (endptr == s) return 0; /* nothing recognized */ - while (lisspace(cast_uchar(*endptr))) endptr++; - return (endptr == s + len); /* OK if no trailing characters */ -} - - - -static void pushstr (lua_State *L, const char *str, size_t l) { - setsvalue2s(L, L->top++, luaS_newlstr(L, str, l)); -} - - -/* this function handles only `%d', `%c', %f, %p, and `%s' formats */ -const char *luaO_pushvfstring (lua_State *L, const char *fmt, va_list argp) { - int n = 0; - for (;;) { - const char *e = strchr(fmt, '%'); - if (e == NULL) break; - luaD_checkstack(L, 2); /* fmt + item */ - pushstr(L, fmt, e - fmt); - switch (*(e+1)) { - case 's': { - const char *s = va_arg(argp, char *); - if (s == NULL) s = "(null)"; - pushstr(L, s, strlen(s)); - break; - } - case 'c': { - char buff; - buff = cast(char, va_arg(argp, int)); - pushstr(L, &buff, 1); - break; - } - case 'd': { - setnvalue(L->top++, cast_num(va_arg(argp, int))); - break; - } - case 'f': { - setnvalue(L->top++, cast_num(va_arg(argp, l_uacNumber))); - break; - } - case 'p': { - char buff[4*sizeof(void *) + 8]; /* should be enough space for a `%p' */ - int l = lcompat_sprintf(buff, "%p", va_arg(argp, void *)); - pushstr(L, buff, l); - break; - } - case '%': { - pushstr(L, "%", 1); - break; - } - default: { - luaG_runerror(L, - "invalid option " LUA_QL("%%%c") " to " LUA_QL("lua_pushfstring"), - *(e + 1)); - } - } - n += 2; - fmt = e+2; - } - luaD_checkstack(L, 1); - pushstr(L, fmt, strlen(fmt)); - if (n > 0) luaV_concat(L, n + 1); - return svalue(L->top - 1); -} - - -const char *luaO_pushfstring (lua_State *L, const char *fmt, ...) { - const char *msg; - va_list argp; - va_start(argp, fmt); - msg = luaO_pushvfstring(L, fmt, argp); - va_end(argp); - return msg; -} - - -/* number of chars of a literal string without the ending \0 */ -#define LL(x) (sizeof(x)/sizeof(char) - 1) - -#define RETS "..." -#define PRE "[string \"" -#define POS "\"]" - -#define addstr(a,b,l) ( memcpy(a,b,(l) * sizeof(char)), a += (l) ) - -void luaO_chunkid (char *out, const char *source, size_t bufflen) { - size_t l = strlen(source); - if (*source == '=') { /* 'literal' source */ - if (l <= bufflen) /* small enough? */ - memcpy(out, source + 1, l * sizeof(char)); - else { /* truncate it */ - addstr(out, source + 1, bufflen - 1); - *out = '\0'; - } - } - else if (*source == '@') { /* file name */ - if (l <= bufflen) /* small enough? */ - memcpy(out, source + 1, l * sizeof(char)); - else { /* add '...' before rest of name */ - addstr(out, RETS, LL(RETS)); - bufflen -= LL(RETS); - memcpy(out, source + 1 + l - bufflen, bufflen * sizeof(char)); - } - } - else { /* string; format as [string "source"] */ - const char *nl = strchr(source, '\n'); /* find first new line (if any) */ - addstr(out, PRE, LL(PRE)); /* add prefix */ - bufflen -= LL(PRE RETS POS) + 1; /* save space for prefix+suffix+'\0' */ - if (l < bufflen && nl == NULL) { /* small one-line source? */ - addstr(out, source, l); /* keep it */ - } - else { - if (nl != NULL) l = nl - source; /* stop at first newline */ - if (l > bufflen) l = bufflen; - addstr(out, source, l); - addstr(out, RETS, LL(RETS)); - } - memcpy(out, POS, (LL(POS) + 1) * sizeof(char)); - } -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h deleted file mode 100644 index 9c9f23542867..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h +++ /dev/null @@ -1,606 +0,0 @@ -/* -** $Id: lobject.h,v 2.71.1.2 2014/05/07 14:14:58 roberto Exp $ -** Type definitions for Lua objects -** See Copyright Notice in lua.h -*/ - - -#ifndef lobject_h -#define lobject_h - - -#include - -#include "llimits.h" -#include "lua.h" - - -/* -** Extra tags for non-values -*/ -#define LUA_TPROTO LUA_NUMTAGS -#define LUA_TUPVAL (LUA_NUMTAGS+1) -#define LUA_TDEADKEY (LUA_NUMTAGS+2) - -/* -** number of all possible tags (including LUA_TNONE but excluding DEADKEY) -*/ -#define LUA_TOTALTAGS (LUA_TUPVAL+2) - - -/* -** tags for Tagged Values have the following use of bits: -** bits 0-3: actual tag (a LUA_T* value) -** bits 4-5: variant bits -** bit 6: whether value is collectable -*/ - -#define VARBITS (3 << 4) - - -/* -** LUA_TFUNCTION variants: -** 0 - Lua function -** 1 - light C function -** 2 - regular C function (closure) -*/ - -/* Variant tags for functions */ -#define LUA_TLCL (LUA_TFUNCTION | (0 << 4)) /* Lua closure */ -#define LUA_TLCF (LUA_TFUNCTION | (1 << 4)) /* light C function */ -#define LUA_TCCL (LUA_TFUNCTION | (2 << 4)) /* C closure */ - - -/* Variant tags for strings */ -#define LUA_TSHRSTR (LUA_TSTRING | (0 << 4)) /* short strings */ -#define LUA_TLNGSTR (LUA_TSTRING | (1 << 4)) /* long strings */ - - -/* Bit mark for collectable types */ -#define BIT_ISCOLLECTABLE (1 << 6) - -/* mark a tag as collectable */ -#define ctb(t) ((t) | BIT_ISCOLLECTABLE) - - -/* -** Union of all collectable objects -*/ -typedef union GCObject GCObject; - - -/* -** Common Header for all collectable objects (in macro form, to be -** included in other objects) -*/ -#define CommonHeader GCObject *next; lu_byte tt; lu_byte marked - - -/* -** Common header in struct form -*/ -typedef struct GCheader { - CommonHeader; -} GCheader; - - - -/* -** Union of all Lua values -*/ -typedef union Value Value; - - -#define numfield lua_Number n; /* numbers */ - - - -/* -** Tagged Values. This is the basic representation of values in Lua, -** an actual value plus a tag with its type. -*/ - -#define TValuefields Value value_; int tt_ - -typedef struct lua_TValue TValue; - - -/* macro defining a nil value */ -#define NILCONSTANT {NULL}, LUA_TNIL - - -#define val_(o) ((o)->value_) -#define num_(o) (val_(o).n) - - -/* raw type tag of a TValue */ -#define rttype(o) ((o)->tt_) - -/* tag with no variants (bits 0-3) */ -#define novariant(x) ((x) & 0x0F) - -/* type tag of a TValue (bits 0-3 for tags + variant bits 4-5) */ -#define ttype(o) (rttype(o) & 0x3F) - -/* type tag of a TValue with no variants (bits 0-3) */ -#define ttypenv(o) (novariant(rttype(o))) - - -/* Macros to test type */ -#define checktag(o,t) (rttype(o) == (t)) -#define checktype(o,t) (ttypenv(o) == (t)) -#define ttisnumber(o) checktag((o), LUA_TNUMBER) -#define ttisnil(o) checktag((o), LUA_TNIL) -#define ttisboolean(o) checktag((o), LUA_TBOOLEAN) -#define ttislightuserdata(o) checktag((o), LUA_TLIGHTUSERDATA) -#define ttisstring(o) checktype((o), LUA_TSTRING) -#define ttisshrstring(o) checktag((o), ctb(LUA_TSHRSTR)) -#define ttislngstring(o) checktag((o), ctb(LUA_TLNGSTR)) -#define ttistable(o) checktag((o), ctb(LUA_TTABLE)) -#define ttisfunction(o) checktype(o, LUA_TFUNCTION) -#define ttisclosure(o) ((rttype(o) & 0x1F) == LUA_TFUNCTION) -#define ttisCclosure(o) checktag((o), ctb(LUA_TCCL)) -#define ttisLclosure(o) checktag((o), ctb(LUA_TLCL)) -#define ttislcf(o) checktag((o), LUA_TLCF) -#define ttisuserdata(o) checktag((o), ctb(LUA_TUSERDATA)) -#define ttisthread(o) checktag((o), ctb(LUA_TTHREAD)) -#define ttisdeadkey(o) checktag((o), LUA_TDEADKEY) - -#define ttisequal(o1,o2) (rttype(o1) == rttype(o2)) - -/* Macros to access values */ -#define nvalue(o) check_exp(ttisnumber(o), num_(o)) -#define gcvalue(o) check_exp(iscollectable(o), val_(o).gc) -#define pvalue(o) check_exp(ttislightuserdata(o), val_(o).p) -#define rawtsvalue(o) check_exp(ttisstring(o), &val_(o).gc->ts) -#define tsvalue(o) (&rawtsvalue(o)->tsv) -#define rawuvalue(o) check_exp(ttisuserdata(o), &val_(o).gc->u) -#define uvalue(o) (&rawuvalue(o)->uv) -#define clvalue(o) check_exp(ttisclosure(o), &val_(o).gc->cl) -#define clLvalue(o) check_exp(ttisLclosure(o), &val_(o).gc->cl.l) -#define clCvalue(o) check_exp(ttisCclosure(o), &val_(o).gc->cl.c) -#define fvalue(o) check_exp(ttislcf(o), val_(o).f) -#define hvalue(o) check_exp(ttistable(o), &val_(o).gc->h) -#define bvalue(o) check_exp(ttisboolean(o), val_(o).b) -#define thvalue(o) check_exp(ttisthread(o), &val_(o).gc->th) -/* a dead value may get the 'gc' field, but cannot access its contents */ -#define deadvalue(o) check_exp(ttisdeadkey(o), cast(void *, val_(o).gc)) - -#define l_isfalse(o) (ttisnil(o) || (ttisboolean(o) && bvalue(o) == 0)) - - -#define iscollectable(o) (rttype(o) & BIT_ISCOLLECTABLE) - - -/* Macros for internal tests */ -#define righttt(obj) (ttype(obj) == gcvalue(obj)->gch.tt) - -#define checkliveness(g,obj) \ - lua_longassert(!iscollectable(obj) || \ - (righttt(obj) && !isdead(g,gcvalue(obj)))) - - -/* Macros to set values */ -#define settt_(o,t) ((o)->tt_=(t)) - -#define setnvalue(obj,x) \ - { TValue *io=(obj); num_(io)=(x); settt_(io, LUA_TNUMBER); } - -#define setnilvalue(obj) settt_(obj, LUA_TNIL) - -#define setfvalue(obj,x) \ - { TValue *io=(obj); val_(io).f=(x); settt_(io, LUA_TLCF); } - -#define setpvalue(obj,x) \ - { TValue *io=(obj); val_(io).p=(x); settt_(io, LUA_TLIGHTUSERDATA); } - -#define setbvalue(obj,x) \ - { TValue *io=(obj); val_(io).b=(x); settt_(io, LUA_TBOOLEAN); } - -#define setgcovalue(L,obj,x) \ - { TValue *io=(obj); GCObject *i_g=(x); \ - val_(io).gc=i_g; settt_(io, ctb(gch(i_g)->tt)); } - -#define setsvalue(L,obj,x) \ - { TValue *io=(obj); \ - TString *x_ = (x); \ - val_(io).gc=cast(GCObject *, x_); settt_(io, ctb(x_->tsv.tt)); \ - checkliveness(G(L),io); } - -#define setuvalue(L,obj,x) \ - { TValue *io=(obj); \ - val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TUSERDATA)); \ - checkliveness(G(L),io); } - -#define setthvalue(L,obj,x) \ - { TValue *io=(obj); \ - val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTHREAD)); \ - checkliveness(G(L),io); } - -#define setclLvalue(L,obj,x) \ - { TValue *io=(obj); \ - val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TLCL)); \ - checkliveness(G(L),io); } - -#define setclCvalue(L,obj,x) \ - { TValue *io=(obj); \ - val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TCCL)); \ - checkliveness(G(L),io); } - -#define sethvalue(L,obj,x) \ - { TValue *io=(obj); \ - val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTABLE)); \ - checkliveness(G(L),io); } - -#define setdeadvalue(obj) settt_(obj, LUA_TDEADKEY) - - - -#define setobj(L,obj1,obj2) \ - { const TValue *io2=(obj2); TValue *io1=(obj1); \ - io1->value_ = io2->value_; io1->tt_ = io2->tt_; \ - checkliveness(G(L),io1); } - - -/* -** different types of assignments, according to destination -*/ - -/* from stack to (same) stack */ -#define setobjs2s setobj -/* to stack (not from same stack) */ -#define setobj2s setobj -#define setsvalue2s setsvalue -#define sethvalue2s sethvalue -#define setptvalue2s setptvalue -/* from table to same table */ -#define setobjt2t setobj -/* to table */ -#define setobj2t setobj -/* to new object */ -#define setobj2n setobj -#define setsvalue2n setsvalue - - -/* check whether a number is valid (useful only for NaN trick) */ -#define luai_checknum(L,o,c) { /* empty */ } - - -/* -** {====================================================== -** NaN Trick -** ======================================================= -*/ -#if defined(LUA_NANTRICK) - -/* -** numbers are represented in the 'd_' field. All other values have the -** value (NNMARK | tag) in 'tt__'. A number with such pattern would be -** a "signaled NaN", which is never generated by regular operations by -** the CPU (nor by 'strtod') -*/ - -/* allows for external implementation for part of the trick */ -#if !defined(NNMARK) /* { */ - - -#if !defined(LUA_IEEEENDIAN) -#error option 'LUA_NANTRICK' needs 'LUA_IEEEENDIAN' -#endif - - -#define NNMARK 0x7FF7A500 -#define NNMASK 0x7FFFFF00 - -#undef TValuefields -#undef NILCONSTANT - -#if (LUA_IEEEENDIAN == 0) /* { */ - -/* little endian */ -#define TValuefields \ - union { struct { Value v__; int tt__; } i; double d__; } u -#define NILCONSTANT {{{NULL}, tag2tt(LUA_TNIL)}} -/* field-access macros */ -#define v_(o) ((o)->u.i.v__) -#define d_(o) ((o)->u.d__) -#define tt_(o) ((o)->u.i.tt__) - -#else /* }{ */ - -/* big endian */ -#define TValuefields \ - union { struct { int tt__; Value v__; } i; double d__; } u -#define NILCONSTANT {{tag2tt(LUA_TNIL), {NULL}}} -/* field-access macros */ -#define v_(o) ((o)->u.i.v__) -#define d_(o) ((o)->u.d__) -#define tt_(o) ((o)->u.i.tt__) - -#endif /* } */ - -#endif /* } */ - - -/* correspondence with standard representation */ -#undef val_ -#define val_(o) v_(o) -#undef num_ -#define num_(o) d_(o) - - -#undef numfield -#define numfield /* no such field; numbers are the entire struct */ - -/* basic check to distinguish numbers from non-numbers */ -#undef ttisnumber -#define ttisnumber(o) ((tt_(o) & NNMASK) != NNMARK) - -#define tag2tt(t) (NNMARK | (t)) - -#undef rttype -#define rttype(o) (ttisnumber(o) ? LUA_TNUMBER : tt_(o) & 0xff) - -#undef settt_ -#define settt_(o,t) (tt_(o) = tag2tt(t)) - -#undef setnvalue -#define setnvalue(obj,x) \ - { TValue *io_=(obj); num_(io_)=(x); lua_assert(ttisnumber(io_)); } - -#undef setobj -#define setobj(L,obj1,obj2) \ - { const TValue *o2_=(obj2); TValue *o1_=(obj1); \ - o1_->u = o2_->u; \ - checkliveness(G(L),o1_); } - - -/* -** these redefinitions are not mandatory, but these forms are more efficient -*/ - -#undef checktag -#undef checktype -#define checktag(o,t) (tt_(o) == tag2tt(t)) -#define checktype(o,t) (ctb(tt_(o) | VARBITS) == ctb(tag2tt(t) | VARBITS)) - -#undef ttisequal -#define ttisequal(o1,o2) \ - (ttisnumber(o1) ? ttisnumber(o2) : (tt_(o1) == tt_(o2))) - - -#undef luai_checknum -#define luai_checknum(L,o,c) { if (!ttisnumber(o)) c; } - -#endif -/* }====================================================== */ - - - -/* -** {====================================================== -** types and prototypes -** ======================================================= -*/ - - -union Value { - GCObject *gc; /* collectable objects */ - void *p; /* light userdata */ - int b; /* booleans */ - lua_CFunction f; /* light C functions */ - numfield /* numbers */ -}; - - -struct lua_TValue { - TValuefields; -}; - - -typedef TValue *StkId; /* index to stack elements */ - - - - -/* -** Header for string value; string bytes follow the end of this structure -*/ -typedef union TString { - L_Umaxalign dummy; /* ensures maximum alignment for strings */ - struct { - CommonHeader; - lu_byte extra; /* reserved words for short strings; "has hash" for longs */ - unsigned int hash; - size_t len; /* number of characters in string */ - } tsv; -} TString; - - -/* get the actual string (array of bytes) from a TString */ -#define getstr(ts) cast(const char *, (ts) + 1) - -/* get the actual string (array of bytes) from a Lua value */ -#define svalue(o) getstr(rawtsvalue(o)) - - -/* -** Header for userdata; memory area follows the end of this structure -*/ -typedef union Udata { - L_Umaxalign dummy; /* ensures maximum alignment for `local' udata */ - struct { - CommonHeader; - struct Table *metatable; - struct Table *env; - size_t len; /* number of bytes */ - } uv; -} Udata; - - - -/* -** Description of an upvalue for function prototypes -*/ -typedef struct Upvaldesc { - TString *name; /* upvalue name (for debug information) */ - lu_byte instack; /* whether it is in stack */ - lu_byte idx; /* index of upvalue (in stack or in outer function's list) */ -} Upvaldesc; - - -/* -** Description of a local variable for function prototypes -** (used for debug information) -*/ -typedef struct LocVar { - TString *varname; - int startpc; /* first point where variable is active */ - int endpc; /* first point where variable is dead */ -} LocVar; - - -/* -** Function Prototypes -*/ -typedef struct Proto { - CommonHeader; - TValue *k; /* constants used by the function */ - Instruction *code; - struct Proto **p; /* functions defined inside the function */ - int *lineinfo; /* map from opcodes to source lines (debug information) */ - LocVar *locvars; /* information about local variables (debug information) */ - Upvaldesc *upvalues; /* upvalue information */ - union Closure *cache; /* last created closure with this prototype */ - TString *source; /* used for debug information */ - int sizeupvalues; /* size of 'upvalues' */ - int sizek; /* size of `k' */ - int sizecode; - int sizelineinfo; - int sizep; /* size of `p' */ - int sizelocvars; - int linedefined; - int lastlinedefined; - GCObject *gclist; - lu_byte numparams; /* number of fixed parameters */ - lu_byte is_vararg; - lu_byte maxstacksize; /* maximum stack used by this function */ -} Proto; - - - -/* -** Lua Upvalues -*/ -typedef struct UpVal { - CommonHeader; - TValue *v; /* points to stack or to its own value */ - union { - TValue value; /* the value (when closed) */ - struct { /* double linked list (when open) */ - struct UpVal *prev; - struct UpVal *next; - } l; - } u; -} UpVal; - - -/* -** Closures -*/ - -#define ClosureHeader \ - CommonHeader; lu_byte nupvalues; GCObject *gclist - -typedef struct CClosure { - ClosureHeader; - lua_CFunction f; - TValue upvalue[1]; /* list of upvalues */ -} CClosure; - - -typedef struct LClosure { - ClosureHeader; - struct Proto *p; - UpVal *upvals[1]; /* list of upvalues */ -} LClosure; - - -typedef union Closure { - CClosure c; - LClosure l; -} Closure; - - -#define isLfunction(o) ttisLclosure(o) - -#define getproto(o) (clLvalue(o)->p) - - -/* -** Tables -*/ - -typedef union TKey { - struct { - TValuefields; - struct Node *next; /* for chaining */ - } nk; - TValue tvk; -} TKey; - - -typedef struct Node { - TValue i_val; - TKey i_key; -} Node; - - -typedef struct Table { - CommonHeader; - lu_byte flags; /* 1<

lsizenode)) - - -/* -** (address of) a fixed nil value -*/ -#define luaO_nilobject (&luaO_nilobject_) - - -LUAI_DDEC const TValue luaO_nilobject_; - - -LUAI_FUNC int luaO_int2fb (unsigned int x); -LUAI_FUNC int luaO_fb2int (int x); -LUAI_FUNC int luaO_ceillog2 (unsigned int x); -LUAI_FUNC lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2); -LUAI_FUNC int luaO_str2d (const char *s, size_t len, lua_Number *result); -LUAI_FUNC int luaO_hexavalue (int c); -LUAI_FUNC const char *luaO_pushvfstring (lua_State *L, const char *fmt, - va_list argp); -LUAI_FUNC const char *luaO_pushfstring (lua_State *L, const char *fmt, ...); -LUAI_FUNC void luaO_chunkid (char *out, const char *source, size_t len); - - -#endif - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c deleted file mode 100644 index 4190dc762428..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c +++ /dev/null @@ -1,107 +0,0 @@ -/* -** $Id: lopcodes.c,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $ -** Opcodes for Lua virtual machine -** See Copyright Notice in lua.h -*/ - - -#define lopcodes_c -#define LUA_CORE - - -#include "lopcodes.h" - - -/* ORDER OP */ - -LUAI_DDEF const char *const luaP_opnames[NUM_OPCODES+1] = { - "MOVE", - "LOADK", - "LOADKX", - "LOADBOOL", - "LOADNIL", - "GETUPVAL", - "GETTABUP", - "GETTABLE", - "SETTABUP", - "SETUPVAL", - "SETTABLE", - "NEWTABLE", - "SELF", - "ADD", - "SUB", - "MUL", - "DIV", - "MOD", - "POW", - "UNM", - "NOT", - "LEN", - "CONCAT", - "JMP", - "EQ", - "LT", - "LE", - "TEST", - "TESTSET", - "CALL", - "TAILCALL", - "RETURN", - "FORLOOP", - "FORPREP", - "TFORCALL", - "TFORLOOP", - "SETLIST", - "CLOSURE", - "VARARG", - "EXTRAARG", - NULL -}; - - -#define opmode(t,a,b,c,m) (((t)<<7) | ((a)<<6) | ((b)<<4) | ((c)<<2) | (m)) - -LUAI_DDEF const lu_byte luaP_opmodes[NUM_OPCODES] = { -/* T A B C mode opcode */ - opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_MOVE */ - ,opmode(0, 1, OpArgK, OpArgN, iABx) /* OP_LOADK */ - ,opmode(0, 1, OpArgN, OpArgN, iABx) /* OP_LOADKX */ - ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_LOADBOOL */ - ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_LOADNIL */ - ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_GETUPVAL */ - ,opmode(0, 1, OpArgU, OpArgK, iABC) /* OP_GETTABUP */ - ,opmode(0, 1, OpArgR, OpArgK, iABC) /* OP_GETTABLE */ - ,opmode(0, 0, OpArgK, OpArgK, iABC) /* OP_SETTABUP */ - ,opmode(0, 0, OpArgU, OpArgN, iABC) /* OP_SETUPVAL */ - ,opmode(0, 0, OpArgK, OpArgK, iABC) /* OP_SETTABLE */ - ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_NEWTABLE */ - ,opmode(0, 1, OpArgR, OpArgK, iABC) /* OP_SELF */ - ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_ADD */ - ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_SUB */ - ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_MUL */ - ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_DIV */ - ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_MOD */ - ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_POW */ - ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_UNM */ - ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_NOT */ - ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_LEN */ - ,opmode(0, 1, OpArgR, OpArgR, iABC) /* OP_CONCAT */ - ,opmode(0, 0, OpArgR, OpArgN, iAsBx) /* OP_JMP */ - ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_EQ */ - ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_LT */ - ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_LE */ - ,opmode(1, 0, OpArgN, OpArgU, iABC) /* OP_TEST */ - ,opmode(1, 1, OpArgR, OpArgU, iABC) /* OP_TESTSET */ - ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_CALL */ - ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_TAILCALL */ - ,opmode(0, 0, OpArgU, OpArgN, iABC) /* OP_RETURN */ - ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_FORLOOP */ - ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_FORPREP */ - ,opmode(0, 0, OpArgN, OpArgU, iABC) /* OP_TFORCALL */ - ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_TFORLOOP */ - ,opmode(0, 0, OpArgU, OpArgU, iABC) /* OP_SETLIST */ - ,opmode(0, 1, OpArgU, OpArgN, iABx) /* OP_CLOSURE */ - ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_VARARG */ - ,opmode(0, 0, OpArgU, OpArgU, iAx) /* OP_EXTRAARG */ -}; - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h deleted file mode 100644 index 8e2f80a13141..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h +++ /dev/null @@ -1,288 +0,0 @@ -/* -** $Id: lopcodes.h,v 1.142.1.2 2014/10/20 18:32:09 roberto Exp $ -** Opcodes for Lua virtual machine -** See Copyright Notice in lua.h -*/ - -#ifndef lopcodes_h -#define lopcodes_h - -#include "llimits.h" - - -/*=========================================================================== - We assume that instructions are unsigned numbers. - All instructions have an opcode in the first 6 bits. - Instructions can have the following fields: - `A' : 8 bits - `B' : 9 bits - `C' : 9 bits - 'Ax' : 26 bits ('A', 'B', and 'C' together) - `Bx' : 18 bits (`B' and `C' together) - `sBx' : signed Bx - - A signed argument is represented in excess K; that is, the number - value is the unsigned value minus K. K is exactly the maximum value - for that argument (so that -max is represented by 0, and +max is - represented by 2*max), which is half the maximum for the corresponding - unsigned argument. -===========================================================================*/ - - -enum OpMode {iABC, iABx, iAsBx, iAx}; /* basic instruction format */ - - -/* -** size and position of opcode arguments. -*/ -#define SIZE_C 9 -#define SIZE_B 9 -#define SIZE_Bx (SIZE_C + SIZE_B) -#define SIZE_A 8 -#define SIZE_Ax (SIZE_C + SIZE_B + SIZE_A) - -#define SIZE_OP 6 - -#define POS_OP 0 -#define POS_A (POS_OP + SIZE_OP) -#define POS_C (POS_A + SIZE_A) -#define POS_B (POS_C + SIZE_C) -#define POS_Bx POS_C -#define POS_Ax POS_A - - -/* -** limits for opcode arguments. -** we use (signed) int to manipulate most arguments, -** so they must fit in LUAI_BITSINT-1 bits (-1 for sign) -*/ -#if SIZE_Bx < LUAI_BITSINT-1 -#define MAXARG_Bx ((1<>1) /* `sBx' is signed */ -#else -#define MAXARG_Bx MAX_INT -#define MAXARG_sBx MAX_INT -#endif - -#if SIZE_Ax < LUAI_BITSINT-1 -#define MAXARG_Ax ((1<>POS_OP) & MASK1(SIZE_OP,0))) -#define SET_OPCODE(i,o) ((i) = (((i)&MASK0(SIZE_OP,POS_OP)) | \ - ((cast(Instruction, o)<>pos) & MASK1(size,0))) -#define setarg(i,v,pos,size) ((i) = (((i)&MASK0(size,pos)) | \ - ((cast(Instruction, v)<= R(A - 1) */ -OP_EQ,/* A B C if ((RK(B) == RK(C)) ~= A) then pc++ */ -OP_LT,/* A B C if ((RK(B) < RK(C)) ~= A) then pc++ */ -OP_LE,/* A B C if ((RK(B) <= RK(C)) ~= A) then pc++ */ - -OP_TEST,/* A C if not (R(A) <=> C) then pc++ */ -OP_TESTSET,/* A B C if (R(B) <=> C) then R(A) := R(B) else pc++ */ - -OP_CALL,/* A B C R(A), ... ,R(A+C-2) := R(A)(R(A+1), ... ,R(A+B-1)) */ -OP_TAILCALL,/* A B C return R(A)(R(A+1), ... ,R(A+B-1)) */ -OP_RETURN,/* A B return R(A), ... ,R(A+B-2) (see note) */ - -OP_FORLOOP,/* A sBx R(A)+=R(A+2); - if R(A) > 4) & 3)) -#define getCMode(m) (cast(enum OpArgMask, (luaP_opmodes[m] >> 2) & 3)) -#define testAMode(m) (luaP_opmodes[m] & (1 << 6)) -#define testTMode(m) (luaP_opmodes[m] & (1 << 7)) - - -LUAI_DDEC const char *const luaP_opnames[NUM_OPCODES+1]; /* opcode names */ - - -/* number of list items to accumulate before a SETLIST instruction */ -#define LFIELDS_PER_FLUSH 50 - - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c deleted file mode 100644 index 73f1af64f834..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c +++ /dev/null @@ -1,1637 +0,0 @@ -/* -** $Id: lparser.c,v 2.130.1.1 2013/04/12 18:48:47 roberto Exp $ -** Lua Parser -** See Copyright Notice in lua.h -*/ - -#include - -#define lparser_c -#define LUA_CORE - -#include "lua.h" - -#include "lcode.h" -#include "ldebug.h" -#include "ldo.h" -#include "lfunc.h" -#include "llex.h" -#include "lmem.h" -#include "lobject.h" -#include "lopcodes.h" -#include "lparser.h" -#include "lstate.h" -#include "lstring.h" -#include "ltable.h" - - - -/* maximum number of local variables per function (must be smaller - than 250, due to the bytecode format) */ -#define MAXVARS 200 - - -#define hasmultret(k) ((k) == VCALL || (k) == VVARARG) - - - -/* -** nodes for block list (list of active blocks) -*/ -typedef struct BlockCnt { - struct BlockCnt *previous; /* chain */ - short firstlabel; /* index of first label in this block */ - short firstgoto; /* index of first pending goto in this block */ - lu_byte nactvar; /* # active locals outside the block */ - lu_byte upval; /* true if some variable in the block is an upvalue */ - lu_byte isloop; /* true if `block' is a loop */ -} BlockCnt; - - - -/* -** prototypes for recursive non-terminal functions -*/ -static void statement (LexState *ls); -static void expr (LexState *ls, expdesc *v); - - -static void anchor_token (LexState *ls) { - /* last token from outer function must be EOS */ - lua_assert(ls->fs != NULL || ls->t.token == TK_EOS); - if (ls->t.token == TK_NAME || ls->t.token == TK_STRING) { - TString *ts = ls->t.seminfo.ts; - luaX_newstring(ls, getstr(ts), ts->tsv.len); - } -} - - -/* semantic error */ -static l_noret semerror (LexState *ls, const char *msg) { - ls->t.token = 0; /* remove 'near to' from final message */ - luaX_syntaxerror(ls, msg); -} - - -static l_noret error_expected (LexState *ls, int token) { - luaX_syntaxerror(ls, - luaO_pushfstring(ls->L, "%s expected", luaX_token2str(ls, token))); -} - - -static l_noret errorlimit (FuncState *fs, int limit, const char *what) { - lua_State *L = fs->ls->L; - const char *msg; - int line = fs->f->linedefined; - const char *where = (line == 0) - ? "main function" - : luaO_pushfstring(L, "function at line %d", line); - msg = luaO_pushfstring(L, "too many %s (limit is %d) in %s", - what, limit, where); - luaX_syntaxerror(fs->ls, msg); -} - - -static void checklimit (FuncState *fs, int v, int l, const char *what) { - if (v > l) errorlimit(fs, l, what); -} - - -static int testnext (LexState *ls, int c) { - if (ls->t.token == c) { - luaX_next(ls); - return 1; - } - else return 0; -} - - -static void check (LexState *ls, int c) { - if (ls->t.token != c) - error_expected(ls, c); -} - - -static void checknext (LexState *ls, int c) { - check(ls, c); - luaX_next(ls); -} - - -#define check_condition(ls,c,msg) { if (!(c)) luaX_syntaxerror(ls, msg); } - - - -static void check_match (LexState *ls, int what, int who, int where) { - if (!testnext(ls, what)) { - if (where == ls->linenumber) - error_expected(ls, what); - else { - luaX_syntaxerror(ls, luaO_pushfstring(ls->L, - "%s expected (to close %s at line %d)", - luaX_token2str(ls, what), luaX_token2str(ls, who), where)); - } - } -} - - -static TString *str_checkname (LexState *ls) { - TString *ts; - check(ls, TK_NAME); - ts = ls->t.seminfo.ts; - luaX_next(ls); - return ts; -} - - -static void init_exp (expdesc *e, expkind k, int i) { - e->f = e->t = NO_JUMP; - e->k = k; - e->u.info = i; -} - - -static void codestring (LexState *ls, expdesc *e, TString *s) { - init_exp(e, VK, luaK_stringK(ls->fs, s)); -} - - -static void checkname (LexState *ls, expdesc *e) { - codestring(ls, e, str_checkname(ls)); -} - - -static int registerlocalvar (LexState *ls, TString *varname) { - FuncState *fs = ls->fs; - Proto *f = fs->f; - int oldsize = f->sizelocvars; - luaM_growvector(ls->L, f->locvars, fs->nlocvars, f->sizelocvars, - LocVar, SHRT_MAX, "local variables"); - while (oldsize < f->sizelocvars) f->locvars[oldsize++].varname = NULL; - f->locvars[fs->nlocvars].varname = varname; - luaC_objbarrier(ls->L, f, varname); - return fs->nlocvars++; -} - - -static void new_localvar (LexState *ls, TString *name) { - FuncState *fs = ls->fs; - Dyndata *dyd = ls->dyd; - int reg = registerlocalvar(ls, name); - checklimit(fs, dyd->actvar.n + 1 - fs->firstlocal, - MAXVARS, "local variables"); - luaM_growvector(ls->L, dyd->actvar.arr, dyd->actvar.n + 1, - dyd->actvar.size, Vardesc, MAX_INT, "local variables"); - dyd->actvar.arr[dyd->actvar.n++].idx = cast(short, reg); -} - - -static void new_localvarliteral_ (LexState *ls, const char *name, size_t sz) { - new_localvar(ls, luaX_newstring(ls, name, sz)); -} - -#define new_localvarliteral(ls,v) \ - new_localvarliteral_(ls, "" v, (sizeof(v)/sizeof(char))-1) - - -static LocVar *getlocvar (FuncState *fs, int i) { - int idx = fs->ls->dyd->actvar.arr[fs->firstlocal + i].idx; - lua_assert(idx < fs->nlocvars); - return &fs->f->locvars[idx]; -} - - -static void adjustlocalvars (LexState *ls, int nvars) { - FuncState *fs = ls->fs; - fs->nactvar = cast_byte(fs->nactvar + nvars); - for (; nvars; nvars--) { - getlocvar(fs, fs->nactvar - nvars)->startpc = fs->pc; - } -} - - -static void removevars (FuncState *fs, int tolevel) { - fs->ls->dyd->actvar.n -= (fs->nactvar - tolevel); - while (fs->nactvar > tolevel) - getlocvar(fs, --fs->nactvar)->endpc = fs->pc; -} - - -static int searchupvalue (FuncState *fs, TString *name) { - int i; - Upvaldesc *up = fs->f->upvalues; - for (i = 0; i < fs->nups; i++) { - if (luaS_eqstr(up[i].name, name)) return i; - } - return -1; /* not found */ -} - - -static int newupvalue (FuncState *fs, TString *name, expdesc *v) { - Proto *f = fs->f; - int oldsize = f->sizeupvalues; - checklimit(fs, fs->nups + 1, MAXUPVAL, "upvalues"); - luaM_growvector(fs->ls->L, f->upvalues, fs->nups, f->sizeupvalues, - Upvaldesc, MAXUPVAL, "upvalues"); - while (oldsize < f->sizeupvalues) f->upvalues[oldsize++].name = NULL; - f->upvalues[fs->nups].instack = (v->k == VLOCAL); - f->upvalues[fs->nups].idx = cast_byte(v->u.info); - f->upvalues[fs->nups].name = name; - luaC_objbarrier(fs->ls->L, f, name); - return fs->nups++; -} - - -static int searchvar (FuncState *fs, TString *n) { - int i; - for (i = cast_int(fs->nactvar) - 1; i >= 0; i--) { - if (luaS_eqstr(n, getlocvar(fs, i)->varname)) - return i; - } - return -1; /* not found */ -} - - -/* - Mark block where variable at given level was defined - (to emit close instructions later). -*/ -static void markupval (FuncState *fs, int level) { - BlockCnt *bl = fs->bl; - while (bl->nactvar > level) bl = bl->previous; - bl->upval = 1; -} - - -/* - Find variable with given name 'n'. If it is an upvalue, add this - upvalue into all intermediate functions. -*/ -static int singlevaraux (FuncState *fs, TString *n, expdesc *var, int base) { - if (fs == NULL) /* no more levels? */ - return VVOID; /* default is global */ - else { - int v = searchvar(fs, n); /* look up locals at current level */ - if (v >= 0) { /* found? */ - init_exp(var, VLOCAL, v); /* variable is local */ - if (!base) - markupval(fs, v); /* local will be used as an upval */ - return VLOCAL; - } - else { /* not found as local at current level; try upvalues */ - int idx = searchupvalue(fs, n); /* try existing upvalues */ - if (idx < 0) { /* not found? */ - if (singlevaraux(fs->prev, n, var, 0) == VVOID) /* try upper levels */ - return VVOID; /* not found; is a global */ - /* else was LOCAL or UPVAL */ - idx = newupvalue(fs, n, var); /* will be a new upvalue */ - } - init_exp(var, VUPVAL, idx); - return VUPVAL; - } - } -} - - -static void singlevar (LexState *ls, expdesc *var) { - TString *varname = str_checkname(ls); - FuncState *fs = ls->fs; - if (singlevaraux(fs, varname, var, 1) == VVOID) { /* global name? */ - expdesc key; - singlevaraux(fs, ls->envn, var, 1); /* get environment variable */ - lua_assert(var->k == VLOCAL || var->k == VUPVAL); - codestring(ls, &key, varname); /* key is variable name */ - luaK_indexed(fs, var, &key); /* env[varname] */ - } -} - - -static void adjust_assign (LexState *ls, int nvars, int nexps, expdesc *e) { - FuncState *fs = ls->fs; - int extra = nvars - nexps; - if (hasmultret(e->k)) { - extra++; /* includes call itself */ - if (extra < 0) extra = 0; - luaK_setreturns(fs, e, extra); /* last exp. provides the difference */ - if (extra > 1) luaK_reserveregs(fs, extra-1); - } - else { - if (e->k != VVOID) luaK_exp2nextreg(fs, e); /* close last expression */ - if (extra > 0) { - int reg = fs->freereg; - luaK_reserveregs(fs, extra); - luaK_nil(fs, reg, extra); - } - } -} - - -static void enterlevel (LexState *ls) { - lua_State *L = ls->L; - ++L->nCcalls; - checklimit(ls->fs, L->nCcalls, LUAI_MAXCCALLS, "C levels"); -} - - -#define leavelevel(ls) ((ls)->L->nCcalls--) - - -static void closegoto (LexState *ls, int g, Labeldesc *label) { - int i; - FuncState *fs = ls->fs; - Labellist *gl = &ls->dyd->gt; - Labeldesc *gt = &gl->arr[g]; - lua_assert(luaS_eqstr(gt->name, label->name)); - if (gt->nactvar < label->nactvar) { - TString *vname = getlocvar(fs, gt->nactvar)->varname; - const char *msg = luaO_pushfstring(ls->L, - " at line %d jumps into the scope of local " LUA_QS, - getstr(gt->name), gt->line, getstr(vname)); - semerror(ls, msg); - } - luaK_patchlist(fs, gt->pc, label->pc); - /* remove goto from pending list */ - for (i = g; i < gl->n - 1; i++) - gl->arr[i] = gl->arr[i + 1]; - gl->n--; -} - - -/* -** try to close a goto with existing labels; this solves backward jumps -*/ -static int findlabel (LexState *ls, int g) { - int i; - BlockCnt *bl = ls->fs->bl; - Dyndata *dyd = ls->dyd; - Labeldesc *gt = &dyd->gt.arr[g]; - /* check labels in current block for a match */ - for (i = bl->firstlabel; i < dyd->label.n; i++) { - Labeldesc *lb = &dyd->label.arr[i]; - if (luaS_eqstr(lb->name, gt->name)) { /* correct label? */ - if (gt->nactvar > lb->nactvar && - (bl->upval || dyd->label.n > bl->firstlabel)) - luaK_patchclose(ls->fs, gt->pc, lb->nactvar); - closegoto(ls, g, lb); /* close it */ - return 1; - } - } - return 0; /* label not found; cannot close goto */ -} - - -static int newlabelentry (LexState *ls, Labellist *l, TString *name, - int line, int pc) { - int n = l->n; - luaM_growvector(ls->L, l->arr, n, l->size, - Labeldesc, SHRT_MAX, "labels/gotos"); - l->arr[n].name = name; - l->arr[n].line = line; - l->arr[n].nactvar = ls->fs->nactvar; - l->arr[n].pc = pc; - l->n++; - return n; -} - - -/* -** check whether new label 'lb' matches any pending gotos in current -** block; solves forward jumps -*/ -static void findgotos (LexState *ls, Labeldesc *lb) { - Labellist *gl = &ls->dyd->gt; - int i = ls->fs->bl->firstgoto; - while (i < gl->n) { - if (luaS_eqstr(gl->arr[i].name, lb->name)) - closegoto(ls, i, lb); - else - i++; - } -} - - -/* -** "export" pending gotos to outer level, to check them against -** outer labels; if the block being exited has upvalues, and -** the goto exits the scope of any variable (which can be the -** upvalue), close those variables being exited. -*/ -static void movegotosout (FuncState *fs, BlockCnt *bl) { - int i = bl->firstgoto; - Labellist *gl = &fs->ls->dyd->gt; - /* correct pending gotos to current block and try to close it - with visible labels */ - while (i < gl->n) { - Labeldesc *gt = &gl->arr[i]; - if (gt->nactvar > bl->nactvar) { - if (bl->upval) - luaK_patchclose(fs, gt->pc, bl->nactvar); - gt->nactvar = bl->nactvar; - } - if (!findlabel(fs->ls, i)) - i++; /* move to next one */ - } -} - - -static void enterblock (FuncState *fs, BlockCnt *bl, lu_byte isloop) { - bl->isloop = isloop; - bl->nactvar = fs->nactvar; - bl->firstlabel = fs->ls->dyd->label.n; - bl->firstgoto = fs->ls->dyd->gt.n; - bl->upval = 0; - bl->previous = fs->bl; - fs->bl = bl; - lua_assert(fs->freereg == fs->nactvar); -} - - -/* -** create a label named "break" to resolve break statements -*/ -static void breaklabel (LexState *ls) { - TString *n = luaS_new(ls->L, "break"); - int l = newlabelentry(ls, &ls->dyd->label, n, 0, ls->fs->pc); - findgotos(ls, &ls->dyd->label.arr[l]); -} - -/* -** generates an error for an undefined 'goto'; choose appropriate -** message when label name is a reserved word (which can only be 'break') -*/ -static l_noret undefgoto (LexState *ls, Labeldesc *gt) { - const char *msg = isreserved(gt->name) - ? "<%s> at line %d not inside a loop" - : "no visible label " LUA_QS " for at line %d"; - msg = luaO_pushfstring(ls->L, msg, getstr(gt->name), gt->line); - semerror(ls, msg); -} - - -static void leaveblock (FuncState *fs) { - BlockCnt *bl = fs->bl; - LexState *ls = fs->ls; - if (bl->previous && bl->upval) { - /* create a 'jump to here' to close upvalues */ - int j = luaK_jump(fs); - luaK_patchclose(fs, j, bl->nactvar); - luaK_patchtohere(fs, j); - } - if (bl->isloop) - breaklabel(ls); /* close pending breaks */ - fs->bl = bl->previous; - removevars(fs, bl->nactvar); - lua_assert(bl->nactvar == fs->nactvar); - fs->freereg = fs->nactvar; /* free registers */ - ls->dyd->label.n = bl->firstlabel; /* remove local labels */ - if (bl->previous) /* inner block? */ - movegotosout(fs, bl); /* update pending gotos to outer block */ - else if (bl->firstgoto < ls->dyd->gt.n) /* pending gotos in outer block? */ - undefgoto(ls, &ls->dyd->gt.arr[bl->firstgoto]); /* error */ -} - - -/* -** adds a new prototype into list of prototypes -*/ -static Proto *addprototype (LexState *ls) { - Proto *clp; - lua_State *L = ls->L; - FuncState *fs = ls->fs; - Proto *f = fs->f; /* prototype of current function */ - if (fs->np >= f->sizep) { - int oldsize = f->sizep; - luaM_growvector(L, f->p, fs->np, f->sizep, Proto *, MAXARG_Bx, "functions"); - while (oldsize < f->sizep) f->p[oldsize++] = NULL; - } - f->p[fs->np++] = clp = luaF_newproto(L); - luaC_objbarrier(L, f, clp); - return clp; -} - - -/* -** codes instruction to create new closure in parent function. -** The OP_CLOSURE instruction must use the last available register, -** so that, if it invokes the GC, the GC knows which registers -** are in use at that time. -*/ -static void codeclosure (LexState *ls, expdesc *v) { - FuncState *fs = ls->fs->prev; - init_exp(v, VRELOCABLE, luaK_codeABx(fs, OP_CLOSURE, 0, fs->np - 1)); - luaK_exp2nextreg(fs, v); /* fix it at the last register */ -} - - -static void open_func (LexState *ls, FuncState *fs, BlockCnt *bl) { - lua_State *L = ls->L; - Proto *f; - fs->prev = ls->fs; /* linked list of funcstates */ - fs->ls = ls; - ls->fs = fs; - fs->pc = 0; - fs->lasttarget = 0; - fs->jpc = NO_JUMP; - fs->freereg = 0; - fs->nk = 0; - fs->np = 0; - fs->nups = 0; - fs->nlocvars = 0; - fs->nactvar = 0; - fs->firstlocal = ls->dyd->actvar.n; - fs->bl = NULL; - f = fs->f; - f->source = ls->source; - f->maxstacksize = 2; /* registers 0/1 are always valid */ - fs->h = luaH_new(L); - /* anchor table of constants (to avoid being collected) */ - sethvalue2s(L, L->top, fs->h); - incr_top(L); - enterblock(fs, bl, 0); -} - - -static void close_func (LexState *ls) { - lua_State *L = ls->L; - FuncState *fs = ls->fs; - Proto *f = fs->f; - luaK_ret(fs, 0, 0); /* final return */ - leaveblock(fs); - luaM_reallocvector(L, f->code, f->sizecode, fs->pc, Instruction); - f->sizecode = fs->pc; - luaM_reallocvector(L, f->lineinfo, f->sizelineinfo, fs->pc, int); - f->sizelineinfo = fs->pc; - luaM_reallocvector(L, f->k, f->sizek, fs->nk, TValue); - f->sizek = fs->nk; - luaM_reallocvector(L, f->p, f->sizep, fs->np, Proto *); - f->sizep = fs->np; - luaM_reallocvector(L, f->locvars, f->sizelocvars, fs->nlocvars, LocVar); - f->sizelocvars = fs->nlocvars; - luaM_reallocvector(L, f->upvalues, f->sizeupvalues, fs->nups, Upvaldesc); - f->sizeupvalues = fs->nups; - lua_assert(fs->bl == NULL); - ls->fs = fs->prev; - /* last token read was anchored in defunct function; must re-anchor it */ - anchor_token(ls); - L->top--; /* pop table of constants */ - luaC_checkGC(L); -} - - - -/*============================================================*/ -/* GRAMMAR RULES */ -/*============================================================*/ - - -/* -** check whether current token is in the follow set of a block. -** 'until' closes syntactical blocks, but do not close scope, -** so it handled in separate. -*/ -static int block_follow (LexState *ls, int withuntil) { - switch (ls->t.token) { - case TK_ELSE: case TK_ELSEIF: - case TK_END: case TK_EOS: - return 1; - case TK_UNTIL: return withuntil; - default: return 0; - } -} - - -static void statlist (LexState *ls) { - /* statlist -> { stat [`;'] } */ - while (!block_follow(ls, 1)) { - if (ls->t.token == TK_RETURN) { - statement(ls); - return; /* 'return' must be last statement */ - } - statement(ls); - } -} - - -static void fieldsel (LexState *ls, expdesc *v) { - /* fieldsel -> ['.' | ':'] NAME */ - FuncState *fs = ls->fs; - expdesc key; - luaK_exp2anyregup(fs, v); - luaX_next(ls); /* skip the dot or colon */ - checkname(ls, &key); - luaK_indexed(fs, v, &key); -} - - -static void yindex (LexState *ls, expdesc *v) { - /* index -> '[' expr ']' */ - luaX_next(ls); /* skip the '[' */ - expr(ls, v); - luaK_exp2val(ls->fs, v); - checknext(ls, ']'); -} - - -/* -** {====================================================================== -** Rules for Constructors -** ======================================================================= -*/ - - -struct ConsControl { - expdesc v; /* last list item read */ - expdesc *t; /* table descriptor */ - int nh; /* total number of `record' elements */ - int na; /* total number of array elements */ - int tostore; /* number of array elements pending to be stored */ -}; - - -static void recfield (LexState *ls, struct ConsControl *cc) { - /* recfield -> (NAME | `['exp1`]') = exp1 */ - FuncState *fs = ls->fs; - int reg = ls->fs->freereg; - expdesc key, val; - int rkkey; - if (ls->t.token == TK_NAME) { - checklimit(fs, cc->nh, MAX_INT, "items in a constructor"); - checkname(ls, &key); - } - else /* ls->t.token == '[' */ - yindex(ls, &key); - cc->nh++; - checknext(ls, '='); - rkkey = luaK_exp2RK(fs, &key); - expr(ls, &val); - luaK_codeABC(fs, OP_SETTABLE, cc->t->u.info, rkkey, luaK_exp2RK(fs, &val)); - fs->freereg = reg; /* free registers */ -} - - -static void closelistfield (FuncState *fs, struct ConsControl *cc) { - if (cc->v.k == VVOID) return; /* there is no list item */ - luaK_exp2nextreg(fs, &cc->v); - cc->v.k = VVOID; - if (cc->tostore == LFIELDS_PER_FLUSH) { - luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore); /* flush */ - cc->tostore = 0; /* no more items pending */ - } -} - - -static void lastlistfield (FuncState *fs, struct ConsControl *cc) { - if (cc->tostore == 0) return; - if (hasmultret(cc->v.k)) { - luaK_setmultret(fs, &cc->v); - luaK_setlist(fs, cc->t->u.info, cc->na, LUA_MULTRET); - cc->na--; /* do not count last expression (unknown number of elements) */ - } - else { - if (cc->v.k != VVOID) - luaK_exp2nextreg(fs, &cc->v); - luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore); - } -} - - -static void listfield (LexState *ls, struct ConsControl *cc) { - /* listfield -> exp */ - expr(ls, &cc->v); - checklimit(ls->fs, cc->na, MAX_INT, "items in a constructor"); - cc->na++; - cc->tostore++; -} - - -static void field (LexState *ls, struct ConsControl *cc) { - /* field -> listfield | recfield */ - switch(ls->t.token) { - case TK_NAME: { /* may be 'listfield' or 'recfield' */ - if (luaX_lookahead(ls) != '=') /* expression? */ - listfield(ls, cc); - else - recfield(ls, cc); - break; - } - case '[': { - recfield(ls, cc); - break; - } - default: { - listfield(ls, cc); - break; - } - } -} - - -static void constructor (LexState *ls, expdesc *t) { - /* constructor -> '{' [ field { sep field } [sep] ] '}' - sep -> ',' | ';' */ - FuncState *fs = ls->fs; - int line = ls->linenumber; - int pc = luaK_codeABC(fs, OP_NEWTABLE, 0, 0, 0); - struct ConsControl cc; - cc.na = cc.nh = cc.tostore = 0; - cc.t = t; - init_exp(t, VRELOCABLE, pc); - init_exp(&cc.v, VVOID, 0); /* no value (yet) */ - luaK_exp2nextreg(ls->fs, t); /* fix it at stack top */ - checknext(ls, '{'); - do { - lua_assert(cc.v.k == VVOID || cc.tostore > 0); - if (ls->t.token == '}') break; - closelistfield(fs, &cc); - field(ls, &cc); - } while (testnext(ls, ',') || testnext(ls, ';')); - check_match(ls, '}', '{', line); - lastlistfield(fs, &cc); - SETARG_B(fs->f->code[pc], luaO_int2fb(cc.na)); /* set initial array size */ - SETARG_C(fs->f->code[pc], luaO_int2fb(cc.nh)); /* set initial table size */ -} - -/* }====================================================================== */ - - - -static void parlist (LexState *ls) { - /* parlist -> [ param { `,' param } ] */ - FuncState *fs = ls->fs; - Proto *f = fs->f; - int nparams = 0; - f->is_vararg = 0; - if (ls->t.token != ')') { /* is `parlist' not empty? */ - do { - switch (ls->t.token) { - case TK_NAME: { /* param -> NAME */ - new_localvar(ls, str_checkname(ls)); - nparams++; - break; - } - case TK_DOTS: { /* param -> `...' */ - luaX_next(ls); - f->is_vararg = 1; - break; - } - default: luaX_syntaxerror(ls, " or " LUA_QL("...") " expected"); - } - } while (!f->is_vararg && testnext(ls, ',')); - } - adjustlocalvars(ls, nparams); - f->numparams = cast_byte(fs->nactvar); - luaK_reserveregs(fs, fs->nactvar); /* reserve register for parameters */ -} - - -static void body (LexState *ls, expdesc *e, int ismethod, int line) { - /* body -> `(' parlist `)' block END */ - FuncState new_fs; - BlockCnt bl; - new_fs.f = addprototype(ls); - new_fs.f->linedefined = line; - open_func(ls, &new_fs, &bl); - checknext(ls, '('); - if (ismethod) { - new_localvarliteral(ls, "self"); /* create 'self' parameter */ - adjustlocalvars(ls, 1); - } - parlist(ls); - checknext(ls, ')'); - statlist(ls); - new_fs.f->lastlinedefined = ls->linenumber; - check_match(ls, TK_END, TK_FUNCTION, line); - codeclosure(ls, e); - close_func(ls); -} - - -static int explist (LexState *ls, expdesc *v) { - /* explist -> expr { `,' expr } */ - int n = 1; /* at least one expression */ - expr(ls, v); - while (testnext(ls, ',')) { - luaK_exp2nextreg(ls->fs, v); - expr(ls, v); - n++; - } - return n; -} - - -static void funcargs (LexState *ls, expdesc *f, int line) { - FuncState *fs = ls->fs; - expdesc args; - int base, nparams; - switch (ls->t.token) { - case '(': { /* funcargs -> `(' [ explist ] `)' */ - luaX_next(ls); - if (ls->t.token == ')') /* arg list is empty? */ - args.k = VVOID; - else { - explist(ls, &args); - luaK_setmultret(fs, &args); - } - check_match(ls, ')', '(', line); - break; - } - case '{': { /* funcargs -> constructor */ - constructor(ls, &args); - break; - } - case TK_STRING: { /* funcargs -> STRING */ - codestring(ls, &args, ls->t.seminfo.ts); - luaX_next(ls); /* must use `seminfo' before `next' */ - break; - } - default: { - luaX_syntaxerror(ls, "function arguments expected"); - } - } - lua_assert(f->k == VNONRELOC); - base = f->u.info; /* base register for call */ - if (hasmultret(args.k)) - nparams = LUA_MULTRET; /* open call */ - else { - if (args.k != VVOID) - luaK_exp2nextreg(fs, &args); /* close last argument */ - nparams = fs->freereg - (base+1); - } - init_exp(f, VCALL, luaK_codeABC(fs, OP_CALL, base, nparams+1, 2)); - luaK_fixline(fs, line); - fs->freereg = base+1; /* call remove function and arguments and leaves - (unless changed) one result */ -} - - - - -/* -** {====================================================================== -** Expression parsing -** ======================================================================= -*/ - - -static void primaryexp (LexState *ls, expdesc *v) { - /* primaryexp -> NAME | '(' expr ')' */ - switch (ls->t.token) { - case '(': { - int line = ls->linenumber; - luaX_next(ls); - expr(ls, v); - check_match(ls, ')', '(', line); - luaK_dischargevars(ls->fs, v); - return; - } - case TK_NAME: { - singlevar(ls, v); - return; - } - default: { - luaX_syntaxerror(ls, "unexpected symbol"); - } - } -} - - -static void suffixedexp (LexState *ls, expdesc *v) { - /* suffixedexp -> - primaryexp { '.' NAME | '[' exp ']' | ':' NAME funcargs | funcargs } */ - FuncState *fs = ls->fs; - int line = ls->linenumber; - primaryexp(ls, v); - for (;;) { - switch (ls->t.token) { - case '.': { /* fieldsel */ - fieldsel(ls, v); - break; - } - case '[': { /* `[' exp1 `]' */ - expdesc key; - luaK_exp2anyregup(fs, v); - yindex(ls, &key); - luaK_indexed(fs, v, &key); - break; - } - case ':': { /* `:' NAME funcargs */ - expdesc key; - luaX_next(ls); - checkname(ls, &key); - luaK_self(fs, v, &key); - funcargs(ls, v, line); - break; - } - case '(': case TK_STRING: case '{': { /* funcargs */ - luaK_exp2nextreg(fs, v); - funcargs(ls, v, line); - break; - } - default: return; - } - } -} - - -static void simpleexp (LexState *ls, expdesc *v) { - /* simpleexp -> NUMBER | STRING | NIL | TRUE | FALSE | ... | - constructor | FUNCTION body | suffixedexp */ - switch (ls->t.token) { - case TK_NUMBER: { - init_exp(v, VKNUM, 0); - v->u.nval = ls->t.seminfo.r; - break; - } - case TK_STRING: { - codestring(ls, v, ls->t.seminfo.ts); - break; - } - case TK_NIL: { - init_exp(v, VNIL, 0); - break; - } - case TK_TRUE: { - init_exp(v, VTRUE, 0); - break; - } - case TK_FALSE: { - init_exp(v, VFALSE, 0); - break; - } - case TK_DOTS: { /* vararg */ - FuncState *fs = ls->fs; - check_condition(ls, fs->f->is_vararg, - "cannot use " LUA_QL("...") " outside a vararg function"); - init_exp(v, VVARARG, luaK_codeABC(fs, OP_VARARG, 0, 1, 0)); - break; - } - case '{': { /* constructor */ - constructor(ls, v); - return; - } - case TK_FUNCTION: { - luaX_next(ls); - body(ls, v, 0, ls->linenumber); - return; - } - default: { - suffixedexp(ls, v); - return; - } - } - luaX_next(ls); -} - - -static UnOpr getunopr (int op) { - switch (op) { - case TK_NOT: return OPR_NOT; - case '-': return OPR_MINUS; - case '#': return OPR_LEN; - default: return OPR_NOUNOPR; - } -} - - -static BinOpr getbinopr (int op) { - switch (op) { - case '+': return OPR_ADD; - case '-': return OPR_SUB; - case '*': return OPR_MUL; - case '/': return OPR_DIV; - case '%': return OPR_MOD; - case '^': return OPR_POW; - case TK_CONCAT: return OPR_CONCAT; - case TK_NE: return OPR_NE; - case TK_EQ: return OPR_EQ; - case '<': return OPR_LT; - case TK_LE: return OPR_LE; - case '>': return OPR_GT; - case TK_GE: return OPR_GE; - case TK_AND: return OPR_AND; - case TK_OR: return OPR_OR; - default: return OPR_NOBINOPR; - } -} - - -static const struct { - lu_byte left; /* left priority for each binary operator */ - lu_byte right; /* right priority */ -} priority[] = { /* ORDER OPR */ - {6, 6}, {6, 6}, {7, 7}, {7, 7}, {7, 7}, /* `+' `-' `*' `/' `%' */ - {10, 9}, {5, 4}, /* ^, .. (right associative) */ - {3, 3}, {3, 3}, {3, 3}, /* ==, <, <= */ - {3, 3}, {3, 3}, {3, 3}, /* ~=, >, >= */ - {2, 2}, {1, 1} /* and, or */ -}; - -#define UNARY_PRIORITY 8 /* priority for unary operators */ - - -/* -** subexpr -> (simpleexp | unop subexpr) { binop subexpr } -** where `binop' is any binary operator with a priority higher than `limit' -*/ -static BinOpr subexpr (LexState *ls, expdesc *v, int limit) { - BinOpr op; - UnOpr uop; - enterlevel(ls); - uop = getunopr(ls->t.token); - if (uop != OPR_NOUNOPR) { - int line = ls->linenumber; - luaX_next(ls); - subexpr(ls, v, UNARY_PRIORITY); - luaK_prefix(ls->fs, uop, v, line); - } - else simpleexp(ls, v); - /* expand while operators have priorities higher than `limit' */ - op = getbinopr(ls->t.token); - while (op != OPR_NOBINOPR && priority[op].left > limit) { - expdesc v2; - BinOpr nextop; - int line = ls->linenumber; - luaX_next(ls); - luaK_infix(ls->fs, op, v); - /* read sub-expression with higher priority */ - nextop = subexpr(ls, &v2, priority[op].right); - luaK_posfix(ls->fs, op, v, &v2, line); - op = nextop; - } - leavelevel(ls); - return op; /* return first untreated operator */ -} - - -static void expr (LexState *ls, expdesc *v) { - subexpr(ls, v, 0); -} - -/* }==================================================================== */ - - - -/* -** {====================================================================== -** Rules for Statements -** ======================================================================= -*/ - - -static void block (LexState *ls) { - /* block -> statlist */ - FuncState *fs = ls->fs; - BlockCnt bl; - enterblock(fs, &bl, 0); - statlist(ls); - leaveblock(fs); -} - - -/* -** structure to chain all variables in the left-hand side of an -** assignment -*/ -struct LHS_assign { - struct LHS_assign *prev; - expdesc v; /* variable (global, local, upvalue, or indexed) */ -}; - - -/* -** check whether, in an assignment to an upvalue/local variable, the -** upvalue/local variable is begin used in a previous assignment to a -** table. If so, save original upvalue/local value in a safe place and -** use this safe copy in the previous assignment. -*/ -static void check_conflict (LexState *ls, struct LHS_assign *lh, expdesc *v) { - FuncState *fs = ls->fs; - int extra = fs->freereg; /* eventual position to save local variable */ - int conflict = 0; - for (; lh; lh = lh->prev) { /* check all previous assignments */ - if (lh->v.k == VINDEXED) { /* assigning to a table? */ - /* table is the upvalue/local being assigned now? */ - if (lh->v.u.ind.vt == v->k && lh->v.u.ind.t == v->u.info) { - conflict = 1; - lh->v.u.ind.vt = VLOCAL; - lh->v.u.ind.t = extra; /* previous assignment will use safe copy */ - } - /* index is the local being assigned? (index cannot be upvalue) */ - if (v->k == VLOCAL && lh->v.u.ind.idx == v->u.info) { - conflict = 1; - lh->v.u.ind.idx = extra; /* previous assignment will use safe copy */ - } - } - } - if (conflict) { - /* copy upvalue/local value to a temporary (in position 'extra') */ - OpCode op = (v->k == VLOCAL) ? OP_MOVE : OP_GETUPVAL; - luaK_codeABC(fs, op, extra, v->u.info, 0); - luaK_reserveregs(fs, 1); - } -} - - -static void assignment (LexState *ls, struct LHS_assign *lh, int nvars) { - expdesc e; - check_condition(ls, vkisvar(lh->v.k), "syntax error"); - if (testnext(ls, ',')) { /* assignment -> ',' suffixedexp assignment */ - struct LHS_assign nv; - nv.prev = lh; - suffixedexp(ls, &nv.v); - if (nv.v.k != VINDEXED) - check_conflict(ls, lh, &nv.v); - checklimit(ls->fs, nvars + ls->L->nCcalls, LUAI_MAXCCALLS, - "C levels"); - assignment(ls, &nv, nvars+1); - } - else { /* assignment -> `=' explist */ - int nexps; - checknext(ls, '='); - nexps = explist(ls, &e); - if (nexps != nvars) { - adjust_assign(ls, nvars, nexps, &e); - if (nexps > nvars) - ls->fs->freereg -= nexps - nvars; /* remove extra values */ - } - else { - luaK_setoneret(ls->fs, &e); /* close last expression */ - luaK_storevar(ls->fs, &lh->v, &e); - return; /* avoid default */ - } - } - init_exp(&e, VNONRELOC, ls->fs->freereg-1); /* default assignment */ - luaK_storevar(ls->fs, &lh->v, &e); -} - - -static int cond (LexState *ls) { - /* cond -> exp */ - expdesc v; - expr(ls, &v); /* read condition */ - if (v.k == VNIL) v.k = VFALSE; /* `falses' are all equal here */ - luaK_goiftrue(ls->fs, &v); - return v.f; -} - - -static void gotostat (LexState *ls, int pc) { - int line = ls->linenumber; - TString *label; - int g; - if (testnext(ls, TK_GOTO)) - label = str_checkname(ls); - else { - luaX_next(ls); /* skip break */ - label = luaS_new(ls->L, "break"); - } - g = newlabelentry(ls, &ls->dyd->gt, label, line, pc); - findlabel(ls, g); /* close it if label already defined */ -} - - -/* check for repeated labels on the same block */ -static void checkrepeated (FuncState *fs, Labellist *ll, TString *label) { - int i; - for (i = fs->bl->firstlabel; i < ll->n; i++) { - if (luaS_eqstr(label, ll->arr[i].name)) { - const char *msg = luaO_pushfstring(fs->ls->L, - "label " LUA_QS " already defined on line %d", - getstr(label), ll->arr[i].line); - semerror(fs->ls, msg); - } - } -} - - -/* skip no-op statements */ -static void skipnoopstat (LexState *ls) { - while (ls->t.token == ';' || ls->t.token == TK_DBCOLON) - statement(ls); -} - - -static void labelstat (LexState *ls, TString *label, int line) { - /* label -> '::' NAME '::' */ - FuncState *fs = ls->fs; - Labellist *ll = &ls->dyd->label; - int l; /* index of new label being created */ - checkrepeated(fs, ll, label); /* check for repeated labels */ - checknext(ls, TK_DBCOLON); /* skip double colon */ - /* create new entry for this label */ - l = newlabelentry(ls, ll, label, line, fs->pc); - skipnoopstat(ls); /* skip other no-op statements */ - if (block_follow(ls, 0)) { /* label is last no-op statement in the block? */ - /* assume that locals are already out of scope */ - ll->arr[l].nactvar = fs->bl->nactvar; - } - findgotos(ls, &ll->arr[l]); -} - - -static void whilestat (LexState *ls, int line) { - /* whilestat -> WHILE cond DO block END */ - FuncState *fs = ls->fs; - int whileinit; - int condexit; - BlockCnt bl; - luaX_next(ls); /* skip WHILE */ - whileinit = luaK_getlabel(fs); - condexit = cond(ls); - enterblock(fs, &bl, 1); - checknext(ls, TK_DO); - block(ls); - luaK_jumpto(fs, whileinit); - check_match(ls, TK_END, TK_WHILE, line); - leaveblock(fs); - luaK_patchtohere(fs, condexit); /* false conditions finish the loop */ -} - - -static void repeatstat (LexState *ls, int line) { - /* repeatstat -> REPEAT block UNTIL cond */ - int condexit; - FuncState *fs = ls->fs; - int repeat_init = luaK_getlabel(fs); - BlockCnt bl1, bl2; - enterblock(fs, &bl1, 1); /* loop block */ - enterblock(fs, &bl2, 0); /* scope block */ - luaX_next(ls); /* skip REPEAT */ - statlist(ls); - check_match(ls, TK_UNTIL, TK_REPEAT, line); - condexit = cond(ls); /* read condition (inside scope block) */ - if (bl2.upval) /* upvalues? */ - luaK_patchclose(fs, condexit, bl2.nactvar); - leaveblock(fs); /* finish scope */ - luaK_patchlist(fs, condexit, repeat_init); /* close the loop */ - leaveblock(fs); /* finish loop */ -} - - -static int exp1 (LexState *ls) { - expdesc e; - int reg; - expr(ls, &e); - luaK_exp2nextreg(ls->fs, &e); - lua_assert(e.k == VNONRELOC); - reg = e.u.info; - return reg; -} - - -static void forbody (LexState *ls, int base, int line, int nvars, int isnum) { - /* forbody -> DO block */ - BlockCnt bl; - FuncState *fs = ls->fs; - int prep, endfor; - adjustlocalvars(ls, 3); /* control variables */ - checknext(ls, TK_DO); - prep = isnum ? luaK_codeAsBx(fs, OP_FORPREP, base, NO_JUMP) : luaK_jump(fs); - enterblock(fs, &bl, 0); /* scope for declared variables */ - adjustlocalvars(ls, nvars); - luaK_reserveregs(fs, nvars); - block(ls); - leaveblock(fs); /* end of scope for declared variables */ - luaK_patchtohere(fs, prep); - if (isnum) /* numeric for? */ - endfor = luaK_codeAsBx(fs, OP_FORLOOP, base, NO_JUMP); - else { /* generic for */ - luaK_codeABC(fs, OP_TFORCALL, base, 0, nvars); - luaK_fixline(fs, line); - endfor = luaK_codeAsBx(fs, OP_TFORLOOP, base + 2, NO_JUMP); - } - luaK_patchlist(fs, endfor, prep + 1); - luaK_fixline(fs, line); -} - - -static void fornum (LexState *ls, TString *varname, int line) { - /* fornum -> NAME = exp1,exp1[,exp1] forbody */ - FuncState *fs = ls->fs; - int base = fs->freereg; - new_localvarliteral(ls, "(for index)"); - new_localvarliteral(ls, "(for limit)"); - new_localvarliteral(ls, "(for step)"); - new_localvar(ls, varname); - checknext(ls, '='); - exp1(ls); /* initial value */ - checknext(ls, ','); - exp1(ls); /* limit */ - if (testnext(ls, ',')) - exp1(ls); /* optional step */ - else { /* default step = 1 */ - luaK_codek(fs, fs->freereg, luaK_numberK(fs, 1)); - luaK_reserveregs(fs, 1); - } - forbody(ls, base, line, 1, 1); -} - - -static void forlist (LexState *ls, TString *indexname) { - /* forlist -> NAME {,NAME} IN explist forbody */ - FuncState *fs = ls->fs; - expdesc e; - int nvars = 4; /* gen, state, control, plus at least one declared var */ - int line; - int base = fs->freereg; - /* create control variables */ - new_localvarliteral(ls, "(for generator)"); - new_localvarliteral(ls, "(for state)"); - new_localvarliteral(ls, "(for control)"); - /* create declared variables */ - new_localvar(ls, indexname); - while (testnext(ls, ',')) { - new_localvar(ls, str_checkname(ls)); - nvars++; - } - checknext(ls, TK_IN); - line = ls->linenumber; - adjust_assign(ls, 3, explist(ls, &e), &e); - luaK_checkstack(fs, 3); /* extra space to call generator */ - forbody(ls, base, line, nvars - 3, 0); -} - - -static void forstat (LexState *ls, int line) { - /* forstat -> FOR (fornum | forlist) END */ - FuncState *fs = ls->fs; - TString *varname; - BlockCnt bl; - enterblock(fs, &bl, 1); /* scope for loop and control variables */ - luaX_next(ls); /* skip `for' */ - varname = str_checkname(ls); /* first variable name */ - switch (ls->t.token) { - case '=': fornum(ls, varname, line); break; - case ',': case TK_IN: forlist(ls, varname); break; - default: luaX_syntaxerror(ls, LUA_QL("=") " or " LUA_QL("in") " expected"); - } - check_match(ls, TK_END, TK_FOR, line); - leaveblock(fs); /* loop scope (`break' jumps to this point) */ -} - - -static void test_then_block (LexState *ls, int *escapelist) { - /* test_then_block -> [IF | ELSEIF] cond THEN block */ - BlockCnt bl; - FuncState *fs = ls->fs; - expdesc v; - int jf; /* instruction to skip 'then' code (if condition is false) */ - luaX_next(ls); /* skip IF or ELSEIF */ - expr(ls, &v); /* read condition */ - checknext(ls, TK_THEN); - if (ls->t.token == TK_GOTO || ls->t.token == TK_BREAK) { - luaK_goiffalse(ls->fs, &v); /* will jump to label if condition is true */ - enterblock(fs, &bl, 0); /* must enter block before 'goto' */ - gotostat(ls, v.t); /* handle goto/break */ - skipnoopstat(ls); /* skip other no-op statements */ - if (block_follow(ls, 0)) { /* 'goto' is the entire block? */ - leaveblock(fs); - return; /* and that is it */ - } - else /* must skip over 'then' part if condition is false */ - jf = luaK_jump(fs); - } - else { /* regular case (not goto/break) */ - luaK_goiftrue(ls->fs, &v); /* skip over block if condition is false */ - enterblock(fs, &bl, 0); - jf = v.f; - } - statlist(ls); /* `then' part */ - leaveblock(fs); - if (ls->t.token == TK_ELSE || - ls->t.token == TK_ELSEIF) /* followed by 'else'/'elseif'? */ - luaK_concat(fs, escapelist, luaK_jump(fs)); /* must jump over it */ - luaK_patchtohere(fs, jf); -} - - -static void ifstat (LexState *ls, int line) { - /* ifstat -> IF cond THEN block {ELSEIF cond THEN block} [ELSE block] END */ - FuncState *fs = ls->fs; - int escapelist = NO_JUMP; /* exit list for finished parts */ - test_then_block(ls, &escapelist); /* IF cond THEN block */ - while (ls->t.token == TK_ELSEIF) - test_then_block(ls, &escapelist); /* ELSEIF cond THEN block */ - if (testnext(ls, TK_ELSE)) - block(ls); /* `else' part */ - check_match(ls, TK_END, TK_IF, line); - luaK_patchtohere(fs, escapelist); /* patch escape list to 'if' end */ -} - - -static void localfunc (LexState *ls) { - expdesc b; - FuncState *fs = ls->fs; - new_localvar(ls, str_checkname(ls)); /* new local variable */ - adjustlocalvars(ls, 1); /* enter its scope */ - body(ls, &b, 0, ls->linenumber); /* function created in next register */ - /* debug information will only see the variable after this point! */ - getlocvar(fs, b.u.info)->startpc = fs->pc; -} - - -static void localstat (LexState *ls) { - /* stat -> LOCAL NAME {`,' NAME} [`=' explist] */ - int nvars = 0; - int nexps; - expdesc e; - do { - new_localvar(ls, str_checkname(ls)); - nvars++; - } while (testnext(ls, ',')); - if (testnext(ls, '=')) - nexps = explist(ls, &e); - else { - e.k = VVOID; - nexps = 0; - } - adjust_assign(ls, nvars, nexps, &e); - adjustlocalvars(ls, nvars); -} - - -static int funcname (LexState *ls, expdesc *v) { - /* funcname -> NAME {fieldsel} [`:' NAME] */ - int ismethod = 0; - singlevar(ls, v); - while (ls->t.token == '.') - fieldsel(ls, v); - if (ls->t.token == ':') { - ismethod = 1; - fieldsel(ls, v); - } - return ismethod; -} - - -static void funcstat (LexState *ls, int line) { - /* funcstat -> FUNCTION funcname body */ - int ismethod; - expdesc v, b; - luaX_next(ls); /* skip FUNCTION */ - ismethod = funcname(ls, &v); - body(ls, &b, ismethod, line); - luaK_storevar(ls->fs, &v, &b); - luaK_fixline(ls->fs, line); /* definition `happens' in the first line */ -} - - -static void exprstat (LexState *ls) { - /* stat -> func | assignment */ - FuncState *fs = ls->fs; - struct LHS_assign v; - suffixedexp(ls, &v.v); - if (ls->t.token == '=' || ls->t.token == ',') { /* stat -> assignment ? */ - v.prev = NULL; - assignment(ls, &v, 1); - } - else { /* stat -> func */ - check_condition(ls, v.v.k == VCALL, "syntax error"); - SETARG_C(getcode(fs, &v.v), 1); /* call statement uses no results */ - } -} - - -static void retstat (LexState *ls) { - /* stat -> RETURN [explist] [';'] */ - FuncState *fs = ls->fs; - expdesc e; - int first, nret; /* registers with returned values */ - if (block_follow(ls, 1) || ls->t.token == ';') - first = nret = 0; /* return no values */ - else { - nret = explist(ls, &e); /* optional return values */ - if (hasmultret(e.k)) { - luaK_setmultret(fs, &e); - if (e.k == VCALL && nret == 1) { /* tail call? */ - SET_OPCODE(getcode(fs,&e), OP_TAILCALL); - lua_assert(GETARG_A(getcode(fs,&e)) == fs->nactvar); - } - first = fs->nactvar; - nret = LUA_MULTRET; /* return all values */ - } - else { - if (nret == 1) /* only one single value? */ - first = luaK_exp2anyreg(fs, &e); - else { - luaK_exp2nextreg(fs, &e); /* values must go to the `stack' */ - first = fs->nactvar; /* return all `active' values */ - lua_assert(nret == fs->freereg - first); - } - } - } - luaK_ret(fs, first, nret); - testnext(ls, ';'); /* skip optional semicolon */ -} - - -static void statement (LexState *ls) { - int line = ls->linenumber; /* may be needed for error messages */ - enterlevel(ls); - switch (ls->t.token) { - case ';': { /* stat -> ';' (empty statement) */ - luaX_next(ls); /* skip ';' */ - break; - } - case TK_IF: { /* stat -> ifstat */ - ifstat(ls, line); - break; - } - case TK_WHILE: { /* stat -> whilestat */ - whilestat(ls, line); - break; - } - case TK_DO: { /* stat -> DO block END */ - luaX_next(ls); /* skip DO */ - block(ls); - check_match(ls, TK_END, TK_DO, line); - break; - } - case TK_FOR: { /* stat -> forstat */ - forstat(ls, line); - break; - } - case TK_REPEAT: { /* stat -> repeatstat */ - repeatstat(ls, line); - break; - } - case TK_FUNCTION: { /* stat -> funcstat */ - funcstat(ls, line); - break; - } - case TK_LOCAL: { /* stat -> localstat */ - luaX_next(ls); /* skip LOCAL */ - if (testnext(ls, TK_FUNCTION)) /* local function? */ - localfunc(ls); - else - localstat(ls); - break; - } - case TK_DBCOLON: { /* stat -> label */ - luaX_next(ls); /* skip double colon */ - labelstat(ls, str_checkname(ls), line); - break; - } - case TK_RETURN: { /* stat -> retstat */ - luaX_next(ls); /* skip RETURN */ - retstat(ls); - break; - } - case TK_BREAK: /* stat -> breakstat */ - case TK_GOTO: { /* stat -> 'goto' NAME */ - gotostat(ls, luaK_jump(ls->fs)); - break; - } - default: { /* stat -> func | assignment */ - exprstat(ls); - break; - } - } - lua_assert(ls->fs->f->maxstacksize >= ls->fs->freereg && - ls->fs->freereg >= ls->fs->nactvar); - ls->fs->freereg = ls->fs->nactvar; /* free registers */ - leavelevel(ls); -} - -/* }====================================================================== */ - - -/* -** compiles the main function, which is a regular vararg function with an -** upvalue named LUA_ENV -*/ -static void mainfunc (LexState *ls, FuncState *fs) { - BlockCnt bl; - expdesc v; - open_func(ls, fs, &bl); - fs->f->is_vararg = 1; /* main function is always vararg */ - init_exp(&v, VLOCAL, 0); /* create and... */ - newupvalue(fs, ls->envn, &v); /* ...set environment upvalue */ - luaX_next(ls); /* read first token */ - statlist(ls); /* parse main body */ - check(ls, TK_EOS); - close_func(ls); -} - - -Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff, - Dyndata *dyd, const char *name, int firstchar) { - LexState lexstate; - FuncState funcstate; - Closure *cl = luaF_newLclosure(L, 1); /* create main closure */ - /* anchor closure (to avoid being collected) */ - setclLvalue(L, L->top, cl); - incr_top(L); - funcstate.f = cl->l.p = luaF_newproto(L); - funcstate.f->source = luaS_new(L, name); /* create and anchor TString */ - lexstate.buff = buff; - lexstate.dyd = dyd; - dyd->actvar.n = dyd->gt.n = dyd->label.n = 0; - luaX_setinput(L, &lexstate, z, funcstate.f->source, firstchar); - mainfunc(&lexstate, &funcstate); - lua_assert(!funcstate.prev && funcstate.nups == 1 && !lexstate.fs); - /* all scopes should be correctly finished */ - lua_assert(dyd->actvar.n == 0 && dyd->gt.n == 0 && dyd->label.n == 0); - return cl; /* it's on the stack too */ -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h deleted file mode 100644 index 0346e3c41a80..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h +++ /dev/null @@ -1,119 +0,0 @@ -/* -** $Id: lparser.h,v 1.70.1.1 2013/04/12 18:48:47 roberto Exp $ -** Lua Parser -** See Copyright Notice in lua.h -*/ - -#ifndef lparser_h -#define lparser_h - -#include "llimits.h" -#include "lobject.h" -#include "lzio.h" - - -/* -** Expression descriptor -*/ - -typedef enum { - VVOID, /* no value */ - VNIL, - VTRUE, - VFALSE, - VK, /* info = index of constant in `k' */ - VKNUM, /* nval = numerical value */ - VNONRELOC, /* info = result register */ - VLOCAL, /* info = local register */ - VUPVAL, /* info = index of upvalue in 'upvalues' */ - VINDEXED, /* t = table register/upvalue; idx = index R/K */ - VJMP, /* info = instruction pc */ - VRELOCABLE, /* info = instruction pc */ - VCALL, /* info = instruction pc */ - VVARARG /* info = instruction pc */ -} expkind; - - -#define vkisvar(k) (VLOCAL <= (k) && (k) <= VINDEXED) -#define vkisinreg(k) ((k) == VNONRELOC || (k) == VLOCAL) - -typedef struct expdesc { - expkind k; - union { - struct { /* for indexed variables (VINDEXED) */ - short idx; /* index (R/K) */ - lu_byte t; /* table (register or upvalue) */ - lu_byte vt; /* whether 't' is register (VLOCAL) or upvalue (VUPVAL) */ - } ind; - int info; /* for generic use */ - lua_Number nval; /* for VKNUM */ - } u; - int t; /* patch list of `exit when true' */ - int f; /* patch list of `exit when false' */ -} expdesc; - - -/* description of active local variable */ -typedef struct Vardesc { - short idx; /* variable index in stack */ -} Vardesc; - - -/* description of pending goto statements and label statements */ -typedef struct Labeldesc { - TString *name; /* label identifier */ - int pc; /* position in code */ - int line; /* line where it appeared */ - lu_byte nactvar; /* local level where it appears in current block */ -} Labeldesc; - - -/* list of labels or gotos */ -typedef struct Labellist { - Labeldesc *arr; /* array */ - int n; /* number of entries in use */ - int size; /* array size */ -} Labellist; - - -/* dynamic structures used by the parser */ -typedef struct Dyndata { - struct { /* list of active local variables */ - Vardesc *arr; - int n; - int size; - } actvar; - Labellist gt; /* list of pending gotos */ - Labellist label; /* list of active labels */ -} Dyndata; - - -/* control of blocks */ -struct BlockCnt; /* defined in lparser.c */ - - -/* state needed to generate code for a given function */ -typedef struct FuncState { - Proto *f; /* current function header */ - Table *h; /* table to find (and reuse) elements in `k' */ - struct FuncState *prev; /* enclosing function */ - struct LexState *ls; /* lexical state */ - struct BlockCnt *bl; /* chain of current blocks */ - int pc; /* next position to code (equivalent to `ncode') */ - int lasttarget; /* 'label' of last 'jump label' */ - int jpc; /* list of pending jumps to `pc' */ - int nk; /* number of elements in `k' */ - int np; /* number of elements in `p' */ - int firstlocal; /* index of first local var (in Dyndata array) */ - short nlocvars; /* number of elements in 'f->locvars' */ - lu_byte nactvar; /* number of active local variables */ - lu_byte nups; /* number of upvalues */ - lu_byte freereg; /* first free register */ -} FuncState; - - -LUAI_FUNC Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff, - Dyndata *dyd, const char *name, int firstchar); - - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c deleted file mode 100644 index b98ce5c2b52b..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c +++ /dev/null @@ -1,321 +0,0 @@ -/* -** $Id: lstate.c,v 2.99.1.2 2013/11/08 17:45:31 roberto Exp $ -** Global State -** See Copyright Notice in lua.h -*/ - - -#include - -#define lstate_c -#define LUA_CORE - -#include "lua.h" - -#include "lapi.h" -#include "ldebug.h" -#include "ldo.h" -#include "lfunc.h" -#include "lgc.h" -#include "llex.h" -#include "lmem.h" -#include "lstate.h" -#include "lstring.h" -#include "ltable.h" -#include "ltm.h" - - -#if !defined(LUAI_GCPAUSE) -#define LUAI_GCPAUSE 200 /* 200% */ -#endif - -#if !defined(LUAI_GCMAJOR) -#define LUAI_GCMAJOR 200 /* 200% */ -#endif - -#if !defined(LUAI_GCMUL) -#define LUAI_GCMUL 200 /* GC runs 'twice the speed' of memory allocation */ -#endif - - -#define MEMERRMSG "not enough memory" - - -/* -** a macro to help the creation of a unique random seed when a state is -** created; the seed is used to randomize hashes. -*/ -#if !defined(luai_makeseed) -#define luai_makeseed() cast(unsigned int, gethrtime()) -#endif - - - -/* -** thread state + extra space -*/ -typedef struct LX { -#if defined(LUAI_EXTRASPACE) - char buff[LUAI_EXTRASPACE]; -#endif - lua_State l; -} LX; - - -/* -** Main thread combines a thread state and the global state -*/ -typedef struct LG { - LX l; - global_State g; -} LG; - - - -#define fromstate(L) (cast(LX *, cast(lu_byte *, (L)) - offsetof(LX, l))) - - -/* -** Compute an initial seed as random as possible. In ANSI, rely on -** Address Space Layout Randomization (if present) to increase -** randomness.. -*/ -#define addbuff(b,p,e) \ - { size_t t = cast(size_t, e); \ - memcpy(buff + p, &t, sizeof(t)); p += sizeof(t); } - -static unsigned int makeseed (lua_State *L) { - char buff[4 * sizeof(size_t)]; - unsigned int h = luai_makeseed(); - int p = 0; - addbuff(buff, p, L); /* heap variable */ - addbuff(buff, p, &h); /* local variable */ - addbuff(buff, p, luaO_nilobject); /* global variable */ - addbuff(buff, p, &lua_newstate); /* public function */ - lua_assert(p == sizeof(buff)); - return luaS_hash(buff, p, h); -} - - -/* -** set GCdebt to a new value keeping the value (totalbytes + GCdebt) -** invariant -*/ -void luaE_setdebt (global_State *g, l_mem debt) { - g->totalbytes -= (debt - g->GCdebt); - g->GCdebt = debt; -} - - -CallInfo *luaE_extendCI (lua_State *L) { - CallInfo *ci = luaM_new(L, CallInfo); - lua_assert(L->ci->next == NULL); - L->ci->next = ci; - ci->previous = L->ci; - ci->next = NULL; - return ci; -} - - -void luaE_freeCI (lua_State *L) { - CallInfo *ci = L->ci; - CallInfo *next = ci->next; - ci->next = NULL; - while ((ci = next) != NULL) { - next = ci->next; - luaM_free(L, ci); - } -} - - -static void stack_init (lua_State *L1, lua_State *L) { - int i; CallInfo *ci; - /* initialize stack array */ - L1->stack = luaM_newvector(L, BASIC_STACK_SIZE, TValue); - L1->stacksize = BASIC_STACK_SIZE; - for (i = 0; i < BASIC_STACK_SIZE; i++) - setnilvalue(L1->stack + i); /* erase new stack */ - L1->top = L1->stack; - L1->stack_last = L1->stack + L1->stacksize - EXTRA_STACK; - /* initialize first ci */ - ci = &L1->base_ci; - ci->next = ci->previous = NULL; - ci->callstatus = 0; - ci->func = L1->top; - setnilvalue(L1->top++); /* 'function' entry for this 'ci' */ - ci->top = L1->top + LUA_MINSTACK; - L1->ci = ci; -} - - -static void freestack (lua_State *L) { - if (L->stack == NULL) - return; /* stack not completely built yet */ - L->ci = &L->base_ci; /* free the entire 'ci' list */ - luaE_freeCI(L); - luaM_freearray(L, L->stack, L->stacksize); /* free stack array */ -} - - -/* -** Create registry table and its predefined values -*/ -static void init_registry (lua_State *L, global_State *g) { - TValue mt; - /* create registry */ - Table *registry = luaH_new(L); - sethvalue(L, &g->l_registry, registry); - luaH_resize(L, registry, LUA_RIDX_LAST, 0); - /* registry[LUA_RIDX_MAINTHREAD] = L */ - setthvalue(L, &mt, L); - luaH_setint(L, registry, LUA_RIDX_MAINTHREAD, &mt); - /* registry[LUA_RIDX_GLOBALS] = table of globals */ - sethvalue(L, &mt, luaH_new(L)); - luaH_setint(L, registry, LUA_RIDX_GLOBALS, &mt); -} - - -/* -** open parts of the state that may cause memory-allocation errors -*/ -static void f_luaopen (lua_State *L, void *ud) { - global_State *g = G(L); - UNUSED(ud); - stack_init(L, L); /* init stack */ - init_registry(L, g); - luaS_resize(L, MINSTRTABSIZE); /* initial size of string table */ - luaT_init(L); - luaX_init(L); - /* pre-create memory-error message */ - g->memerrmsg = luaS_newliteral(L, MEMERRMSG); - luaS_fix(g->memerrmsg); /* it should never be collected */ - g->gcrunning = 1; /* allow gc */ - g->version = lua_version(NULL); - luai_userstateopen(L); -} - - -/* -** preinitialize a state with consistent values without allocating -** any memory (to avoid errors) -*/ -static void preinit_state (lua_State *L, global_State *g) { - G(L) = g; - L->stack = NULL; - L->ci = NULL; - L->stacksize = 0; - L->errorJmp = NULL; - L->nCcalls = 0; - L->hook = NULL; - L->hookmask = 0; - L->basehookcount = 0; - L->allowhook = 1; - resethookcount(L); - L->openupval = NULL; - L->nny = 1; - L->status = LUA_OK; - L->errfunc = 0; -} - - -static void close_state (lua_State *L) { - global_State *g = G(L); - luaF_close(L, L->stack); /* close all upvalues for this thread */ - luaC_freeallobjects(L); /* collect all objects */ - if (g->version) /* closing a fully built state? */ - luai_userstateclose(L); - luaM_freearray(L, G(L)->strt.hash, G(L)->strt.size); - luaZ_freebuffer(L, &g->buff); - freestack(L); - lua_assert(gettotalbytes(g) == sizeof(LG)); - (*g->frealloc)(g->ud, fromstate(L), sizeof(LG), 0); /* free main block */ -} - - -LUA_API lua_State *lua_newthread (lua_State *L) { - lua_State *L1; - lua_lock(L); - luaC_checkGC(L); - L1 = &luaC_newobj(L, LUA_TTHREAD, sizeof(LX), NULL, offsetof(LX, l))->th; - setthvalue(L, L->top, L1); - api_incr_top(L); - preinit_state(L1, G(L)); - L1->hookmask = L->hookmask; - L1->basehookcount = L->basehookcount; - L1->hook = L->hook; - resethookcount(L1); - luai_userstatethread(L, L1); - stack_init(L1, L); /* init stack */ - lua_unlock(L); - return L1; -} - - -void luaE_freethread (lua_State *L, lua_State *L1) { - LX *l = fromstate(L1); - luaF_close(L1, L1->stack); /* close all upvalues for this thread */ - lua_assert(L1->openupval == NULL); - luai_userstatefree(L, L1); - freestack(L1); - luaM_free(L, l); -} - - -LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud) { - int i; - lua_State *L; - global_State *g; - LG *l = cast(LG *, (*f)(ud, NULL, LUA_TTHREAD, sizeof(LG))); - if (l == NULL) return NULL; - L = &l->l.l; - g = &l->g; - L->next = NULL; - L->tt = LUA_TTHREAD; - g->currentwhite = bit2mask(WHITE0BIT, FIXEDBIT); - L->marked = luaC_white(g); - g->gckind = KGC_NORMAL; - preinit_state(L, g); - g->frealloc = f; - g->ud = ud; - g->mainthread = L; - g->seed = makeseed(L); - g->uvhead.u.l.prev = &g->uvhead; - g->uvhead.u.l.next = &g->uvhead; - g->gcrunning = 0; /* no GC while building state */ - g->GCestimate = 0; - g->strt.size = 0; - g->strt.nuse = 0; - g->strt.hash = NULL; - setnilvalue(&g->l_registry); - luaZ_initbuffer(L, &g->buff); - g->panic = NULL; - g->version = NULL; - g->gcstate = GCSpause; - g->allgc = NULL; - g->finobj = NULL; - g->tobefnz = NULL; - g->sweepgc = g->sweepfin = NULL; - g->gray = g->grayagain = NULL; - g->weak = g->ephemeron = g->allweak = NULL; - g->totalbytes = sizeof(LG); - g->GCdebt = 0; - g->gcpause = LUAI_GCPAUSE; - g->gcmajorinc = LUAI_GCMAJOR; - g->gcstepmul = LUAI_GCMUL; - for (i=0; i < LUA_NUMTAGS; i++) g->mt[i] = NULL; - if (luaD_rawrunprotected(L, f_luaopen, NULL) != LUA_OK) { - /* memory allocation error: free partial state */ - close_state(L); - L = NULL; - } - return L; -} - - -LUA_API void lua_close (lua_State *L) { - L = G(L)->mainthread; /* only the main thread can be closed */ - lua_lock(L); - close_state(L); -} - - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h deleted file mode 100644 index daffd9aacfbb..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h +++ /dev/null @@ -1,228 +0,0 @@ -/* -** $Id: lstate.h,v 2.82.1.1 2013/04/12 18:48:47 roberto Exp $ -** Global State -** See Copyright Notice in lua.h -*/ - -#ifndef lstate_h -#define lstate_h - -#include "lua.h" - -#include "lobject.h" -#include "ltm.h" -#include "lzio.h" - - -/* - -** Some notes about garbage-collected objects: All objects in Lua must -** be kept somehow accessible until being freed. -** -** Lua keeps most objects linked in list g->allgc. The link uses field -** 'next' of the CommonHeader. -** -** Strings are kept in several lists headed by the array g->strt.hash. -** -** Open upvalues are not subject to independent garbage collection. They -** are collected together with their respective threads. Lua keeps a -** double-linked list with all open upvalues (g->uvhead) so that it can -** mark objects referred by them. (They are always gray, so they must -** be remarked in the atomic step. Usually their contents would be marked -** when traversing the respective threads, but the thread may already be -** dead, while the upvalue is still accessible through closures.) -** -** Objects with finalizers are kept in the list g->finobj. -** -** The list g->tobefnz links all objects being finalized. - -*/ - - -struct lua_longjmp; /* defined in ldo.c */ - - - -/* extra stack space to handle TM calls and some other extras */ -#define EXTRA_STACK 5 - - -#define BASIC_STACK_SIZE (2*LUA_MINSTACK) - - -/* kinds of Garbage Collection */ -#define KGC_NORMAL 0 -#define KGC_EMERGENCY 1 /* gc was forced by an allocation failure */ -#define KGC_GEN 2 /* generational collection */ - - -typedef struct stringtable { - GCObject **hash; - lu_int32 nuse; /* number of elements */ - int size; -} stringtable; - - -/* -** information about a call -*/ -typedef struct CallInfo { - StkId func; /* function index in the stack */ - StkId top; /* top for this function */ - struct CallInfo *previous, *next; /* dynamic call link */ - short nresults; /* expected number of results from this function */ - lu_byte callstatus; - ptrdiff_t extra; - union { - struct { /* only for Lua functions */ - StkId base; /* base for this function */ - const Instruction *savedpc; - } l; - struct { /* only for C functions */ - int ctx; /* context info. in case of yields */ - lua_CFunction k; /* continuation in case of yields */ - ptrdiff_t old_errfunc; - lu_byte old_allowhook; - lu_byte status; - } c; - } u; -} CallInfo; - - -/* -** Bits in CallInfo status -*/ -#define CIST_LUA (1<<0) /* call is running a Lua function */ -#define CIST_HOOKED (1<<1) /* call is running a debug hook */ -#define CIST_REENTRY (1<<2) /* call is running on same invocation of - luaV_execute of previous call */ -#define CIST_YIELDED (1<<3) /* call reentered after suspension */ -#define CIST_YPCALL (1<<4) /* call is a yieldable protected call */ -#define CIST_STAT (1<<5) /* call has an error status (pcall) */ -#define CIST_TAIL (1<<6) /* call was tail called */ -#define CIST_HOOKYIELD (1<<7) /* last hook called yielded */ - - -#define isLua(ci) ((ci)->callstatus & CIST_LUA) - - -/* -** `global state', shared by all threads of this state -*/ -typedef struct global_State { - lua_Alloc frealloc; /* function to reallocate memory */ - void *ud; /* auxiliary data to `frealloc' */ - lu_mem totalbytes; /* number of bytes currently allocated - GCdebt */ - l_mem GCdebt; /* bytes allocated not yet compensated by the collector */ - lu_mem GCmemtrav; /* memory traversed by the GC */ - lu_mem GCestimate; /* an estimate of the non-garbage memory in use */ - stringtable strt; /* hash table for strings */ - TValue l_registry; - unsigned int seed; /* randomized seed for hashes */ - lu_byte currentwhite; - lu_byte gcstate; /* state of garbage collector */ - lu_byte gckind; /* kind of GC running */ - lu_byte gcrunning; /* true if GC is running */ - int sweepstrgc; /* position of sweep in `strt' */ - GCObject *allgc; /* list of all collectable objects */ - GCObject *finobj; /* list of collectable objects with finalizers */ - GCObject **sweepgc; /* current position of sweep in list 'allgc' */ - GCObject **sweepfin; /* current position of sweep in list 'finobj' */ - GCObject *gray; /* list of gray objects */ - GCObject *grayagain; /* list of objects to be traversed atomically */ - GCObject *weak; /* list of tables with weak values */ - GCObject *ephemeron; /* list of ephemeron tables (weak keys) */ - GCObject *allweak; /* list of all-weak tables */ - GCObject *tobefnz; /* list of userdata to be GC */ - UpVal uvhead; /* head of double-linked list of all open upvalues */ - Mbuffer buff; /* temporary buffer for string concatenation */ - int gcpause; /* size of pause between successive GCs */ - int gcmajorinc; /* pause between major collections (only in gen. mode) */ - int gcstepmul; /* GC `granularity' */ - lua_CFunction panic; /* to be called in unprotected errors */ - struct lua_State *mainthread; - const lua_Number *version; /* pointer to version number */ - TString *memerrmsg; /* memory-error message */ - TString *tmname[TM_N]; /* array with tag-method names */ - struct Table *mt[LUA_NUMTAGS]; /* metatables for basic types */ -} global_State; - - -/* -** `per thread' state -*/ -struct lua_State { - CommonHeader; - lu_byte status; - StkId top; /* first free slot in the stack */ - global_State *l_G; - CallInfo *ci; /* call info for current function */ - const Instruction *oldpc; /* last pc traced */ - StkId stack_last; /* last free slot in the stack */ - StkId stack; /* stack base */ - int stacksize; - unsigned short nny; /* number of non-yieldable calls in stack */ - unsigned short nCcalls; /* number of nested C calls */ - lu_byte hookmask; - lu_byte allowhook; - int basehookcount; - int hookcount; - lua_Hook hook; - GCObject *openupval; /* list of open upvalues in this stack */ - GCObject *gclist; - struct lua_longjmp *errorJmp; /* current error recover point */ - ptrdiff_t errfunc; /* current error handling function (stack index) */ - CallInfo base_ci; /* CallInfo for first level (C calling Lua) */ -}; - - -#define G(L) (L->l_G) - - -/* -** Union of all collectable objects -*/ -union GCObject { - GCheader gch; /* common header */ - union TString ts; - union Udata u; - union Closure cl; - struct Table h; - struct Proto p; - struct UpVal uv; - struct lua_State th; /* thread */ -}; - - -#define gch(o) (&(o)->gch) - -/* macros to convert a GCObject into a specific value */ -#define rawgco2ts(o) \ - check_exp(novariant((o)->gch.tt) == LUA_TSTRING, &((o)->ts)) -#define gco2ts(o) (&rawgco2ts(o)->tsv) -#define rawgco2u(o) check_exp((o)->gch.tt == LUA_TUSERDATA, &((o)->u)) -#define gco2u(o) (&rawgco2u(o)->uv) -#define gco2lcl(o) check_exp((o)->gch.tt == LUA_TLCL, &((o)->cl.l)) -#define gco2ccl(o) check_exp((o)->gch.tt == LUA_TCCL, &((o)->cl.c)) -#define gco2cl(o) \ - check_exp(novariant((o)->gch.tt) == LUA_TFUNCTION, &((o)->cl)) -#define gco2t(o) check_exp((o)->gch.tt == LUA_TTABLE, &((o)->h)) -#define gco2p(o) check_exp((o)->gch.tt == LUA_TPROTO, &((o)->p)) -#define gco2uv(o) check_exp((o)->gch.tt == LUA_TUPVAL, &((o)->uv)) -#define gco2th(o) check_exp((o)->gch.tt == LUA_TTHREAD, &((o)->th)) - -/* macro to convert any Lua object into a GCObject */ -#define obj2gco(v) (cast(GCObject *, (v))) - - -/* actual number of total bytes allocated */ -#define gettotalbytes(g) ((g)->totalbytes + (g)->GCdebt) - -LUAI_FUNC void luaE_setdebt (global_State *g, l_mem debt); -LUAI_FUNC void luaE_freethread (lua_State *L, lua_State *L1); -LUAI_FUNC CallInfo *luaE_extendCI (lua_State *L); -LUAI_FUNC void luaE_freeCI (lua_State *L); - - -#endif - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c deleted file mode 100644 index e20ab04b12de..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c +++ /dev/null @@ -1,185 +0,0 @@ -/* -** $Id: lstring.c,v 2.26.1.1 2013/04/12 18:48:47 roberto Exp $ -** String table (keeps all strings handled by Lua) -** See Copyright Notice in lua.h -*/ - - -#include - -#define lstring_c -#define LUA_CORE - -#include "lua.h" - -#include "lmem.h" -#include "lobject.h" -#include "lstate.h" -#include "lstring.h" - - -/* -** Lua will use at most ~(2^LUAI_HASHLIMIT) bytes from a string to -** compute its hash -*/ -#if !defined(LUAI_HASHLIMIT) -#define LUAI_HASHLIMIT 5 -#endif - - -/* -** equality for long strings -*/ -int luaS_eqlngstr (TString *a, TString *b) { - size_t len = a->tsv.len; - lua_assert(a->tsv.tt == LUA_TLNGSTR && b->tsv.tt == LUA_TLNGSTR); - return (a == b) || /* same instance or... */ - ((len == b->tsv.len) && /* equal length and ... */ - (memcmp(getstr(a), getstr(b), len) == 0)); /* equal contents */ -} - - -/* -** equality for strings -*/ -int luaS_eqstr (TString *a, TString *b) { - return (a->tsv.tt == b->tsv.tt) && - (a->tsv.tt == LUA_TSHRSTR ? eqshrstr(a, b) : luaS_eqlngstr(a, b)); -} - - -unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) { - unsigned int h = seed ^ cast(unsigned int, l); - size_t l1; - size_t step = (l >> LUAI_HASHLIMIT) + 1; - for (l1 = l; l1 >= step; l1 -= step) - h = h ^ ((h<<5) + (h>>2) + cast_byte(str[l1 - 1])); - return h; -} - - -/* -** resizes the string table -*/ -void luaS_resize (lua_State *L, int newsize) { - int i; - stringtable *tb = &G(L)->strt; - /* cannot resize while GC is traversing strings */ - luaC_runtilstate(L, ~bitmask(GCSsweepstring)); - if (newsize > tb->size) { - luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *); - for (i = tb->size; i < newsize; i++) tb->hash[i] = NULL; - } - /* rehash */ - for (i=0; isize; i++) { - GCObject *p = tb->hash[i]; - tb->hash[i] = NULL; - while (p) { /* for each node in the list */ - GCObject *next = gch(p)->next; /* save next */ - unsigned int h = lmod(gco2ts(p)->hash, newsize); /* new position */ - gch(p)->next = tb->hash[h]; /* chain it */ - tb->hash[h] = p; - resetoldbit(p); /* see MOVE OLD rule */ - p = next; - } - } - if (newsize < tb->size) { - /* shrinking slice must be empty */ - lua_assert(tb->hash[newsize] == NULL && tb->hash[tb->size - 1] == NULL); - luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *); - } - tb->size = newsize; -} - - -/* -** creates a new string object -*/ -static TString *createstrobj (lua_State *L, const char *str, size_t l, - int tag, unsigned int h, GCObject **list) { - TString *ts; - size_t totalsize; /* total size of TString object */ - totalsize = sizeof(TString) + ((l + 1) * sizeof(char)); - ts = &luaC_newobj(L, tag, totalsize, list, 0)->ts; - ts->tsv.len = l; - ts->tsv.hash = h; - ts->tsv.extra = 0; - memcpy(ts+1, str, l*sizeof(char)); - ((char *)(ts+1))[l] = '\0'; /* ending 0 */ - return ts; -} - - -/* -** creates a new short string, inserting it into string table -*/ -static TString *newshrstr (lua_State *L, const char *str, size_t l, - unsigned int h) { - GCObject **list; /* (pointer to) list where it will be inserted */ - stringtable *tb = &G(L)->strt; - TString *s; - if (tb->nuse >= cast(lu_int32, tb->size) && tb->size <= MAX_INT/2) - luaS_resize(L, tb->size*2); /* too crowded */ - list = &tb->hash[lmod(h, tb->size)]; - s = createstrobj(L, str, l, LUA_TSHRSTR, h, list); - tb->nuse++; - return s; -} - - -/* -** checks whether short string exists and reuses it or creates a new one -*/ -static TString *internshrstr (lua_State *L, const char *str, size_t l) { - GCObject *o; - global_State *g = G(L); - unsigned int h = luaS_hash(str, l, g->seed); - for (o = g->strt.hash[lmod(h, g->strt.size)]; - o != NULL; - o = gch(o)->next) { - TString *ts = rawgco2ts(o); - if (h == ts->tsv.hash && - l == ts->tsv.len && - (memcmp(str, getstr(ts), l * sizeof(char)) == 0)) { - if (isdead(G(L), o)) /* string is dead (but was not collected yet)? */ - changewhite(o); /* resurrect it */ - return ts; - } - } - return newshrstr(L, str, l, h); /* not found; create a new string */ -} - - -/* -** new string (with explicit length) -*/ -TString *luaS_newlstr (lua_State *L, const char *str, size_t l) { - if (l <= LUAI_MAXSHORTLEN) /* short string? */ - return internshrstr(L, str, l); - else { - if (l + 1 > (MAX_SIZET - sizeof(TString))/sizeof(char)) - luaM_toobig(L); - return createstrobj(L, str, l, LUA_TLNGSTR, G(L)->seed, NULL); - } -} - - -/* -** new zero-terminated string -*/ -TString *luaS_new (lua_State *L, const char *str) { - return luaS_newlstr(L, str, strlen(str)); -} - - -Udata *luaS_newudata (lua_State *L, size_t s, Table *e) { - Udata *u; - if (s > MAX_SIZET - sizeof(Udata)) - luaM_toobig(L); - u = &luaC_newobj(L, LUA_TUSERDATA, sizeof(Udata) + s, NULL, 0)->u; - u->uv.len = s; - u->uv.metatable = NULL; - u->uv.env = e; - return u; -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h deleted file mode 100644 index 260e7f169bd0..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h +++ /dev/null @@ -1,46 +0,0 @@ -/* -** $Id: lstring.h,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $ -** String table (keep all strings handled by Lua) -** See Copyright Notice in lua.h -*/ - -#ifndef lstring_h -#define lstring_h - -#include "lgc.h" -#include "lobject.h" -#include "lstate.h" - - -#define sizestring(s) (sizeof(union TString)+((s)->len+1)*sizeof(char)) - -#define sizeudata(u) (sizeof(union Udata)+(u)->len) - -#define luaS_newliteral(L, s) (luaS_newlstr(L, "" s, \ - (sizeof(s)/sizeof(char))-1)) - -#define luaS_fix(s) l_setbit((s)->tsv.marked, FIXEDBIT) - - -/* -** test whether a string is a reserved word -*/ -#define isreserved(s) ((s)->tsv.tt == LUA_TSHRSTR && (s)->tsv.extra > 0) - - -/* -** equality for short strings, which are always internalized -*/ -#define eqshrstr(a,b) check_exp((a)->tsv.tt == LUA_TSHRSTR, (a) == (b)) - - -LUAI_FUNC unsigned int luaS_hash (const char *str, size_t l, unsigned int seed); -LUAI_FUNC int luaS_eqlngstr (TString *a, TString *b); -LUAI_FUNC int luaS_eqstr (TString *a, TString *b); -LUAI_FUNC void luaS_resize (lua_State *L, int newsize); -LUAI_FUNC Udata *luaS_newudata (lua_State *L, size_t s, Table *e); -LUAI_FUNC TString *luaS_newlstr (lua_State *L, const char *str, size_t l); -LUAI_FUNC TString *luaS_new (lua_State *L, const char *str); - - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c deleted file mode 100644 index 589752d3690e..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c +++ /dev/null @@ -1,1050 +0,0 @@ -/* -** $Id: lstrlib.c,v 1.178.1.1 2013/04/12 18:48:47 roberto Exp $ -** Standard library for string operations and pattern-matching -** See Copyright Notice in lua.h -*/ - - -#include -#include - -#define lstrlib_c -#define LUA_LIB - -#include "lua.h" - -#include "lauxlib.h" -#include "lualib.h" - - -/* -** maximum number of captures that a pattern can do during -** pattern-matching. This limit is arbitrary. -*/ -#if !defined(LUA_MAXCAPTURES) -#define LUA_MAXCAPTURES 32 -#endif - - -/* macro to `unsign' a character */ -#define uchar(c) ((unsigned char)(c)) - -/* - * PATCHED: add missing character macros. - */ -#ifdef illumos -#define tolower(C) (((C) >= 'A' && (C) <= 'Z') ? (C) - 'A' + 'a' : (C)) -#define toupper(C) (((C) >= 'a' && (C) <= 'z') ? (C) - 'a' + 'A': (C)) -#define iscntrl(C) ((((C) >= 0) && ((C) <= 0x1f)) || ((C) == 0x7f)) -#else -#define isalnum(C) (isalpha(C) || isdigit(C)) -#define iscntrl(C) (uchar(C) <= 0x1f || uchar(C) == 0x7f) -#endif -#define isgraph(C) ((C) >= 0x21 && (C) <= 0x7E) -#define ispunct(C) (((C) >= 0x21 && (C) <= 0x2F) || \ - ((C) >= 0x3A && (C) <= 0x40) || \ - ((C) >= 0x5B && (C) <= 0x60) || \ - ((C) >= 0x7B && (C) <= 0x7E)) - -/* - * The provided version of sprintf returns a char *, but str_format expects - * it to return the number of characters printed. This version has the expected - * behavior. - */ -static size_t str_sprintf(char *buf, const char *fmt, ...) { - va_list args; - size_t len; - - va_start(args, fmt); - len = vsnprintf(buf, INT_MAX, fmt, args); - va_end(args); - - return len; -} - - -static int str_len (lua_State *L) { - size_t l; - luaL_checklstring(L, 1, &l); - lua_pushinteger(L, (lua_Integer)l); - return 1; -} - - -/* translate a relative string position: negative means back from end */ -static size_t posrelat (ptrdiff_t pos, size_t len) { - if (pos >= 0) return (size_t)pos; - else if (0u - (size_t)pos > len) return 0; - else return len - ((size_t)-pos) + 1; -} - - -static int str_sub (lua_State *L) { - size_t l; - const char *s = luaL_checklstring(L, 1, &l); - size_t start = posrelat(luaL_checkinteger(L, 2), l); - size_t end = posrelat(luaL_optinteger(L, 3, -1), l); - if (start < 1) start = 1; - if (end > l) end = l; - if (start <= end) - lua_pushlstring(L, s + start - 1, end - start + 1); - else lua_pushliteral(L, ""); - return 1; -} - - -static int str_reverse (lua_State *L) { - size_t l, i; - luaL_Buffer b; - const char *s = luaL_checklstring(L, 1, &l); - char *p = luaL_buffinitsize(L, &b, l); - for (i = 0; i < l; i++) - p[i] = s[l - i - 1]; - luaL_pushresultsize(&b, l); - return 1; -} - - -static int str_lower (lua_State *L) { - size_t l; - size_t i; - luaL_Buffer b; - const char *s = luaL_checklstring(L, 1, &l); - char *p = luaL_buffinitsize(L, &b, l); - for (i=0; i> 1) - -static int str_rep (lua_State *L) { - size_t l, lsep; - const char *s = luaL_checklstring(L, 1, &l); - int n = luaL_checkint(L, 2); - const char *sep = luaL_optlstring(L, 3, "", &lsep); - if (n <= 0) lua_pushliteral(L, ""); - else if (l + lsep < l || l + lsep >= MAXSIZE / n) /* may overflow? */ - return luaL_error(L, "resulting string too large"); - else { - size_t totallen = n * l + (n - 1) * lsep; - luaL_Buffer b; - char *p = luaL_buffinitsize(L, &b, totallen); - while (n-- > 1) { /* first n-1 copies (followed by separator) */ - memcpy(p, s, l * sizeof(char)); p += l; - if (lsep > 0) { /* avoid empty 'memcpy' (may be expensive) */ - memcpy(p, sep, lsep * sizeof(char)); p += lsep; - } - } - memcpy(p, s, l * sizeof(char)); /* last copy (not followed by separator) */ - luaL_pushresultsize(&b, totallen); - } - return 1; -} - - -static int str_byte (lua_State *L) { - size_t l; - const char *s = luaL_checklstring(L, 1, &l); - size_t posi = posrelat(luaL_optinteger(L, 2, 1), l); - size_t pose = posrelat(luaL_optinteger(L, 3, posi), l); - int n, i; - if (posi < 1) posi = 1; - if (pose > l) pose = l; - if (posi > pose) return 0; /* empty interval; return no values */ - n = (int)(pose - posi + 1); - if (posi + n <= pose) /* (size_t -> int) overflow? */ - return luaL_error(L, "string slice too long"); - luaL_checkstack(L, n, "string slice too long"); - for (i=0; i= ms->level || ms->capture[l].len == CAP_UNFINISHED) - return luaL_error(ms->L, "invalid capture index %%%d", l + 1); - return l; -} - - -static int capture_to_close (MatchState *ms) { - int level = ms->level; - for (level--; level>=0; level--) - if (ms->capture[level].len == CAP_UNFINISHED) return level; - return luaL_error(ms->L, "invalid pattern capture"); -} - - -static const char *classend (MatchState *ms, const char *p) { - switch (*p++) { - case L_ESC: { - if (p == ms->p_end) - luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")"); - return p+1; - } - case '[': { - if (*p == '^') p++; - do { /* look for a `]' */ - if (p == ms->p_end) - luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")"); - if (*(p++) == L_ESC && p < ms->p_end) - p++; /* skip escapes (e.g. `%]') */ - } while (*p != ']'); - return p+1; - } - default: { - return p; - } - } -} - - -static int match_class (int c, int cl) { - int res; - switch (tolower(cl)) { - case 'a' : res = isalpha(c); break; - case 'c' : res = iscntrl(c); break; - case 'd' : res = isdigit(c); break; - case 'g' : res = isgraph(c); break; - case 'l' : res = islower(c); break; - case 'p' : res = ispunct(c); break; - case 's' : res = isspace(c); break; - case 'u' : res = isupper(c); break; - case 'w' : res = isalnum(c); break; - case 'x' : res = isxdigit(c); break; - case 'z' : res = (c == 0); break; /* deprecated option */ - default: return (cl == c); - } - return (islower(cl) ? res : !res); -} - - -static int matchbracketclass (int c, const char *p, const char *ec) { - int sig = 1; - if (*(p+1) == '^') { - sig = 0; - p++; /* skip the `^' */ - } - while (++p < ec) { - if (*p == L_ESC) { - p++; - if (match_class(c, uchar(*p))) - return sig; - } - else if ((*(p+1) == '-') && (p+2 < ec)) { - p+=2; - if (uchar(*(p-2)) <= c && c <= uchar(*p)) - return sig; - } - else if (uchar(*p) == c) return sig; - } - return !sig; -} - - -static int singlematch (MatchState *ms, const char *s, const char *p, - const char *ep) { - if (s >= ms->src_end) - return 0; - else { - int c = uchar(*s); - switch (*p) { - case '.': return 1; /* matches any char */ - case L_ESC: return match_class(c, uchar(*(p+1))); - case '[': return matchbracketclass(c, p, ep-1); - default: return (uchar(*p) == c); - } - } -} - - -static const char *matchbalance (MatchState *ms, const char *s, - const char *p) { - if (p >= ms->p_end - 1) - luaL_error(ms->L, "malformed pattern " - "(missing arguments to " LUA_QL("%%b") ")"); - if (*s != *p) return NULL; - else { - int b = *p; - int e = *(p+1); - int cont = 1; - while (++s < ms->src_end) { - if (*s == e) { - if (--cont == 0) return s+1; - } - else if (*s == b) cont++; - } - } - return NULL; /* string ends out of balance */ -} - - -static const char *max_expand (MatchState *ms, const char *s, - const char *p, const char *ep) { - ptrdiff_t i = 0; /* counts maximum expand for item */ - while (singlematch(ms, s + i, p, ep)) - i++; - /* keeps trying to match with the maximum repetitions */ - while (i>=0) { - const char *res = match(ms, (s+i), ep+1); - if (res) return res; - i--; /* else didn't match; reduce 1 repetition to try again */ - } - return NULL; -} - - -static const char *min_expand (MatchState *ms, const char *s, - const char *p, const char *ep) { - for (;;) { - const char *res = match(ms, s, ep+1); - if (res != NULL) - return res; - else if (singlematch(ms, s, p, ep)) - s++; /* try with one more repetition */ - else return NULL; - } -} - - -static const char *start_capture (MatchState *ms, const char *s, - const char *p, int what) { - const char *res; - int level = ms->level; - if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures"); - ms->capture[level].init = s; - ms->capture[level].len = what; - ms->level = level+1; - if ((res=match(ms, s, p)) == NULL) /* match failed? */ - ms->level--; /* undo capture */ - return res; -} - - -static const char *end_capture (MatchState *ms, const char *s, - const char *p) { - int l = capture_to_close(ms); - const char *res; - ms->capture[l].len = s - ms->capture[l].init; /* close capture */ - if ((res = match(ms, s, p)) == NULL) /* match failed? */ - ms->capture[l].len = CAP_UNFINISHED; /* undo capture */ - return res; -} - - -static const char *match_capture (MatchState *ms, const char *s, int l) { - size_t len; - l = check_capture(ms, l); - len = ms->capture[l].len; - if ((size_t)(ms->src_end-s) >= len && - memcmp(ms->capture[l].init, s, len) == 0) - return s+len; - else return NULL; -} - - -static const char *match (MatchState *ms, const char *s, const char *p) { - if (ms->matchdepth-- == 0) - luaL_error(ms->L, "pattern too complex"); - init: /* using goto's to optimize tail recursion */ - if (p != ms->p_end) { /* end of pattern? */ - switch (*p) { - case '(': { /* start capture */ - if (*(p + 1) == ')') /* position capture? */ - s = start_capture(ms, s, p + 2, CAP_POSITION); - else - s = start_capture(ms, s, p + 1, CAP_UNFINISHED); - break; - } - case ')': { /* end capture */ - s = end_capture(ms, s, p + 1); - break; - } - case '$': { - if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */ - goto dflt; /* no; go to default */ - s = (s == ms->src_end) ? s : NULL; /* check end of string */ - break; - } - case L_ESC: { /* escaped sequences not in the format class[*+?-]? */ - switch (*(p + 1)) { - case 'b': { /* balanced string? */ - s = matchbalance(ms, s, p + 2); - if (s != NULL) { - p += 4; goto init; /* return match(ms, s, p + 4); */ - } /* else fail (s == NULL) */ - break; - } - case 'f': { /* frontier? */ - const char *ep; char previous; - p += 2; - if (*p != '[') - luaL_error(ms->L, "missing " LUA_QL("[") " after " - LUA_QL("%%f") " in pattern"); - ep = classend(ms, p); /* points to what is next */ - previous = (s == ms->src_init) ? '\0' : *(s - 1); - if (!matchbracketclass(uchar(previous), p, ep - 1) && - matchbracketclass(uchar(*s), p, ep - 1)) { - p = ep; goto init; /* return match(ms, s, ep); */ - } - s = NULL; /* match failed */ - break; - } - case '0': case '1': case '2': case '3': - case '4': case '5': case '6': case '7': - case '8': case '9': { /* capture results (%0-%9)? */ - s = match_capture(ms, s, uchar(*(p + 1))); - if (s != NULL) { - p += 2; goto init; /* return match(ms, s, p + 2) */ - } - break; - } - default: goto dflt; - } - break; - } - default: dflt: { /* pattern class plus optional suffix */ - const char *ep = classend(ms, p); /* points to optional suffix */ - /* does not match at least once? */ - if (!singlematch(ms, s, p, ep)) { - if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */ - p = ep + 1; goto init; /* return match(ms, s, ep + 1); */ - } - else /* '+' or no suffix */ - s = NULL; /* fail */ - } - else { /* matched once */ - switch (*ep) { /* handle optional suffix */ - case '?': { /* optional */ - const char *res; - if ((res = match(ms, s + 1, ep + 1)) != NULL) - s = res; - else { - p = ep + 1; goto init; /* else return match(ms, s, ep + 1); */ - } - break; - } - case '+': /* 1 or more repetitions */ - s++; /* 1 match already done */ - /* FALLTHROUGH */ - case '*': /* 0 or more repetitions */ - s = max_expand(ms, s, p, ep); - break; - case '-': /* 0 or more repetitions (minimum) */ - s = min_expand(ms, s, p, ep); - break; - default: /* no suffix */ - s++; p = ep; goto init; /* return match(ms, s + 1, ep); */ - } - } - break; - } - } - } - ms->matchdepth++; - return s; -} - - - -static const char *lmemfind (const char *s1, size_t l1, - const char *s2, size_t l2) { - if (l2 == 0) return s1; /* empty strings are everywhere */ - else if (l2 > l1) return NULL; /* avoids a negative `l1' */ - else { - const char *init; /* to search for a `*s2' inside `s1' */ - l2--; /* 1st char will be checked by `memchr' */ - l1 = l1-l2; /* `s2' cannot be found after that */ - while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) { - init++; /* 1st char is already checked */ - if (memcmp(init, s2+1, l2) == 0) - return init-1; - else { /* correct `l1' and `s1' to try again */ - l1 -= init-s1; - s1 = init; - } - } - return NULL; /* not found */ - } -} - - -static void push_onecapture (MatchState *ms, int i, const char *s, - const char *e) { - if (i >= ms->level) { - if (i == 0) /* ms->level == 0, too */ - lua_pushlstring(ms->L, s, e - s); /* add whole match */ - else - luaL_error(ms->L, "invalid capture index"); - } - else { - ptrdiff_t l = ms->capture[i].len; - if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture"); - if (l == CAP_POSITION) - lua_pushinteger(ms->L, ms->capture[i].init - ms->src_init + 1); - else - lua_pushlstring(ms->L, ms->capture[i].init, l); - } -} - - -static int push_captures (MatchState *ms, const char *s, const char *e) { - int i; - int nlevels = (ms->level == 0 && s) ? 1 : ms->level; - luaL_checkstack(ms->L, nlevels, "too many captures"); - for (i = 0; i < nlevels; i++) - push_onecapture(ms, i, s, e); - return nlevels; /* number of strings pushed */ -} - - -/* check whether pattern has no special characters */ -static int nospecials (const char *p, size_t l) { - size_t upto = 0; - do { - if (strpbrk(p + upto, SPECIALS)) - return 0; /* pattern has a special character */ - upto += strlen(p + upto) + 1; /* may have more after \0 */ - } while (upto <= l); - return 1; /* no special chars found */ -} - - -static int str_find_aux (lua_State *L, int find) { - size_t ls, lp; - const char *s = luaL_checklstring(L, 1, &ls); - const char *p = luaL_checklstring(L, 2, &lp); - size_t init = posrelat(luaL_optinteger(L, 3, 1), ls); - if (init < 1) init = 1; - else if (init > ls + 1) { /* start after string's end? */ - lua_pushnil(L); /* cannot find anything */ - return 1; - } - /* explicit request or no special characters? */ - if (find && (lua_toboolean(L, 4) || nospecials(p, lp))) { - /* do a plain search */ - const char *s2 = lmemfind(s + init - 1, ls - init + 1, p, lp); - if (s2) { - lua_pushinteger(L, s2 - s + 1); - lua_pushinteger(L, s2 - s + lp); - return 2; - } - } - else { - MatchState ms; - const char *s1 = s + init - 1; - int anchor = (*p == '^'); - if (anchor) { - p++; lp--; /* skip anchor character */ - } - ms.L = L; - ms.matchdepth = MAXCCALLS; - ms.src_init = s; - ms.src_end = s + ls; - ms.p_end = p + lp; - do { - const char *res; - ms.level = 0; - lua_assert(ms.matchdepth == MAXCCALLS); - if ((res=match(&ms, s1, p)) != NULL) { - if (find) { - lua_pushinteger(L, s1 - s + 1); /* start */ - lua_pushinteger(L, res - s); /* end */ - return push_captures(&ms, NULL, 0) + 2; - } - else - return push_captures(&ms, s1, res); - } - } while (s1++ < ms.src_end && !anchor); - } - lua_pushnil(L); /* not found */ - return 1; -} - - -static int str_find (lua_State *L) { - return str_find_aux(L, 1); -} - - -static int str_match (lua_State *L) { - return str_find_aux(L, 0); -} - - -static int gmatch_aux (lua_State *L) { - MatchState ms; - size_t ls, lp; - const char *s = lua_tolstring(L, lua_upvalueindex(1), &ls); - const char *p = lua_tolstring(L, lua_upvalueindex(2), &lp); - const char *src; - ms.L = L; - ms.matchdepth = MAXCCALLS; - ms.src_init = s; - ms.src_end = s+ls; - ms.p_end = p + lp; - for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3)); - src <= ms.src_end; - src++) { - const char *e; - ms.level = 0; - lua_assert(ms.matchdepth == MAXCCALLS); - if ((e = match(&ms, src, p)) != NULL) { - lua_Integer newstart = e-s; - if (e == src) newstart++; /* empty match? go at least one position */ - lua_pushinteger(L, newstart); - lua_replace(L, lua_upvalueindex(3)); - return push_captures(&ms, src, e); - } - } - return 0; /* not found */ -} - - -static int str_gmatch (lua_State *L) { - luaL_checkstring(L, 1); - luaL_checkstring(L, 2); - lua_settop(L, 2); - lua_pushinteger(L, 0); - lua_pushcclosure(L, gmatch_aux, 3); - return 1; -} - - -static void add_s (MatchState *ms, luaL_Buffer *b, const char *s, - const char *e) { - size_t l, i; - const char *news = lua_tolstring(ms->L, 3, &l); - for (i = 0; i < l; i++) { - if (news[i] != L_ESC) - luaL_addchar(b, news[i]); - else { - i++; /* skip ESC */ - if (!isdigit(uchar(news[i]))) { - if (news[i] != L_ESC) - luaL_error(ms->L, "invalid use of " LUA_QL("%c") - " in replacement string", L_ESC); - luaL_addchar(b, news[i]); - } - else if (news[i] == '0') - luaL_addlstring(b, s, e - s); - else { - push_onecapture(ms, news[i] - '1', s, e); - luaL_addvalue(b); /* add capture to accumulated result */ - } - } - } -} - - -static void add_value (MatchState *ms, luaL_Buffer *b, const char *s, - const char *e, int tr) { - lua_State *L = ms->L; - switch (tr) { - case LUA_TFUNCTION: { - int n; - lua_pushvalue(L, 3); - n = push_captures(ms, s, e); - lua_call(L, n, 1); - break; - } - case LUA_TTABLE: { - push_onecapture(ms, 0, s, e); - lua_gettable(L, 3); - break; - } - default: { /* LUA_TNUMBER or LUA_TSTRING */ - add_s(ms, b, s, e); - return; - } - } - if (!lua_toboolean(L, -1)) { /* nil or false? */ - lua_pop(L, 1); - lua_pushlstring(L, s, e - s); /* keep original text */ - } - else if (!lua_isstring(L, -1)) - luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1)); - luaL_addvalue(b); /* add result to accumulator */ -} - - -static int str_gsub (lua_State *L) { - size_t srcl, lp; - const char *src = luaL_checklstring(L, 1, &srcl); - const char *p = luaL_checklstring(L, 2, &lp); - int tr = lua_type(L, 3); - size_t max_s = luaL_optinteger(L, 4, srcl+1); - int anchor = (*p == '^'); - size_t n = 0; - MatchState ms; - luaL_Buffer b; - luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING || - tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3, - "string/function/table expected"); - luaL_buffinit(L, &b); - if (anchor) { - p++; lp--; /* skip anchor character */ - } - ms.L = L; - ms.matchdepth = MAXCCALLS; - ms.src_init = src; - ms.src_end = src+srcl; - ms.p_end = p + lp; - while (n < max_s) { - const char *e; - ms.level = 0; - lua_assert(ms.matchdepth == MAXCCALLS); - e = match(&ms, src, p); - if (e) { - n++; - add_value(&ms, &b, src, e, tr); - } - if (e && e>src) /* non empty match? */ - src = e; /* skip it */ - else if (src < ms.src_end) - luaL_addchar(&b, *src++); - else break; - if (anchor) break; - } - luaL_addlstring(&b, src, ms.src_end-src); - luaL_pushresult(&b); - lua_pushinteger(L, n); /* number of substitutions */ - return 2; -} - -/* }====================================================== */ - - - -/* -** {====================================================== -** STRING FORMAT -** ======================================================= -*/ - -/* -** LUA_INTFRMLEN is the length modifier for integer conversions in -** 'string.format'; LUA_INTFRM_T is the integer type corresponding to -** the previous length -*/ -#if !defined(LUA_INTFRMLEN) /* { */ -#if defined(LUA_USE_LONGLONG) - -#define LUA_INTFRMLEN "ll" -#define LUA_INTFRM_T long long - -#else - -#define LUA_INTFRMLEN "l" -#define LUA_INTFRM_T long - -#endif -#endif /* } */ - - -/* -** LUA_FLTFRMLEN is the length modifier for float conversions in -** 'string.format'; LUA_FLTFRM_T is the float type corresponding to -** the previous length -*/ -#if !defined(LUA_FLTFRMLEN) - -#define LUA_FLTFRMLEN "" -#define LUA_FLTFRM_T double - -#endif - - -/* maximum size of each formatted item (> len(format('%99.99f', -1e308))) */ -#define MAX_ITEM 512 -/* valid flags in a format specification */ -#define FLAGS "-+ #0" -/* -** maximum size of each format specification (such as '%-099.99d') -** (+10 accounts for %99.99x plus margin of error) -*/ -#define MAX_FORMAT (sizeof(FLAGS) + sizeof(LUA_INTFRMLEN) + 10) - - -static void addquoted (lua_State *L, luaL_Buffer *b, int arg) { - size_t l; - const char *s = luaL_checklstring(L, arg, &l); - luaL_addchar(b, '"'); - while (l--) { - if (*s == '"' || *s == '\\' || *s == '\n') { - luaL_addchar(b, '\\'); - luaL_addchar(b, *s); - } - else if (*s == '\0' || iscntrl(uchar(*s))) { - char buff[10]; - if (!isdigit(uchar(*(s+1)))) - sprintf(buff, "\\%d", (int)uchar(*s)); - else - sprintf(buff, "\\%03d", (int)uchar(*s)); - luaL_addstring(b, buff); - } - else - luaL_addchar(b, *s); - s++; - } - luaL_addchar(b, '"'); -} - -static const char *scanformat (lua_State *L, const char *strfrmt, char *form) { - const char *p = strfrmt; - while (*p != '\0' && strchr(FLAGS, *p) != NULL) p++; /* skip flags */ - if ((size_t)(p - strfrmt) >= sizeof(FLAGS)/sizeof(char)) - luaL_error(L, "invalid format (repeated flags)"); - if (isdigit(uchar(*p))) p++; /* skip width */ - if (isdigit(uchar(*p))) p++; /* (2 digits at most) */ - if (*p == '.') { - p++; - if (isdigit(uchar(*p))) p++; /* skip precision */ - if (isdigit(uchar(*p))) p++; /* (2 digits at most) */ - } - if (isdigit(uchar(*p))) - luaL_error(L, "invalid format (width or precision too long)"); - *(form++) = '%'; - memcpy(form, strfrmt, (p - strfrmt + 1) * sizeof(char)); - form += p - strfrmt + 1; - *form = '\0'; - return p; -} - - -/* -** add length modifier into formats -*/ -static void addlenmod (char *form, const char *lenmod) { - size_t l = strlen(form); - size_t lm = strlen(lenmod); - char spec = form[l - 1]; - strcpy(form + l - 1, lenmod); - form[l + lm - 1] = spec; - form[l + lm] = '\0'; -} - - -static int str_format (lua_State *L) { - int top = lua_gettop(L); - int arg = 1; - size_t sfl; - const char *strfrmt = luaL_checklstring(L, arg, &sfl); - const char *strfrmt_end = strfrmt+sfl; - luaL_Buffer b; - luaL_buffinit(L, &b); - while (strfrmt < strfrmt_end) { - if (*strfrmt != L_ESC) - luaL_addchar(&b, *strfrmt++); - else if (*++strfrmt == L_ESC) - luaL_addchar(&b, *strfrmt++); /* %% */ - else { /* format item */ - char form[MAX_FORMAT]; /* to store the format (`%...') */ - char *buff = luaL_prepbuffsize(&b, MAX_ITEM); /* to put formatted item */ - int nb = 0; /* number of bytes in added item */ - if (++arg > top) - luaL_argerror(L, arg, "no value"); - strfrmt = scanformat(L, strfrmt, form); - switch (*strfrmt++) { - case 'c': { - nb = str_sprintf(buff, form, luaL_checkint(L, arg)); - break; - } - case 'd': case 'i': { - lua_Number n = luaL_checknumber(L, arg); - LUA_INTFRM_T ni = (LUA_INTFRM_T)n; - lua_Number diff = n - (lua_Number)ni; - luaL_argcheck(L, -1 < diff && diff < 1, arg, - "not a number in proper range"); - addlenmod(form, LUA_INTFRMLEN); - nb = str_sprintf(buff, form, ni); - break; - } - case 'o': case 'u': case 'x': case 'X': { - lua_Number n = luaL_checknumber(L, arg); - unsigned LUA_INTFRM_T ni = (unsigned LUA_INTFRM_T)n; - lua_Number diff = n - (lua_Number)ni; - luaL_argcheck(L, -1 < diff && diff < 1, arg, - "not a non-negative number in proper range"); - addlenmod(form, LUA_INTFRMLEN); - nb = str_sprintf(buff, form, ni); - break; - } -#if defined(LUA_USE_FLOAT_FORMATS) - case 'e': case 'E': case 'f': -#if defined(LUA_USE_AFORMAT) - case 'a': case 'A': -#endif - case 'g': case 'G': { - addlenmod(form, LUA_FLTFRMLEN); - nb = str_sprintf(buff, form, (LUA_FLTFRM_T)luaL_checknumber(L, arg)); - break; - } -#endif - case 'q': { - addquoted(L, &b, arg); - break; - } - case 's': { - size_t l; - const char *s = luaL_tolstring(L, arg, &l); - if (!strchr(form, '.') && l >= 100) { - /* no precision and string is too long to be formatted; - keep original string */ - luaL_addvalue(&b); - break; - } - else { - nb = str_sprintf(buff, form, s); - lua_pop(L, 1); /* remove result from 'luaL_tolstring' */ - break; - } - } - default: { /* also treat cases `pnLlh' */ - return luaL_error(L, "invalid option " LUA_QL("%%%c") " to " - LUA_QL("format"), *(strfrmt - 1)); - } - } - luaL_addsize(&b, nb); - } - } - luaL_pushresult(&b); - return 1; -} - -/* }====================================================== */ - - -static const luaL_Reg strlib[] = { - {"byte", str_byte}, - {"char", str_char}, - {"dump", str_dump}, - {"find", str_find}, - {"format", str_format}, - {"gmatch", str_gmatch}, - {"gsub", str_gsub}, - {"len", str_len}, - {"lower", str_lower}, - {"match", str_match}, - {"rep", str_rep}, - {"reverse", str_reverse}, - {"sub", str_sub}, - {"upper", str_upper}, - {NULL, NULL} -}; - - -static void createmetatable (lua_State *L) { - lua_createtable(L, 0, 1); /* table to be metatable for strings */ - lua_pushliteral(L, ""); /* dummy string */ - lua_pushvalue(L, -2); /* copy table */ - lua_setmetatable(L, -2); /* set table as metatable for strings */ - lua_pop(L, 1); /* pop dummy string */ - lua_pushvalue(L, -2); /* get string library */ - lua_setfield(L, -2, "__index"); /* metatable.__index = string */ - lua_pop(L, 1); /* pop metatable */ -} - - -/* -** Open string library -*/ -LUAMOD_API int luaopen_string (lua_State *L) { - luaL_newlib(L, strlib); - createmetatable(L); - return 1; -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c deleted file mode 100644 index 4f8ab1b16733..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c +++ /dev/null @@ -1,589 +0,0 @@ -/* -** $Id: ltable.c,v 2.72.1.1 2013/04/12 18:48:47 roberto Exp $ -** Lua tables (hash) -** See Copyright Notice in lua.h -*/ - - -/* -** Implementation of tables (aka arrays, objects, or hash tables). -** Tables keep its elements in two parts: an array part and a hash part. -** Non-negative integer keys are all candidates to be kept in the array -** part. The actual size of the array is the largest `n' such that at -** least half the slots between 0 and n are in use. -** Hash uses a mix of chained scatter table with Brent's variation. -** A main invariant of these tables is that, if an element is not -** in its main position (i.e. the `original' position that its hash gives -** to it), then the colliding element is in its own main position. -** Hence even when the load factor reaches 100%, performance remains good. -*/ - -#include - -#define ltable_c -#define LUA_CORE - -#include "lua.h" - -#include "ldebug.h" -#include "ldo.h" -#include "lgc.h" -#include "lmem.h" -#include "lobject.h" -#include "lstate.h" -#include "lstring.h" -#include "ltable.h" -#include "lvm.h" - - -/* -** max size of array part is 2^MAXBITS -*/ -#if LUAI_BITSINT >= 32 -#define MAXBITS 30 -#else -#define MAXBITS (LUAI_BITSINT-2) -#endif - -#define MAXASIZE (1 << MAXBITS) - - -#define hashpow2(t,n) (gnode(t, lmod((n), sizenode(t)))) - -#define hashstr(t,str) hashpow2(t, (str)->tsv.hash) -#define hashboolean(t,p) hashpow2(t, p) - - -/* -** for some types, it is better to avoid modulus by power of 2, as -** they tend to have many 2 factors. -*/ -#define hashmod(t,n) (gnode(t, ((n) % ((sizenode(t)-1)|1)))) - - -#define hashpointer(t,p) hashmod(t, IntPoint(p)) - - -#define dummynode (&dummynode_) - -#define isdummy(n) ((n) == dummynode) - -static const Node dummynode_ = { - {NILCONSTANT}, /* value */ - {{NILCONSTANT, NULL}} /* key */ -}; - - -/* -** hash for lua_Numbers -*/ -static Node *hashnum (const Table *t, lua_Number n) { - int i; - luai_hashnum(i, n); - if (i < 0) { - if (cast(unsigned int, i) == 0u - i) /* use unsigned to avoid overflows */ - i = 0; /* handle INT_MIN */ - i = -i; /* must be a positive value */ - } - return hashmod(t, i); -} - - - -/* -** returns the `main' position of an element in a table (that is, the index -** of its hash value) -*/ -static Node *mainposition (const Table *t, const TValue *key) { - switch (ttype(key)) { - case LUA_TNUMBER: - return hashnum(t, nvalue(key)); - case LUA_TLNGSTR: { - TString *s = rawtsvalue(key); - if (s->tsv.extra == 0) { /* no hash? */ - s->tsv.hash = luaS_hash(getstr(s), s->tsv.len, s->tsv.hash); - s->tsv.extra = 1; /* now it has its hash */ - } - return hashstr(t, rawtsvalue(key)); - } - case LUA_TSHRSTR: - return hashstr(t, rawtsvalue(key)); - case LUA_TBOOLEAN: - return hashboolean(t, bvalue(key)); - case LUA_TLIGHTUSERDATA: - return hashpointer(t, pvalue(key)); - case LUA_TLCF: - return hashpointer(t, fvalue(key)); - default: - return hashpointer(t, gcvalue(key)); - } -} - - -/* -** returns the index for `key' if `key' is an appropriate key to live in -** the array part of the table, -1 otherwise. -*/ -static int arrayindex (const TValue *key) { - if (ttisnumber(key)) { - lua_Number n = nvalue(key); - int k; - lua_number2int(k, n); - if (luai_numeq(cast_num(k), n)) - return k; - } - return -1; /* `key' did not match some condition */ -} - - -/* -** returns the index of a `key' for table traversals. First goes all -** elements in the array part, then elements in the hash part. The -** beginning of a traversal is signaled by -1. -*/ -static int findindex (lua_State *L, Table *t, StkId key) { - int i; - if (ttisnil(key)) return -1; /* first iteration */ - i = arrayindex(key); - if (0 < i && i <= t->sizearray) /* is `key' inside array part? */ - return i-1; /* yes; that's the index (corrected to C) */ - else { - Node *n = mainposition(t, key); - for (;;) { /* check whether `key' is somewhere in the chain */ - /* key may be dead already, but it is ok to use it in `next' */ - if (luaV_rawequalobj(gkey(n), key) || - (ttisdeadkey(gkey(n)) && iscollectable(key) && - deadvalue(gkey(n)) == gcvalue(key))) { - i = cast_int(n - gnode(t, 0)); /* key index in hash table */ - /* hash elements are numbered after array ones */ - return i + t->sizearray; - } - else n = gnext(n); - if (n == NULL) - luaG_runerror(L, "invalid key to " LUA_QL("next")); /* key not found */ - } - } -} - - -int luaH_next (lua_State *L, Table *t, StkId key) { - int i = findindex(L, t, key); /* find original element */ - for (i++; i < t->sizearray; i++) { /* try first array part */ - if (!ttisnil(&t->array[i])) { /* a non-nil value? */ - setnvalue(key, cast_num(i+1)); - setobj2s(L, key+1, &t->array[i]); - return 1; - } - } - for (i -= t->sizearray; i < sizenode(t); i++) { /* then hash part */ - if (!ttisnil(gval(gnode(t, i)))) { /* a non-nil value? */ - setobj2s(L, key, gkey(gnode(t, i))); - setobj2s(L, key+1, gval(gnode(t, i))); - return 1; - } - } - return 0; /* no more elements */ -} - - -/* -** {============================================================= -** Rehash -** ============================================================== -*/ - - -static int computesizes (int nums[], int *narray) { - int i; - int twotoi; /* 2^i */ - int a = 0; /* number of elements smaller than 2^i */ - int na = 0; /* number of elements to go to array part */ - int n = 0; /* optimal size for array part */ - for (i = 0, twotoi = 1; twotoi/2 < *narray; i++, twotoi *= 2) { - if (nums[i] > 0) { - a += nums[i]; - if (a > twotoi/2) { /* more than half elements present? */ - n = twotoi; /* optimal size (till now) */ - na = a; /* all elements smaller than n will go to array part */ - } - } - if (a == *narray) break; /* all elements already counted */ - } - *narray = n; - lua_assert(*narray/2 <= na && na <= *narray); - return na; -} - - -static int countint (const TValue *key, int *nums) { - int k = arrayindex(key); - if (0 < k && k <= MAXASIZE) { /* is `key' an appropriate array index? */ - nums[luaO_ceillog2(k)]++; /* count as such */ - return 1; - } - else - return 0; -} - - -static int numusearray (const Table *t, int *nums) { - int lg; - int ttlg; /* 2^lg */ - int ause = 0; /* summation of `nums' */ - int i = 1; /* count to traverse all array keys */ - for (lg=0, ttlg=1; lg<=MAXBITS; lg++, ttlg*=2) { /* for each slice */ - int lc = 0; /* counter */ - int lim = ttlg; - if (lim > t->sizearray) { - lim = t->sizearray; /* adjust upper limit */ - if (i > lim) - break; /* no more elements to count */ - } - /* count elements in range (2^(lg-1), 2^lg] */ - for (; i <= lim; i++) { - if (!ttisnil(&t->array[i-1])) - lc++; - } - nums[lg] += lc; - ause += lc; - } - return ause; -} - - -static int numusehash (const Table *t, int *nums, int *pnasize) { - int totaluse = 0; /* total number of elements */ - int ause = 0; /* summation of `nums' */ - int i = sizenode(t); - while (i--) { - Node *n = &t->node[i]; - if (!ttisnil(gval(n))) { - ause += countint(gkey(n), nums); - totaluse++; - } - } - *pnasize += ause; - return totaluse; -} - - -static void setarrayvector (lua_State *L, Table *t, int size) { - int i; - luaM_reallocvector(L, t->array, t->sizearray, size, TValue); - for (i=t->sizearray; iarray[i]); - t->sizearray = size; -} - - -static void setnodevector (lua_State *L, Table *t, int size) { - int lsize; - if (size == 0) { /* no elements to hash part? */ - t->node = cast(Node *, dummynode); /* use common `dummynode' */ - lsize = 0; - } - else { - int i; - lsize = luaO_ceillog2(size); - if (lsize > MAXBITS) - luaG_runerror(L, "table overflow"); - size = twoto(lsize); - t->node = luaM_newvector(L, size, Node); - for (i=0; ilsizenode = cast_byte(lsize); - t->lastfree = gnode(t, size); /* all positions are free */ -} - - -void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize) { - int i; - int oldasize = t->sizearray; - int oldhsize = t->lsizenode; - Node *nold = t->node; /* save old hash ... */ - if (nasize > oldasize) /* array part must grow? */ - setarrayvector(L, t, nasize); - /* create new hash part with appropriate size */ - setnodevector(L, t, nhsize); - if (nasize < oldasize) { /* array part must shrink? */ - t->sizearray = nasize; - /* re-insert elements from vanishing slice */ - for (i=nasize; iarray[i])) - luaH_setint(L, t, i + 1, &t->array[i]); - } - /* shrink array */ - luaM_reallocvector(L, t->array, oldasize, nasize, TValue); - } - /* re-insert elements from hash part */ - for (i = twoto(oldhsize) - 1; i >= 0; i--) { - Node *old = nold+i; - if (!ttisnil(gval(old))) { - /* doesn't need barrier/invalidate cache, as entry was - already present in the table */ - setobjt2t(L, luaH_set(L, t, gkey(old)), gval(old)); - } - } - if (!isdummy(nold)) - luaM_freearray(L, nold, cast(size_t, twoto(oldhsize))); /* free old array */ -} - - -void luaH_resizearray (lua_State *L, Table *t, int nasize) { - int nsize = isdummy(t->node) ? 0 : sizenode(t); - luaH_resize(L, t, nasize, nsize); -} - - -static void rehash (lua_State *L, Table *t, const TValue *ek) { - int nasize, na; - int nums[MAXBITS+1]; /* nums[i] = number of keys with 2^(i-1) < k <= 2^i */ - int i; - int totaluse; - for (i=0; i<=MAXBITS; i++) nums[i] = 0; /* reset counts */ - nasize = numusearray(t, nums); /* count keys in array part */ - totaluse = nasize; /* all those keys are integer keys */ - totaluse += numusehash(t, nums, &nasize); /* count keys in hash part */ - /* count extra key */ - nasize += countint(ek, nums); - totaluse++; - /* compute new size for array part */ - na = computesizes(nums, &nasize); - /* resize the table to new computed sizes */ - luaH_resize(L, t, nasize, totaluse - na); -} - - - -/* -** }============================================================= -*/ - - -Table *luaH_new (lua_State *L) { - Table *t = &luaC_newobj(L, LUA_TTABLE, sizeof(Table), NULL, 0)->h; - t->metatable = NULL; - t->flags = cast_byte(~0); - t->array = NULL; - t->sizearray = 0; - setnodevector(L, t, 0); - return t; -} - - -void luaH_free (lua_State *L, Table *t) { - if (!isdummy(t->node)) - luaM_freearray(L, t->node, cast(size_t, sizenode(t))); - luaM_freearray(L, t->array, t->sizearray); - luaM_free(L, t); -} - - -static Node *getfreepos (Table *t) { - while (t->lastfree > t->node) { - t->lastfree--; - if (ttisnil(gkey(t->lastfree))) - return t->lastfree; - } - return NULL; /* could not find a free place */ -} - - - -/* -** inserts a new key into a hash table; first, check whether key's main -** position is free. If not, check whether colliding node is in its main -** position or not: if it is not, move colliding node to an empty place and -** put new key in its main position; otherwise (colliding node is in its main -** position), new key goes to an empty position. -*/ -TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key) { - Node *mp; - if (ttisnil(key)) luaG_runerror(L, "table index is nil"); - else if (ttisnumber(key) && luai_numisnan(L, nvalue(key))) - luaG_runerror(L, "table index is NaN"); - mp = mainposition(t, key); - if (!ttisnil(gval(mp)) || isdummy(mp)) { /* main position is taken? */ - Node *othern; - Node *n = getfreepos(t); /* get a free place */ - if (n == NULL) { /* cannot find a free place? */ - rehash(L, t, key); /* grow table */ - /* whatever called 'newkey' take care of TM cache and GC barrier */ - return luaH_set(L, t, key); /* insert key into grown table */ - } - lua_assert(!isdummy(n)); - othern = mainposition(t, gkey(mp)); - if (othern != mp) { /* is colliding node out of its main position? */ - /* yes; move colliding node into free position */ - while (gnext(othern) != mp) othern = gnext(othern); /* find previous */ - gnext(othern) = n; /* redo the chain with `n' in place of `mp' */ - *n = *mp; /* copy colliding node into free pos. (mp->next also goes) */ - gnext(mp) = NULL; /* now `mp' is free */ - setnilvalue(gval(mp)); - } - else { /* colliding node is in its own main position */ - /* new node will go into free position */ - gnext(n) = gnext(mp); /* chain new position */ - gnext(mp) = n; - mp = n; - } - } - setobj2t(L, gkey(mp), key); - luaC_barrierback(L, obj2gco(t), key); - lua_assert(ttisnil(gval(mp))); - return gval(mp); -} - - -/* -** search function for integers -*/ -const TValue *luaH_getint (Table *t, int key) { - /* (1 <= key && key <= t->sizearray) */ - if (cast(unsigned int, key-1) < cast(unsigned int, t->sizearray)) - return &t->array[key-1]; - else { - lua_Number nk = cast_num(key); - Node *n = hashnum(t, nk); - do { /* check whether `key' is somewhere in the chain */ - if (ttisnumber(gkey(n)) && luai_numeq(nvalue(gkey(n)), nk)) - return gval(n); /* that's it */ - else n = gnext(n); - } while (n); - return luaO_nilobject; - } -} - - -/* -** search function for short strings -*/ -const TValue *luaH_getstr (Table *t, TString *key) { - Node *n = hashstr(t, key); - lua_assert(key->tsv.tt == LUA_TSHRSTR); - do { /* check whether `key' is somewhere in the chain */ - if (ttisshrstring(gkey(n)) && eqshrstr(rawtsvalue(gkey(n)), key)) - return gval(n); /* that's it */ - else n = gnext(n); - } while (n); - return luaO_nilobject; -} - - -/* -** main search function -*/ -const TValue *luaH_get (Table *t, const TValue *key) { - switch (ttype(key)) { - case LUA_TSHRSTR: return luaH_getstr(t, rawtsvalue(key)); - case LUA_TNIL: return luaO_nilobject; - case LUA_TNUMBER: { - int k; - lua_Number n = nvalue(key); - lua_number2int(k, n); - if (luai_numeq(cast_num(k), n)) /* index is int? */ - return luaH_getint(t, k); /* use specialized version */ - /* else go through */ - } - /* FALLTHROUGH */ - default: { - Node *n = mainposition(t, key); - do { /* check whether `key' is somewhere in the chain */ - if (luaV_rawequalobj(gkey(n), key)) - return gval(n); /* that's it */ - else n = gnext(n); - } while (n); - return luaO_nilobject; - } - } -} - - -/* -** beware: when using this function you probably need to check a GC -** barrier and invalidate the TM cache. -*/ -TValue *luaH_set (lua_State *L, Table *t, const TValue *key) { - const TValue *p = luaH_get(t, key); - if (p != luaO_nilobject) - return cast(TValue *, p); - else return luaH_newkey(L, t, key); -} - - -void luaH_setint (lua_State *L, Table *t, int key, TValue *value) { - const TValue *p = luaH_getint(t, key); - TValue *cell; - if (p != luaO_nilobject) - cell = cast(TValue *, p); - else { - TValue k; - setnvalue(&k, cast_num(key)); - cell = luaH_newkey(L, t, &k); - } - setobj2t(L, cell, value); -} - - -static int unbound_search (Table *t, unsigned int j) { - unsigned int i = j; /* i is zero or a present index */ - j++; - /* find `i' and `j' such that i is present and j is not */ - while (!ttisnil(luaH_getint(t, j))) { - i = j; - j *= 2; - if (j > cast(unsigned int, MAX_INT)) { /* overflow? */ - /* table was built with bad purposes: resort to linear search */ - i = 1; - while (!ttisnil(luaH_getint(t, i))) i++; - return i - 1; - } - } - /* now do a binary search between them */ - while (j - i > 1) { - unsigned int m = (i+j)/2; - if (ttisnil(luaH_getint(t, m))) j = m; - else i = m; - } - return i; -} - - -/* -** Try to find a boundary in table `t'. A `boundary' is an integer index -** such that t[i] is non-nil and t[i+1] is nil (and 0 if t[1] is nil). -*/ -int luaH_getn (Table *t) { - unsigned int j = t->sizearray; - if (j > 0 && ttisnil(&t->array[j - 1])) { - /* there is a boundary in the array part: (binary) search for it */ - unsigned int i = 0; - while (j - i > 1) { - unsigned int m = (i+j)/2; - if (ttisnil(&t->array[m - 1])) j = m; - else i = m; - } - return i; - } - /* else must find a boundary in hash part */ - else if (isdummy(t->node)) /* hash part is empty? */ - return j; /* that is easy... */ - else return unbound_search(t, j); -} - - - -#if defined(LUA_DEBUG) - -Node *luaH_mainposition (const Table *t, const TValue *key) { - return mainposition(t, key); -} - -int luaH_isdummy (Node *n) { return isdummy(n); } - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h deleted file mode 100644 index d69449b2b863..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h +++ /dev/null @@ -1,45 +0,0 @@ -/* -** $Id: ltable.h,v 2.16.1.2 2013/08/30 15:49:41 roberto Exp $ -** Lua tables (hash) -** See Copyright Notice in lua.h -*/ - -#ifndef ltable_h -#define ltable_h - -#include "lobject.h" - - -#define gnode(t,i) (&(t)->node[i]) -#define gkey(n) (&(n)->i_key.tvk) -#define gval(n) (&(n)->i_val) -#define gnext(n) ((n)->i_key.nk.next) - -#define invalidateTMcache(t) ((t)->flags = 0) - -/* returns the key, given the value of a table entry */ -#define keyfromval(v) \ - (gkey(cast(Node *, cast(char *, (v)) - offsetof(Node, i_val)))) - - -LUAI_FUNC const TValue *luaH_getint (Table *t, int key); -LUAI_FUNC void luaH_setint (lua_State *L, Table *t, int key, TValue *value); -LUAI_FUNC const TValue *luaH_getstr (Table *t, TString *key); -LUAI_FUNC const TValue *luaH_get (Table *t, const TValue *key); -LUAI_FUNC TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key); -LUAI_FUNC TValue *luaH_set (lua_State *L, Table *t, const TValue *key); -LUAI_FUNC Table *luaH_new (lua_State *L); -LUAI_FUNC void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize); -LUAI_FUNC void luaH_resizearray (lua_State *L, Table *t, int nasize); -LUAI_FUNC void luaH_free (lua_State *L, Table *t); -LUAI_FUNC int luaH_next (lua_State *L, Table *t, StkId key); -LUAI_FUNC int luaH_getn (Table *t); - - -#if defined(LUA_DEBUG) -LUAI_FUNC Node *luaH_mainposition (const Table *t, const TValue *key); -LUAI_FUNC int luaH_isdummy (Node *n); -#endif - - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c deleted file mode 100644 index ac9a662448fa..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c +++ /dev/null @@ -1,284 +0,0 @@ -/* -** $Id: ltablib.c,v 1.65.1.2 2014/05/07 16:32:55 roberto Exp $ -** Library for Table Manipulation -** See Copyright Notice in lua.h -*/ - - -#include - -#define ltablib_c -#define LUA_LIB - -#include "lua.h" - -#include "lauxlib.h" -#include "lualib.h" - - -#define aux_getn(L,n) (luaL_checktype(L, n, LUA_TTABLE), luaL_len(L, n)) - - - -#if defined(LUA_COMPAT_MAXN) -static int maxn (lua_State *L) { - lua_Number max = 0; - luaL_checktype(L, 1, LUA_TTABLE); - lua_pushnil(L); /* first key */ - while (lua_next(L, 1)) { - lua_pop(L, 1); /* remove value */ - if (lua_type(L, -1) == LUA_TNUMBER) { - lua_Number v = lua_tonumber(L, -1); - if (v > max) max = v; - } - } - lua_pushnumber(L, max); - return 1; -} -#endif - - -static int tinsert (lua_State *L) { - int e = aux_getn(L, 1) + 1; /* first empty element */ - int pos; /* where to insert new element */ - switch (lua_gettop(L)) { - case 2: { /* called with only 2 arguments */ - pos = e; /* insert new element at the end */ - break; - } - case 3: { - int i; - pos = luaL_checkint(L, 2); /* 2nd argument is the position */ - luaL_argcheck(L, 1 <= pos && pos <= e, 2, "position out of bounds"); - for (i = e; i > pos; i--) { /* move up elements */ - lua_rawgeti(L, 1, i-1); - lua_rawseti(L, 1, i); /* t[i] = t[i-1] */ - } - break; - } - default: { - return luaL_error(L, "wrong number of arguments to " LUA_QL("insert")); - } - } - lua_rawseti(L, 1, pos); /* t[pos] = v */ - return 0; -} - - -static int tremove (lua_State *L) { - int size = aux_getn(L, 1); - int pos = luaL_optint(L, 2, size); - if (pos != size) /* validate 'pos' if given */ - luaL_argcheck(L, 1 <= pos && pos <= size + 1, 1, "position out of bounds"); - lua_rawgeti(L, 1, pos); /* result = t[pos] */ - for ( ; pos < size; pos++) { - lua_rawgeti(L, 1, pos+1); - lua_rawseti(L, 1, pos); /* t[pos] = t[pos+1] */ - } - lua_pushnil(L); - lua_rawseti(L, 1, pos); /* t[pos] = nil */ - return 1; -} - - -static void addfield (lua_State *L, luaL_Buffer *b, int i) { - lua_rawgeti(L, 1, i); - if (!lua_isstring(L, -1)) - luaL_error(L, "invalid value (%s) at index %d in table for " - LUA_QL("concat"), luaL_typename(L, -1), i); - luaL_addvalue(b); -} - - -static int tconcat (lua_State *L) { - luaL_Buffer b; - size_t lsep; - int i, last; - const char *sep = luaL_optlstring(L, 2, "", &lsep); - luaL_checktype(L, 1, LUA_TTABLE); - i = luaL_optint(L, 3, 1); - last = luaL_opt(L, luaL_checkint, 4, luaL_len(L, 1)); - luaL_buffinit(L, &b); - for (; i < last; i++) { - addfield(L, &b, i); - luaL_addlstring(&b, sep, lsep); - } - if (i == last) /* add last value (if interval was not empty) */ - addfield(L, &b, i); - luaL_pushresult(&b); - return 1; -} - - -/* -** {====================================================== -** Pack/unpack -** ======================================================= -*/ - -static int pack (lua_State *L) { - int n = lua_gettop(L); /* number of elements to pack */ - lua_createtable(L, n, 1); /* create result table */ - lua_pushinteger(L, n); - lua_setfield(L, -2, "n"); /* t.n = number of elements */ - if (n > 0) { /* at least one element? */ - int i; - lua_pushvalue(L, 1); - lua_rawseti(L, -2, 1); /* insert first element */ - lua_replace(L, 1); /* move table into index 1 */ - for (i = n; i >= 2; i--) /* assign other elements */ - lua_rawseti(L, 1, i); - } - return 1; /* return table */ -} - - -static int unpack (lua_State *L) { - int i, e; - unsigned int n; - luaL_checktype(L, 1, LUA_TTABLE); - i = luaL_optint(L, 2, 1); - e = luaL_opt(L, luaL_checkint, 3, luaL_len(L, 1)); - if (i > e) return 0; /* empty range */ - n = (unsigned int)e - (unsigned int)i; /* number of elements minus 1 */ - if (n > (INT_MAX - 10) || !lua_checkstack(L, ++n)) - return luaL_error(L, "too many results to unpack"); - lua_rawgeti(L, 1, i); /* push arg[i] (avoiding overflow problems) */ - while (i++ < e) /* push arg[i + 1...e] */ - lua_rawgeti(L, 1, i); - return n; -} - -/* }====================================================== */ - - - -/* -** {====================================================== -** Quicksort -** (based on `Algorithms in MODULA-3', Robert Sedgewick; -** Addison-Wesley, 1993.) -** ======================================================= -*/ - - -static void set2 (lua_State *L, int i, int j) { - lua_rawseti(L, 1, i); - lua_rawseti(L, 1, j); -} - -static int sort_comp (lua_State *L, int a, int b) { - if (!lua_isnil(L, 2)) { /* function? */ - int res; - lua_pushvalue(L, 2); - lua_pushvalue(L, a-1); /* -1 to compensate function */ - lua_pushvalue(L, b-2); /* -2 to compensate function and `a' */ - lua_call(L, 2, 1); - res = lua_toboolean(L, -1); - lua_pop(L, 1); - return res; - } - else /* a < b? */ - return lua_compare(L, a, b, LUA_OPLT); -} - -static void auxsort (lua_State *L, int l, int u) { - while (l < u) { /* for tail recursion */ - int i, j; - /* sort elements a[l], a[(l+u)/2] and a[u] */ - lua_rawgeti(L, 1, l); - lua_rawgeti(L, 1, u); - if (sort_comp(L, -1, -2)) /* a[u] < a[l]? */ - set2(L, l, u); /* swap a[l] - a[u] */ - else - lua_pop(L, 2); - if (u-l == 1) break; /* only 2 elements */ - i = (l+u)/2; - lua_rawgeti(L, 1, i); - lua_rawgeti(L, 1, l); - if (sort_comp(L, -2, -1)) /* a[i]= P */ - while (lua_rawgeti(L, 1, ++i), sort_comp(L, -1, -2)) { - if (i>=u) luaL_error(L, "invalid order function for sorting"); - lua_pop(L, 1); /* remove a[i] */ - } - /* repeat --j until a[j] <= P */ - while (lua_rawgeti(L, 1, --j), sort_comp(L, -3, -1)) { - if (j<=l) luaL_error(L, "invalid order function for sorting"); - lua_pop(L, 1); /* remove a[j] */ - } - if (j - -#define ltm_c -#define LUA_CORE - -#include "lua.h" - -#include "lobject.h" -#include "lstate.h" -#include "lstring.h" -#include "ltable.h" -#include "ltm.h" - - -static const char udatatypename[] = "userdata"; - -LUAI_DDEF const char *const luaT_typenames_[LUA_TOTALTAGS] = { - "no value", - "nil", "boolean", udatatypename, "number", - "string", "table", "function", udatatypename, "thread", - "proto", "upval" /* these last two cases are used for tests only */ -}; - - -void luaT_init (lua_State *L) { - static const char *const luaT_eventname[] = { /* ORDER TM */ - "__index", "__newindex", - "__gc", "__mode", "__len", "__eq", - "__add", "__sub", "__mul", "__div", "__mod", - "__pow", "__unm", "__lt", "__le", - "__concat", "__call" - }; - int i; - for (i=0; itmname[i] = luaS_new(L, luaT_eventname[i]); - luaS_fix(G(L)->tmname[i]); /* never collect these names */ - } -} - - -/* -** function to be used with macro "fasttm": optimized for absence of -** tag methods -*/ -const TValue *luaT_gettm (Table *events, TMS event, TString *ename) { - const TValue *tm = luaH_getstr(events, ename); - lua_assert(event <= TM_EQ); - if (ttisnil(tm)) { /* no tag method? */ - events->flags |= cast_byte(1u<metatable; - break; - case LUA_TUSERDATA: - mt = uvalue(o)->metatable; - break; - default: - mt = G(L)->mt[ttypenv(o)]; - } - return (mt ? luaH_getstr(mt, G(L)->tmname[event]) : luaO_nilobject); -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h deleted file mode 100644 index 7f89c841f9c0..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h +++ /dev/null @@ -1,57 +0,0 @@ -/* -** $Id: ltm.h,v 2.11.1.1 2013/04/12 18:48:47 roberto Exp $ -** Tag methods -** See Copyright Notice in lua.h -*/ - -#ifndef ltm_h -#define ltm_h - - -#include "lobject.h" - - -/* -* WARNING: if you change the order of this enumeration, -* grep "ORDER TM" -*/ -typedef enum { - TM_INDEX, - TM_NEWINDEX, - TM_GC, - TM_MODE, - TM_LEN, - TM_EQ, /* last tag method with `fast' access */ - TM_ADD, - TM_SUB, - TM_MUL, - TM_DIV, - TM_MOD, - TM_POW, - TM_UNM, - TM_LT, - TM_LE, - TM_CONCAT, - TM_CALL, - TM_N /* number of elements in the enum */ -} TMS; - - - -#define gfasttm(g,et,e) ((et) == NULL ? NULL : \ - ((et)->flags & (1u<<(e))) ? NULL : luaT_gettm(et, e, (g)->tmname[e])) - -#define fasttm(l,et,e) gfasttm(G(l), et, e) - -#define ttypename(x) luaT_typenames_[(x) + 1] -#define objtypename(x) ttypename(ttypenv(x)) - -LUAI_DDEC const char *const luaT_typenames_[LUA_TOTALTAGS]; - - -LUAI_FUNC const TValue *luaT_gettm (Table *events, TMS event, TString *ename); -LUAI_FUNC const TValue *luaT_gettmbyobj (lua_State *L, const TValue *o, - TMS event); -LUAI_FUNC void luaT_init (lua_State *L); - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h deleted file mode 100644 index 4610dad45ed8..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h +++ /dev/null @@ -1,443 +0,0 @@ -/* -** $Id: lua.h,v 1.285.1.4 2015/02/21 14:04:50 roberto Exp $ -** Lua - A Scripting Language -** Lua.org, PUC-Rio, Brazil (http://www.lua.org) -** See Copyright Notice at the end of this file -*/ - - -#ifndef lua_h -#define lua_h - -#include - -#include "luaconf.h" - - -#define LUA_VERSION_MAJOR "5" -#define LUA_VERSION_MINOR "2" -#define LUA_VERSION_NUM 502 -#define LUA_VERSION_RELEASE "4" - -#define LUA_VERSION "Lua " LUA_VERSION_MAJOR "." LUA_VERSION_MINOR -#define LUA_RELEASE LUA_VERSION "." LUA_VERSION_RELEASE -#define LUA_COPYRIGHT LUA_RELEASE " Copyright (C) 1994-2015 Lua.org, PUC-Rio" -#define LUA_AUTHORS "R. Ierusalimschy, L. H. de Figueiredo, W. Celes" - - -/* mark for precompiled code ('Lua') */ -#define LUA_SIGNATURE "\033Lua" - -/* option for multiple returns in 'lua_pcall' and 'lua_call' */ -#define LUA_MULTRET (-1) - - -/* -** pseudo-indices -*/ -#define LUA_REGISTRYINDEX LUAI_FIRSTPSEUDOIDX -#define lua_upvalueindex(i) (LUA_REGISTRYINDEX - (i)) - - -/* thread status */ -#define LUA_OK 0 -#define LUA_YIELD 1 -#define LUA_ERRRUN 2 -#define LUA_ERRSYNTAX 3 -#define LUA_ERRMEM 4 -#define LUA_ERRGCMM 5 -#define LUA_ERRERR 6 - - -typedef struct lua_State lua_State; - -typedef int (*lua_CFunction) (lua_State *L); - - -/* -** functions that read/write blocks when loading/dumping Lua chunks -*/ -typedef const char * (*lua_Reader) (lua_State *L, void *ud, size_t *sz); - -typedef int (*lua_Writer) (lua_State *L, const void* p, size_t sz, void* ud); - - -/* -** prototype for memory-allocation functions -*/ -typedef void * (*lua_Alloc) (void *ud, void *ptr, size_t osize, size_t nsize); - - -/* -** basic types -*/ -#define LUA_TNONE (-1) - -#define LUA_TNIL 0 -#define LUA_TBOOLEAN 1 -#define LUA_TLIGHTUSERDATA 2 -#define LUA_TNUMBER 3 -#define LUA_TSTRING 4 -#define LUA_TTABLE 5 -#define LUA_TFUNCTION 6 -#define LUA_TUSERDATA 7 -#define LUA_TTHREAD 8 - -#define LUA_NUMTAGS 9 - - - -/* minimum Lua stack available to a C function */ -#define LUA_MINSTACK 20 - - -/* predefined values in the registry */ -#define LUA_RIDX_MAINTHREAD 1 -#define LUA_RIDX_GLOBALS 2 -#define LUA_RIDX_LAST LUA_RIDX_GLOBALS - - -/* type of numbers in Lua */ -typedef LUA_NUMBER lua_Number; - - -/* type for integer functions */ -typedef LUA_INTEGER lua_Integer; - -/* unsigned integer type */ -typedef LUA_UNSIGNED lua_Unsigned; - - - - -/* -** generic extra include file -*/ -#if defined(LUA_USER_H) -#include LUA_USER_H -#endif - - -/* -** RCS ident string -*/ -extern const char lua_ident[]; - - -/* -** state manipulation -*/ -LUA_API lua_State *(lua_newstate) (lua_Alloc f, void *ud); -LUA_API void (lua_close) (lua_State *L); -LUA_API lua_State *(lua_newthread) (lua_State *L); - -LUA_API lua_CFunction (lua_atpanic) (lua_State *L, lua_CFunction panicf); - - -LUA_API const lua_Number *(lua_version) (lua_State *L); - - -/* -** basic stack manipulation -*/ -LUA_API int (lua_absindex) (lua_State *L, int idx); -LUA_API int (lua_gettop) (lua_State *L); -LUA_API void (lua_settop) (lua_State *L, int idx); -LUA_API void (lua_pushvalue) (lua_State *L, int idx); -LUA_API void (lua_remove) (lua_State *L, int idx); -LUA_API void (lua_insert) (lua_State *L, int idx); -LUA_API void (lua_replace) (lua_State *L, int idx); -LUA_API void (lua_copy) (lua_State *L, int fromidx, int toidx); -LUA_API int (lua_checkstack) (lua_State *L, int sz); - -LUA_API void (lua_xmove) (lua_State *from, lua_State *to, int n); - - -/* -** access functions (stack -> C) -*/ - -LUA_API int (lua_isnumber) (lua_State *L, int idx); -LUA_API int (lua_isstring) (lua_State *L, int idx); -LUA_API int (lua_iscfunction) (lua_State *L, int idx); -LUA_API int (lua_isuserdata) (lua_State *L, int idx); -LUA_API int (lua_type) (lua_State *L, int idx); -LUA_API const char *(lua_typename) (lua_State *L, int tp); - -LUA_API lua_Number (lua_tonumberx) (lua_State *L, int idx, int *isnum); -LUA_API lua_Integer (lua_tointegerx) (lua_State *L, int idx, int *isnum); -LUA_API lua_Unsigned (lua_tounsignedx) (lua_State *L, int idx, int *isnum); -LUA_API int (lua_toboolean) (lua_State *L, int idx); -LUA_API const char *(lua_tolstring) (lua_State *L, int idx, size_t *len); -LUA_API size_t (lua_rawlen) (lua_State *L, int idx); -LUA_API lua_CFunction (lua_tocfunction) (lua_State *L, int idx); -LUA_API void *(lua_touserdata) (lua_State *L, int idx); -LUA_API lua_State *(lua_tothread) (lua_State *L, int idx); -LUA_API const void *(lua_topointer) (lua_State *L, int idx); - - -/* -** Comparison and arithmetic functions -*/ - -#define LUA_OPADD 0 /* ORDER TM */ -#define LUA_OPSUB 1 -#define LUA_OPMUL 2 -#define LUA_OPDIV 3 -#define LUA_OPMOD 4 -#define LUA_OPPOW 5 -#define LUA_OPUNM 6 - -LUA_API void (lua_arith) (lua_State *L, int op); - -#define LUA_OPEQ 0 -#define LUA_OPLT 1 -#define LUA_OPLE 2 - -LUA_API int (lua_rawequal) (lua_State *L, int idx1, int idx2); -LUA_API int (lua_compare) (lua_State *L, int idx1, int idx2, int op); - - -/* -** push functions (C -> stack) -*/ -LUA_API void (lua_pushnil) (lua_State *L); -LUA_API void (lua_pushnumber) (lua_State *L, lua_Number n); -LUA_API void (lua_pushinteger) (lua_State *L, lua_Integer n); -LUA_API void (lua_pushunsigned) (lua_State *L, lua_Unsigned n); -LUA_API const char *(lua_pushlstring) (lua_State *L, const char *s, size_t l); -LUA_API const char *(lua_pushstring) (lua_State *L, const char *s); -LUA_API const char *(lua_pushvfstring) (lua_State *L, const char *fmt, - va_list argp); -LUA_API const char *(lua_pushfstring) (lua_State *L, const char *fmt, ...); -LUA_API void (lua_pushcclosure) (lua_State *L, lua_CFunction fn, int n); -LUA_API void (lua_pushboolean) (lua_State *L, int b); -LUA_API void (lua_pushlightuserdata) (lua_State *L, void *p); -LUA_API int (lua_pushthread) (lua_State *L); - - -/* -** get functions (Lua -> stack) -*/ -LUA_API void (lua_getglobal) (lua_State *L, const char *var); -LUA_API void (lua_gettable) (lua_State *L, int idx); -LUA_API void (lua_getfield) (lua_State *L, int idx, const char *k); -LUA_API void (lua_rawget) (lua_State *L, int idx); -LUA_API void (lua_rawgeti) (lua_State *L, int idx, int n); -LUA_API void (lua_rawgetp) (lua_State *L, int idx, const void *p); -LUA_API void (lua_createtable) (lua_State *L, int narr, int nrec); -LUA_API void *(lua_newuserdata) (lua_State *L, size_t sz); -LUA_API int (lua_getmetatable) (lua_State *L, int objindex); -LUA_API void (lua_getuservalue) (lua_State *L, int idx); - - -/* -** set functions (stack -> Lua) -*/ -LUA_API void (lua_setglobal) (lua_State *L, const char *var); -LUA_API void (lua_settable) (lua_State *L, int idx); -LUA_API void (lua_setfield) (lua_State *L, int idx, const char *k); -LUA_API void (lua_rawset) (lua_State *L, int idx); -LUA_API void (lua_rawseti) (lua_State *L, int idx, int n); -LUA_API void (lua_rawsetp) (lua_State *L, int idx, const void *p); -LUA_API int (lua_setmetatable) (lua_State *L, int objindex); -LUA_API void (lua_setuservalue) (lua_State *L, int idx); - - -/* -** 'load' and 'call' functions (load and run Lua code) -*/ -LUA_API void (lua_callk) (lua_State *L, int nargs, int nresults, int ctx, - lua_CFunction k); -#define lua_call(L,n,r) lua_callk(L, (n), (r), 0, NULL) - -LUA_API int (lua_getctx) (lua_State *L, int *ctx); - -LUA_API int (lua_pcallk) (lua_State *L, int nargs, int nresults, int errfunc, - int ctx, lua_CFunction k); -#define lua_pcall(L,n,r,f) lua_pcallk(L, (n), (r), (f), 0, NULL) - -LUA_API int (lua_load) (lua_State *L, lua_Reader reader, void *dt, - const char *chunkname, - const char *mode); - -LUA_API int (lua_dump) (lua_State *L, lua_Writer writer, void *data); - - -/* -** coroutine functions -*/ -LUA_API int (lua_yieldk) (lua_State *L, int nresults, int ctx, - lua_CFunction k); -#define lua_yield(L,n) lua_yieldk(L, (n), 0, NULL) -LUA_API int (lua_resume) (lua_State *L, lua_State *from, int narg); -LUA_API int (lua_status) (lua_State *L); - -/* -** garbage-collection function and options -*/ - -#define LUA_GCSTOP 0 -#define LUA_GCRESTART 1 -#define LUA_GCCOLLECT 2 -#define LUA_GCCOUNT 3 -#define LUA_GCCOUNTB 4 -#define LUA_GCSTEP 5 -#define LUA_GCSETPAUSE 6 -#define LUA_GCSETSTEPMUL 7 -#define LUA_GCSETMAJORINC 8 -#define LUA_GCISRUNNING 9 -#define LUA_GCGEN 10 -#define LUA_GCINC 11 - -LUA_API int (lua_gc) (lua_State *L, int what, int data); - - -/* -** miscellaneous functions -*/ - -LUA_API int (lua_error) (lua_State *L); - -LUA_API int (lua_next) (lua_State *L, int idx); - -LUA_API void (lua_concat) (lua_State *L, int n); -LUA_API void (lua_len) (lua_State *L, int idx); - -LUA_API lua_Alloc (lua_getallocf) (lua_State *L, void **ud); -LUA_API void (lua_setallocf) (lua_State *L, lua_Alloc f, void *ud); - - - -/* -** =============================================================== -** some useful macros -** =============================================================== -*/ - -#define lua_tonumber(L,i) lua_tonumberx(L,i,NULL) -#define lua_tointeger(L,i) lua_tointegerx(L,i,NULL) -#define lua_tounsigned(L,i) lua_tounsignedx(L,i,NULL) - -#define lua_pop(L,n) lua_settop(L, -(n)-1) - -#define lua_newtable(L) lua_createtable(L, 0, 0) - -#define lua_register(L,n,f) (lua_pushcfunction(L, (f)), lua_setglobal(L, (n))) - -#define lua_pushcfunction(L,f) lua_pushcclosure(L, (f), 0) - -#define lua_isfunction(L,n) (lua_type(L, (n)) == LUA_TFUNCTION) -#define lua_istable(L,n) (lua_type(L, (n)) == LUA_TTABLE) -#define lua_islightuserdata(L,n) (lua_type(L, (n)) == LUA_TLIGHTUSERDATA) -#define lua_isnil(L,n) (lua_type(L, (n)) == LUA_TNIL) -#define lua_isboolean(L,n) (lua_type(L, (n)) == LUA_TBOOLEAN) -#define lua_isthread(L,n) (lua_type(L, (n)) == LUA_TTHREAD) -#define lua_isnone(L,n) (lua_type(L, (n)) == LUA_TNONE) -#define lua_isnoneornil(L, n) (lua_type(L, (n)) <= 0) - -#define lua_pushliteral(L, s) \ - lua_pushlstring(L, "" s, (sizeof(s)/sizeof(char))-1) - -#define lua_pushglobaltable(L) \ - lua_rawgeti(L, LUA_REGISTRYINDEX, LUA_RIDX_GLOBALS) - -#define lua_tostring(L,i) lua_tolstring(L, (i), NULL) - - - -/* -** {====================================================================== -** Debug API -** ======================================================================= -*/ - - -/* -** Event codes -*/ -#define LUA_HOOKCALL 0 -#define LUA_HOOKRET 1 -#define LUA_HOOKLINE 2 -#define LUA_HOOKCOUNT 3 -#define LUA_HOOKTAILCALL 4 - - -/* -** Event masks -*/ -#define LUA_MASKCALL (1 << LUA_HOOKCALL) -#define LUA_MASKRET (1 << LUA_HOOKRET) -#define LUA_MASKLINE (1 << LUA_HOOKLINE) -#define LUA_MASKCOUNT (1 << LUA_HOOKCOUNT) - -typedef struct lua_Debug lua_Debug; /* activation record */ - - -/* Functions to be called by the debugger in specific events */ -typedef void (*lua_Hook) (lua_State *L, lua_Debug *ar); - - -LUA_API int (lua_getstack) (lua_State *L, int level, lua_Debug *ar); -LUA_API int (lua_getinfo) (lua_State *L, const char *what, lua_Debug *ar); -LUA_API const char *(lua_getlocal) (lua_State *L, const lua_Debug *ar, int n); -LUA_API const char *(lua_setlocal) (lua_State *L, const lua_Debug *ar, int n); -LUA_API const char *(lua_getupvalue) (lua_State *L, int funcindex, int n); -LUA_API const char *(lua_setupvalue) (lua_State *L, int funcindex, int n); - -LUA_API void *(lua_upvalueid) (lua_State *L, int fidx, int n); -LUA_API void (lua_upvaluejoin) (lua_State *L, int fidx1, int n1, - int fidx2, int n2); - -LUA_API int (lua_sethook) (lua_State *L, lua_Hook func, int mask, int count); -LUA_API lua_Hook (lua_gethook) (lua_State *L); -LUA_API int (lua_gethookmask) (lua_State *L); -LUA_API int (lua_gethookcount) (lua_State *L); - - -struct lua_Debug { - int event; - const char *name; /* (n) */ - const char *namewhat; /* (n) 'global', 'local', 'field', 'method' */ - const char *what; /* (S) 'Lua', 'C', 'main', 'tail' */ - const char *source; /* (S) */ - int currentline; /* (l) */ - int linedefined; /* (S) */ - int lastlinedefined; /* (S) */ - unsigned char nups; /* (u) number of upvalues */ - unsigned char nparams;/* (u) number of parameters */ - char isvararg; /* (u) */ - char istailcall; /* (t) */ - char short_src[LUA_IDSIZE]; /* (S) */ - /* private part */ - struct CallInfo *i_ci; /* active function */ -}; - -/* }====================================================================== */ - - -/****************************************************************************** -* Copyright (C) 1994-2015 Lua.org, PUC-Rio. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to deal in the Software without restriction, including -* without limitation the rights to use, copy, modify, merge, publish, -* distribute, sublicense, and/or sell copies of the Software, and to -* permit persons to whom the Software is furnished to do so, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -******************************************************************************/ - - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h deleted file mode 100644 index e856eee264ff..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h +++ /dev/null @@ -1,555 +0,0 @@ -/* -** $Id: luaconf.h,v 1.176.1.2 2013/11/21 17:26:16 roberto Exp $ -** Configuration file for Lua -** See Copyright Notice in lua.h -*/ - - -#ifndef lconfig_h -#define lconfig_h - -#include -#ifdef illumos -#include -#else -#include -#endif - -extern ssize_t lcompat_sprintf(char *, const char *, ...); -extern int64_t lcompat_strtoll(const char *, char **); -extern int64_t lcompat_pow(int64_t, int64_t); - -/* -** ================================================================== -** Search for "@@" to find all configurable definitions. -** =================================================================== -*/ - - -/* -@@ LUA_ANSI controls the use of non-ansi features. -** CHANGE it (define it) if you want Lua to avoid the use of any -** non-ansi feature or library. -*/ -#if !defined(LUA_ANSI) && defined(__STRICT_ANSI__) -#define LUA_ANSI -#endif - - -#if !defined(LUA_ANSI) && defined(_WIN32) && !defined(_WIN32_WCE) -#define LUA_WIN /* enable goodies for regular Windows platforms */ -#endif - -#if defined(LUA_WIN) -#define LUA_DL_DLL -#define LUA_USE_AFORMAT /* assume 'printf' handles 'aA' specifiers */ -#endif - - - -#if defined(LUA_USE_LINUX) -#define LUA_USE_POSIX -#define LUA_USE_DLOPEN /* needs an extra library: -ldl */ -#define LUA_USE_READLINE /* needs some extra libraries */ -#define LUA_USE_STRTODHEX /* assume 'strtod' handles hex formats */ -#define LUA_USE_AFORMAT /* assume 'printf' handles 'aA' specifiers */ -#define LUA_USE_LONGLONG /* assume support for long long */ -#endif - -#if defined(LUA_USE_MACOSX) -#define LUA_USE_POSIX -#define LUA_USE_DLOPEN /* does not need -ldl */ -#define LUA_USE_READLINE /* needs an extra library: -lreadline */ -#define LUA_USE_STRTODHEX /* assume 'strtod' handles hex formats */ -#define LUA_USE_AFORMAT /* assume 'printf' handles 'aA' specifiers */ -#define LUA_USE_LONGLONG /* assume support for long long */ -#endif - - - -/* -@@ LUA_USE_POSIX includes all functionality listed as X/Open System -@* Interfaces Extension (XSI). -** CHANGE it (define it) if your system is XSI compatible. -*/ -#if defined(LUA_USE_POSIX) -#define LUA_USE_MKSTEMP -#define LUA_USE_ISATTY -#define LUA_USE_POPEN -#define LUA_USE_ULONGJMP -#define LUA_USE_GMTIME_R -#endif - - - -/* -@@ LUA_PATH_DEFAULT is the default path that Lua uses to look for -@* Lua libraries. -@@ LUA_CPATH_DEFAULT is the default path that Lua uses to look for -@* C libraries. -** CHANGE them if your machine has a non-conventional directory -** hierarchy or if you want to install your libraries in -** non-conventional directories. -*/ -#if defined(_WIN32) /* { */ -/* -** In Windows, any exclamation mark ('!') in the path is replaced by the -** path of the directory of the executable file of the current process. -*/ -#define LUA_LDIR "!\\lua\\" -#define LUA_CDIR "!\\" -#define LUA_PATH_DEFAULT \ - LUA_LDIR"?.lua;" LUA_LDIR"?\\init.lua;" \ - LUA_CDIR"?.lua;" LUA_CDIR"?\\init.lua;" ".\\?.lua" -#define LUA_CPATH_DEFAULT \ - LUA_CDIR"?.dll;" LUA_CDIR"loadall.dll;" ".\\?.dll" - -#else /* }{ */ - -#define LUA_VDIR LUA_VERSION_MAJOR "." LUA_VERSION_MINOR "/" -#define LUA_ROOT "/usr/local/" -#define LUA_LDIR LUA_ROOT "share/lua/" LUA_VDIR -#define LUA_CDIR LUA_ROOT "lib/lua/" LUA_VDIR -#define LUA_PATH_DEFAULT \ - LUA_LDIR"?.lua;" LUA_LDIR"?/init.lua;" \ - LUA_CDIR"?.lua;" LUA_CDIR"?/init.lua;" "./?.lua" -#define LUA_CPATH_DEFAULT \ - LUA_CDIR"?.so;" LUA_CDIR"loadall.so;" "./?.so" -#endif /* } */ - - -/* -@@ LUA_DIRSEP is the directory separator (for submodules). -** CHANGE it if your machine does not use "/" as the directory separator -** and is not Windows. (On Windows Lua automatically uses "\".) -*/ -#if defined(_WIN32) -#define LUA_DIRSEP "\\" -#else -#define LUA_DIRSEP "/" -#endif - - -/* -@@ LUA_ENV is the name of the variable that holds the current -@@ environment, used to access global names. -** CHANGE it if you do not like this name. -*/ -#define LUA_ENV "_ENV" - - -/* -@@ LUA_API is a mark for all core API functions. -@@ LUALIB_API is a mark for all auxiliary library functions. -@@ LUAMOD_API is a mark for all standard library opening functions. -** CHANGE them if you need to define those functions in some special way. -** For instance, if you want to create one Windows DLL with the core and -** the libraries, you may want to use the following definition (define -** LUA_BUILD_AS_DLL to get it). -*/ -#if defined(LUA_BUILD_AS_DLL) /* { */ - -#if defined(LUA_CORE) || defined(LUA_LIB) /* { */ -#define LUA_API __declspec(dllexport) -#else /* }{ */ -#define LUA_API __declspec(dllimport) -#endif /* } */ - -#else /* }{ */ - -#define LUA_API extern - -#endif /* } */ - - -/* more often than not the libs go together with the core */ -#define LUALIB_API LUA_API -#define LUAMOD_API LUALIB_API - - -/* -@@ LUAI_FUNC is a mark for all extern functions that are not to be -@* exported to outside modules. -@@ LUAI_DDEF and LUAI_DDEC are marks for all extern (const) variables -@* that are not to be exported to outside modules (LUAI_DDEF for -@* definitions and LUAI_DDEC for declarations). -** CHANGE them if you need to mark them in some special way. Elf/gcc -** (versions 3.2 and later) mark them as "hidden" to optimize access -** when Lua is compiled as a shared library. Not all elf targets support -** this attribute. Unfortunately, gcc does not offer a way to check -** whether the target offers that support, and those without support -** give a warning about it. To avoid these warnings, change to the -** default definition. -*/ -#if defined(__GNUC__) && ((__GNUC__*100 + __GNUC_MINOR__) >= 302) && \ - defined(__ELF__) /* { */ -#define LUAI_FUNC __attribute__((visibility("hidden"))) extern -#define LUAI_DDEC LUAI_FUNC -#define LUAI_DDEF /* empty */ - -#else /* }{ */ -#define LUAI_FUNC extern -#define LUAI_DDEC extern -#define LUAI_DDEF /* empty */ -#endif /* } */ - - - -/* -@@ LUA_QL describes how error messages quote program elements. -** CHANGE it if you want a different appearance. -*/ -#define LUA_QL(x) "'" x "'" -#define LUA_QS LUA_QL("%s") - - -/* -@@ LUA_IDSIZE gives the maximum size for the description of the source -@* of a function in debug information. -** CHANGE it if you want a different size. -*/ -#define LUA_IDSIZE 60 - - -/* -@@ luai_writestringerror defines how to print error messages. -** (A format string with one argument is enough for Lua...) -*/ -#ifdef _KERNEL -#define luai_writestringerror(s,p) \ - (zfs_dbgmsg((s), (p))) -#else -#define luai_writestringerror(s,p) \ - (fprintf(stderr, (s), (p)), fflush(stderr)) -#endif - - -/* -@@ LUAI_MAXSHORTLEN is the maximum length for short strings, that is, -** strings that are internalized. (Cannot be smaller than reserved words -** or tags for metamethods, as these strings must be internalized; -** #("function") = 8, #("__newindex") = 10.) -*/ -#define LUAI_MAXSHORTLEN 40 - - - -/* -** {================================================================== -** Compatibility with previous versions -** =================================================================== -*/ - -/* -@@ LUA_COMPAT_ALL controls all compatibility options. -** You can define it to get all options, or change specific options -** to fit your specific needs. -*/ -#if defined(LUA_COMPAT_ALL) /* { */ - -/* -@@ LUA_COMPAT_UNPACK controls the presence of global 'unpack'. -** You can replace it with 'table.unpack'. -*/ -#define LUA_COMPAT_UNPACK - -/* -@@ LUA_COMPAT_LOADERS controls the presence of table 'package.loaders'. -** You can replace it with 'package.searchers'. -*/ -#define LUA_COMPAT_LOADERS - -/* -@@ macro 'lua_cpcall' emulates deprecated function lua_cpcall. -** You can call your C function directly (with light C functions). -*/ -#define lua_cpcall(L,f,u) \ - (lua_pushcfunction(L, (f)), \ - lua_pushlightuserdata(L,(u)), \ - lua_pcall(L,1,0,0)) - - -/* -@@ LUA_COMPAT_LOG10 defines the function 'log10' in the math library. -** You can rewrite 'log10(x)' as 'log(x, 10)'. -*/ -#define LUA_COMPAT_LOG10 - -/* -@@ LUA_COMPAT_LOADSTRING defines the function 'loadstring' in the base -** library. You can rewrite 'loadstring(s)' as 'load(s)'. -*/ -#define LUA_COMPAT_LOADSTRING - -/* -@@ LUA_COMPAT_MAXN defines the function 'maxn' in the table library. -*/ -#define LUA_COMPAT_MAXN - -/* -@@ The following macros supply trivial compatibility for some -** changes in the API. The macros themselves document how to -** change your code to avoid using them. -*/ -#define lua_strlen(L,i) lua_rawlen(L, (i)) - -#define lua_objlen(L,i) lua_rawlen(L, (i)) - -#define lua_equal(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPEQ) -#define lua_lessthan(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPLT) - -/* -@@ LUA_COMPAT_MODULE controls compatibility with previous -** module functions 'module' (Lua) and 'luaL_register' (C). -*/ -#define LUA_COMPAT_MODULE - -#endif /* } */ - -/* }================================================================== */ - - - -/* -@@ LUAI_BITSINT defines the number of bits in an int. -** CHANGE here if Lua cannot automatically detect the number of bits of -** your machine. Probably you do not need to change this. -*/ -/* avoid overflows in comparison */ -#if INT_MAX-20 < 32760 /* { */ -#define LUAI_BITSINT 16 -#elif INT_MAX > 2147483640L /* }{ */ -/* int has at least 32 bits */ -#define LUAI_BITSINT 32 -#else /* }{ */ -#error "you must define LUA_BITSINT with number of bits in an integer" -#endif /* } */ - - -/* -@@ LUA_INT32 is a signed integer with exactly 32 bits. -@@ LUAI_UMEM is an unsigned integer big enough to count the total -@* memory used by Lua. -@@ LUAI_MEM is a signed integer big enough to count the total memory -@* used by Lua. -** CHANGE here if for some weird reason the default definitions are not -** good enough for your machine. Probably you do not need to change -** this. -*/ -#if LUAI_BITSINT >= 32 /* { */ -#define LUA_INT32 int -#define LUAI_UMEM size_t -#define LUAI_MEM ptrdiff_t -#else /* }{ */ -/* 16-bit ints */ -#define LUA_INT32 long -#define LUAI_UMEM unsigned long -#define LUAI_MEM long -#endif /* } */ - - -/* -@@ LUAI_MAXSTACK limits the size of the Lua stack. -** CHANGE it if you need a different limit. This limit is arbitrary; -** its only purpose is to stop Lua from consuming unlimited stack -** space (and to reserve some numbers for pseudo-indices). -*/ -#if LUAI_BITSINT >= 32 -#define LUAI_MAXSTACK 1000000 -#else -#define LUAI_MAXSTACK 15000 -#endif - -/* reserve some space for error handling */ -#define LUAI_FIRSTPSEUDOIDX (-LUAI_MAXSTACK - 1000) - - - - -/* -@@ LUAL_BUFFERSIZE is the buffer size used by the lauxlib buffer system. -** CHANGE it if it uses too much C-stack space. -*/ -#define LUAL_BUFFERSIZE 1024 - - - - -/* -** {================================================================== -@@ LUA_NUMBER is the type of numbers in Lua. -** CHANGE the following definitions only if you want to build Lua -** with a number type different from double. You may also need to -** change lua_number2int & lua_number2integer. -** =================================================================== -*/ - -#define LUA_NUMBER int64_t - -/* -@@ LUAI_UACNUMBER is the result of an 'usual argument conversion' -@* over a number. -*/ -#define LUAI_UACNUMBER int64_t - - -/* -@@ LUA_NUMBER_SCAN is the format for reading numbers. -@@ LUA_NUMBER_FMT is the format for writing numbers. -@@ lua_number2str converts a number to a string. -@@ LUAI_MAXNUMBER2STR is maximum size of previous conversion. -*/ -#define LUA_NUMBER_FMT "%" PRId64 -#define lua_number2str(s,n) lcompat_sprintf((s), LUA_NUMBER_FMT, (n)) -#define LUAI_MAXNUMBER2STR 32 /* 16 digits, sign, point, and \0 */ - - -/* -@@ l_mathop allows the addition of an 'l' or 'f' to all math operations -*/ -#define l_mathop(x) (x ## l) - - -/* -@@ lua_str2number converts a decimal numeric string to a number. -@@ lua_strx2number converts an hexadecimal numeric string to a number. -** In C99, 'strtod' does both conversions. C89, however, has no function -** to convert floating hexadecimal strings to numbers. For these -** systems, you can leave 'lua_strx2number' undefined and Lua will -** provide its own implementation. -*/ -#define lua_str2number(s,p) lcompat_strtoll((s), (p)) - -#if defined(LUA_USE_STRTODHEX) -#define lua_strx2number(s,p) lcompat_strtoll((s), (p)) -#endif - - -/* -@@ The luai_num* macros define the primitive operations over numbers. -*/ - -/* the following operations need the math library */ -#if defined(lobject_c) || defined(lvm_c) -#define luai_nummod(L,a,b) ((a) % (b)) -#define luai_numpow(L,a,b) (lcompat_pow((a),(b))) -#endif - -/* these are quite standard operations */ -#if defined(LUA_CORE) -#define luai_numadd(L,a,b) ((a)+(b)) -#define luai_numsub(L,a,b) ((a)-(b)) -#define luai_nummul(L,a,b) ((a)*(b)) -#define luai_numdiv(L,a,b) ((a)/(b)) -#define luai_numunm(L,a) (-(a)) -#define luai_numeq(a,b) ((a)==(b)) -#define luai_numlt(L,a,b) ((a)<(b)) -#define luai_numle(L,a,b) ((a)<=(b)) -#define luai_numisnan(L,a) (!luai_numeq((a), (a))) -#endif - - - -/* -@@ LUA_INTEGER is the integral type used by lua_pushinteger/lua_tointeger. -** CHANGE that if ptrdiff_t is not adequate on your machine. (On most -** machines, ptrdiff_t gives a good choice between int or long.) -*/ -#define LUA_INTEGER ptrdiff_t - -/* -@@ LUA_UNSIGNED is the integral type used by lua_pushunsigned/lua_tounsigned. -** It must have at least 32 bits. -*/ -#define LUA_UNSIGNED uint64_t - - - -/* -** Some tricks with doubles -*/ - -#if defined(LUA_NUMBER_DOUBLE) && !defined(LUA_ANSI) /* { */ -/* -** The next definitions activate some tricks to speed up the -** conversion from doubles to integer types, mainly to LUA_UNSIGNED. -** -@@ LUA_MSASMTRICK uses Microsoft assembler to avoid clashes with a -** DirectX idiosyncrasy. -** -@@ LUA_IEEE754TRICK uses a trick that should work on any machine -** using IEEE754 with a 32-bit integer type. -** -@@ LUA_IEEELL extends the trick to LUA_INTEGER; should only be -** defined when LUA_INTEGER is a 32-bit integer. -** -@@ LUA_IEEEENDIAN is the endianness of doubles in your machine -** (0 for little endian, 1 for big endian); if not defined, Lua will -** check it dynamically for LUA_IEEE754TRICK (but not for LUA_NANTRICK). -** -@@ LUA_NANTRICK controls the use of a trick to pack all types into -** a single double value, using NaN values to represent non-number -** values. The trick only works on 32-bit machines (ints and pointers -** are 32-bit values) with numbers represented as IEEE 754-2008 doubles -** with conventional endianess (12345678 or 87654321), in CPUs that do -** not produce signaling NaN values (all NaNs are quiet). -*/ - -/* Microsoft compiler on a Pentium (32 bit) ? */ -#if defined(LUA_WIN) && defined(_MSC_VER) && defined(_M_IX86) /* { */ - -#define LUA_MSASMTRICK -#define LUA_IEEEENDIAN 0 -#define LUA_NANTRICK - - -/* pentium 32 bits? */ -#elif defined(__i386__) || defined(__i386) || defined(__X86__) /* }{ */ - -#define LUA_IEEE754TRICK -#define LUA_IEEELL -#define LUA_IEEEENDIAN 0 -#define LUA_NANTRICK - -/* pentium 64 bits? */ -#elif defined(__x86_64) /* }{ */ - -#define LUA_IEEE754TRICK -#define LUA_IEEEENDIAN 0 - -#elif defined(__POWERPC__) || defined(__ppc__) /* }{ */ - -#define LUA_IEEE754TRICK -#define LUA_IEEEENDIAN 1 - -#else /* }{ */ - -/* assume IEEE754 and a 32-bit integer type */ -#define LUA_IEEE754TRICK - -#endif /* } */ - -#endif /* } */ - -/* }================================================================== */ - - - - -/* =================================================================== */ - -/* -** Local configuration. You can use this space to add your redefinitions -** without modifying the main part of the file. -*/ - -#define getlocaledecpoint() ('.') - -#define abs(x) (((x) < 0) ? -(x) : (x)) - -#if !defined(UCHAR_MAX) -#define UCHAR_MAX (0xff) -#endif - -#endif - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h deleted file mode 100644 index da82005c9de2..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h +++ /dev/null @@ -1,55 +0,0 @@ -/* -** $Id: lualib.h,v 1.43.1.1 2013/04/12 18:48:47 roberto Exp $ -** Lua standard libraries -** See Copyright Notice in lua.h -*/ - - -#ifndef lualib_h -#define lualib_h - -#include "lua.h" - - - -LUAMOD_API int (luaopen_base) (lua_State *L); - -#define LUA_COLIBNAME "coroutine" -LUAMOD_API int (luaopen_coroutine) (lua_State *L); - -#define LUA_TABLIBNAME "table" -LUAMOD_API int (luaopen_table) (lua_State *L); - -#define LUA_IOLIBNAME "io" -LUAMOD_API int (luaopen_io) (lua_State *L); - -#define LUA_OSLIBNAME "os" -LUAMOD_API int (luaopen_os) (lua_State *L); - -#define LUA_STRLIBNAME "string" -LUAMOD_API int (luaopen_string) (lua_State *L); - -#define LUA_BITLIBNAME "bit32" -LUAMOD_API int (luaopen_bit32) (lua_State *L); - -#define LUA_MATHLIBNAME "math" -LUAMOD_API int (luaopen_math) (lua_State *L); - -#define LUA_DBLIBNAME "debug" -LUAMOD_API int (luaopen_debug) (lua_State *L); - -#define LUA_LOADLIBNAME "package" -LUAMOD_API int (luaopen_package) (lua_State *L); - - -/* open all previous libraries */ -LUALIB_API void (luaL_openlibs) (lua_State *L); - - - -#if !defined(lua_assert) -#define lua_assert(x) ((void)0) -#endif - - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c deleted file mode 100644 index 4d53749a0273..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c +++ /dev/null @@ -1,258 +0,0 @@ -/* -** $Id: lundump.c,v 2.22.1.1 2013/04/12 18:48:47 roberto Exp $ -** load precompiled Lua chunks -** See Copyright Notice in lua.h -*/ - -#include - -#define lundump_c -#define LUA_CORE - -#include "lua.h" - -#include "ldebug.h" -#include "ldo.h" -#include "lfunc.h" -#include "lmem.h" -#include "lobject.h" -#include "lstring.h" -#include "lundump.h" -#include "lzio.h" - -typedef struct { - lua_State* L; - ZIO* Z; - Mbuffer* b; - const char* name; -} LoadState; - -static l_noret error(LoadState* S, const char* why) -{ - luaO_pushfstring(S->L,"%s: %s precompiled chunk",S->name,why); - luaD_throw(S->L,LUA_ERRSYNTAX); -} - -#define LoadMem(S,b,n,size) LoadBlock(S,b,(n)*(size)) -#define LoadByte(S) (lu_byte)LoadChar(S) -#define LoadVar(S,x) LoadMem(S,&x,1,sizeof(x)) -#define LoadVector(S,b,n,size) LoadMem(S,b,n,size) - -#if !defined(luai_verifycode) -#define luai_verifycode(L,b,f) /* empty */ -#endif - -static void LoadBlock(LoadState* S, void* b, size_t size) -{ - if (luaZ_read(S->Z,b,size)!=0) error(S,"truncated"); -} - -static int LoadChar(LoadState* S) -{ - char x; - LoadVar(S,x); - return x; -} - -static int LoadInt(LoadState* S) -{ - int x; - LoadVar(S,x); - if (x<0) error(S,"corrupted"); - return x; -} - -static lua_Number LoadNumber(LoadState* S) -{ - lua_Number x; - LoadVar(S,x); - return x; -} - -static TString* LoadString(LoadState* S) -{ - size_t size; - LoadVar(S,size); - if (size==0) - return NULL; - else - { - char* s=luaZ_openspace(S->L,S->b,size); - LoadBlock(S,s,size*sizeof(char)); - return luaS_newlstr(S->L,s,size-1); /* remove trailing '\0' */ - } -} - -static void LoadCode(LoadState* S, Proto* f) -{ - int n=LoadInt(S); - f->code=luaM_newvector(S->L,n,Instruction); - f->sizecode=n; - LoadVector(S,f->code,n,sizeof(Instruction)); -} - -static void LoadFunction(LoadState* S, Proto* f); - -static void LoadConstants(LoadState* S, Proto* f) -{ - int i,n; - n=LoadInt(S); - f->k=luaM_newvector(S->L,n,TValue); - f->sizek=n; - for (i=0; ik[i]); - for (i=0; ik[i]; - int t=LoadChar(S); - switch (t) - { - case LUA_TNIL: - setnilvalue(o); - break; - case LUA_TBOOLEAN: - setbvalue(o,LoadChar(S)); - break; - case LUA_TNUMBER: - setnvalue(o,LoadNumber(S)); - break; - case LUA_TSTRING: - setsvalue2n(S->L,o,LoadString(S)); - break; - default: lua_assert(0); - } - } - n=LoadInt(S); - f->p=luaM_newvector(S->L,n,Proto*); - f->sizep=n; - for (i=0; ip[i]=NULL; - for (i=0; ip[i]=luaF_newproto(S->L); - LoadFunction(S,f->p[i]); - } -} - -static void LoadUpvalues(LoadState* S, Proto* f) -{ - int i,n; - n=LoadInt(S); - f->upvalues=luaM_newvector(S->L,n,Upvaldesc); - f->sizeupvalues=n; - for (i=0; iupvalues[i].name=NULL; - for (i=0; iupvalues[i].instack=LoadByte(S); - f->upvalues[i].idx=LoadByte(S); - } -} - -static void LoadDebug(LoadState* S, Proto* f) -{ - int i,n; - f->source=LoadString(S); - n=LoadInt(S); - f->lineinfo=luaM_newvector(S->L,n,int); - f->sizelineinfo=n; - LoadVector(S,f->lineinfo,n,sizeof(int)); - n=LoadInt(S); - f->locvars=luaM_newvector(S->L,n,LocVar); - f->sizelocvars=n; - for (i=0; ilocvars[i].varname=NULL; - for (i=0; ilocvars[i].varname=LoadString(S); - f->locvars[i].startpc=LoadInt(S); - f->locvars[i].endpc=LoadInt(S); - } - n=LoadInt(S); - for (i=0; iupvalues[i].name=LoadString(S); -} - -static void LoadFunction(LoadState* S, Proto* f) -{ - f->linedefined=LoadInt(S); - f->lastlinedefined=LoadInt(S); - f->numparams=LoadByte(S); - f->is_vararg=LoadByte(S); - f->maxstacksize=LoadByte(S); - LoadCode(S,f); - LoadConstants(S,f); - LoadUpvalues(S,f); - LoadDebug(S,f); -} - -/* the code below must be consistent with the code in luaU_header */ -#define N0 LUAC_HEADERSIZE -#define N1 (sizeof(LUA_SIGNATURE)-sizeof(char)) -#define N2 N1+2 -#define N3 N2+6 - -static void LoadHeader(LoadState* S) -{ - lu_byte h[LUAC_HEADERSIZE]; - lu_byte s[LUAC_HEADERSIZE]; - luaU_header(h); - memcpy(s,h,sizeof(char)); /* first char already read */ - LoadBlock(S,s+sizeof(char),LUAC_HEADERSIZE-sizeof(char)); - if (memcmp(h,s,N0)==0) return; - if (memcmp(h,s,N1)!=0) error(S,"not a"); - if (memcmp(h,s,N2)!=0) error(S,"version mismatch in"); - if (memcmp(h,s,N3)!=0) error(S,"incompatible"); else error(S,"corrupted"); -} - -/* -** load precompiled chunk -*/ -Closure* luaU_undump (lua_State* L, ZIO* Z, Mbuffer* buff, const char* name) -{ - LoadState S; - Closure* cl; - if (*name=='@' || *name=='=') - S.name=name+1; - else if (*name==LUA_SIGNATURE[0]) - S.name="binary string"; - else - S.name=name; - S.L=L; - S.Z=Z; - S.b=buff; - LoadHeader(&S); - cl=luaF_newLclosure(L,1); - setclLvalue(L,L->top,cl); incr_top(L); - cl->l.p=luaF_newproto(L); - LoadFunction(&S,cl->l.p); - if (cl->l.p->sizeupvalues != 1) - { - Proto* p=cl->l.p; - cl=luaF_newLclosure(L,cl->l.p->sizeupvalues); - cl->l.p=p; - setclLvalue(L,L->top-1,cl); - } - luai_verifycode(L,buff,cl->l.p); - return cl; -} - -#define MYINT(s) (s[0]-'0') -#define VERSION MYINT(LUA_VERSION_MAJOR)*16+MYINT(LUA_VERSION_MINOR) -#define FORMAT 0 /* this is the official format */ - -/* -* make header for precompiled chunks -* if you change the code below be sure to update LoadHeader and FORMAT above -* and LUAC_HEADERSIZE in lundump.h -*/ -void luaU_header (lu_byte* h) -{ - int x=1; - memcpy(h,LUA_SIGNATURE,sizeof(LUA_SIGNATURE)-sizeof(char)); - h+=sizeof(LUA_SIGNATURE)-sizeof(char); - *h++=cast_byte(VERSION); - *h++=cast_byte(FORMAT); - *h++=cast_byte(*(char*)&x); /* endianness */ - *h++=cast_byte(sizeof(int)); - *h++=cast_byte(sizeof(size_t)); - *h++=cast_byte(sizeof(Instruction)); - *h++=cast_byte(sizeof(lua_Number)); - *h++=cast_byte(((lua_Number)0.5)==0); /* is lua_Number integral? */ - memcpy(h,LUAC_TAIL,sizeof(LUAC_TAIL)-sizeof(char)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h deleted file mode 100644 index 5255db259dfe..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h +++ /dev/null @@ -1,28 +0,0 @@ -/* -** $Id: lundump.h,v 1.39.1.1 2013/04/12 18:48:47 roberto Exp $ -** load precompiled Lua chunks -** See Copyright Notice in lua.h -*/ - -#ifndef lundump_h -#define lundump_h - -#include "lobject.h" -#include "lzio.h" - -/* load one chunk; from lundump.c */ -LUAI_FUNC Closure* luaU_undump (lua_State* L, ZIO* Z, Mbuffer* buff, const char* name); - -/* make header; from lundump.c */ -LUAI_FUNC void luaU_header (lu_byte* h); - -/* dump one chunk; from ldump.c */ -LUAI_FUNC int luaU_dump (lua_State* L, const Proto* f, lua_Writer w, void* data, int strip); - -/* data to catch conversion errors */ -#define LUAC_TAIL "\x19\x93\r\n\x1a\n" - -/* size in bytes of header of binary files */ -#define LUAC_HEADERSIZE (sizeof(LUA_SIGNATURE)-sizeof(char)+2+6+sizeof(LUAC_TAIL)-sizeof(char)) - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c deleted file mode 100644 index a06e36e5ceae..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c +++ /dev/null @@ -1,930 +0,0 @@ -/* -** $Id: lvm.c,v 2.155.1.1 2013/04/12 18:48:47 roberto Exp $ -** Lua virtual machine -** See Copyright Notice in lua.h -*/ - - -#include - -#define strcoll(l,r) (strcmp((l),(r))) - -#define lvm_c -#define LUA_CORE - -#include "lua.h" - -#include "ldebug.h" -#include "ldo.h" -#include "lfunc.h" -#include "lgc.h" -#include "lobject.h" -#include "lopcodes.h" -#include "lstate.h" -#include "lstring.h" -#include "ltable.h" -#include "ltm.h" -#include "lvm.h" - - - -/* limit for table tag-method chains (to avoid loops) */ -#define MAXTAGLOOP 100 - - -const TValue *luaV_tonumber (const TValue *obj, TValue *n) { - lua_Number num; - if (ttisnumber(obj)) return obj; - if (ttisstring(obj) && luaO_str2d(svalue(obj), tsvalue(obj)->len, &num)) { - setnvalue(n, num); - return n; - } - else - return NULL; -} - - -int luaV_tostring (lua_State *L, StkId obj) { - if (!ttisnumber(obj)) - return 0; - else { - char s[LUAI_MAXNUMBER2STR]; - lua_Number n = nvalue(obj); - int l = lua_number2str(s, n); - setsvalue2s(L, obj, luaS_newlstr(L, s, l)); - return 1; - } -} - - -static void traceexec (lua_State *L) { - CallInfo *ci = L->ci; - lu_byte mask = L->hookmask; - int counthook = ((mask & LUA_MASKCOUNT) && L->hookcount == 0); - if (counthook) - resethookcount(L); /* reset count */ - if (ci->callstatus & CIST_HOOKYIELD) { /* called hook last time? */ - ci->callstatus &= ~CIST_HOOKYIELD; /* erase mark */ - return; /* do not call hook again (VM yielded, so it did not move) */ - } - if (counthook) - luaD_hook(L, LUA_HOOKCOUNT, -1); /* call count hook */ - if (mask & LUA_MASKLINE) { - Proto *p = ci_func(ci)->p; - int npc = pcRel(ci->u.l.savedpc, p); - int newline = getfuncline(p, npc); - if (npc == 0 || /* call linehook when enter a new function, */ - ci->u.l.savedpc <= L->oldpc || /* when jump back (loop), or when */ - newline != getfuncline(p, pcRel(L->oldpc, p))) /* enter a new line */ - luaD_hook(L, LUA_HOOKLINE, newline); /* call line hook */ - } - L->oldpc = ci->u.l.savedpc; - if (L->status == LUA_YIELD) { /* did hook yield? */ - if (counthook) - L->hookcount = 1; /* undo decrement to zero */ - ci->u.l.savedpc--; /* undo increment (resume will increment it again) */ - ci->callstatus |= CIST_HOOKYIELD; /* mark that it yielded */ - ci->func = L->top - 1; /* protect stack below results */ - luaD_throw(L, LUA_YIELD); - } -} - - -static void callTM (lua_State *L, const TValue *f, const TValue *p1, - const TValue *p2, TValue *p3, int hasres) { - ptrdiff_t result = savestack(L, p3); - setobj2s(L, L->top++, f); /* push function */ - setobj2s(L, L->top++, p1); /* 1st argument */ - setobj2s(L, L->top++, p2); /* 2nd argument */ - if (!hasres) /* no result? 'p3' is third argument */ - setobj2s(L, L->top++, p3); /* 3rd argument */ - /* metamethod may yield only when called from Lua code */ - luaD_call(L, L->top - (4 - hasres), hasres, isLua(L->ci)); - if (hasres) { /* if has result, move it to its place */ - p3 = restorestack(L, result); - setobjs2s(L, p3, --L->top); - } -} - - -void luaV_gettable (lua_State *L, const TValue *t, TValue *key, StkId val) { - int loop; - for (loop = 0; loop < MAXTAGLOOP; loop++) { - const TValue *tm; - if (ttistable(t)) { /* `t' is a table? */ - Table *h = hvalue(t); - const TValue *res = luaH_get(h, key); /* do a primitive get */ - if (!ttisnil(res) || /* result is not nil? */ - (tm = fasttm(L, h->metatable, TM_INDEX)) == NULL) { /* or no TM? */ - setobj2s(L, val, res); - return; - } - /* else will try the tag method */ - } - else if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_INDEX))) - luaG_typeerror(L, t, "index"); - if (ttisfunction(tm)) { - callTM(L, tm, t, key, val, 1); - return; - } - t = tm; /* else repeat with 'tm' */ - } - luaG_runerror(L, "loop in gettable"); -} - - -void luaV_settable (lua_State *L, const TValue *t, TValue *key, StkId val) { - int loop; - for (loop = 0; loop < MAXTAGLOOP; loop++) { - const TValue *tm; - if (ttistable(t)) { /* `t' is a table? */ - Table *h = hvalue(t); - TValue *oldval = cast(TValue *, luaH_get(h, key)); - /* if previous value is not nil, there must be a previous entry - in the table; moreover, a metamethod has no relevance */ - if (!ttisnil(oldval) || - /* previous value is nil; must check the metamethod */ - ((tm = fasttm(L, h->metatable, TM_NEWINDEX)) == NULL && - /* no metamethod; is there a previous entry in the table? */ - (oldval != luaO_nilobject || - /* no previous entry; must create one. (The next test is - always true; we only need the assignment.) */ - (oldval = luaH_newkey(L, h, key), 1)))) { - /* no metamethod and (now) there is an entry with given key */ - setobj2t(L, oldval, val); /* assign new value to that entry */ - invalidateTMcache(h); - luaC_barrierback(L, obj2gco(h), val); - return; - } - /* else will try the metamethod */ - } - else /* not a table; check metamethod */ - if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_NEWINDEX))) - luaG_typeerror(L, t, "index"); - /* there is a metamethod */ - if (ttisfunction(tm)) { - callTM(L, tm, t, key, val, 0); - return; - } - t = tm; /* else repeat with 'tm' */ - } - luaG_runerror(L, "loop in settable"); -} - - -static int call_binTM (lua_State *L, const TValue *p1, const TValue *p2, - StkId res, TMS event) { - const TValue *tm = luaT_gettmbyobj(L, p1, event); /* try first operand */ - if (ttisnil(tm)) - tm = luaT_gettmbyobj(L, p2, event); /* try second operand */ - if (ttisnil(tm)) return 0; - callTM(L, tm, p1, p2, res, 1); - return 1; -} - - -static const TValue *get_equalTM (lua_State *L, Table *mt1, Table *mt2, - TMS event) { - const TValue *tm1 = fasttm(L, mt1, event); - const TValue *tm2; - if (tm1 == NULL) return NULL; /* no metamethod */ - if (mt1 == mt2) return tm1; /* same metatables => same metamethods */ - tm2 = fasttm(L, mt2, event); - if (tm2 == NULL) return NULL; /* no metamethod */ - if (luaV_rawequalobj(tm1, tm2)) /* same metamethods? */ - return tm1; - return NULL; -} - - -static int call_orderTM (lua_State *L, const TValue *p1, const TValue *p2, - TMS event) { - if (!call_binTM(L, p1, p2, L->top, event)) - return -1; /* no metamethod */ - else - return !l_isfalse(L->top); -} - - -static int l_strcmp (const TString *ls, const TString *rs) { - const char *l = getstr(ls); - size_t ll = ls->tsv.len; - const char *r = getstr(rs); - size_t lr = rs->tsv.len; - for (;;) { - int temp = strcoll(l, r); - if (temp != 0) return temp; - else { /* strings are equal up to a `\0' */ - size_t len = strlen(l); /* index of first `\0' in both strings */ - if (len == lr) /* r is finished? */ - return (len == ll) ? 0 : 1; - else if (len == ll) /* l is finished? */ - return -1; /* l is smaller than r (because r is not finished) */ - /* both strings longer than `len'; go on comparing (after the `\0') */ - len++; - l += len; ll -= len; r += len; lr -= len; - } - } -} - - -int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r) { - int res; - if (ttisnumber(l) && ttisnumber(r)) - return luai_numlt(L, nvalue(l), nvalue(r)); - else if (ttisstring(l) && ttisstring(r)) - return l_strcmp(rawtsvalue(l), rawtsvalue(r)) < 0; - else if ((res = call_orderTM(L, l, r, TM_LT)) < 0) - luaG_ordererror(L, l, r); - return res; -} - - -int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r) { - int res; - if (ttisnumber(l) && ttisnumber(r)) - return luai_numle(L, nvalue(l), nvalue(r)); - else if (ttisstring(l) && ttisstring(r)) - return l_strcmp(rawtsvalue(l), rawtsvalue(r)) <= 0; - else if ((res = call_orderTM(L, l, r, TM_LE)) >= 0) /* first try `le' */ - return res; - else if ((res = call_orderTM(L, r, l, TM_LT)) < 0) /* else try `lt' */ - luaG_ordererror(L, l, r); - return !res; -} - - -/* -** equality of Lua values. L == NULL means raw equality (no metamethods) -*/ -int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2) { - const TValue *tm; - lua_assert(ttisequal(t1, t2)); - switch (ttype(t1)) { - case LUA_TNIL: return 1; - case LUA_TNUMBER: return luai_numeq(nvalue(t1), nvalue(t2)); - case LUA_TBOOLEAN: return bvalue(t1) == bvalue(t2); /* true must be 1 !! */ - case LUA_TLIGHTUSERDATA: return pvalue(t1) == pvalue(t2); - case LUA_TLCF: return fvalue(t1) == fvalue(t2); - case LUA_TSHRSTR: return eqshrstr(rawtsvalue(t1), rawtsvalue(t2)); - case LUA_TLNGSTR: return luaS_eqlngstr(rawtsvalue(t1), rawtsvalue(t2)); - case LUA_TUSERDATA: { - if (uvalue(t1) == uvalue(t2)) return 1; - else if (L == NULL) return 0; - tm = get_equalTM(L, uvalue(t1)->metatable, uvalue(t2)->metatable, TM_EQ); - break; /* will try TM */ - } - case LUA_TTABLE: { - if (hvalue(t1) == hvalue(t2)) return 1; - else if (L == NULL) return 0; - tm = get_equalTM(L, hvalue(t1)->metatable, hvalue(t2)->metatable, TM_EQ); - break; /* will try TM */ - } - default: - lua_assert(iscollectable(t1)); - return gcvalue(t1) == gcvalue(t2); - } - if (tm == NULL) return 0; /* no TM? */ - callTM(L, tm, t1, t2, L->top, 1); /* call TM */ - return !l_isfalse(L->top); -} - - -void luaV_concat (lua_State *L, int total) { - lua_assert(total >= 2); - do { - StkId top = L->top; - int n = 2; /* number of elements handled in this pass (at least 2) */ - if (!(ttisstring(top-2) || ttisnumber(top-2)) || !tostring(L, top-1)) { - if (!call_binTM(L, top-2, top-1, top-2, TM_CONCAT)) - luaG_concaterror(L, top-2, top-1); - } - else if (tsvalue(top-1)->len == 0) /* second operand is empty? */ - (void)tostring(L, top - 2); /* result is first operand */ - else if (ttisstring(top-2) && tsvalue(top-2)->len == 0) { - setobjs2s(L, top - 2, top - 1); /* result is second op. */ - } - else { - /* at least two non-empty string values; get as many as possible */ - size_t tl = tsvalue(top-1)->len; - char *buffer; - int i; - /* collect total length */ - for (i = 1; i < total && tostring(L, top-i-1); i++) { - size_t l = tsvalue(top-i-1)->len; - if (l >= (MAX_SIZET/sizeof(char)) - tl) - luaG_runerror(L, "string length overflow"); - tl += l; - } - buffer = luaZ_openspace(L, &G(L)->buff, tl); - tl = 0; - n = i; - do { /* concat all strings */ - size_t l = tsvalue(top-i)->len; - memcpy(buffer+tl, svalue(top-i), l * sizeof(char)); - tl += l; - } while (--i > 0); - setsvalue2s(L, top-n, luaS_newlstr(L, buffer, tl)); - } - total -= n-1; /* got 'n' strings to create 1 new */ - L->top -= n-1; /* popped 'n' strings and pushed one */ - } while (total > 1); /* repeat until only 1 result left */ -} - - -void luaV_objlen (lua_State *L, StkId ra, const TValue *rb) { - const TValue *tm; - switch (ttypenv(rb)) { - case LUA_TTABLE: { - Table *h = hvalue(rb); - tm = fasttm(L, h->metatable, TM_LEN); - if (tm) break; /* metamethod? break switch to call it */ - setnvalue(ra, cast_num(luaH_getn(h))); /* else primitive len */ - return; - } - case LUA_TSTRING: { - setnvalue(ra, cast_num(tsvalue(rb)->len)); - return; - } - default: { /* try metamethod */ - tm = luaT_gettmbyobj(L, rb, TM_LEN); - if (ttisnil(tm)) /* no metamethod? */ - luaG_typeerror(L, rb, "get length of"); - break; - } - } - callTM(L, tm, rb, rb, ra, 1); -} - -/* - * luaV_div and luaV_mod patched in from Lua 5.3.2 in order to properly handle - * div/mod by zero (instead of crashing, which is the default behavior in - * Lua 5.2) - */ - -/* -** Integer division; return 'm // n', that is, floor(m/n). -** C division truncates its result (rounds towards zero). -** 'floor(q) == trunc(q)' when 'q >= 0' or when 'q' is integer, -** otherwise 'floor(q) == trunc(q) - 1'. -*/ -static lua_Number luaV_div (lua_State *L, lua_Number m, lua_Number n) { - if ((lua_Unsigned)(n) + 1u <= 1u) { /* special cases: -1 or 0 */ - if (n == 0) - luaG_runerror(L, "attempt to divide by zero"); - return (0 - m); /* n==-1; avoid overflow with 0x80000...//-1 */ - } - else { - lua_Number q = m / n; /* perform C division */ - if ((m ^ n) < 0 && m % n != 0) /* 'm/n' would be negative non-integer? */ - q -= 1; /* correct result for different rounding */ - return q; - } -} - - -/* -** Integer modulus; return 'm % n'. (Assume that C '%' with -** negative operands follows C99 behavior. See previous comment -** about luaV_div.) -*/ -static lua_Number luaV_mod (lua_State *L, lua_Number m, lua_Number n) { - if ((lua_Unsigned)(n) + 1u <= 1u) { /* special cases: -1 or 0 */ - if (n == 0) - luaG_runerror(L, "attempt to perform 'n%%0'"); - return 0; /* m % -1 == 0; avoid overflow with 0x80000...%-1 */ - } - else { - lua_Number r = m % n; - if (r != 0 && (m ^ n) < 0) /* 'm/n' would be non-integer negative? */ - r += n; /* correct result for different rounding */ - return r; - } -} - -/* - * End patch from 5.3.2 - */ - -void luaV_arith (lua_State *L, StkId ra, const TValue *rb, - const TValue *rc, TMS op) { - TValue tempb, tempc; - const TValue *b, *c; - if ((b = luaV_tonumber(rb, &tempb)) != NULL && - (c = luaV_tonumber(rc, &tempc)) != NULL) { - /* - * Patched: if dividing or modding, use patched functions from 5.3 - */ - lua_Number res; - int lop = op - TM_ADD + LUA_OPADD; - if (lop == LUA_OPDIV) { - res = luaV_div(L, nvalue(b), nvalue(c)); - } else if (lop == LUA_OPMOD) { - res = luaV_mod(L, nvalue(b), nvalue(c)); - } else { - res = luaO_arith(op - TM_ADD + LUA_OPADD, nvalue(b), nvalue(c)); - } - setnvalue(ra, res); - } - else if (!call_binTM(L, rb, rc, ra, op)) - luaG_aritherror(L, rb, rc); -} - - -/* -** check whether cached closure in prototype 'p' may be reused, that is, -** whether there is a cached closure with the same upvalues needed by -** new closure to be created. -*/ -static Closure *getcached (Proto *p, UpVal **encup, StkId base) { - Closure *c = p->cache; - if (c != NULL) { /* is there a cached closure? */ - int nup = p->sizeupvalues; - Upvaldesc *uv = p->upvalues; - int i; - for (i = 0; i < nup; i++) { /* check whether it has right upvalues */ - TValue *v = uv[i].instack ? base + uv[i].idx : encup[uv[i].idx]->v; - if (c->l.upvals[i]->v != v) - return NULL; /* wrong upvalue; cannot reuse closure */ - } - } - return c; /* return cached closure (or NULL if no cached closure) */ -} - - -/* -** create a new Lua closure, push it in the stack, and initialize -** its upvalues. Note that the call to 'luaC_barrierproto' must come -** before the assignment to 'p->cache', as the function needs the -** original value of that field. -*/ -static void pushclosure (lua_State *L, Proto *p, UpVal **encup, StkId base, - StkId ra) { - int nup = p->sizeupvalues; - Upvaldesc *uv = p->upvalues; - int i; - Closure *ncl = luaF_newLclosure(L, nup); - ncl->l.p = p; - setclLvalue(L, ra, ncl); /* anchor new closure in stack */ - for (i = 0; i < nup; i++) { /* fill in its upvalues */ - if (uv[i].instack) /* upvalue refers to local variable? */ - ncl->l.upvals[i] = luaF_findupval(L, base + uv[i].idx); - else /* get upvalue from enclosing function */ - ncl->l.upvals[i] = encup[uv[i].idx]; - } - luaC_barrierproto(L, p, ncl); - p->cache = ncl; /* save it on cache for reuse */ -} - - -/* -** finish execution of an opcode interrupted by an yield -*/ -void luaV_finishOp (lua_State *L) { - CallInfo *ci = L->ci; - StkId base = ci->u.l.base; - Instruction inst = *(ci->u.l.savedpc - 1); /* interrupted instruction */ - OpCode op = GET_OPCODE(inst); - switch (op) { /* finish its execution */ - case OP_ADD: case OP_SUB: case OP_MUL: case OP_DIV: - case OP_MOD: case OP_POW: case OP_UNM: case OP_LEN: - case OP_GETTABUP: case OP_GETTABLE: case OP_SELF: { - setobjs2s(L, base + GETARG_A(inst), --L->top); - break; - } - case OP_LE: case OP_LT: case OP_EQ: { - int res = !l_isfalse(L->top - 1); - L->top--; - /* metamethod should not be called when operand is K */ - lua_assert(!ISK(GETARG_B(inst))); - if (op == OP_LE && /* "<=" using "<" instead? */ - ttisnil(luaT_gettmbyobj(L, base + GETARG_B(inst), TM_LE))) - res = !res; /* invert result */ - lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_JMP); - if (res != GETARG_A(inst)) /* condition failed? */ - ci->u.l.savedpc++; /* skip jump instruction */ - break; - } - case OP_CONCAT: { - StkId top = L->top - 1; /* top when 'call_binTM' was called */ - int b = GETARG_B(inst); /* first element to concatenate */ - int total = cast_int(top - 1 - (base + b)); /* yet to concatenate */ - setobj2s(L, top - 2, top); /* put TM result in proper position */ - if (total > 1) { /* are there elements to concat? */ - L->top = top - 1; /* top is one after last element (at top-2) */ - luaV_concat(L, total); /* concat them (may yield again) */ - } - /* move final result to final position */ - setobj2s(L, ci->u.l.base + GETARG_A(inst), L->top - 1); - L->top = ci->top; /* restore top */ - break; - } - case OP_TFORCALL: { - lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_TFORLOOP); - L->top = ci->top; /* correct top */ - break; - } - case OP_CALL: { - if (GETARG_C(inst) - 1 >= 0) /* nresults >= 0? */ - L->top = ci->top; /* adjust results */ - break; - } - case OP_TAILCALL: case OP_SETTABUP: case OP_SETTABLE: - break; - default: lua_assert(0); - } -} - - - -/* -** some macros for common tasks in `luaV_execute' -*/ - -#if !defined luai_runtimecheck -#define luai_runtimecheck(L, c) /* void */ -#endif - - -#define RA(i) (base+GETARG_A(i)) -/* to be used after possible stack reallocation */ -#define RB(i) check_exp(getBMode(GET_OPCODE(i)) == OpArgR, base+GETARG_B(i)) -#define RC(i) check_exp(getCMode(GET_OPCODE(i)) == OpArgR, base+GETARG_C(i)) -#define RKB(i) check_exp(getBMode(GET_OPCODE(i)) == OpArgK, \ - ISK(GETARG_B(i)) ? k+INDEXK(GETARG_B(i)) : base+GETARG_B(i)) -#define RKC(i) check_exp(getCMode(GET_OPCODE(i)) == OpArgK, \ - ISK(GETARG_C(i)) ? k+INDEXK(GETARG_C(i)) : base+GETARG_C(i)) -#define KBx(i) \ - (k + (GETARG_Bx(i) != 0 ? GETARG_Bx(i) - 1 : GETARG_Ax(*ci->u.l.savedpc++))) - - -/* execute a jump instruction */ -#define dojump(ci,i,e) \ - { int a = GETARG_A(i); \ - if (a > 0) luaF_close(L, ci->u.l.base + a - 1); \ - ci->u.l.savedpc += GETARG_sBx(i) + e; } - -/* for test instructions, execute the jump instruction that follows it */ -#define donextjump(ci) { i = *ci->u.l.savedpc; dojump(ci, i, 1); } - - -#define Protect(x) { {x;}; base = ci->u.l.base; } - -#define checkGC(L,c) \ - Protect( luaC_condGC(L,{L->top = (c); /* limit of live values */ \ - luaC_step(L); \ - L->top = ci->top;}) /* restore top */ \ - luai_threadyield(L); ) - - -#define arith_op(op,tm) { \ - TValue *rb = RKB(i); \ - TValue *rc = RKC(i); \ - if (ttisnumber(rb) && ttisnumber(rc)) { \ - lua_Number nb = nvalue(rb), nc = nvalue(rc); \ - setnvalue(ra, op(L, nb, nc)); \ - } \ - else { Protect(luaV_arith(L, ra, rb, rc, tm)); } } - - -#define vmdispatch(o) switch(o) -#define vmcase(l,b) case l: {b} break; -#define vmcasenb(l,b) case l: {b} /* nb = no break */ - -void luaV_execute (lua_State *L) { - CallInfo *ci = L->ci; - LClosure *cl; - TValue *k; - StkId base; - newframe: /* reentry point when frame changes (call/return) */ - lua_assert(ci == L->ci); - cl = clLvalue(ci->func); - k = cl->p->k; - base = ci->u.l.base; - /* main loop of interpreter */ - for (;;) { - Instruction i = *(ci->u.l.savedpc++); - StkId ra; - if ((L->hookmask & (LUA_MASKLINE | LUA_MASKCOUNT)) && - (--L->hookcount == 0 || L->hookmask & LUA_MASKLINE)) { - Protect(traceexec(L)); - } - /* WARNING: several calls may realloc the stack and invalidate `ra' */ - ra = RA(i); - lua_assert(base == ci->u.l.base); - lua_assert(base <= L->top && L->top < L->stack + L->stacksize); - vmdispatch (GET_OPCODE(i)) { - vmcase(OP_MOVE, - setobjs2s(L, ra, RB(i)); - ) - vmcase(OP_LOADK, - TValue *rb = k + GETARG_Bx(i); - setobj2s(L, ra, rb); - ) - vmcase(OP_LOADKX, - TValue *rb; - lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG); - rb = k + GETARG_Ax(*ci->u.l.savedpc++); - setobj2s(L, ra, rb); - ) - vmcase(OP_LOADBOOL, - setbvalue(ra, GETARG_B(i)); - if (GETARG_C(i)) ci->u.l.savedpc++; /* skip next instruction (if C) */ - ) - vmcase(OP_LOADNIL, - int b = GETARG_B(i); - do { - setnilvalue(ra++); - } while (b--); - ) - vmcase(OP_GETUPVAL, - int b = GETARG_B(i); - setobj2s(L, ra, cl->upvals[b]->v); - ) - vmcase(OP_GETTABUP, - int b = GETARG_B(i); - Protect(luaV_gettable(L, cl->upvals[b]->v, RKC(i), ra)); - ) - vmcase(OP_GETTABLE, - Protect(luaV_gettable(L, RB(i), RKC(i), ra)); - ) - vmcase(OP_SETTABUP, - int a = GETARG_A(i); - Protect(luaV_settable(L, cl->upvals[a]->v, RKB(i), RKC(i))); - ) - vmcase(OP_SETUPVAL, - UpVal *uv = cl->upvals[GETARG_B(i)]; - setobj(L, uv->v, ra); - luaC_barrier(L, uv, ra); - ) - vmcase(OP_SETTABLE, - Protect(luaV_settable(L, ra, RKB(i), RKC(i))); - ) - vmcase(OP_NEWTABLE, - int b = GETARG_B(i); - int c = GETARG_C(i); - Table *t = luaH_new(L); - sethvalue(L, ra, t); - if (b != 0 || c != 0) - luaH_resize(L, t, luaO_fb2int(b), luaO_fb2int(c)); - checkGC(L, ra + 1); - ) - vmcase(OP_SELF, - StkId rb = RB(i); - setobjs2s(L, ra+1, rb); - Protect(luaV_gettable(L, rb, RKC(i), ra)); - ) - vmcase(OP_ADD, - arith_op(luai_numadd, TM_ADD); - ) - vmcase(OP_SUB, - arith_op(luai_numsub, TM_SUB); - ) - vmcase(OP_MUL, - arith_op(luai_nummul, TM_MUL); - ) - /* - * Patched: use luaV_* instead of luai_* to handle div/mod by 0 - */ - vmcase(OP_DIV, - arith_op(luaV_div, TM_DIV); - ) - vmcase(OP_MOD, - arith_op(luaV_mod, TM_MOD); - ) - vmcase(OP_POW, - arith_op(luai_numpow, TM_POW); - ) - vmcase(OP_UNM, - TValue *rb = RB(i); - if (ttisnumber(rb)) { - lua_Number nb = nvalue(rb); - setnvalue(ra, luai_numunm(L, nb)); - } - else { - Protect(luaV_arith(L, ra, rb, rb, TM_UNM)); - } - ) - vmcase(OP_NOT, - TValue *rb = RB(i); - int res = l_isfalse(rb); /* next assignment may change this value */ - setbvalue(ra, res); - ) - vmcase(OP_LEN, - Protect(luaV_objlen(L, ra, RB(i))); - ) - vmcase(OP_CONCAT, - int b = GETARG_B(i); - int c = GETARG_C(i); - StkId rb; - L->top = base + c + 1; /* mark the end of concat operands */ - Protect(luaV_concat(L, c - b + 1)); - ra = RA(i); /* 'luav_concat' may invoke TMs and move the stack */ - rb = b + base; - setobjs2s(L, ra, rb); - checkGC(L, (ra >= rb ? ra + 1 : rb)); - L->top = ci->top; /* restore top */ - ) - vmcase(OP_JMP, - dojump(ci, i, 0); - ) - vmcase(OP_EQ, - TValue *rb = RKB(i); - TValue *rc = RKC(i); - Protect( - if (cast_int(equalobj(L, rb, rc)) != GETARG_A(i)) - ci->u.l.savedpc++; - else - donextjump(ci); - ) - ) - vmcase(OP_LT, - Protect( - if (luaV_lessthan(L, RKB(i), RKC(i)) != GETARG_A(i)) - ci->u.l.savedpc++; - else - donextjump(ci); - ) - ) - vmcase(OP_LE, - Protect( - if (luaV_lessequal(L, RKB(i), RKC(i)) != GETARG_A(i)) - ci->u.l.savedpc++; - else - donextjump(ci); - ) - ) - vmcase(OP_TEST, - if (GETARG_C(i) ? l_isfalse(ra) : !l_isfalse(ra)) - ci->u.l.savedpc++; - else - donextjump(ci); - ) - vmcase(OP_TESTSET, - TValue *rb = RB(i); - if (GETARG_C(i) ? l_isfalse(rb) : !l_isfalse(rb)) - ci->u.l.savedpc++; - else { - setobjs2s(L, ra, rb); - donextjump(ci); - } - ) - vmcase(OP_CALL, - int b = GETARG_B(i); - int nresults = GETARG_C(i) - 1; - if (b != 0) L->top = ra+b; /* else previous instruction set top */ - if (luaD_precall(L, ra, nresults)) { /* C function? */ - if (nresults >= 0) L->top = ci->top; /* adjust results */ - base = ci->u.l.base; - } - else { /* Lua function */ - ci = L->ci; - ci->callstatus |= CIST_REENTRY; - goto newframe; /* restart luaV_execute over new Lua function */ - } - ) - vmcase(OP_TAILCALL, - int b = GETARG_B(i); - if (b != 0) L->top = ra+b; /* else previous instruction set top */ - lua_assert(GETARG_C(i) - 1 == LUA_MULTRET); - if (luaD_precall(L, ra, LUA_MULTRET)) /* C function? */ - base = ci->u.l.base; - else { - /* tail call: put called frame (n) in place of caller one (o) */ - CallInfo *nci = L->ci; /* called frame */ - CallInfo *oci = nci->previous; /* caller frame */ - StkId nfunc = nci->func; /* called function */ - StkId ofunc = oci->func; /* caller function */ - /* last stack slot filled by 'precall' */ - StkId lim = nci->u.l.base + getproto(nfunc)->numparams; - int aux; - /* close all upvalues from previous call */ - if (cl->p->sizep > 0) luaF_close(L, oci->u.l.base); - /* move new frame into old one */ - for (aux = 0; nfunc + aux < lim; aux++) - setobjs2s(L, ofunc + aux, nfunc + aux); - oci->u.l.base = ofunc + (nci->u.l.base - nfunc); /* correct base */ - oci->top = L->top = ofunc + (L->top - nfunc); /* correct top */ - oci->u.l.savedpc = nci->u.l.savedpc; - oci->callstatus |= CIST_TAIL; /* function was tail called */ - ci = L->ci = oci; /* remove new frame */ - lua_assert(L->top == oci->u.l.base + getproto(ofunc)->maxstacksize); - goto newframe; /* restart luaV_execute over new Lua function */ - } - ) - vmcasenb(OP_RETURN, - int b = GETARG_B(i); - if (b != 0) L->top = ra+b-1; - if (cl->p->sizep > 0) luaF_close(L, base); - b = luaD_poscall(L, ra); - if (!(ci->callstatus & CIST_REENTRY)) /* 'ci' still the called one */ - return; /* external invocation: return */ - else { /* invocation via reentry: continue execution */ - ci = L->ci; - if (b) L->top = ci->top; - lua_assert(isLua(ci)); - lua_assert(GET_OPCODE(*((ci)->u.l.savedpc - 1)) == OP_CALL); - goto newframe; /* restart luaV_execute over new Lua function */ - } - ) - vmcase(OP_FORLOOP, - lua_Number step = nvalue(ra+2); - lua_Number idx = luai_numadd(L, nvalue(ra), step); /* increment index */ - lua_Number limit = nvalue(ra+1); - if (luai_numlt(L, 0, step) ? luai_numle(L, idx, limit) - : luai_numle(L, limit, idx)) { - ci->u.l.savedpc += GETARG_sBx(i); /* jump back */ - setnvalue(ra, idx); /* update internal index... */ - setnvalue(ra+3, idx); /* ...and external index */ - } - ) - vmcase(OP_FORPREP, - const TValue *init = ra; - const TValue *plimit = ra+1; - const TValue *pstep = ra+2; - if (!tonumber(init, ra)) - luaG_runerror(L, LUA_QL("for") " initial value must be a number"); - else if (!tonumber(plimit, ra+1)) - luaG_runerror(L, LUA_QL("for") " limit must be a number"); - else if (!tonumber(pstep, ra+2)) - luaG_runerror(L, LUA_QL("for") " step must be a number"); - setnvalue(ra, luai_numsub(L, nvalue(ra), nvalue(pstep))); - ci->u.l.savedpc += GETARG_sBx(i); - ) - vmcasenb(OP_TFORCALL, - StkId cb = ra + 3; /* call base */ - setobjs2s(L, cb+2, ra+2); - setobjs2s(L, cb+1, ra+1); - setobjs2s(L, cb, ra); - L->top = cb + 3; /* func. + 2 args (state and index) */ - Protect(luaD_call(L, cb, GETARG_C(i), 1)); - L->top = ci->top; - i = *(ci->u.l.savedpc++); /* go to next instruction */ - ra = RA(i); - lua_assert(GET_OPCODE(i) == OP_TFORLOOP); - goto l_tforloop; - ) - vmcase(OP_TFORLOOP, - l_tforloop: - if (!ttisnil(ra + 1)) { /* continue loop? */ - setobjs2s(L, ra, ra + 1); /* save control variable */ - ci->u.l.savedpc += GETARG_sBx(i); /* jump back */ - } - ) - vmcase(OP_SETLIST, - int n = GETARG_B(i); - int c = GETARG_C(i); - int last; - Table *h; - if (n == 0) n = cast_int(L->top - ra) - 1; - if (c == 0) { - lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG); - c = GETARG_Ax(*ci->u.l.savedpc++); - } - luai_runtimecheck(L, ttistable(ra)); - h = hvalue(ra); - last = ((c-1)*LFIELDS_PER_FLUSH) + n; - if (last > h->sizearray) /* needs more space? */ - luaH_resizearray(L, h, last); /* pre-allocate it at once */ - for (; n > 0; n--) { - TValue *val = ra+n; - luaH_setint(L, h, last--, val); - luaC_barrierback(L, obj2gco(h), val); - } - L->top = ci->top; /* correct top (in case of previous open call) */ - ) - vmcase(OP_CLOSURE, - Proto *p = cl->p->p[GETARG_Bx(i)]; - Closure *ncl = getcached(p, cl->upvals, base); /* cached closure */ - if (ncl == NULL) /* no match? */ - pushclosure(L, p, cl->upvals, base, ra); /* create a new one */ - else - setclLvalue(L, ra, ncl); /* push cashed closure */ - checkGC(L, ra + 1); - ) - vmcase(OP_VARARG, - int b = GETARG_B(i) - 1; - int j; - int n = cast_int(base - ci->func) - cl->p->numparams - 1; - if (b < 0) { /* B == 0? */ - b = n; /* get all var. arguments */ - Protect(luaD_checkstack(L, n)); - ra = RA(i); /* previous call may change the stack */ - L->top = ra + n; - } - for (j = 0; j < b; j++) { - if (j < n) { - setobjs2s(L, ra + j, base - n + j); - } - else { - setnilvalue(ra + j); - } - } - ) - vmcase(OP_EXTRAARG, - lua_assert(0); - ) - } - } -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h deleted file mode 100644 index 5380270da63d..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h +++ /dev/null @@ -1,44 +0,0 @@ -/* -** $Id: lvm.h,v 2.18.1.1 2013/04/12 18:48:47 roberto Exp $ -** Lua virtual machine -** See Copyright Notice in lua.h -*/ - -#ifndef lvm_h -#define lvm_h - - -#include "ldo.h" -#include "lobject.h" -#include "ltm.h" - - -#define tostring(L,o) (ttisstring(o) || (luaV_tostring(L, o))) - -#define tonumber(o,n) (ttisnumber(o) || (((o) = luaV_tonumber(o,n)) != NULL)) - -#define equalobj(L,o1,o2) (ttisequal(o1, o2) && luaV_equalobj_(L, o1, o2)) - -#define luaV_rawequalobj(o1,o2) equalobj(NULL,o1,o2) - - -/* not to called directly */ -LUAI_FUNC int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2); - - -LUAI_FUNC int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r); -LUAI_FUNC int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r); -LUAI_FUNC const TValue *luaV_tonumber (const TValue *obj, TValue *n); -LUAI_FUNC int luaV_tostring (lua_State *L, StkId obj); -LUAI_FUNC void luaV_gettable (lua_State *L, const TValue *t, TValue *key, - StkId val); -LUAI_FUNC void luaV_settable (lua_State *L, const TValue *t, TValue *key, - StkId val); -LUAI_FUNC void luaV_finishOp (lua_State *L); -LUAI_FUNC void luaV_execute (lua_State *L); -LUAI_FUNC void luaV_concat (lua_State *L, int total); -LUAI_FUNC void luaV_arith (lua_State *L, StkId ra, const TValue *rb, - const TValue *rc, TMS op); -LUAI_FUNC void luaV_objlen (lua_State *L, StkId ra, const TValue *rb); - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c deleted file mode 100644 index 53e6a3daeb5a..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c +++ /dev/null @@ -1,76 +0,0 @@ -/* -** $Id: lzio.c,v 1.35.1.1 2013/04/12 18:48:47 roberto Exp $ -** Buffered streams -** See Copyright Notice in lua.h -*/ - - -#include - -#define lzio_c -#define LUA_CORE - -#include "lua.h" - -#include "llimits.h" -#include "lmem.h" -#include "lstate.h" -#include "lzio.h" - - -int luaZ_fill (ZIO *z) { - size_t size; - lua_State *L = z->L; - const char *buff; - lua_unlock(L); - buff = z->reader(L, z->data, &size); - lua_lock(L); - if (buff == NULL || size == 0) - return EOZ; - z->n = size - 1; /* discount char being returned */ - z->p = buff; - return cast_uchar(*(z->p++)); -} - - -void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader, void *data) { - z->L = L; - z->reader = reader; - z->data = data; - z->n = 0; - z->p = NULL; -} - - -/* --------------------------------------------------------------- read --- */ -size_t luaZ_read (ZIO *z, void *b, size_t n) { - while (n) { - size_t m; - if (z->n == 0) { /* no bytes in buffer? */ - if (luaZ_fill(z) == EOZ) /* try to read more */ - return n; /* no more input; return number of missing bytes */ - else { - z->n++; /* luaZ_fill consumed first byte; put it back */ - z->p--; - } - } - m = (n <= z->n) ? n : z->n; /* min. between n and z->n */ - memcpy(b, z->p, m); - z->n -= m; - z->p += m; - b = (char *)b + m; - n -= m; - } - return 0; -} - -/* ------------------------------------------------------------------------ */ -char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n) { - if (n > buff->buffsize) { - if (n < LUA_MINBUFFER) n = LUA_MINBUFFER; - luaZ_resizebuffer(L, buff, n); - } - return buff->buffer; -} - - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h deleted file mode 100644 index 441f7479cb14..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h +++ /dev/null @@ -1,65 +0,0 @@ -/* -** $Id: lzio.h,v 1.26.1.1 2013/04/12 18:48:47 roberto Exp $ -** Buffered streams -** See Copyright Notice in lua.h -*/ - - -#ifndef lzio_h -#define lzio_h - -#include "lua.h" - -#include "lmem.h" - - -#define EOZ (-1) /* end of stream */ - -typedef struct Zio ZIO; - -#define zgetc(z) (((z)->n--)>0 ? cast_uchar(*(z)->p++) : luaZ_fill(z)) - - -typedef struct Mbuffer { - char *buffer; - size_t n; - size_t buffsize; -} Mbuffer; - -#define luaZ_initbuffer(L, buff) ((buff)->buffer = NULL, (buff)->buffsize = 0) - -#define luaZ_buffer(buff) ((buff)->buffer) -#define luaZ_sizebuffer(buff) ((buff)->buffsize) -#define luaZ_bufflen(buff) ((buff)->n) - -#define luaZ_resetbuffer(buff) ((buff)->n = 0) - - -#define luaZ_resizebuffer(L, buff, size) \ - (luaM_reallocvector(L, (buff)->buffer, (buff)->buffsize, size, char), \ - (buff)->buffsize = size) - -#define luaZ_freebuffer(L, buff) luaZ_resizebuffer(L, buff, 0) - - -LUAI_FUNC char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n); -LUAI_FUNC void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader, - void *data); -LUAI_FUNC size_t luaZ_read (ZIO* z, void* b, size_t n); /* read next n bytes */ - - - -/* --------- Private Part ------------------ */ - -struct Zio { - size_t n; /* bytes still unread */ - const char *p; /* current position in buffer */ - lua_Reader reader; /* reader function */ - void* data; /* additional data */ - lua_State *L; /* Lua state (for reader) */ -}; - - -LUAI_FUNC int luaZ_fill (ZIO *z); - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c deleted file mode 100644 index 699373ad4d43..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c +++ /dev/null @@ -1,129 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -/* - * We keep our own copy of this algorithm for 3 main reasons: - * 1. If we didn't, anyone modifying common/os/compress.c would - * directly break our on disk format - * 2. Our version of lzjb does not have a number of checks that the - * common/os version needs and uses - * 3. We initialize the lempel to ensure deterministic results, - * so that identical blocks can always be deduplicated. - * In particular, we are adding the "feature" that compress() can - * take a destination buffer size and returns the compressed length, or the - * source length if compression would overflow the destination buffer. - */ - -#include -#include -#include - -#define MATCH_BITS 6 -#define MATCH_MIN 3 -#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1)) -#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) -#define LEMPEL_SIZE 1024 - -/*ARGSUSED*/ -size_t -lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) -{ - uchar_t *src = s_start; - uchar_t *dst = d_start; - uchar_t *cpy; - uchar_t *copymap = NULL; - int copymask = 1 << (NBBY - 1); - int mlen, offset, hash; - uint16_t *hp; - uint16_t lempel[LEMPEL_SIZE] = { 0 }; - - while (src < (uchar_t *)s_start + s_len) { - if ((copymask <<= 1) == (1 << NBBY)) { - if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) - return (s_len); - copymask = 1; - copymap = dst; - *dst++ = 0; - } - if (src > (uchar_t *)s_start + s_len - MATCH_MAX) { - *dst++ = *src++; - continue; - } - hash = (src[0] << 16) + (src[1] << 8) + src[2]; - hash += hash >> 9; - hash += hash >> 5; - hp = &lempel[hash & (LEMPEL_SIZE - 1)]; - offset = (intptr_t)(src - *hp) & OFFSET_MASK; - *hp = (uint16_t)(uintptr_t)src; - cpy = src - offset; - if (cpy >= (uchar_t *)s_start && cpy != src && - src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) { - *copymap |= copymask; - for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++) - if (src[mlen] != cpy[mlen]) - break; - *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) | - (offset >> NBBY); - *dst++ = (uchar_t)offset; - src += mlen; - } else { - *dst++ = *src++; - } - } - return (dst - (uchar_t *)d_start); -} - -/*ARGSUSED*/ -int -lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) -{ - uchar_t *src = s_start; - uchar_t *dst = d_start; - uchar_t *d_end = (uchar_t *)d_start + d_len; - uchar_t *cpy; - uchar_t copymap = 0; - int copymask = 1 << (NBBY - 1); - - while (dst < d_end) { - if ((copymask <<= 1) == (1 << NBBY)) { - copymask = 1; - copymap = *src++; - } - if (copymap & copymask) { - int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN; - int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK; - src += 2; - if ((cpy = dst - offset) < (uchar_t *)d_start) - return (-1); - if (mlen > (d_end - dst)) - mlen = d_end - dst; - while (--mlen >= 0) - *dst++ = *cpy++; - } else { - *dst++ = *src++; - } - } - return (0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c deleted file mode 100644 index 6cd862baff30..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ /dev/null @@ -1,4624 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright (c) 2017, Intel Corporation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -SYSCTL_DECL(_vfs_zfs); -SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "ZFS metaslab"); - -#define GANG_ALLOCATION(flags) \ - ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) - -uint64_t metaslab_aliquot = 512ULL << 10; -uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ -SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, force_ganging, CTLFLAG_RWTUN, - &metaslab_force_ganging, 0, - "Force gang block allocation for blocks larger than or equal to this value"); - -/* - * Since we can touch multiple metaslabs (and their respective space maps) - * with each transaction group, we benefit from having a smaller space map - * block size since it allows us to issue more I/O operations scattered - * around the disk. - */ -int zfs_metaslab_sm_blksz = (1 << 12); -SYSCTL_INT(_vfs_zfs, OID_AUTO, metaslab_sm_blksz, CTLFLAG_RDTUN, - &zfs_metaslab_sm_blksz, 0, - "Block size for metaslab DTL space map. Power of 2 and greater than 4096."); - -/* - * The in-core space map representation is more compact than its on-disk form. - * The zfs_condense_pct determines how much more compact the in-core - * space map representation must be before we compact it on-disk. - * Values should be greater than or equal to 100. - */ -int zfs_condense_pct = 200; -SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, - &zfs_condense_pct, 0, - "Condense on-disk spacemap when it is more than this many percents" - " of in-memory counterpart"); - -/* - * Condensing a metaslab is not guaranteed to actually reduce the amount of - * space used on disk. In particular, a space map uses data in increments of - * MAX(1 << ashift, space_map_blksize), so a metaslab might use the - * same number of blocks after condensing. Since the goal of condensing is to - * reduce the number of IOPs required to read the space map, we only want to - * condense when we can be sure we will reduce the number of blocks used by the - * space map. Unfortunately, we cannot precisely compute whether or not this is - * the case in metaslab_should_condense since we are holding ms_lock. Instead, - * we apply the following heuristic: do not condense a spacemap unless the - * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold - * blocks. - */ -int zfs_metaslab_condense_block_threshold = 4; - -/* - * The zfs_mg_noalloc_threshold defines which metaslab groups should - * be eligible for allocation. The value is defined as a percentage of - * free space. Metaslab groups that have more free space than - * zfs_mg_noalloc_threshold are always eligible for allocations. Once - * a metaslab group's free space is less than or equal to the - * zfs_mg_noalloc_threshold the allocator will avoid allocating to that - * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. - * Once all groups in the pool reach zfs_mg_noalloc_threshold then all - * groups are allowed to accept allocations. Gang blocks are always - * eligible to allocate on any metaslab group. The default value of 0 means - * no metaslab group will be excluded based on this criterion. - */ -int zfs_mg_noalloc_threshold = 0; -SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, - &zfs_mg_noalloc_threshold, 0, - "Percentage of metaslab group size that should be free" - " to make it eligible for allocation"); - -/* - * Metaslab groups are considered eligible for allocations if their - * fragmenation metric (measured as a percentage) is less than or equal to - * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold - * then it will be skipped unless all metaslab groups within the metaslab - * class have also crossed this threshold. - */ -int zfs_mg_fragmentation_threshold = 85; -SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN, - &zfs_mg_fragmentation_threshold, 0, - "Percentage of metaslab group size that should be considered " - "eligible for allocations unless all metaslab groups within the metaslab class " - "have also crossed this threshold"); - -/* - * Allow metaslabs to keep their active state as long as their fragmentation - * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An - * active metaslab that exceeds this threshold will no longer keep its active - * status allowing better metaslabs to be selected. - */ -int zfs_metaslab_fragmentation_threshold = 70; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN, - &zfs_metaslab_fragmentation_threshold, 0, - "Maximum percentage of metaslab fragmentation level to keep their active state"); - -/* - * When set will load all metaslabs when pool is first opened. - */ -int metaslab_debug_load = 0; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, - &metaslab_debug_load, 0, - "Load all metaslabs when pool is first opened"); - -/* - * When set will prevent metaslabs from being unloaded. - */ -int metaslab_debug_unload = 0; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN, - &metaslab_debug_unload, 0, - "Prevent metaslabs from being unloaded"); - -/* - * Minimum size which forces the dynamic allocator to change - * it's allocation strategy. Once the space map cannot satisfy - * an allocation of this size then it switches to using more - * aggressive strategy (i.e search by size rather than offset). - */ -uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; -SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, - &metaslab_df_alloc_threshold, 0, - "Minimum size which forces the dynamic allocator to change it's allocation strategy"); - -/* - * The minimum free space, in percent, which must be available - * in a space map to continue allocations in a first-fit fashion. - * Once the space map's free space drops below this level we dynamically - * switch to using best-fit allocations. - */ -int metaslab_df_free_pct = 4; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, - &metaslab_df_free_pct, 0, - "The minimum free space, in percent, which must be available in a " - "space map to continue allocations in a first-fit fashion"); - -/* - * A metaslab is considered "free" if it contains a contiguous - * segment which is greater than metaslab_min_alloc_size. - */ -uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; -SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, - &metaslab_min_alloc_size, 0, - "A metaslab is considered \"free\" if it contains a contiguous " - "segment which is greater than vfs.zfs.metaslab.min_alloc_size"); - -/* - * Percentage of all cpus that can be used by the metaslab taskq. - */ -int metaslab_load_pct = 50; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, - &metaslab_load_pct, 0, - "Percentage of cpus that can be used by the metaslab taskq"); - -/* - * Determines how many txgs a metaslab may remain loaded without having any - * allocations from it. As long as a metaslab continues to be used we will - * keep it loaded. - */ -int metaslab_unload_delay = TXG_SIZE * 2; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, - &metaslab_unload_delay, 0, - "Number of TXGs that an unused metaslab can be kept in memory"); - -/* - * Max number of metaslabs per group to preload. - */ -int metaslab_preload_limit = SPA_DVAS_PER_BP; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, - &metaslab_preload_limit, 0, - "Max number of metaslabs per group to preload"); - -/* - * Enable/disable preloading of metaslab. - */ -boolean_t metaslab_preload_enabled = B_TRUE; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, - &metaslab_preload_enabled, 0, - "Max number of metaslabs per group to preload"); - -/* - * Enable/disable fragmentation weighting on metaslabs. - */ -boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, - &metaslab_fragmentation_factor_enabled, 0, - "Enable fragmentation weighting on metaslabs"); - -/* - * Enable/disable lba weighting (i.e. outer tracks are given preference). - */ -boolean_t metaslab_lba_weighting_enabled = B_TRUE; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, - &metaslab_lba_weighting_enabled, 0, - "Enable LBA weighting (i.e. outer tracks are given preference)"); - -/* - * Enable/disable metaslab group biasing. - */ -boolean_t metaslab_bias_enabled = B_TRUE; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, - &metaslab_bias_enabled, 0, - "Enable metaslab group biasing"); - -/* - * Enable/disable remapping of indirect DVAs to their concrete vdevs. - */ -boolean_t zfs_remap_blkptr_enable = B_TRUE; - -/* - * Enable/disable segment-based metaslab selection. - */ -boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; - -/* - * When using segment-based metaslab selection, we will continue - * allocating from the active metaslab until we have exhausted - * zfs_metaslab_switch_threshold of its buckets. - */ -int zfs_metaslab_switch_threshold = 2; - -/* - * Internal switch to enable/disable the metaslab allocation tracing - * facility. - */ -#ifdef _METASLAB_TRACING -boolean_t metaslab_trace_enabled = B_TRUE; -#endif - -/* - * Maximum entries that the metaslab allocation tracing facility will keep - * in a given list when running in non-debug mode. We limit the number - * of entries in non-debug mode to prevent us from using up too much memory. - * The limit should be sufficiently large that we don't expect any allocation - * to every exceed this value. In debug mode, the system will panic if this - * limit is ever reached allowing for further investigation. - */ -#ifdef _METASLAB_TRACING -uint64_t metaslab_trace_max_entries = 5000; -#endif - -static uint64_t metaslab_weight(metaslab_t *); -static void metaslab_set_fragmentation(metaslab_t *); -static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); -static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); -static void metaslab_passivate(metaslab_t *msp, uint64_t weight); -static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); -#ifdef _METASLAB_TRACING -kmem_cache_t *metaslab_alloc_trace_cache; -#endif - -/* - * ========================================================================== - * Metaslab classes - * ========================================================================== - */ -metaslab_class_t * -metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) -{ - metaslab_class_t *mc; - - mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); - - mc->mc_spa = spa; - mc->mc_rotor = NULL; - mc->mc_ops = ops; - mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); - mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * - sizeof (zfs_refcount_t), KM_SLEEP); - mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * - sizeof (uint64_t), KM_SLEEP); - for (int i = 0; i < spa->spa_alloc_count; i++) - zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); - - return (mc); -} - -void -metaslab_class_destroy(metaslab_class_t *mc) -{ - ASSERT(mc->mc_rotor == NULL); - ASSERT(mc->mc_alloc == 0); - ASSERT(mc->mc_deferred == 0); - ASSERT(mc->mc_space == 0); - ASSERT(mc->mc_dspace == 0); - - for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) - zfs_refcount_destroy(&mc->mc_alloc_slots[i]); - kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * - sizeof (zfs_refcount_t)); - kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * - sizeof (uint64_t)); - mutex_destroy(&mc->mc_lock); - kmem_free(mc, sizeof (metaslab_class_t)); -} - -int -metaslab_class_validate(metaslab_class_t *mc) -{ - metaslab_group_t *mg; - vdev_t *vd; - - /* - * Must hold one of the spa_config locks. - */ - ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || - spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); - - if ((mg = mc->mc_rotor) == NULL) - return (0); - - do { - vd = mg->mg_vd; - ASSERT(vd->vdev_mg != NULL); - ASSERT3P(vd->vdev_top, ==, vd); - ASSERT3P(mg->mg_class, ==, mc); - ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); - } while ((mg = mg->mg_next) != mc->mc_rotor); - - return (0); -} - -static void -metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, - int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) -{ - atomic_add_64(&mc->mc_alloc, alloc_delta); - atomic_add_64(&mc->mc_deferred, defer_delta); - atomic_add_64(&mc->mc_space, space_delta); - atomic_add_64(&mc->mc_dspace, dspace_delta); -} - -void -metaslab_class_minblocksize_update(metaslab_class_t *mc) -{ - metaslab_group_t *mg; - vdev_t *vd; - uint64_t minashift = UINT64_MAX; - - if ((mg = mc->mc_rotor) == NULL) { - mc->mc_minblocksize = SPA_MINBLOCKSIZE; - return; - } - - do { - vd = mg->mg_vd; - if (vd->vdev_ashift < minashift) - minashift = vd->vdev_ashift; - } while ((mg = mg->mg_next) != mc->mc_rotor); - - mc->mc_minblocksize = 1ULL << minashift; -} - -uint64_t -metaslab_class_get_alloc(metaslab_class_t *mc) -{ - return (mc->mc_alloc); -} - -uint64_t -metaslab_class_get_deferred(metaslab_class_t *mc) -{ - return (mc->mc_deferred); -} - -uint64_t -metaslab_class_get_space(metaslab_class_t *mc) -{ - return (mc->mc_space); -} - -uint64_t -metaslab_class_get_dspace(metaslab_class_t *mc) -{ - return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); -} - -uint64_t -metaslab_class_get_minblocksize(metaslab_class_t *mc) -{ - return (mc->mc_minblocksize); -} - -void -metaslab_class_histogram_verify(metaslab_class_t *mc) -{ - spa_t *spa = mc->mc_spa; - vdev_t *rvd = spa->spa_root_vdev; - uint64_t *mc_hist; - int i; - - if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) - return; - - mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, - KM_SLEEP); - - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; - - /* - * Skip any holes, uninitialized top-levels, or - * vdevs that are not in this metalab class. - */ - if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || - mg->mg_class != mc) { - continue; - } - - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) - mc_hist[i] += mg->mg_histogram[i]; - } - - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) - VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); - - kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); -} - -/* - * Calculate the metaslab class's fragmentation metric. The metric - * is weighted based on the space contribution of each metaslab group. - * The return value will be a number between 0 and 100 (inclusive), or - * ZFS_FRAG_INVALID if the metric has not been set. See comment above the - * zfs_frag_table for more information about the metric. - */ -uint64_t -metaslab_class_fragmentation(metaslab_class_t *mc) -{ - vdev_t *rvd = mc->mc_spa->spa_root_vdev; - uint64_t fragmentation = 0; - - spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); - - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; - - /* - * Skip any holes, uninitialized top-levels, - * or vdevs that are not in this metalab class. - */ - if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || - mg->mg_class != mc) { - continue; - } - - /* - * If a metaslab group does not contain a fragmentation - * metric then just bail out. - */ - if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { - spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); - return (ZFS_FRAG_INVALID); - } - - /* - * Determine how much this metaslab_group is contributing - * to the overall pool fragmentation metric. - */ - fragmentation += mg->mg_fragmentation * - metaslab_group_get_space(mg); - } - fragmentation /= metaslab_class_get_space(mc); - - ASSERT3U(fragmentation, <=, 100); - spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); - return (fragmentation); -} - -/* - * Calculate the amount of expandable space that is available in - * this metaslab class. If a device is expanded then its expandable - * space will be the amount of allocatable space that is currently not - * part of this metaslab class. - */ -uint64_t -metaslab_class_expandable_space(metaslab_class_t *mc) -{ - vdev_t *rvd = mc->mc_spa->spa_root_vdev; - uint64_t space = 0; - - spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); - for (int c = 0; c < rvd->vdev_children; c++) { - uint64_t tspace; - vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; - - if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || - mg->mg_class != mc) { - continue; - } - - /* - * Calculate if we have enough space to add additional - * metaslabs. We report the expandable space in terms - * of the metaslab size since that's the unit of expansion. - * Adjust by efi system partition size. - */ - tspace = tvd->vdev_max_asize - tvd->vdev_asize; - if (tspace > mc->mc_spa->spa_bootsize) { - tspace -= mc->mc_spa->spa_bootsize; - } - space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); - } - spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); - return (space); -} - -static int -metaslab_compare(const void *x1, const void *x2) -{ - const metaslab_t *m1 = (const metaslab_t *)x1; - const metaslab_t *m2 = (const metaslab_t *)x2; - - int sort1 = 0; - int sort2 = 0; - if (m1->ms_allocator != -1 && m1->ms_primary) - sort1 = 1; - else if (m1->ms_allocator != -1 && !m1->ms_primary) - sort1 = 2; - if (m2->ms_allocator != -1 && m2->ms_primary) - sort2 = 1; - else if (m2->ms_allocator != -1 && !m2->ms_primary) - sort2 = 2; - - /* - * Sort inactive metaslabs first, then primaries, then secondaries. When - * selecting a metaslab to allocate from, an allocator first tries its - * primary, then secondary active metaslab. If it doesn't have active - * metaslabs, or can't allocate from them, it searches for an inactive - * metaslab to activate. If it can't find a suitable one, it will steal - * a primary or secondary metaslab from another allocator. - */ - if (sort1 < sort2) - return (-1); - if (sort1 > sort2) - return (1); - - int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); - if (likely(cmp)) - return (cmp); - - IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); - - return (AVL_CMP(m1->ms_start, m2->ms_start)); -} - -uint64_t -metaslab_allocated_space(metaslab_t *msp) -{ - return (msp->ms_allocated_space); -} - -/* - * Verify that the space accounting on disk matches the in-core range_trees. - */ -static void -metaslab_verify_space(metaslab_t *msp, uint64_t txg) -{ - spa_t *spa = msp->ms_group->mg_vd->vdev_spa; - uint64_t allocating = 0; - uint64_t sm_free_space, msp_free_space; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT(!msp->ms_condensing); - - if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) - return; - - /* - * We can only verify the metaslab space when we're called - * from syncing context with a loaded metaslab that has an - * allocated space map. Calling this in non-syncing context - * does not provide a consistent view of the metaslab since - * we're performing allocations in the future. - */ - if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || - !msp->ms_loaded) - return; - - /* - * Even though the smp_alloc field can get negative (e.g. - * see vdev_checkpoint_sm), that should never be the case - * when it come's to a metaslab's space map. - */ - ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); - - sm_free_space = msp->ms_size - metaslab_allocated_space(msp); - - /* - * Account for future allocations since we would have - * already deducted that space from the ms_allocatable. - */ - for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { - allocating += - range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); - } - - ASSERT3U(msp->ms_deferspace, ==, - range_tree_space(msp->ms_defer[0]) + - range_tree_space(msp->ms_defer[1])); - - msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + - msp->ms_deferspace + range_tree_space(msp->ms_freed); - - VERIFY3U(sm_free_space, ==, msp_free_space); -} - -/* - * ========================================================================== - * Metaslab groups - * ========================================================================== - */ -/* - * Update the allocatable flag and the metaslab group's capacity. - * The allocatable flag is set to true if the capacity is below - * the zfs_mg_noalloc_threshold or has a fragmentation value that is - * greater than zfs_mg_fragmentation_threshold. If a metaslab group - * transitions from allocatable to non-allocatable or vice versa then the - * metaslab group's class is updated to reflect the transition. - */ -static void -metaslab_group_alloc_update(metaslab_group_t *mg) -{ - vdev_t *vd = mg->mg_vd; - metaslab_class_t *mc = mg->mg_class; - vdev_stat_t *vs = &vd->vdev_stat; - boolean_t was_allocatable; - boolean_t was_initialized; - - ASSERT(vd == vd->vdev_top); - ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, - SCL_ALLOC); - - mutex_enter(&mg->mg_lock); - was_allocatable = mg->mg_allocatable; - was_initialized = mg->mg_initialized; - - mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / - (vs->vs_space + 1); - - mutex_enter(&mc->mc_lock); - - /* - * If the metaslab group was just added then it won't - * have any space until we finish syncing out this txg. - * At that point we will consider it initialized and available - * for allocations. We also don't consider non-activated - * metaslab groups (e.g. vdevs that are in the middle of being removed) - * to be initialized, because they can't be used for allocation. - */ - mg->mg_initialized = metaslab_group_initialized(mg); - if (!was_initialized && mg->mg_initialized) { - mc->mc_groups++; - } else if (was_initialized && !mg->mg_initialized) { - ASSERT3U(mc->mc_groups, >, 0); - mc->mc_groups--; - } - if (mg->mg_initialized) - mg->mg_no_free_space = B_FALSE; - - /* - * A metaslab group is considered allocatable if it has plenty - * of free space or is not heavily fragmented. We only take - * fragmentation into account if the metaslab group has a valid - * fragmentation metric (i.e. a value between 0 and 100). - */ - mg->mg_allocatable = (mg->mg_activation_count > 0 && - mg->mg_free_capacity > zfs_mg_noalloc_threshold && - (mg->mg_fragmentation == ZFS_FRAG_INVALID || - mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); - - /* - * The mc_alloc_groups maintains a count of the number of - * groups in this metaslab class that are still above the - * zfs_mg_noalloc_threshold. This is used by the allocating - * threads to determine if they should avoid allocations to - * a given group. The allocator will avoid allocations to a group - * if that group has reached or is below the zfs_mg_noalloc_threshold - * and there are still other groups that are above the threshold. - * When a group transitions from allocatable to non-allocatable or - * vice versa we update the metaslab class to reflect that change. - * When the mc_alloc_groups value drops to 0 that means that all - * groups have reached the zfs_mg_noalloc_threshold making all groups - * eligible for allocations. This effectively means that all devices - * are balanced again. - */ - if (was_allocatable && !mg->mg_allocatable) - mc->mc_alloc_groups--; - else if (!was_allocatable && mg->mg_allocatable) - mc->mc_alloc_groups++; - mutex_exit(&mc->mc_lock); - - mutex_exit(&mg->mg_lock); -} - -metaslab_group_t * -metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) -{ - metaslab_group_t *mg; - - mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); - mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL); - mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), - KM_SLEEP); - mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), - KM_SLEEP); - avl_create(&mg->mg_metaslab_tree, metaslab_compare, - sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); - mg->mg_vd = vd; - mg->mg_class = mc; - mg->mg_activation_count = 0; - mg->mg_initialized = B_FALSE; - mg->mg_no_free_space = B_TRUE; - mg->mg_allocators = allocators; - - mg->mg_alloc_queue_depth = kmem_zalloc(allocators * - sizeof (zfs_refcount_t), KM_SLEEP); - mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * - sizeof (uint64_t), KM_SLEEP); - for (int i = 0; i < allocators; i++) { - zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); - mg->mg_cur_max_alloc_queue_depth[i] = 0; - } - - mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, - minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); - - return (mg); -} - -void -metaslab_group_destroy(metaslab_group_t *mg) -{ - ASSERT(mg->mg_prev == NULL); - ASSERT(mg->mg_next == NULL); - /* - * We may have gone below zero with the activation count - * either because we never activated in the first place or - * because we're done, and possibly removing the vdev. - */ - ASSERT(mg->mg_activation_count <= 0); - - taskq_destroy(mg->mg_taskq); - avl_destroy(&mg->mg_metaslab_tree); - kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); - kmem_free(mg->mg_secondaries, mg->mg_allocators * - sizeof (metaslab_t *)); - mutex_destroy(&mg->mg_lock); - mutex_destroy(&mg->mg_ms_initialize_lock); - cv_destroy(&mg->mg_ms_initialize_cv); - - for (int i = 0; i < mg->mg_allocators; i++) { - zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]); - mg->mg_cur_max_alloc_queue_depth[i] = 0; - } - kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * - sizeof (zfs_refcount_t)); - kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * - sizeof (uint64_t)); - - kmem_free(mg, sizeof (metaslab_group_t)); -} - -void -metaslab_group_activate(metaslab_group_t *mg) -{ - metaslab_class_t *mc = mg->mg_class; - metaslab_group_t *mgprev, *mgnext; - - ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); - - ASSERT(mc->mc_rotor != mg); - ASSERT(mg->mg_prev == NULL); - ASSERT(mg->mg_next == NULL); - ASSERT(mg->mg_activation_count <= 0); - - if (++mg->mg_activation_count <= 0) - return; - - mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); - metaslab_group_alloc_update(mg); - - if ((mgprev = mc->mc_rotor) == NULL) { - mg->mg_prev = mg; - mg->mg_next = mg; - } else { - mgnext = mgprev->mg_next; - mg->mg_prev = mgprev; - mg->mg_next = mgnext; - mgprev->mg_next = mg; - mgnext->mg_prev = mg; - } - mc->mc_rotor = mg; - metaslab_class_minblocksize_update(mc); -} - -/* - * Passivate a metaslab group and remove it from the allocation rotor. - * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating - * a metaslab group. This function will momentarily drop spa_config_locks - * that are lower than the SCL_ALLOC lock (see comment below). - */ -void -metaslab_group_passivate(metaslab_group_t *mg) -{ - metaslab_class_t *mc = mg->mg_class; - spa_t *spa = mc->mc_spa; - metaslab_group_t *mgprev, *mgnext; - int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); - - ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, - (SCL_ALLOC | SCL_ZIO)); - - if (--mg->mg_activation_count != 0) { - ASSERT(mc->mc_rotor != mg); - ASSERT(mg->mg_prev == NULL); - ASSERT(mg->mg_next == NULL); - ASSERT(mg->mg_activation_count < 0); - return; - } - - /* - * The spa_config_lock is an array of rwlocks, ordered as - * follows (from highest to lowest): - * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > - * SCL_ZIO > SCL_FREE > SCL_VDEV - * (For more information about the spa_config_lock see spa_misc.c) - * The higher the lock, the broader its coverage. When we passivate - * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO - * config locks. However, the metaslab group's taskq might be trying - * to preload metaslabs so we must drop the SCL_ZIO lock and any - * lower locks to allow the I/O to complete. At a minimum, - * we continue to hold the SCL_ALLOC lock, which prevents any future - * allocations from taking place and any changes to the vdev tree. - */ - spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); - taskq_wait(mg->mg_taskq); - spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); - metaslab_group_alloc_update(mg); - for (int i = 0; i < mg->mg_allocators; i++) { - metaslab_t *msp = mg->mg_primaries[i]; - if (msp != NULL) { - mutex_enter(&msp->ms_lock); - metaslab_passivate(msp, - metaslab_weight_from_range_tree(msp)); - mutex_exit(&msp->ms_lock); - } - msp = mg->mg_secondaries[i]; - if (msp != NULL) { - mutex_enter(&msp->ms_lock); - metaslab_passivate(msp, - metaslab_weight_from_range_tree(msp)); - mutex_exit(&msp->ms_lock); - } - } - - mgprev = mg->mg_prev; - mgnext = mg->mg_next; - - if (mg == mgnext) { - mc->mc_rotor = NULL; - } else { - mc->mc_rotor = mgnext; - mgprev->mg_next = mgnext; - mgnext->mg_prev = mgprev; - } - - mg->mg_prev = NULL; - mg->mg_next = NULL; - metaslab_class_minblocksize_update(mc); -} - -boolean_t -metaslab_group_initialized(metaslab_group_t *mg) -{ - vdev_t *vd = mg->mg_vd; - vdev_stat_t *vs = &vd->vdev_stat; - - return (vs->vs_space != 0 && mg->mg_activation_count > 0); -} - -uint64_t -metaslab_group_get_space(metaslab_group_t *mg) -{ - return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); -} - -void -metaslab_group_histogram_verify(metaslab_group_t *mg) -{ - uint64_t *mg_hist; - vdev_t *vd = mg->mg_vd; - uint64_t ashift = vd->vdev_ashift; - int i; - - if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) - return; - - mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, - KM_SLEEP); - - ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, - SPACE_MAP_HISTOGRAM_SIZE + ashift); - - for (int m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - ASSERT(msp != NULL); - - /* skip if not active or not a member */ - if (msp->ms_sm == NULL || msp->ms_group != mg) - continue; - - for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) - mg_hist[i + ashift] += - msp->ms_sm->sm_phys->smp_histogram[i]; - } - - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) - VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); - - kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); -} - -static void -metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) -{ - metaslab_class_t *mc = mg->mg_class; - uint64_t ashift = mg->mg_vd->vdev_ashift; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - if (msp->ms_sm == NULL) - return; - - mutex_enter(&mg->mg_lock); - for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { - mg->mg_histogram[i + ashift] += - msp->ms_sm->sm_phys->smp_histogram[i]; - mc->mc_histogram[i + ashift] += - msp->ms_sm->sm_phys->smp_histogram[i]; - } - mutex_exit(&mg->mg_lock); -} - -void -metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) -{ - metaslab_class_t *mc = mg->mg_class; - uint64_t ashift = mg->mg_vd->vdev_ashift; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - if (msp->ms_sm == NULL) - return; - - mutex_enter(&mg->mg_lock); - for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { - ASSERT3U(mg->mg_histogram[i + ashift], >=, - msp->ms_sm->sm_phys->smp_histogram[i]); - ASSERT3U(mc->mc_histogram[i + ashift], >=, - msp->ms_sm->sm_phys->smp_histogram[i]); - - mg->mg_histogram[i + ashift] -= - msp->ms_sm->sm_phys->smp_histogram[i]; - mc->mc_histogram[i + ashift] -= - msp->ms_sm->sm_phys->smp_histogram[i]; - } - mutex_exit(&mg->mg_lock); -} - -static void -metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) -{ - ASSERT(msp->ms_group == NULL); - mutex_enter(&mg->mg_lock); - msp->ms_group = mg; - msp->ms_weight = 0; - avl_add(&mg->mg_metaslab_tree, msp); - mutex_exit(&mg->mg_lock); - - mutex_enter(&msp->ms_lock); - metaslab_group_histogram_add(mg, msp); - mutex_exit(&msp->ms_lock); -} - -static void -metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) -{ - mutex_enter(&msp->ms_lock); - metaslab_group_histogram_remove(mg, msp); - mutex_exit(&msp->ms_lock); - - mutex_enter(&mg->mg_lock); - ASSERT(msp->ms_group == mg); - avl_remove(&mg->mg_metaslab_tree, msp); - msp->ms_group = NULL; - mutex_exit(&mg->mg_lock); -} - -static void -metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) -{ - ASSERT(MUTEX_HELD(&mg->mg_lock)); - ASSERT(msp->ms_group == mg); - avl_remove(&mg->mg_metaslab_tree, msp); - msp->ms_weight = weight; - avl_add(&mg->mg_metaslab_tree, msp); - -} - -static void -metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) -{ - /* - * Although in principle the weight can be any value, in - * practice we do not use values in the range [1, 511]. - */ - ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - mutex_enter(&mg->mg_lock); - metaslab_group_sort_impl(mg, msp, weight); - mutex_exit(&mg->mg_lock); -} - -/* - * Calculate the fragmentation for a given metaslab group. We can use - * a simple average here since all metaslabs within the group must have - * the same size. The return value will be a value between 0 and 100 - * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this - * group have a fragmentation metric. - */ -uint64_t -metaslab_group_fragmentation(metaslab_group_t *mg) -{ - vdev_t *vd = mg->mg_vd; - uint64_t fragmentation = 0; - uint64_t valid_ms = 0; - - for (int m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - - if (msp->ms_fragmentation == ZFS_FRAG_INVALID) - continue; - if (msp->ms_group != mg) - continue; - - valid_ms++; - fragmentation += msp->ms_fragmentation; - } - - if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) - return (ZFS_FRAG_INVALID); - - fragmentation /= valid_ms; - ASSERT3U(fragmentation, <=, 100); - return (fragmentation); -} - -/* - * Determine if a given metaslab group should skip allocations. A metaslab - * group should avoid allocations if its free capacity is less than the - * zfs_mg_noalloc_threshold or its fragmentation metric is greater than - * zfs_mg_fragmentation_threshold and there is at least one metaslab group - * that can still handle allocations. If the allocation throttle is enabled - * then we skip allocations to devices that have reached their maximum - * allocation queue depth unless the selected metaslab group is the only - * eligible group remaining. - */ -static boolean_t -metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, - uint64_t psize, int allocator, int d) -{ - spa_t *spa = mg->mg_vd->vdev_spa; - metaslab_class_t *mc = mg->mg_class; - - /* - * We can only consider skipping this metaslab group if it's - * in the normal metaslab class and there are other metaslab - * groups to select from. Otherwise, we always consider it eligible - * for allocations. - */ - if ((mc != spa_normal_class(spa) && - mc != spa_special_class(spa) && - mc != spa_dedup_class(spa)) || - mc->mc_groups <= 1) - return (B_TRUE); - - /* - * If the metaslab group's mg_allocatable flag is set (see comments - * in metaslab_group_alloc_update() for more information) and - * the allocation throttle is disabled then allow allocations to this - * device. However, if the allocation throttle is enabled then - * check if we have reached our allocation limit (mg_alloc_queue_depth) - * to determine if we should allow allocations to this metaslab group. - * If all metaslab groups are no longer considered allocatable - * (mc_alloc_groups == 0) or we're trying to allocate the smallest - * gang block size then we allow allocations on this metaslab group - * regardless of the mg_allocatable or throttle settings. - */ - if (mg->mg_allocatable) { - metaslab_group_t *mgp; - int64_t qdepth; - uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; - - if (!mc->mc_alloc_throttle_enabled) - return (B_TRUE); - - /* - * If this metaslab group does not have any free space, then - * there is no point in looking further. - */ - if (mg->mg_no_free_space) - return (B_FALSE); - - /* - * Relax allocation throttling for ditto blocks. Due to - * random imbalances in allocation it tends to push copies - * to one vdev, that looks a bit better at the moment. - */ - qmax = qmax * (4 + d) / 4; - - qdepth = zfs_refcount_count( - &mg->mg_alloc_queue_depth[allocator]); - - /* - * If this metaslab group is below its qmax or it's - * the only allocatable metasable group, then attempt - * to allocate from it. - */ - if (qdepth < qmax || mc->mc_alloc_groups == 1) - return (B_TRUE); - ASSERT3U(mc->mc_alloc_groups, >, 1); - - /* - * Since this metaslab group is at or over its qmax, we - * need to determine if there are metaslab groups after this - * one that might be able to handle this allocation. This is - * racy since we can't hold the locks for all metaslab - * groups at the same time when we make this check. - */ - for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { - qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; - qmax = qmax * (4 + d) / 4; - qdepth = zfs_refcount_count( - &mgp->mg_alloc_queue_depth[allocator]); - - /* - * If there is another metaslab group that - * might be able to handle the allocation, then - * we return false so that we skip this group. - */ - if (qdepth < qmax && !mgp->mg_no_free_space) - return (B_FALSE); - } - - /* - * We didn't find another group to handle the allocation - * so we can't skip this metaslab group even though - * we are at or over our qmax. - */ - return (B_TRUE); - - } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { - return (B_TRUE); - } - return (B_FALSE); -} - -/* - * ========================================================================== - * Range tree callbacks - * ========================================================================== - */ - -/* - * Comparison function for the private size-ordered tree. Tree is sorted - * by size, larger sizes at the end of the tree. - */ -static int -metaslab_rangesize_compare(const void *x1, const void *x2) -{ - const range_seg_t *r1 = x1; - const range_seg_t *r2 = x2; - uint64_t rs_size1 = r1->rs_end - r1->rs_start; - uint64_t rs_size2 = r2->rs_end - r2->rs_start; - - int cmp = AVL_CMP(rs_size1, rs_size2); - if (likely(cmp)) - return (cmp); - - return (AVL_CMP(r1->rs_start, r2->rs_start)); -} - -/* - * ========================================================================== - * Common allocator routines - * ========================================================================== - */ - -/* - * Return the maximum contiguous segment within the metaslab. - */ -uint64_t -metaslab_block_maxsize(metaslab_t *msp) -{ - avl_tree_t *t = &msp->ms_allocatable_by_size; - range_seg_t *rs; - - if (t == NULL || (rs = avl_last(t)) == NULL) - return (0ULL); - - return (rs->rs_end - rs->rs_start); -} - -static range_seg_t * -metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) -{ - range_seg_t *rs, rsearch; - avl_index_t where; - - rsearch.rs_start = start; - rsearch.rs_end = start + size; - - rs = avl_find(t, &rsearch, &where); - if (rs == NULL) { - rs = avl_nearest(t, where, AVL_AFTER); - } - - return (rs); -} - -/* - * This is a helper function that can be used by the allocator to find - * a suitable block to allocate. This will search the specified AVL - * tree looking for a block that matches the specified criteria. - */ -static uint64_t -metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, - uint64_t align) -{ - range_seg_t *rs = metaslab_block_find(t, *cursor, size); - - while (rs != NULL) { - uint64_t offset = P2ROUNDUP(rs->rs_start, align); - - if (offset + size <= rs->rs_end) { - *cursor = offset + size; - return (offset); - } - rs = AVL_NEXT(t, rs); - } - - /* - * If we know we've searched the whole map (*cursor == 0), give up. - * Otherwise, reset the cursor to the beginning and try again. - */ - if (*cursor == 0) - return (-1ULL); - - *cursor = 0; - return (metaslab_block_picker(t, cursor, size, align)); -} - -/* - * ========================================================================== - * The first-fit block allocator - * ========================================================================== - */ -static uint64_t -metaslab_ff_alloc(metaslab_t *msp, uint64_t size) -{ - /* - * Find the largest power of 2 block size that evenly divides the - * requested size. This is used to try to allocate blocks with similar - * alignment from the same area of the metaslab (i.e. same cursor - * bucket) but it does not guarantee that other allocations sizes - * may exist in the same region. - */ - uint64_t align = size & -size; - uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; - avl_tree_t *t = &msp->ms_allocatable->rt_root; - - return (metaslab_block_picker(t, cursor, size, align)); -} - -static metaslab_ops_t metaslab_ff_ops = { - metaslab_ff_alloc -}; - -/* - * ========================================================================== - * Dynamic block allocator - - * Uses the first fit allocation scheme until space get low and then - * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold - * and metaslab_df_free_pct to determine when to switch the allocation scheme. - * ========================================================================== - */ -static uint64_t -metaslab_df_alloc(metaslab_t *msp, uint64_t size) -{ - /* - * Find the largest power of 2 block size that evenly divides the - * requested size. This is used to try to allocate blocks with similar - * alignment from the same area of the metaslab (i.e. same cursor - * bucket) but it does not guarantee that other allocations sizes - * may exist in the same region. - */ - uint64_t align = size & -size; - uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; - range_tree_t *rt = msp->ms_allocatable; - avl_tree_t *t = &rt->rt_root; - uint64_t max_size = metaslab_block_maxsize(msp); - int free_pct = range_tree_space(rt) * 100 / msp->ms_size; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, - avl_numnodes(&msp->ms_allocatable_by_size)); - - if (max_size < size) - return (-1ULL); - - /* - * If we're running low on space switch to using the size - * sorted AVL tree (best-fit). - */ - if (max_size < metaslab_df_alloc_threshold || - free_pct < metaslab_df_free_pct) { - t = &msp->ms_allocatable_by_size; - *cursor = 0; - } - - return (metaslab_block_picker(t, cursor, size, 1ULL)); -} - -static metaslab_ops_t metaslab_df_ops = { - metaslab_df_alloc -}; - -/* - * ========================================================================== - * Cursor fit block allocator - - * Select the largest region in the metaslab, set the cursor to the beginning - * of the range and the cursor_end to the end of the range. As allocations - * are made advance the cursor. Continue allocating from the cursor until - * the range is exhausted and then find a new range. - * ========================================================================== - */ -static uint64_t -metaslab_cf_alloc(metaslab_t *msp, uint64_t size) -{ - range_tree_t *rt = msp->ms_allocatable; - avl_tree_t *t = &msp->ms_allocatable_by_size; - uint64_t *cursor = &msp->ms_lbas[0]; - uint64_t *cursor_end = &msp->ms_lbas[1]; - uint64_t offset = 0; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); - - ASSERT3U(*cursor_end, >=, *cursor); - - if ((*cursor + size) > *cursor_end) { - range_seg_t *rs; - - rs = avl_last(&msp->ms_allocatable_by_size); - if (rs == NULL || (rs->rs_end - rs->rs_start) < size) - return (-1ULL); - - *cursor = rs->rs_start; - *cursor_end = rs->rs_end; - } - - offset = *cursor; - *cursor += size; - - return (offset); -} - -static metaslab_ops_t metaslab_cf_ops = { - metaslab_cf_alloc -}; - -/* - * ========================================================================== - * New dynamic fit allocator - - * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift - * contiguous blocks. If no region is found then just use the largest segment - * that remains. - * ========================================================================== - */ - -/* - * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) - * to request from the allocator. - */ -uint64_t metaslab_ndf_clump_shift = 4; - -static uint64_t -metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) -{ - avl_tree_t *t = &msp->ms_allocatable->rt_root; - avl_index_t where; - range_seg_t *rs, rsearch; - uint64_t hbit = highbit64(size); - uint64_t *cursor = &msp->ms_lbas[hbit - 1]; - uint64_t max_size = metaslab_block_maxsize(msp); - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, - avl_numnodes(&msp->ms_allocatable_by_size)); - - if (max_size < size) - return (-1ULL); - - rsearch.rs_start = *cursor; - rsearch.rs_end = *cursor + size; - - rs = avl_find(t, &rsearch, &where); - if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { - t = &msp->ms_allocatable_by_size; - - rsearch.rs_start = 0; - rsearch.rs_end = MIN(max_size, - 1ULL << (hbit + metaslab_ndf_clump_shift)); - rs = avl_find(t, &rsearch, &where); - if (rs == NULL) - rs = avl_nearest(t, where, AVL_AFTER); - ASSERT(rs != NULL); - } - - if ((rs->rs_end - rs->rs_start) >= size) { - *cursor = rs->rs_start + size; - return (rs->rs_start); - } - return (-1ULL); -} - -static metaslab_ops_t metaslab_ndf_ops = { - metaslab_ndf_alloc -}; - -metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; - -/* - * ========================================================================== - * Metaslabs - * ========================================================================== - */ - -static void -metaslab_aux_histograms_clear(metaslab_t *msp) -{ - /* - * Auxiliary histograms are only cleared when resetting them, - * which can only happen while the metaslab is loaded. - */ - ASSERT(msp->ms_loaded); - - bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); - for (int t = 0; t < TXG_DEFER_SIZE; t++) - bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); -} - -static void -metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, - range_tree_t *rt) -{ - /* - * This is modeled after space_map_histogram_add(), so refer to that - * function for implementation details. We want this to work like - * the space map histogram, and not the range tree histogram, as we - * are essentially constructing a delta that will be later subtracted - * from the space map histogram. - */ - int idx = 0; - for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { - ASSERT3U(i, >=, idx + shift); - histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); - - if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { - ASSERT3U(idx + shift, ==, i); - idx++; - ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); - } - } -} - -/* - * Called at every sync pass that the metaslab gets synced. - * - * The reason is that we want our auxiliary histograms to be updated - * wherever the metaslab's space map histogram is updated. This way - * we stay consistent on which parts of the metaslab space map's - * histogram are currently not available for allocations (e.g because - * they are in the defer, freed, and freeing trees). - */ -static void -metaslab_aux_histograms_update(metaslab_t *msp) -{ - space_map_t *sm = msp->ms_sm; - ASSERT(sm != NULL); - - /* - * This is similar to the metaslab's space map histogram updates - * that take place in metaslab_sync(). The only difference is that - * we only care about segments that haven't made it into the - * ms_allocatable tree yet. - */ - if (msp->ms_loaded) { - metaslab_aux_histograms_clear(msp); - - metaslab_aux_histogram_add(msp->ms_synchist, - sm->sm_shift, msp->ms_freed); - - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - metaslab_aux_histogram_add(msp->ms_deferhist[t], - sm->sm_shift, msp->ms_defer[t]); - } - } - - metaslab_aux_histogram_add(msp->ms_synchist, - sm->sm_shift, msp->ms_freeing); -} - -/* - * Called every time we are done syncing (writing to) the metaslab, - * i.e. at the end of each sync pass. - * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] - */ -static void -metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) -{ - spa_t *spa = msp->ms_group->mg_vd->vdev_spa; - space_map_t *sm = msp->ms_sm; - - if (sm == NULL) { - /* - * We came here from metaslab_init() when creating/opening a - * pool, looking at a metaslab that hasn't had any allocations - * yet. - */ - return; - } - - /* - * This is similar to the actions that we take for the ms_freed - * and ms_defer trees in metaslab_sync_done(). - */ - uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; - if (defer_allowed) { - bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], - sizeof (msp->ms_synchist)); - } else { - bzero(msp->ms_deferhist[hist_index], - sizeof (msp->ms_deferhist[hist_index])); - } - bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); -} - -/* - * Ensure that the metaslab's weight and fragmentation are consistent - * with the contents of the histogram (either the range tree's histogram - * or the space map's depending whether the metaslab is loaded). - */ -static void -metaslab_verify_weight_and_frag(metaslab_t *msp) -{ - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) - return; - - /* see comment in metaslab_verify_unflushed_changes() */ - if (msp->ms_group == NULL) - return; - - /* - * Devices being removed always return a weight of 0 and leave - * fragmentation and ms_max_size as is - there is nothing for - * us to verify here. - */ - vdev_t *vd = msp->ms_group->mg_vd; - if (vd->vdev_removing) - return; - - /* - * If the metaslab is dirty it probably means that we've done - * some allocations or frees that have changed our histograms - * and thus the weight. - */ - for (int t = 0; t < TXG_SIZE; t++) { - if (txg_list_member(&vd->vdev_ms_list, msp, t)) - return; - } - - /* - * This verification checks that our in-memory state is consistent - * with what's on disk. If the pool is read-only then there aren't - * any changes and we just have the initially-loaded state. - */ - if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) - return; - - /* some extra verification for in-core tree if you can */ - if (msp->ms_loaded) { - range_tree_stat_verify(msp->ms_allocatable); - VERIFY(space_map_histogram_verify(msp->ms_sm, - msp->ms_allocatable)); - } - - uint64_t weight = msp->ms_weight; - uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; - boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); - uint64_t frag = msp->ms_fragmentation; - uint64_t max_segsize = msp->ms_max_size; - - msp->ms_weight = 0; - msp->ms_fragmentation = 0; - msp->ms_max_size = 0; - - /* - * This function is used for verification purposes. Regardless of - * whether metaslab_weight() thinks this metaslab should be active or - * not, we want to ensure that the actual weight (and therefore the - * value of ms_weight) would be the same if it was to be recalculated - * at this point. - */ - msp->ms_weight = metaslab_weight(msp) | was_active; - - VERIFY3U(max_segsize, ==, msp->ms_max_size); - - /* - * If the weight type changed then there is no point in doing - * verification. Revert fields to their original values. - */ - if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || - (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { - msp->ms_fragmentation = frag; - msp->ms_weight = weight; - return; - } - - VERIFY3U(msp->ms_fragmentation, ==, frag); - VERIFY3U(msp->ms_weight, ==, weight); -} - -/* - * Wait for any in-progress metaslab loads to complete. - */ -static void -metaslab_load_wait(metaslab_t *msp) -{ - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - while (msp->ms_loading) { - ASSERT(!msp->ms_loaded); - cv_wait(&msp->ms_load_cv, &msp->ms_lock); - } -} - -static int -metaslab_load_impl(metaslab_t *msp) -{ - int error = 0; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT(msp->ms_loading); - ASSERT(!msp->ms_condensing); - - /* - * We temporarily drop the lock to unblock other operations while we - * are reading the space map. Therefore, metaslab_sync() and - * metaslab_sync_done() can run at the same time as we do. - * - * metaslab_sync() can append to the space map while we are loading. - * Therefore we load only entries that existed when we started the - * load. Additionally, metaslab_sync_done() has to wait for the load - * to complete because there are potential races like metaslab_load() - * loading parts of the space map that are currently being appended - * by metaslab_sync(). If we didn't, the ms_allocatable would have - * entries that metaslab_sync_done() would try to re-add later. - * - * That's why before dropping the lock we remember the synced length - * of the metaslab and read up to that point of the space map, - * ignoring entries appended by metaslab_sync() that happen after we - * drop the lock. - */ - uint64_t length = msp->ms_synced_length; - mutex_exit(&msp->ms_lock); - - if (msp->ms_sm != NULL) { - error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, - SM_FREE, length); - } else { - /* - * The space map has not been allocated yet, so treat - * all the space in the metaslab as free and add it to the - * ms_allocatable tree. - */ - range_tree_add(msp->ms_allocatable, - msp->ms_start, msp->ms_size); - } - - /* - * We need to grab the ms_sync_lock to prevent metaslab_sync() from - * changing the ms_sm and the metaslab's range trees while we are - * about to use them and populate the ms_allocatable. The ms_lock - * is insufficient for this because metaslab_sync() doesn't hold - * the ms_lock while writing the ms_checkpointing tree to disk. - */ - mutex_enter(&msp->ms_sync_lock); - mutex_enter(&msp->ms_lock); - ASSERT(!msp->ms_condensing); - - if (error != 0) { - mutex_exit(&msp->ms_sync_lock); - return (error); - } - - ASSERT3P(msp->ms_group, !=, NULL); - msp->ms_loaded = B_TRUE; - - /* - * The ms_allocatable contains the segments that exist in the - * ms_defer trees [see ms_synced_length]. Thus we need to remove - * them from ms_allocatable as they will be added again in - * metaslab_sync_done(). - */ - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_walk(msp->ms_defer[t], - range_tree_remove, msp->ms_allocatable); - } - - /* - * Call metaslab_recalculate_weight_and_sort() now that the - * metaslab is loaded so we get the metaslab's real weight. - * - * Unless this metaslab was created with older software and - * has not yet been converted to use segment-based weight, we - * expect the new weight to be better or equal to the weight - * that the metaslab had while it was not loaded. This is - * because the old weight does not take into account the - * consolidation of adjacent segments between TXGs. [see - * comment for ms_synchist and ms_deferhist[] for more info] - */ - uint64_t weight = msp->ms_weight; - metaslab_recalculate_weight_and_sort(msp); - if (!WEIGHT_IS_SPACEBASED(weight)) - ASSERT3U(weight, <=, msp->ms_weight); - msp->ms_max_size = metaslab_block_maxsize(msp); - - spa_t *spa = msp->ms_group->mg_vd->vdev_spa; - metaslab_verify_space(msp, spa_syncing_txg(spa)); - mutex_exit(&msp->ms_sync_lock); - - return (0); -} - -int -metaslab_load(metaslab_t *msp) -{ - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - /* - * There may be another thread loading the same metaslab, if that's - * the case just wait until the other thread is done and return. - */ - metaslab_load_wait(msp); - if (msp->ms_loaded) - return (0); - VERIFY(!msp->ms_loading); - ASSERT(!msp->ms_condensing); - - msp->ms_loading = B_TRUE; - int error = metaslab_load_impl(msp); - msp->ms_loading = B_FALSE; - cv_broadcast(&msp->ms_load_cv); - - return (error); -} - -void -metaslab_unload(metaslab_t *msp) -{ - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - metaslab_verify_weight_and_frag(msp); - - range_tree_vacate(msp->ms_allocatable, NULL, NULL); - msp->ms_loaded = B_FALSE; - - msp->ms_weight &= ~METASLAB_ACTIVE_MASK; - msp->ms_max_size = 0; - - /* - * We explicitly recalculate the metaslab's weight based on its space - * map (as it is now not loaded). We want unload metaslabs to always - * have their weights calculated from the space map histograms, while - * loaded ones have it calculated from their in-core range tree - * [see metaslab_load()]. This way, the weight reflects the information - * available in-core, whether it is loaded or not - * - * If ms_group == NULL means that we came here from metaslab_fini(), - * at which point it doesn't make sense for us to do the recalculation - * and the sorting. - */ - if (msp->ms_group != NULL) - metaslab_recalculate_weight_and_sort(msp); -} - -static void -metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, - int64_t defer_delta, int64_t space_delta) -{ - vdev_space_update(vd, alloc_delta, defer_delta, space_delta); - - ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); - ASSERT(vd->vdev_ms_count != 0); - - metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, - vdev_deflated_space(vd, space_delta)); -} - -int -metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, - metaslab_t **msp) -{ - vdev_t *vd = mg->mg_vd; - spa_t *spa = vd->vdev_spa; - objset_t *mos = spa->spa_meta_objset; - metaslab_t *ms; - int error; - - ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); - mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); - - ms->ms_id = id; - ms->ms_start = id << vd->vdev_ms_shift; - ms->ms_size = 1ULL << vd->vdev_ms_shift; - ms->ms_allocator = -1; - ms->ms_new = B_TRUE; - - /* - * We only open space map objects that already exist. All others - * will be opened when we finally allocate an object for it. - * - * Note: - * When called from vdev_expand(), we can't call into the DMU as - * we are holding the spa_config_lock as a writer and we would - * deadlock [see relevant comment in vdev_metaslab_init()]. in - * that case, the object parameter is zero though, so we won't - * call into the DMU. - */ - if (object != 0) { - error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, - ms->ms_size, vd->vdev_ashift); - - if (error != 0) { - kmem_free(ms, sizeof (metaslab_t)); - return (error); - } - - ASSERT(ms->ms_sm != NULL); - ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0); - ms->ms_allocated_space = space_map_allocated(ms->ms_sm); - } - - /* - * We create the ms_allocatable here, but we don't create the - * other range trees until metaslab_sync_done(). This serves - * two purposes: it allows metaslab_sync_done() to detect the - * addition of new space; and for debugging, it ensures that - * we'd data fault on any attempt to use this metaslab before - * it's ready. - */ - ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size, - metaslab_rangesize_compare, 0); - metaslab_group_add(mg, ms); - - metaslab_set_fragmentation(ms); - - /* - * If we're opening an existing pool (txg == 0) or creating - * a new one (txg == TXG_INITIAL), all space is available now. - * If we're adding space to an existing pool, the new space - * does not become available until after this txg has synced. - * The metaslab's weight will also be initialized when we sync - * out this txg. This ensures that we don't attempt to allocate - * from it before we have initialized it completely. - */ - if (txg <= TXG_INITIAL) { - metaslab_sync_done(ms, 0); - metaslab_space_update(vd, mg->mg_class, - metaslab_allocated_space(ms), 0, 0); - } - - /* - * If metaslab_debug_load is set and we're initializing a metaslab - * that has an allocated space map object then load the space map - * so that we can verify frees. - */ - if (metaslab_debug_load && ms->ms_sm != NULL) { - mutex_enter(&ms->ms_lock); - VERIFY0(metaslab_load(ms)); - mutex_exit(&ms->ms_lock); - } - - if (txg != 0) { - vdev_dirty(vd, 0, NULL, txg); - vdev_dirty(vd, VDD_METASLAB, ms, txg); - } - - *msp = ms; - - return (0); -} - -void -metaslab_fini(metaslab_t *msp) -{ - metaslab_group_t *mg = msp->ms_group; - vdev_t *vd = mg->mg_vd; - - metaslab_group_remove(mg, msp); - - mutex_enter(&msp->ms_lock); - VERIFY(msp->ms_group == NULL); - metaslab_space_update(vd, mg->mg_class, - -metaslab_allocated_space(msp), 0, -msp->ms_size); - - space_map_close(msp->ms_sm); - - metaslab_unload(msp); - - range_tree_destroy(msp->ms_allocatable); - range_tree_destroy(msp->ms_freeing); - range_tree_destroy(msp->ms_freed); - - for (int t = 0; t < TXG_SIZE; t++) { - range_tree_destroy(msp->ms_allocating[t]); - } - - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_destroy(msp->ms_defer[t]); - } - ASSERT0(msp->ms_deferspace); - - range_tree_destroy(msp->ms_checkpointing); - - for (int t = 0; t < TXG_SIZE; t++) - ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); - - mutex_exit(&msp->ms_lock); - cv_destroy(&msp->ms_load_cv); - mutex_destroy(&msp->ms_lock); - mutex_destroy(&msp->ms_sync_lock); - ASSERT3U(msp->ms_allocator, ==, -1); - - kmem_free(msp, sizeof (metaslab_t)); -} - -#define FRAGMENTATION_TABLE_SIZE 17 - -/* - * This table defines a segment size based fragmentation metric that will - * allow each metaslab to derive its own fragmentation value. This is done - * by calculating the space in each bucket of the spacemap histogram and - * multiplying that by the fragmentation metric in this table. Doing - * this for all buckets and dividing it by the total amount of free - * space in this metaslab (i.e. the total free space in all buckets) gives - * us the fragmentation metric. This means that a high fragmentation metric - * equates to most of the free space being comprised of small segments. - * Conversely, if the metric is low, then most of the free space is in - * large segments. A 10% change in fragmentation equates to approximately - * double the number of segments. - * - * This table defines 0% fragmented space using 16MB segments. Testing has - * shown that segments that are greater than or equal to 16MB do not suffer - * from drastic performance problems. Using this value, we derive the rest - * of the table. Since the fragmentation value is never stored on disk, it - * is possible to change these calculations in the future. - */ -int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { - 100, /* 512B */ - 100, /* 1K */ - 98, /* 2K */ - 95, /* 4K */ - 90, /* 8K */ - 80, /* 16K */ - 70, /* 32K */ - 60, /* 64K */ - 50, /* 128K */ - 40, /* 256K */ - 30, /* 512K */ - 20, /* 1M */ - 15, /* 2M */ - 10, /* 4M */ - 5, /* 8M */ - 0 /* 16M */ -}; - -/* - * Calculate the metaslab's fragmentation metric and set ms_fragmentation. - * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not - * been upgraded and does not support this metric. Otherwise, the return - * value should be in the range [0, 100]. - */ -static void -metaslab_set_fragmentation(metaslab_t *msp) -{ - spa_t *spa = msp->ms_group->mg_vd->vdev_spa; - uint64_t fragmentation = 0; - uint64_t total = 0; - boolean_t feature_enabled = spa_feature_is_enabled(spa, - SPA_FEATURE_SPACEMAP_HISTOGRAM); - - if (!feature_enabled) { - msp->ms_fragmentation = ZFS_FRAG_INVALID; - return; - } - - /* - * A null space map means that the entire metaslab is free - * and thus is not fragmented. - */ - if (msp->ms_sm == NULL) { - msp->ms_fragmentation = 0; - return; - } - - /* - * If this metaslab's space map has not been upgraded, flag it - * so that we upgrade next time we encounter it. - */ - if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { - uint64_t txg = spa_syncing_txg(spa); - vdev_t *vd = msp->ms_group->mg_vd; - - /* - * If we've reached the final dirty txg, then we must - * be shutting down the pool. We don't want to dirty - * any data past this point so skip setting the condense - * flag. We can retry this action the next time the pool - * is imported. - */ - if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { - msp->ms_condense_wanted = B_TRUE; - vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); - zfs_dbgmsg("txg %llu, requesting force condense: " - "ms_id %llu, vdev_id %llu", txg, msp->ms_id, - vd->vdev_id); - } - msp->ms_fragmentation = ZFS_FRAG_INVALID; - return; - } - - for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { - uint64_t space = 0; - uint8_t shift = msp->ms_sm->sm_shift; - - int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, - FRAGMENTATION_TABLE_SIZE - 1); - - if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) - continue; - - space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); - total += space; - - ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); - fragmentation += space * zfs_frag_table[idx]; - } - - if (total > 0) - fragmentation /= total; - ASSERT3U(fragmentation, <=, 100); - - msp->ms_fragmentation = fragmentation; -} - -/* - * Compute a weight -- a selection preference value -- for the given metaslab. - * This is based on the amount of free space, the level of fragmentation, - * the LBA range, and whether the metaslab is loaded. - */ -static uint64_t -metaslab_space_weight(metaslab_t *msp) -{ - metaslab_group_t *mg = msp->ms_group; - vdev_t *vd = mg->mg_vd; - uint64_t weight, space; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT(!vd->vdev_removing); - - /* - * The baseline weight is the metaslab's free space. - */ - space = msp->ms_size - metaslab_allocated_space(msp); - - if (metaslab_fragmentation_factor_enabled && - msp->ms_fragmentation != ZFS_FRAG_INVALID) { - /* - * Use the fragmentation information to inversely scale - * down the baseline weight. We need to ensure that we - * don't exclude this metaslab completely when it's 100% - * fragmented. To avoid this we reduce the fragmented value - * by 1. - */ - space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; - - /* - * If space < SPA_MINBLOCKSIZE, then we will not allocate from - * this metaslab again. The fragmentation metric may have - * decreased the space to something smaller than - * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE - * so that we can consume any remaining space. - */ - if (space > 0 && space < SPA_MINBLOCKSIZE) - space = SPA_MINBLOCKSIZE; - } - weight = space; - - /* - * Modern disks have uniform bit density and constant angular velocity. - * Therefore, the outer recording zones are faster (higher bandwidth) - * than the inner zones by the ratio of outer to inner track diameter, - * which is typically around 2:1. We account for this by assigning - * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). - * In effect, this means that we'll select the metaslab with the most - * free bandwidth rather than simply the one with the most free space. - */ - if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { - weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; - ASSERT(weight >= space && weight <= 2 * space); - } - - /* - * If this metaslab is one we're actively using, adjust its - * weight to make it preferable to any inactive metaslab so - * we'll polish it off. If the fragmentation on this metaslab - * has exceed our threshold, then don't mark it active. - */ - if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && - msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { - weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); - } - - WEIGHT_SET_SPACEBASED(weight); - return (weight); -} - -/* - * Return the weight of the specified metaslab, according to the segment-based - * weighting algorithm. The metaslab must be loaded. This function can - * be called within a sync pass since it relies only on the metaslab's - * range tree which is always accurate when the metaslab is loaded. - */ -static uint64_t -metaslab_weight_from_range_tree(metaslab_t *msp) -{ - uint64_t weight = 0; - uint32_t segments = 0; - - ASSERT(msp->ms_loaded); - - for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; - i--) { - uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; - int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; - - segments <<= 1; - segments += msp->ms_allocatable->rt_histogram[i]; - - /* - * The range tree provides more precision than the space map - * and must be downgraded so that all values fit within the - * space map's histogram. This allows us to compare loaded - * vs. unloaded metaslabs to determine which metaslab is - * considered "best". - */ - if (i > max_idx) - continue; - - if (segments != 0) { - WEIGHT_SET_COUNT(weight, segments); - WEIGHT_SET_INDEX(weight, i); - WEIGHT_SET_ACTIVE(weight, 0); - break; - } - } - return (weight); -} - -/* - * Calculate the weight based on the on-disk histogram. This should only - * be called after a sync pass has completely finished since the on-disk - * information is updated in metaslab_sync(). - */ -static uint64_t -metaslab_weight_from_spacemap(metaslab_t *msp) -{ - space_map_t *sm = msp->ms_sm; - ASSERT(!msp->ms_loaded); - ASSERT(sm != NULL); - ASSERT3U(space_map_object(sm), !=, 0); - ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); - - /* - * Create a joint histogram from all the segments that have made - * it to the metaslab's space map histogram, that are not yet - * available for allocation because they are still in the freeing - * pipeline (e.g. freeing, freed, and defer trees). Then subtract - * these segments from the space map's histogram to get a more - * accurate weight. - */ - uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; - for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) - deferspace_histogram[i] += msp->ms_synchist[i]; - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { - deferspace_histogram[i] += msp->ms_deferhist[t][i]; - } - } - - uint64_t weight = 0; - for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { - ASSERT3U(sm->sm_phys->smp_histogram[i], >=, - deferspace_histogram[i]); - uint64_t count = - sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; - if (count != 0) { - WEIGHT_SET_COUNT(weight, count); - WEIGHT_SET_INDEX(weight, i + sm->sm_shift); - WEIGHT_SET_ACTIVE(weight, 0); - break; - } - } - return (weight); -} - -/* - * Compute a segment-based weight for the specified metaslab. The weight - * is determined by highest bucket in the histogram. The information - * for the highest bucket is encoded into the weight value. - */ -static uint64_t -metaslab_segment_weight(metaslab_t *msp) -{ - metaslab_group_t *mg = msp->ms_group; - uint64_t weight = 0; - uint8_t shift = mg->mg_vd->vdev_ashift; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - /* - * The metaslab is completely free. - */ - if (metaslab_allocated_space(msp) == 0) { - int idx = highbit64(msp->ms_size) - 1; - int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; - - if (idx < max_idx) { - WEIGHT_SET_COUNT(weight, 1ULL); - WEIGHT_SET_INDEX(weight, idx); - } else { - WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); - WEIGHT_SET_INDEX(weight, max_idx); - } - WEIGHT_SET_ACTIVE(weight, 0); - ASSERT(!WEIGHT_IS_SPACEBASED(weight)); - - return (weight); - } - - ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); - - /* - * If the metaslab is fully allocated then just make the weight 0. - */ - if (metaslab_allocated_space(msp) == msp->ms_size) - return (0); - /* - * If the metaslab is already loaded, then use the range tree to - * determine the weight. Otherwise, we rely on the space map information - * to generate the weight. - */ - if (msp->ms_loaded) { - weight = metaslab_weight_from_range_tree(msp); - } else { - weight = metaslab_weight_from_spacemap(msp); - } - - /* - * If the metaslab was active the last time we calculated its weight - * then keep it active. We want to consume the entire region that - * is associated with this weight. - */ - if (msp->ms_activation_weight != 0 && weight != 0) - WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); - return (weight); -} - -/* - * Determine if we should attempt to allocate from this metaslab. If the - * metaslab has a maximum size then we can quickly determine if the desired - * allocation size can be satisfied. Otherwise, if we're using segment-based - * weighting then we can determine the maximum allocation that this metaslab - * can accommodate based on the index encoded in the weight. If we're using - * space-based weights then rely on the entire weight (excluding the weight - * type bit). - */ -boolean_t -metaslab_should_allocate(metaslab_t *msp, uint64_t asize) -{ - boolean_t should_allocate; - - if (msp->ms_max_size != 0) - return (msp->ms_max_size >= asize); - - if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { - /* - * The metaslab segment weight indicates segments in the - * range [2^i, 2^(i+1)), where i is the index in the weight. - * Since the asize might be in the middle of the range, we - * should attempt the allocation if asize < 2^(i+1). - */ - should_allocate = (asize < - 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); - } else { - should_allocate = (asize <= - (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); - } - return (should_allocate); -} - -static uint64_t -metaslab_weight(metaslab_t *msp) -{ - vdev_t *vd = msp->ms_group->mg_vd; - spa_t *spa = vd->vdev_spa; - uint64_t weight; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - /* - * If this vdev is in the process of being removed, there is nothing - * for us to do here. - */ - if (vd->vdev_removing) - return (0); - - metaslab_set_fragmentation(msp); - - /* - * Update the maximum size if the metaslab is loaded. This will - * ensure that we get an accurate maximum size if newly freed space - * has been added back into the free tree. - */ - if (msp->ms_loaded) - msp->ms_max_size = metaslab_block_maxsize(msp); - else - ASSERT0(msp->ms_max_size); - - /* - * Segment-based weighting requires space map histogram support. - */ - if (zfs_metaslab_segment_weight_enabled && - spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && - (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == - sizeof (space_map_phys_t))) { - weight = metaslab_segment_weight(msp); - } else { - weight = metaslab_space_weight(msp); - } - return (weight); -} - -void -metaslab_recalculate_weight_and_sort(metaslab_t *msp) -{ - /* note: we preserve the mask (e.g. indication of primary, etc..) */ - uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; - metaslab_group_sort(msp->ms_group, msp, - metaslab_weight(msp) | was_active); -} - -static int -metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, - int allocator, uint64_t activation_weight) -{ - /* - * If we're activating for the claim code, we don't want to actually - * set the metaslab up for a specific allocator. - */ - if (activation_weight == METASLAB_WEIGHT_CLAIM) - return (0); - metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? - mg->mg_primaries : mg->mg_secondaries); - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - mutex_enter(&mg->mg_lock); - if (arr[allocator] != NULL) { - mutex_exit(&mg->mg_lock); - return (EEXIST); - } - - arr[allocator] = msp; - ASSERT3S(msp->ms_allocator, ==, -1); - msp->ms_allocator = allocator; - msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); - mutex_exit(&mg->mg_lock); - - return (0); -} - -static int -metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) -{ - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - int error = metaslab_load(msp); - if (error != 0) { - metaslab_group_sort(msp->ms_group, msp, 0); - return (error); - } - if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { - /* - * The metaslab was activated for another allocator - * while we were waiting, we should reselect. - */ - return (EBUSY); - } - if ((error = metaslab_activate_allocator(msp->ms_group, msp, - allocator, activation_weight)) != 0) { - return (error); - } - - msp->ms_activation_weight = msp->ms_weight; - metaslab_group_sort(msp->ms_group, msp, - msp->ms_weight | activation_weight); - } - ASSERT(msp->ms_loaded); - ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); - - return (0); -} - -static void -metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, - uint64_t weight) -{ - ASSERT(MUTEX_HELD(&msp->ms_lock)); - if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { - metaslab_group_sort(mg, msp, weight); - return; - } - - mutex_enter(&mg->mg_lock); - ASSERT3P(msp->ms_group, ==, mg); - if (msp->ms_primary) { - ASSERT3U(0, <=, msp->ms_allocator); - ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); - ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); - ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); - mg->mg_primaries[msp->ms_allocator] = NULL; - } else { - ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); - ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); - mg->mg_secondaries[msp->ms_allocator] = NULL; - } - msp->ms_allocator = -1; - metaslab_group_sort_impl(mg, msp, weight); - mutex_exit(&mg->mg_lock); -} - -static void -metaslab_passivate(metaslab_t *msp, uint64_t weight) -{ - uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; - - /* - * If size < SPA_MINBLOCKSIZE, then we will not allocate from - * this metaslab again. In that case, it had better be empty, - * or we would be leaving space on the table. - */ - ASSERT(size >= SPA_MINBLOCKSIZE || - range_tree_is_empty(msp->ms_allocatable)); - ASSERT0(weight & METASLAB_ACTIVE_MASK); - - msp->ms_activation_weight = 0; - metaslab_passivate_allocator(msp->ms_group, msp, weight); - ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); -} - -/* - * Segment-based metaslabs are activated once and remain active until - * we either fail an allocation attempt (similar to space-based metaslabs) - * or have exhausted the free space in zfs_metaslab_switch_threshold - * buckets since the metaslab was activated. This function checks to see - * if we've exhaused the zfs_metaslab_switch_threshold buckets in the - * metaslab and passivates it proactively. This will allow us to select a - * metaslabs with larger contiguous region if any remaining within this - * metaslab group. If we're in sync pass > 1, then we continue using this - * metaslab so that we don't dirty more block and cause more sync passes. - */ -void -metaslab_segment_may_passivate(metaslab_t *msp) -{ - spa_t *spa = msp->ms_group->mg_vd->vdev_spa; - - if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) - return; - - /* - * Since we are in the middle of a sync pass, the most accurate - * information that is accessible to us is the in-core range tree - * histogram; calculate the new weight based on that information. - */ - uint64_t weight = metaslab_weight_from_range_tree(msp); - int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); - int current_idx = WEIGHT_GET_INDEX(weight); - - if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) - metaslab_passivate(msp, weight); -} - -static void -metaslab_preload(void *arg) -{ - metaslab_t *msp = arg; - spa_t *spa = msp->ms_group->mg_vd->vdev_spa; - - ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); - - mutex_enter(&msp->ms_lock); - (void) metaslab_load(msp); - msp->ms_selected_txg = spa_syncing_txg(spa); - mutex_exit(&msp->ms_lock); -} - -static void -metaslab_group_preload(metaslab_group_t *mg) -{ - spa_t *spa = mg->mg_vd->vdev_spa; - metaslab_t *msp; - avl_tree_t *t = &mg->mg_metaslab_tree; - int m = 0; - - if (spa_shutting_down(spa) || !metaslab_preload_enabled) { - taskq_wait(mg->mg_taskq); - return; - } - - mutex_enter(&mg->mg_lock); - - /* - * Load the next potential metaslabs - */ - for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { - ASSERT3P(msp->ms_group, ==, mg); - - /* - * We preload only the maximum number of metaslabs specified - * by metaslab_preload_limit. If a metaslab is being forced - * to condense then we preload it too. This will ensure - * that force condensing happens in the next txg. - */ - if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { - continue; - } - - VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, - msp, TQ_SLEEP) != 0); - } - mutex_exit(&mg->mg_lock); -} - -/* - * Determine if the space map's on-disk footprint is past our tolerance - * for inefficiency. We would like to use the following criteria to make - * our decision: - * - * 1. The size of the space map object should not dramatically increase as a - * result of writing out the free space range tree. - * - * 2. The minimal on-disk space map representation is zfs_condense_pct/100 - * times the size than the free space range tree representation - * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). - * - * 3. The on-disk size of the space map should actually decrease. - * - * Unfortunately, we cannot compute the on-disk size of the space map in this - * context because we cannot accurately compute the effects of compression, etc. - * Instead, we apply the heuristic described in the block comment for - * zfs_metaslab_condense_block_threshold - we only condense if the space used - * is greater than a threshold number of blocks. - */ -static boolean_t -metaslab_should_condense(metaslab_t *msp) -{ - space_map_t *sm = msp->ms_sm; - vdev_t *vd = msp->ms_group->mg_vd; - uint64_t vdev_blocksize = 1 << vd->vdev_ashift; - uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT(msp->ms_loaded); - - /* - * Allocations and frees in early passes are generally more space - * efficient (in terms of blocks described in space map entries) - * than the ones in later passes (e.g. we don't compress after - * sync pass 5) and condensing a metaslab multiple times in a txg - * could degrade performance. - * - * Thus we prefer condensing each metaslab at most once every txg at - * the earliest sync pass possible. If a metaslab is eligible for - * condensing again after being considered for condensing within the - * same txg, it will hopefully be dirty in the next txg where it will - * be condensed at an earlier pass. - */ - if (msp->ms_condense_checked_txg == current_txg) - return (B_FALSE); - msp->ms_condense_checked_txg = current_txg; - - /* - * We always condense metaslabs that are empty and metaslabs for - * which a condense request has been made. - */ - if (avl_is_empty(&msp->ms_allocatable_by_size) || - msp->ms_condense_wanted) - return (B_TRUE); - - uint64_t object_size = space_map_length(msp->ms_sm); - uint64_t optimal_size = space_map_estimate_optimal_size(sm, - msp->ms_allocatable, SM_NO_VDEVID); - - dmu_object_info_t doi; - dmu_object_info_from_db(sm->sm_dbuf, &doi); - uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); - - return (object_size >= (optimal_size * zfs_condense_pct / 100) && - object_size > zfs_metaslab_condense_block_threshold * record_size); -} - -/* - * Condense the on-disk space map representation to its minimized form. - * The minimized form consists of a small number of allocations followed by - * the entries of the free range tree. - */ -static void -metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) -{ - range_tree_t *condense_tree; - space_map_t *sm = msp->ms_sm; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT(msp->ms_loaded); - - zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " - "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, - msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, - msp->ms_group->mg_vd->vdev_spa->spa_name, - space_map_length(msp->ms_sm), - avl_numnodes(&msp->ms_allocatable->rt_root), - msp->ms_condense_wanted ? "TRUE" : "FALSE"); - - msp->ms_condense_wanted = B_FALSE; - - /* - * Create an range tree that is 100% allocated. We remove segments - * that have been freed in this txg, any deferred frees that exist, - * and any allocation in the future. Removing segments should be - * a relatively inexpensive operation since we expect these trees to - * have a small number of nodes. - */ - condense_tree = range_tree_create(NULL, NULL); - range_tree_add(condense_tree, msp->ms_start, msp->ms_size); - - range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); - range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); - - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_walk(msp->ms_defer[t], - range_tree_remove, condense_tree); - } - - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { - range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], - range_tree_remove, condense_tree); - } - - /* - * We're about to drop the metaslab's lock thus allowing - * other consumers to change it's content. Set the - * metaslab's ms_condensing flag to ensure that - * allocations on this metaslab do not occur while we're - * in the middle of committing it to disk. This is only critical - * for ms_allocatable as all other range trees use per txg - * views of their content. - */ - msp->ms_condensing = B_TRUE; - - mutex_exit(&msp->ms_lock); - space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); - - /* - * While we would ideally like to create a space map representation - * that consists only of allocation records, doing so can be - * prohibitively expensive because the in-core free tree can be - * large, and therefore computationally expensive to subtract - * from the condense_tree. Instead we sync out two trees, a cheap - * allocation only tree followed by the in-core free tree. While not - * optimal, this is typically close to optimal, and much cheaper to - * compute. - */ - space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); - range_tree_vacate(condense_tree, NULL, NULL); - range_tree_destroy(condense_tree); - - space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); - mutex_enter(&msp->ms_lock); - msp->ms_condensing = B_FALSE; -} - -/* - * Write a metaslab to disk in the context of the specified transaction group. - */ -void -metaslab_sync(metaslab_t *msp, uint64_t txg) -{ - metaslab_group_t *mg = msp->ms_group; - vdev_t *vd = mg->mg_vd; - spa_t *spa = vd->vdev_spa; - objset_t *mos = spa_meta_objset(spa); - range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; - dmu_tx_t *tx; - uint64_t object = space_map_object(msp->ms_sm); - - ASSERT(!vd->vdev_ishole); - - /* - * This metaslab has just been added so there's no work to do now. - */ - if (msp->ms_freeing == NULL) { - ASSERT3P(alloctree, ==, NULL); - return; - } - - ASSERT3P(alloctree, !=, NULL); - ASSERT3P(msp->ms_freeing, !=, NULL); - ASSERT3P(msp->ms_freed, !=, NULL); - ASSERT3P(msp->ms_checkpointing, !=, NULL); - - /* - * Normally, we don't want to process a metaslab if there are no - * allocations or frees to perform. However, if the metaslab is being - * forced to condense and it's loaded, we need to let it through. - */ - if (range_tree_is_empty(alloctree) && - range_tree_is_empty(msp->ms_freeing) && - range_tree_is_empty(msp->ms_checkpointing) && - !(msp->ms_loaded && msp->ms_condense_wanted)) - return; - - - VERIFY(txg <= spa_final_dirty_txg(spa)); - - /* - * The only state that can actually be changing concurrently - * with metaslab_sync() is the metaslab's ms_allocatable. No - * other thread can be modifying this txg's alloc, freeing, - * freed, or space_map_phys_t. We drop ms_lock whenever we - * could call into the DMU, because the DMU can call down to - * us (e.g. via zio_free()) at any time. - * - * The spa_vdev_remove_thread() can be reading metaslab state - * concurrently, and it is locked out by the ms_sync_lock. - * Note that the ms_lock is insufficient for this, because it - * is dropped by space_map_write(). - */ - tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); - - if (msp->ms_sm == NULL) { - uint64_t new_object; - - new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); - VERIFY3U(new_object, !=, 0); - - VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, - msp->ms_start, msp->ms_size, vd->vdev_ashift)); - - ASSERT(msp->ms_sm != NULL); - ASSERT0(metaslab_allocated_space(msp)); - } - - if (!range_tree_is_empty(msp->ms_checkpointing) && - vd->vdev_checkpoint_sm == NULL) { - ASSERT(spa_has_checkpoint(spa)); - - uint64_t new_object = space_map_alloc(mos, - vdev_standard_sm_blksz, tx); - VERIFY3U(new_object, !=, 0); - - VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, - mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); - ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); - - /* - * We save the space map object as an entry in vdev_top_zap - * so it can be retrieved when the pool is reopened after an - * export or through zdb. - */ - VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, - vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, - sizeof (new_object), 1, &new_object, tx)); - } - - mutex_enter(&msp->ms_sync_lock); - mutex_enter(&msp->ms_lock); - - /* - * Note: metaslab_condense() clears the space map's histogram. - * Therefore we must verify and remove this histogram before - * condensing. - */ - metaslab_group_histogram_verify(mg); - metaslab_class_histogram_verify(mg->mg_class); - metaslab_group_histogram_remove(mg, msp); - - if (msp->ms_loaded && metaslab_should_condense(msp)) { - metaslab_condense(msp, txg, tx); - } else { - mutex_exit(&msp->ms_lock); - space_map_write(msp->ms_sm, alloctree, SM_ALLOC, - SM_NO_VDEVID, tx); - space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, - SM_NO_VDEVID, tx); - mutex_enter(&msp->ms_lock); - } - - msp->ms_allocated_space += range_tree_space(alloctree); - ASSERT3U(msp->ms_allocated_space, >=, - range_tree_space(msp->ms_freeing)); - msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); - - if (!range_tree_is_empty(msp->ms_checkpointing)) { - ASSERT(spa_has_checkpoint(spa)); - ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); - - /* - * Since we are doing writes to disk and the ms_checkpointing - * tree won't be changing during that time, we drop the - * ms_lock while writing to the checkpoint space map. - */ - mutex_exit(&msp->ms_lock); - space_map_write(vd->vdev_checkpoint_sm, - msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); - mutex_enter(&msp->ms_lock); - - spa->spa_checkpoint_info.sci_dspace += - range_tree_space(msp->ms_checkpointing); - vd->vdev_stat.vs_checkpoint_space += - range_tree_space(msp->ms_checkpointing); - ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, - -space_map_allocated(vd->vdev_checkpoint_sm)); - - range_tree_vacate(msp->ms_checkpointing, NULL, NULL); - } - - if (msp->ms_loaded) { - /* - * When the space map is loaded, we have an accurate - * histogram in the range tree. This gives us an opportunity - * to bring the space map's histogram up-to-date so we clear - * it first before updating it. - */ - space_map_histogram_clear(msp->ms_sm); - space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); - - /* - * Since we've cleared the histogram we need to add back - * any free space that has already been processed, plus - * any deferred space. This allows the on-disk histogram - * to accurately reflect all free space even if some space - * is not yet available for allocation (i.e. deferred). - */ - space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); - - /* - * Add back any deferred free space that has not been - * added back into the in-core free tree yet. This will - * ensure that we don't end up with a space map histogram - * that is completely empty unless the metaslab is fully - * allocated. - */ - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - space_map_histogram_add(msp->ms_sm, - msp->ms_defer[t], tx); - } - } - - /* - * Always add the free space from this sync pass to the space - * map histogram. We want to make sure that the on-disk histogram - * accounts for all free space. If the space map is not loaded, - * then we will lose some accuracy but will correct it the next - * time we load the space map. - */ - space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); - metaslab_aux_histograms_update(msp); - - metaslab_group_histogram_add(mg, msp); - metaslab_group_histogram_verify(mg); - metaslab_class_histogram_verify(mg->mg_class); - - /* - * For sync pass 1, we avoid traversing this txg's free range tree - * and instead will just swap the pointers for freeing and freed. - * We can safely do this since the freed_tree is guaranteed to be - * empty on the initial pass. - */ - if (spa_sync_pass(spa) == 1) { - range_tree_swap(&msp->ms_freeing, &msp->ms_freed); - ASSERT0(msp->ms_allocated_this_txg); - } else { - range_tree_vacate(msp->ms_freeing, - range_tree_add, msp->ms_freed); - } - msp->ms_allocated_this_txg += range_tree_space(alloctree); - range_tree_vacate(alloctree, NULL, NULL); - - ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); - ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) - & TXG_MASK])); - ASSERT0(range_tree_space(msp->ms_freeing)); - ASSERT0(range_tree_space(msp->ms_checkpointing)); - - mutex_exit(&msp->ms_lock); - - if (object != space_map_object(msp->ms_sm)) { - object = space_map_object(msp->ms_sm); - dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * - msp->ms_id, sizeof (uint64_t), &object, tx); - } - mutex_exit(&msp->ms_sync_lock); - dmu_tx_commit(tx); -} - -/* - * Called after a transaction group has completely synced to mark - * all of the metaslab's free space as usable. - */ -void -metaslab_sync_done(metaslab_t *msp, uint64_t txg) -{ - metaslab_group_t *mg = msp->ms_group; - vdev_t *vd = mg->mg_vd; - spa_t *spa = vd->vdev_spa; - range_tree_t **defer_tree; - int64_t alloc_delta, defer_delta; - boolean_t defer_allowed = B_TRUE; - - ASSERT(!vd->vdev_ishole); - - mutex_enter(&msp->ms_lock); - - /* - * If this metaslab is just becoming available, initialize its - * range trees and add its capacity to the vdev. - */ - if (msp->ms_freed == NULL) { - for (int t = 0; t < TXG_SIZE; t++) { - ASSERT(msp->ms_allocating[t] == NULL); - - msp->ms_allocating[t] = range_tree_create(NULL, NULL); - } - - ASSERT3P(msp->ms_freeing, ==, NULL); - msp->ms_freeing = range_tree_create(NULL, NULL); - - ASSERT3P(msp->ms_freed, ==, NULL); - msp->ms_freed = range_tree_create(NULL, NULL); - - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - ASSERT(msp->ms_defer[t] == NULL); - - msp->ms_defer[t] = range_tree_create(NULL, NULL); - } - - ASSERT3P(msp->ms_checkpointing, ==, NULL); - msp->ms_checkpointing = range_tree_create(NULL, NULL); - - metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); - } - ASSERT0(range_tree_space(msp->ms_freeing)); - ASSERT0(range_tree_space(msp->ms_checkpointing)); - - defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; - - uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - - metaslab_class_get_alloc(spa_normal_class(spa)); - if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { - defer_allowed = B_FALSE; - } - - defer_delta = 0; - alloc_delta = msp->ms_allocated_this_txg - - range_tree_space(msp->ms_freed); - if (defer_allowed) { - defer_delta = range_tree_space(msp->ms_freed) - - range_tree_space(*defer_tree); - } else { - defer_delta -= range_tree_space(*defer_tree); - } - - metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, - defer_delta, 0); - - /* - * If there's a metaslab_load() in progress, wait for it to complete - * so that we have a consistent view of the in-core space map. - */ - metaslab_load_wait(msp); - - /* - * Move the frees from the defer_tree back to the free - * range tree (if it's loaded). Swap the freed_tree and - * the defer_tree -- this is safe to do because we've - * just emptied out the defer_tree. - */ - range_tree_vacate(*defer_tree, - msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); - if (defer_allowed) { - range_tree_swap(&msp->ms_freed, defer_tree); - } else { - range_tree_vacate(msp->ms_freed, - msp->ms_loaded ? range_tree_add : NULL, - msp->ms_allocatable); - } - - msp->ms_synced_length = space_map_length(msp->ms_sm); - - msp->ms_deferspace += defer_delta; - ASSERT3S(msp->ms_deferspace, >=, 0); - ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); - if (msp->ms_deferspace != 0) { - /* - * Keep syncing this metaslab until all deferred frees - * are back in circulation. - */ - vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); - } - metaslab_aux_histograms_update_done(msp, defer_allowed); - - if (msp->ms_new) { - msp->ms_new = B_FALSE; - mutex_enter(&mg->mg_lock); - mg->mg_ms_ready++; - mutex_exit(&mg->mg_lock); - } - - /* - * Re-sort metaslab within its group now that we've adjusted - * its allocatable space. - */ - metaslab_recalculate_weight_and_sort(msp); - - /* - * If the metaslab is loaded and we've not tried to load or allocate - * from it in 'metaslab_unload_delay' txgs, then unload it. - */ - if (msp->ms_loaded && - msp->ms_initializing == 0 && - msp->ms_selected_txg + metaslab_unload_delay < txg) { - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { - VERIFY0(range_tree_space( - msp->ms_allocating[(txg + t) & TXG_MASK])); - } - if (msp->ms_allocator != -1) { - metaslab_passivate(msp, msp->ms_weight & - ~METASLAB_ACTIVE_MASK); - } - - if (!metaslab_debug_unload) - metaslab_unload(msp); - } - - ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); - ASSERT0(range_tree_space(msp->ms_freeing)); - ASSERT0(range_tree_space(msp->ms_freed)); - ASSERT0(range_tree_space(msp->ms_checkpointing)); - - msp->ms_allocated_this_txg = 0; - mutex_exit(&msp->ms_lock); -} - -void -metaslab_sync_reassess(metaslab_group_t *mg) -{ - spa_t *spa = mg->mg_class->mc_spa; - - spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); - metaslab_group_alloc_update(mg); - mg->mg_fragmentation = metaslab_group_fragmentation(mg); - - /* - * Preload the next potential metaslabs but only on active - * metaslab groups. We can get into a state where the metaslab - * is no longer active since we dirty metaslabs as we remove a - * a device, thus potentially making the metaslab group eligible - * for preloading. - */ - if (mg->mg_activation_count > 0) { - metaslab_group_preload(mg); - } - spa_config_exit(spa, SCL_ALLOC, FTAG); -} - -/* - * When writing a ditto block (i.e. more than one DVA for a given BP) on - * the same vdev as an existing DVA of this BP, then try to allocate it - * on a different metaslab than existing DVAs (i.e. a unique metaslab). - */ -static boolean_t -metaslab_is_unique(metaslab_t *msp, dva_t *dva) -{ - uint64_t dva_ms_id; - - if (DVA_GET_ASIZE(dva) == 0) - return (B_TRUE); - - if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) - return (B_TRUE); - - dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; - - return (msp->ms_id != dva_ms_id); -} - -/* - * ========================================================================== - * Metaslab allocation tracing facility - * ========================================================================== - */ -#ifdef _METASLAB_TRACING -kstat_t *metaslab_trace_ksp; -kstat_named_t metaslab_trace_over_limit; - -void -metaslab_alloc_trace_init(void) -{ - ASSERT(metaslab_alloc_trace_cache == NULL); - metaslab_alloc_trace_cache = kmem_cache_create( - "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); - metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", - "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); - if (metaslab_trace_ksp != NULL) { - metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; - kstat_named_init(&metaslab_trace_over_limit, - "metaslab_trace_over_limit", KSTAT_DATA_UINT64); - kstat_install(metaslab_trace_ksp); - } -} - -void -metaslab_alloc_trace_fini(void) -{ - if (metaslab_trace_ksp != NULL) { - kstat_delete(metaslab_trace_ksp); - metaslab_trace_ksp = NULL; - } - kmem_cache_destroy(metaslab_alloc_trace_cache); - metaslab_alloc_trace_cache = NULL; -} - -/* - * Add an allocation trace element to the allocation tracing list. - */ -static void -metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, - metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, - int allocator) -{ - if (!metaslab_trace_enabled) - return; - - /* - * When the tracing list reaches its maximum we remove - * the second element in the list before adding a new one. - * By removing the second element we preserve the original - * entry as a clue to what allocations steps have already been - * performed. - */ - if (zal->zal_size == metaslab_trace_max_entries) { - metaslab_alloc_trace_t *mat_next; -#ifdef DEBUG - panic("too many entries in allocation list"); -#endif - atomic_inc_64(&metaslab_trace_over_limit.value.ui64); - zal->zal_size--; - mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); - list_remove(&zal->zal_list, mat_next); - kmem_cache_free(metaslab_alloc_trace_cache, mat_next); - } - - metaslab_alloc_trace_t *mat = - kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); - list_link_init(&mat->mat_list_node); - mat->mat_mg = mg; - mat->mat_msp = msp; - mat->mat_size = psize; - mat->mat_dva_id = dva_id; - mat->mat_offset = offset; - mat->mat_weight = 0; - mat->mat_allocator = allocator; - - if (msp != NULL) - mat->mat_weight = msp->ms_weight; - - /* - * The list is part of the zio so locking is not required. Only - * a single thread will perform allocations for a given zio. - */ - list_insert_tail(&zal->zal_list, mat); - zal->zal_size++; - - ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); -} - -void -metaslab_trace_init(zio_alloc_list_t *zal) -{ - list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), - offsetof(metaslab_alloc_trace_t, mat_list_node)); - zal->zal_size = 0; -} - -void -metaslab_trace_fini(zio_alloc_list_t *zal) -{ - metaslab_alloc_trace_t *mat; - - while ((mat = list_remove_head(&zal->zal_list)) != NULL) - kmem_cache_free(metaslab_alloc_trace_cache, mat); - list_destroy(&zal->zal_list); - zal->zal_size = 0; -} - -#else - -#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc) - -void -metaslab_alloc_trace_init(void) -{ -} - -void -metaslab_alloc_trace_fini(void) -{ -} - -void -metaslab_trace_init(zio_alloc_list_t *zal) -{ -} - -void -metaslab_trace_fini(zio_alloc_list_t *zal) -{ -} - -#endif /* _METASLAB_TRACING */ - -/* - * ========================================================================== - * Metaslab block operations - * ========================================================================== - */ - -static void -metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, - int allocator) -{ - if (!(flags & METASLAB_ASYNC_ALLOC) || - (flags & METASLAB_DONT_THROTTLE)) - return; - - metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; - if (!mg->mg_class->mc_alloc_throttle_enabled) - return; - - (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); -} - -static void -metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) -{ - uint64_t max = mg->mg_max_alloc_queue_depth; - uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; - while (cur < max) { - if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], - cur, cur + 1) == cur) { - atomic_inc_64( - &mg->mg_class->mc_alloc_max_slots[allocator]); - return; - } - cur = mg->mg_cur_max_alloc_queue_depth[allocator]; - } -} - -void -metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, - int allocator, boolean_t io_complete) -{ - if (!(flags & METASLAB_ASYNC_ALLOC) || - (flags & METASLAB_DONT_THROTTLE)) - return; - - metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; - if (!mg->mg_class->mc_alloc_throttle_enabled) - return; - - (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); - if (io_complete) - metaslab_group_increment_qdepth(mg, allocator); -} - -void -metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, - int allocator) -{ -#ifdef ZFS_DEBUG - const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); - - for (int d = 0; d < ndvas; d++) { - uint64_t vdev = DVA_GET_VDEV(&dva[d]); - metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; - VERIFY(zfs_refcount_not_held( - &mg->mg_alloc_queue_depth[allocator], tag)); - } -#endif -} - -static uint64_t -metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) -{ - uint64_t start; - range_tree_t *rt = msp->ms_allocatable; - metaslab_class_t *mc = msp->ms_group->mg_class; - - VERIFY(!msp->ms_condensing); - VERIFY0(msp->ms_initializing); - - start = mc->mc_ops->msop_alloc(msp, size); - if (start != -1ULL) { - metaslab_group_t *mg = msp->ms_group; - vdev_t *vd = mg->mg_vd; - - VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); - VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); - range_tree_remove(rt, start, size); - - if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) - vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); - - range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); - - /* Track the last successful allocation */ - msp->ms_alloc_txg = txg; - metaslab_verify_space(msp, txg); - } - - /* - * Now that we've attempted the allocation we need to update the - * metaslab's maximum block size since it may have changed. - */ - msp->ms_max_size = metaslab_block_maxsize(msp); - return (start); -} - -/* - * Find the metaslab with the highest weight that is less than what we've - * already tried. In the common case, this means that we will examine each - * metaslab at most once. Note that concurrent callers could reorder metaslabs - * by activation/passivation once we have dropped the mg_lock. If a metaslab is - * activated by another thread, and we fail to allocate from the metaslab we - * have selected, we may not try the newly-activated metaslab, and instead - * activate another metaslab. This is not optimal, but generally does not cause - * any problems (a possible exception being if every metaslab is completely full - * except for the the newly-activated metaslab which we fail to examine). - */ -static metaslab_t * -find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, - dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, - zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) -{ - avl_index_t idx; - avl_tree_t *t = &mg->mg_metaslab_tree; - metaslab_t *msp = avl_find(t, search, &idx); - if (msp == NULL) - msp = avl_nearest(t, idx, AVL_AFTER); - - for (; msp != NULL; msp = AVL_NEXT(t, msp)) { - int i; - if (!metaslab_should_allocate(msp, asize)) { - metaslab_trace_add(zal, mg, msp, asize, d, - TRACE_TOO_SMALL, allocator); - continue; - } - - /* - * If the selected metaslab is condensing or being - * initialized, skip it. - */ - if (msp->ms_condensing || msp->ms_initializing > 0) - continue; - - *was_active = msp->ms_allocator != -1; - /* - * If we're activating as primary, this is our first allocation - * from this disk, so we don't need to check how close we are. - * If the metaslab under consideration was already active, - * we're getting desperate enough to steal another allocator's - * metaslab, so we still don't care about distances. - */ - if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) - break; - - for (i = 0; i < d; i++) { - if (want_unique && - !metaslab_is_unique(msp, &dva[i])) - break; /* try another metaslab */ - } - if (i == d) - break; - } - - if (msp != NULL) { - search->ms_weight = msp->ms_weight; - search->ms_start = msp->ms_start + 1; - search->ms_allocator = msp->ms_allocator; - search->ms_primary = msp->ms_primary; - } - return (msp); -} - -/* ARGSUSED */ -static uint64_t -metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, - int d, int allocator) -{ - metaslab_t *msp = NULL; - uint64_t offset = -1ULL; - uint64_t activation_weight; - - activation_weight = METASLAB_WEIGHT_PRIMARY; - for (int i = 0; i < d; i++) { - if (activation_weight == METASLAB_WEIGHT_PRIMARY && - DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { - activation_weight = METASLAB_WEIGHT_SECONDARY; - } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && - DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { - activation_weight = METASLAB_WEIGHT_CLAIM; - break; - } - } - - /* - * If we don't have enough metaslabs active to fill the entire array, we - * just use the 0th slot. - */ - if (mg->mg_ms_ready < mg->mg_allocators * 3) - allocator = 0; - - ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); - - metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); - search->ms_weight = UINT64_MAX; - search->ms_start = 0; - /* - * At the end of the metaslab tree are the already-active metaslabs, - * first the primaries, then the secondaries. When we resume searching - * through the tree, we need to consider ms_allocator and ms_primary so - * we start in the location right after where we left off, and don't - * accidentally loop forever considering the same metaslabs. - */ - search->ms_allocator = -1; - search->ms_primary = B_TRUE; - for (;;) { - boolean_t was_active = B_FALSE; - - mutex_enter(&mg->mg_lock); - - if (activation_weight == METASLAB_WEIGHT_PRIMARY && - mg->mg_primaries[allocator] != NULL) { - msp = mg->mg_primaries[allocator]; - was_active = B_TRUE; - } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && - mg->mg_secondaries[allocator] != NULL) { - msp = mg->mg_secondaries[allocator]; - was_active = B_TRUE; - } else { - msp = find_valid_metaslab(mg, activation_weight, dva, d, - want_unique, asize, allocator, zal, search, - &was_active); - } - - mutex_exit(&mg->mg_lock); - if (msp == NULL) { - kmem_free(search, sizeof (*search)); - return (-1ULL); - } - - mutex_enter(&msp->ms_lock); - /* - * Ensure that the metaslab we have selected is still - * capable of handling our request. It's possible that - * another thread may have changed the weight while we - * were blocked on the metaslab lock. We check the - * active status first to see if we need to reselect - * a new metaslab. - */ - if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { - mutex_exit(&msp->ms_lock); - continue; - } - - /* - * If the metaslab is freshly activated for an allocator that - * isn't the one we're allocating from, or if it's a primary and - * we're seeking a secondary (or vice versa), we go back and - * select a new metaslab. - */ - if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && - (msp->ms_allocator != -1) && - (msp->ms_allocator != allocator || ((activation_weight == - METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { - mutex_exit(&msp->ms_lock); - continue; - } - - if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && - activation_weight != METASLAB_WEIGHT_CLAIM) { - metaslab_passivate(msp, msp->ms_weight & - ~METASLAB_WEIGHT_CLAIM); - mutex_exit(&msp->ms_lock); - continue; - } - - if (metaslab_activate(msp, allocator, activation_weight) != 0) { - mutex_exit(&msp->ms_lock); - continue; - } - - msp->ms_selected_txg = txg; - - /* - * Now that we have the lock, recheck to see if we should - * continue to use this metaslab for this allocation. The - * the metaslab is now loaded so metaslab_should_allocate() can - * accurately determine if the allocation attempt should - * proceed. - */ - if (!metaslab_should_allocate(msp, asize)) { - /* Passivate this metaslab and select a new one. */ - metaslab_trace_add(zal, mg, msp, asize, d, - TRACE_TOO_SMALL, allocator); - goto next; - } - - /* - * If this metaslab is currently condensing then pick again as - * we can't manipulate this metaslab until it's committed - * to disk. If this metaslab is being initialized, we shouldn't - * allocate from it since the allocated region might be - * overwritten after allocation. - */ - if (msp->ms_condensing) { - metaslab_trace_add(zal, mg, msp, asize, d, - TRACE_CONDENSING, allocator); - metaslab_passivate(msp, msp->ms_weight & - ~METASLAB_ACTIVE_MASK); - mutex_exit(&msp->ms_lock); - continue; - } else if (msp->ms_initializing > 0) { - metaslab_trace_add(zal, mg, msp, asize, d, - TRACE_INITIALIZING, allocator); - metaslab_passivate(msp, msp->ms_weight & - ~METASLAB_ACTIVE_MASK); - mutex_exit(&msp->ms_lock); - continue; - } - - offset = metaslab_block_alloc(msp, asize, txg); - metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); - - if (offset != -1ULL) { - /* Proactively passivate the metaslab, if needed */ - metaslab_segment_may_passivate(msp); - break; - } -next: - ASSERT(msp->ms_loaded); - - /* - * We were unable to allocate from this metaslab so determine - * a new weight for this metaslab. Now that we have loaded - * the metaslab we can provide a better hint to the metaslab - * selector. - * - * For space-based metaslabs, we use the maximum block size. - * This information is only available when the metaslab - * is loaded and is more accurate than the generic free - * space weight that was calculated by metaslab_weight(). - * This information allows us to quickly compare the maximum - * available allocation in the metaslab to the allocation - * size being requested. - * - * For segment-based metaslabs, determine the new weight - * based on the highest bucket in the range tree. We - * explicitly use the loaded segment weight (i.e. the range - * tree histogram) since it contains the space that is - * currently available for allocation and is accurate - * even within a sync pass. - */ - if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { - uint64_t weight = metaslab_block_maxsize(msp); - WEIGHT_SET_SPACEBASED(weight); - metaslab_passivate(msp, weight); - } else { - metaslab_passivate(msp, - metaslab_weight_from_range_tree(msp)); - } - - /* - * We have just failed an allocation attempt, check - * that metaslab_should_allocate() agrees. Otherwise, - * we may end up in an infinite loop retrying the same - * metaslab. - */ - ASSERT(!metaslab_should_allocate(msp, asize)); - - mutex_exit(&msp->ms_lock); - } - mutex_exit(&msp->ms_lock); - kmem_free(search, sizeof (*search)); - return (offset); -} - -static uint64_t -metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, - int d, int allocator) -{ - uint64_t offset; - ASSERT(mg->mg_initialized); - - offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, - dva, d, allocator); - - mutex_enter(&mg->mg_lock); - if (offset == -1ULL) { - mg->mg_failed_allocations++; - metaslab_trace_add(zal, mg, NULL, asize, d, - TRACE_GROUP_FAILURE, allocator); - if (asize == SPA_GANGBLOCKSIZE) { - /* - * This metaslab group was unable to allocate - * the minimum gang block size so it must be out of - * space. We must notify the allocation throttle - * to start skipping allocation attempts to this - * metaslab group until more space becomes available. - * Note: this failure cannot be caused by the - * allocation throttle since the allocation throttle - * is only responsible for skipping devices and - * not failing block allocations. - */ - mg->mg_no_free_space = B_TRUE; - } - } - mg->mg_allocations++; - mutex_exit(&mg->mg_lock); - return (offset); -} - -/* - * Allocate a block for the specified i/o. - */ -int -metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, - dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, - zio_alloc_list_t *zal, int allocator) -{ - metaslab_group_t *mg, *rotor; - vdev_t *vd; - boolean_t try_hard = B_FALSE; - - ASSERT(!DVA_IS_VALID(&dva[d])); - - /* - * For testing, make some blocks above a certain size be gang blocks. - * This will also test spilling from special to normal. - */ - if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { - metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, - allocator); - return (SET_ERROR(ENOSPC)); - } - - /* - * Start at the rotor and loop through all mgs until we find something. - * Note that there's no locking on mc_rotor or mc_aliquot because - * nothing actually breaks if we miss a few updates -- we just won't - * allocate quite as evenly. It all balances out over time. - * - * If we are doing ditto or log blocks, try to spread them across - * consecutive vdevs. If we're forced to reuse a vdev before we've - * allocated all of our ditto blocks, then try and spread them out on - * that vdev as much as possible. If it turns out to not be possible, - * gradually lower our standards until anything becomes acceptable. - * Also, allocating on consecutive vdevs (as opposed to random vdevs) - * gives us hope of containing our fault domains to something we're - * able to reason about. Otherwise, any two top-level vdev failures - * will guarantee the loss of data. With consecutive allocation, - * only two adjacent top-level vdev failures will result in data loss. - * - * If we are doing gang blocks (hintdva is non-NULL), try to keep - * ourselves on the same vdev as our gang block header. That - * way, we can hope for locality in vdev_cache, plus it makes our - * fault domains something tractable. - */ - if (hintdva) { - vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); - - /* - * It's possible the vdev we're using as the hint no - * longer exists or its mg has been closed (e.g. by - * device removal). Consult the rotor when - * all else fails. - */ - if (vd != NULL && vd->vdev_mg != NULL) { - mg = vd->vdev_mg; - - if (flags & METASLAB_HINTBP_AVOID && - mg->mg_next != NULL) - mg = mg->mg_next; - } else { - mg = mc->mc_rotor; - } - } else if (d != 0) { - vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); - mg = vd->vdev_mg->mg_next; - } else { - ASSERT(mc->mc_rotor != NULL); - mg = mc->mc_rotor; - } - - /* - * If the hint put us into the wrong metaslab class, or into a - * metaslab group that has been passivated, just follow the rotor. - */ - if (mg->mg_class != mc || mg->mg_activation_count <= 0) - mg = mc->mc_rotor; - - rotor = mg; -top: - do { - boolean_t allocatable; - - ASSERT(mg->mg_activation_count == 1); - vd = mg->mg_vd; - - /* - * Don't allocate from faulted devices. - */ - if (try_hard) { - spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); - allocatable = vdev_allocatable(vd); - spa_config_exit(spa, SCL_ZIO, FTAG); - } else { - allocatable = vdev_allocatable(vd); - } - - /* - * Determine if the selected metaslab group is eligible - * for allocations. If we're ganging then don't allow - * this metaslab group to skip allocations since that would - * inadvertently return ENOSPC and suspend the pool - * even though space is still available. - */ - if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { - allocatable = metaslab_group_allocatable(mg, rotor, - psize, allocator, d); - } - - if (!allocatable) { - metaslab_trace_add(zal, mg, NULL, psize, d, - TRACE_NOT_ALLOCATABLE, allocator); - goto next; - } - - ASSERT(mg->mg_initialized); - - /* - * Avoid writing single-copy data to a failing, - * non-redundant vdev, unless we've already tried all - * other vdevs. - */ - if ((vd->vdev_stat.vs_write_errors > 0 || - vd->vdev_state < VDEV_STATE_HEALTHY) && - d == 0 && !try_hard && vd->vdev_children == 0) { - metaslab_trace_add(zal, mg, NULL, psize, d, - TRACE_VDEV_ERROR, allocator); - goto next; - } - - ASSERT(mg->mg_class == mc); - - uint64_t asize = vdev_psize_to_asize(vd, psize); - ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - - /* - * If we don't need to try hard, then require that the - * block be on an different metaslab from any other DVAs - * in this BP (unique=true). If we are trying hard, then - * allow any metaslab to be used (unique=false). - */ - uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - !try_hard, dva, d, allocator); - - if (offset != -1ULL) { - /* - * If we've just selected this metaslab group, - * figure out whether the corresponding vdev is - * over- or under-used relative to the pool, - * and set an allocation bias to even it out. - */ - if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { - vdev_stat_t *vs = &vd->vdev_stat; - int64_t vu, cu; - - vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); - cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); - - /* - * Calculate how much more or less we should - * try to allocate from this device during - * this iteration around the rotor. - * For example, if a device is 80% full - * and the pool is 20% full then we should - * reduce allocations by 60% on this device. - * - * mg_bias = (20 - 80) * 512K / 100 = -307K - * - * This reduces allocations by 307K for this - * iteration. - */ - mg->mg_bias = ((cu - vu) * - (int64_t)mg->mg_aliquot) / 100; - } else if (!metaslab_bias_enabled) { - mg->mg_bias = 0; - } - - if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= - mg->mg_aliquot + mg->mg_bias) { - mc->mc_rotor = mg->mg_next; - mc->mc_aliquot = 0; - } - - DVA_SET_VDEV(&dva[d], vd->vdev_id); - DVA_SET_OFFSET(&dva[d], offset); - DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); - DVA_SET_ASIZE(&dva[d], asize); - - return (0); - } -next: - mc->mc_rotor = mg->mg_next; - mc->mc_aliquot = 0; - } while ((mg = mg->mg_next) != rotor); - - /* - * If we haven't tried hard, do so now. - */ - if (!try_hard) { - try_hard = B_TRUE; - goto top; - } - - bzero(&dva[d], sizeof (dva_t)); - - metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); - return (SET_ERROR(ENOSPC)); -} - -void -metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, - boolean_t checkpoint) -{ - metaslab_t *msp; - spa_t *spa = vd->vdev_spa; - - ASSERT(vdev_is_concrete(vd)); - ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); - ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); - - msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - - VERIFY(!msp->ms_condensing); - VERIFY3U(offset, >=, msp->ms_start); - VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); - VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); - VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); - - metaslab_check_free_impl(vd, offset, asize); - - mutex_enter(&msp->ms_lock); - if (range_tree_is_empty(msp->ms_freeing) && - range_tree_is_empty(msp->ms_checkpointing)) { - vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); - } - - if (checkpoint) { - ASSERT(spa_has_checkpoint(spa)); - range_tree_add(msp->ms_checkpointing, offset, asize); - } else { - range_tree_add(msp->ms_freeing, offset, asize); - } - mutex_exit(&msp->ms_lock); -} - -/* ARGSUSED */ -void -metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, - uint64_t size, void *arg) -{ - boolean_t *checkpoint = arg; - - ASSERT3P(checkpoint, !=, NULL); - - if (vd->vdev_ops->vdev_op_remap != NULL) - vdev_indirect_mark_obsolete(vd, offset, size); - else - metaslab_free_impl(vd, offset, size, *checkpoint); -} - -static void -metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, - boolean_t checkpoint) -{ - spa_t *spa = vd->vdev_spa; - - ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); - - if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) - return; - - if (spa->spa_vdev_removal != NULL && - spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && - vdev_is_concrete(vd)) { - /* - * Note: we check if the vdev is concrete because when - * we complete the removal, we first change the vdev to be - * an indirect vdev (in open context), and then (in syncing - * context) clear spa_vdev_removal. - */ - free_from_removing_vdev(vd, offset, size); - } else if (vd->vdev_ops->vdev_op_remap != NULL) { - vdev_indirect_mark_obsolete(vd, offset, size); - vd->vdev_ops->vdev_op_remap(vd, offset, size, - metaslab_free_impl_cb, &checkpoint); - } else { - metaslab_free_concrete(vd, offset, size, checkpoint); - } -} - -typedef struct remap_blkptr_cb_arg { - blkptr_t *rbca_bp; - spa_remap_cb_t rbca_cb; - vdev_t *rbca_remap_vd; - uint64_t rbca_remap_offset; - void *rbca_cb_arg; -} remap_blkptr_cb_arg_t; - -void -remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, - uint64_t size, void *arg) -{ - remap_blkptr_cb_arg_t *rbca = arg; - blkptr_t *bp = rbca->rbca_bp; - - /* We can not remap split blocks. */ - if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) - return; - ASSERT0(inner_offset); - - if (rbca->rbca_cb != NULL) { - /* - * At this point we know that we are not handling split - * blocks and we invoke the callback on the previous - * vdev which must be indirect. - */ - ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); - - rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, - rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); - - /* set up remap_blkptr_cb_arg for the next call */ - rbca->rbca_remap_vd = vd; - rbca->rbca_remap_offset = offset; - } - - /* - * The phys birth time is that of dva[0]. This ensures that we know - * when each dva was written, so that resilver can determine which - * blocks need to be scrubbed (i.e. those written during the time - * the vdev was offline). It also ensures that the key used in - * the ARC hash table is unique (i.e. dva[0] + phys_birth). If - * we didn't change the phys_birth, a lookup in the ARC for a - * remapped BP could find the data that was previously stored at - * this vdev + offset. - */ - vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, - DVA_GET_VDEV(&bp->blk_dva[0])); - vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; - bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, - DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); - - DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); - DVA_SET_OFFSET(&bp->blk_dva[0], offset); -} - -/* - * If the block pointer contains any indirect DVAs, modify them to refer to - * concrete DVAs. Note that this will sometimes not be possible, leaving - * the indirect DVA in place. This happens if the indirect DVA spans multiple - * segments in the mapping (i.e. it is a "split block"). - * - * If the BP was remapped, calls the callback on the original dva (note the - * callback can be called multiple times if the original indirect DVA refers - * to another indirect DVA, etc). - * - * Returns TRUE if the BP was remapped. - */ -boolean_t -spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) -{ - remap_blkptr_cb_arg_t rbca; - - if (!zfs_remap_blkptr_enable) - return (B_FALSE); - - if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) - return (B_FALSE); - - /* - * Dedup BP's can not be remapped, because ddt_phys_select() depends - * on DVA[0] being the same in the BP as in the DDT (dedup table). - */ - if (BP_GET_DEDUP(bp)) - return (B_FALSE); - - /* - * Gang blocks can not be remapped, because - * zio_checksum_gang_verifier() depends on the DVA[0] that's in - * the BP used to read the gang block header (GBH) being the same - * as the DVA[0] that we allocated for the GBH. - */ - if (BP_IS_GANG(bp)) - return (B_FALSE); - - /* - * Embedded BP's have no DVA to remap. - */ - if (BP_GET_NDVAS(bp) < 1) - return (B_FALSE); - - /* - * Note: we only remap dva[0]. If we remapped other dvas, we - * would no longer know what their phys birth txg is. - */ - dva_t *dva = &bp->blk_dva[0]; - - uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t size = DVA_GET_ASIZE(dva); - vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); - - if (vd->vdev_ops->vdev_op_remap == NULL) - return (B_FALSE); - - rbca.rbca_bp = bp; - rbca.rbca_cb = callback; - rbca.rbca_remap_vd = vd; - rbca.rbca_remap_offset = offset; - rbca.rbca_cb_arg = arg; - - /* - * remap_blkptr_cb() will be called in order for each level of - * indirection, until a concrete vdev is reached or a split block is - * encountered. old_vd and old_offset are updated within the callback - * as we go from the one indirect vdev to the next one (either concrete - * or indirect again) in that order. - */ - vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); - - /* Check if the DVA wasn't remapped because it is a split block */ - if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) - return (B_FALSE); - - return (B_TRUE); -} - -/* - * Undo the allocation of a DVA which happened in the given transaction group. - */ -void -metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) -{ - metaslab_t *msp; - vdev_t *vd; - uint64_t vdev = DVA_GET_VDEV(dva); - uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t size = DVA_GET_ASIZE(dva); - - ASSERT(DVA_IS_VALID(dva)); - ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); - - if (txg > spa_freeze_txg(spa)) - return; - - if ((vd = vdev_lookup_top(spa, vdev)) == NULL || - (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { - cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", - (u_longlong_t)vdev, (u_longlong_t)offset); - ASSERT(0); - return; - } - - ASSERT(!vd->vdev_removing); - ASSERT(vdev_is_concrete(vd)); - ASSERT0(vd->vdev_indirect_config.vic_mapping_object); - ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); - - if (DVA_GET_GANG(dva)) - size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - - msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - - mutex_enter(&msp->ms_lock); - range_tree_remove(msp->ms_allocating[txg & TXG_MASK], - offset, size); - - VERIFY(!msp->ms_condensing); - VERIFY3U(offset, >=, msp->ms_start); - VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); - VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, - msp->ms_size); - VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); - VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - range_tree_add(msp->ms_allocatable, offset, size); - mutex_exit(&msp->ms_lock); -} - -/* - * Free the block represented by the given DVA. - */ -void -metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) -{ - uint64_t vdev = DVA_GET_VDEV(dva); - uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t size = DVA_GET_ASIZE(dva); - vdev_t *vd = vdev_lookup_top(spa, vdev); - - ASSERT(DVA_IS_VALID(dva)); - ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); - - if (DVA_GET_GANG(dva)) { - size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - } - - metaslab_free_impl(vd, offset, size, checkpoint); -} - -/* - * Reserve some allocation slots. The reservation system must be called - * before we call into the allocator. If there aren't any available slots - * then the I/O will be throttled until an I/O completes and its slots are - * freed up. The function returns true if it was successful in placing - * the reservation. - */ -boolean_t -metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, - zio_t *zio, int flags) -{ - uint64_t available_slots = 0; - boolean_t slot_reserved = B_FALSE; - uint64_t max = mc->mc_alloc_max_slots[allocator]; - - ASSERT(mc->mc_alloc_throttle_enabled); - mutex_enter(&mc->mc_lock); - - uint64_t reserved_slots = - zfs_refcount_count(&mc->mc_alloc_slots[allocator]); - if (reserved_slots < max) - available_slots = max - reserved_slots; - - if (slots <= available_slots || GANG_ALLOCATION(flags) || - flags & METASLAB_MUST_RESERVE) { - /* - * We reserve the slots individually so that we can unreserve - * them individually when an I/O completes. - */ - for (int d = 0; d < slots; d++) { - reserved_slots = - zfs_refcount_add(&mc->mc_alloc_slots[allocator], - zio); - } - zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; - slot_reserved = B_TRUE; - } - - mutex_exit(&mc->mc_lock); - return (slot_reserved); -} - -void -metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, - int allocator, zio_t *zio) -{ - ASSERT(mc->mc_alloc_throttle_enabled); - mutex_enter(&mc->mc_lock); - for (int d = 0; d < slots; d++) { - (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], - zio); - } - mutex_exit(&mc->mc_lock); -} - -static int -metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, - uint64_t txg) -{ - metaslab_t *msp; - spa_t *spa = vd->vdev_spa; - int error = 0; - - if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) - return (ENXIO); - - ASSERT3P(vd->vdev_ms, !=, NULL); - msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - - mutex_enter(&msp->ms_lock); - - if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) - error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); - /* - * No need to fail in that case; someone else has activated the - * metaslab, but that doesn't preclude us from using it. - */ - if (error == EBUSY) - error = 0; - - if (error == 0 && - !range_tree_contains(msp->ms_allocatable, offset, size)) - error = SET_ERROR(ENOENT); - - if (error || txg == 0) { /* txg == 0 indicates dry run */ - mutex_exit(&msp->ms_lock); - return (error); - } - - VERIFY(!msp->ms_condensing); - VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); - VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, - msp->ms_size); - range_tree_remove(msp->ms_allocatable, offset, size); - - if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ - if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) - vdev_dirty(vd, VDD_METASLAB, msp, txg); - range_tree_add(msp->ms_allocating[txg & TXG_MASK], - offset, size); - } - - mutex_exit(&msp->ms_lock); - - return (0); -} - -typedef struct metaslab_claim_cb_arg_t { - uint64_t mcca_txg; - int mcca_error; -} metaslab_claim_cb_arg_t; - -/* ARGSUSED */ -static void -metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, - uint64_t size, void *arg) -{ - metaslab_claim_cb_arg_t *mcca_arg = arg; - - if (mcca_arg->mcca_error == 0) { - mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, - size, mcca_arg->mcca_txg); - } -} - -int -metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) -{ - if (vd->vdev_ops->vdev_op_remap != NULL) { - metaslab_claim_cb_arg_t arg; - - /* - * Only zdb(1M) can claim on indirect vdevs. This is used - * to detect leaks of mapped space (that are not accounted - * for in the obsolete counts, spacemap, or bpobj). - */ - ASSERT(!spa_writeable(vd->vdev_spa)); - arg.mcca_error = 0; - arg.mcca_txg = txg; - - vd->vdev_ops->vdev_op_remap(vd, offset, size, - metaslab_claim_impl_cb, &arg); - - if (arg.mcca_error == 0) { - arg.mcca_error = metaslab_claim_concrete(vd, - offset, size, txg); - } - return (arg.mcca_error); - } else { - return (metaslab_claim_concrete(vd, offset, size, txg)); - } -} - -/* - * Intent log support: upon opening the pool after a crash, notify the SPA - * of blocks that the intent log has allocated for immediate write, but - * which are still considered free by the SPA because the last transaction - * group didn't commit yet. - */ -static int -metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) -{ - uint64_t vdev = DVA_GET_VDEV(dva); - uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t size = DVA_GET_ASIZE(dva); - vdev_t *vd; - - if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { - return (SET_ERROR(ENXIO)); - } - - ASSERT(DVA_IS_VALID(dva)); - - if (DVA_GET_GANG(dva)) - size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - - return (metaslab_claim_impl(vd, offset, size, txg)); -} - -int -metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, - int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, - zio_alloc_list_t *zal, zio_t *zio, int allocator) -{ - dva_t *dva = bp->blk_dva; - dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; - int error = 0; - - ASSERT(bp->blk_birth == 0); - ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); - - spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); - - if (mc->mc_rotor == NULL) { /* no vdevs in this class */ - spa_config_exit(spa, SCL_ALLOC, FTAG); - return (SET_ERROR(ENOSPC)); - } - - ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); - ASSERT(BP_GET_NDVAS(bp) == 0); - ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); - ASSERT3P(zal, !=, NULL); - - for (int d = 0; d < ndvas; d++) { - error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, - txg, flags, zal, allocator); - if (error != 0) { - for (d--; d >= 0; d--) { - metaslab_unalloc_dva(spa, &dva[d], txg); - metaslab_group_alloc_decrement(spa, - DVA_GET_VDEV(&dva[d]), zio, flags, - allocator, B_FALSE); - bzero(&dva[d], sizeof (dva_t)); - } - spa_config_exit(spa, SCL_ALLOC, FTAG); - return (error); - } else { - /* - * Update the metaslab group's queue depth - * based on the newly allocated dva. - */ - metaslab_group_alloc_increment(spa, - DVA_GET_VDEV(&dva[d]), zio, flags, allocator); - } - - } - ASSERT(error == 0); - ASSERT(BP_GET_NDVAS(bp) == ndvas); - - spa_config_exit(spa, SCL_ALLOC, FTAG); - - BP_SET_BIRTH(bp, txg, txg); - - return (0); -} - -void -metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) -{ - const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); - - ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); - - /* - * If we have a checkpoint for the pool we need to make sure that - * the blocks that we free that are part of the checkpoint won't be - * reused until the checkpoint is discarded or we revert to it. - * - * The checkpoint flag is passed down the metaslab_free code path - * and is set whenever we want to add a block to the checkpoint's - * accounting. That is, we "checkpoint" blocks that existed at the - * time the checkpoint was created and are therefore referenced by - * the checkpointed uberblock. - * - * Note that, we don't checkpoint any blocks if the current - * syncing txg <= spa_checkpoint_txg. We want these frees to sync - * normally as they will be referenced by the checkpointed uberblock. - */ - boolean_t checkpoint = B_FALSE; - if (bp->blk_birth <= spa->spa_checkpoint_txg && - spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { - /* - * At this point, if the block is part of the checkpoint - * there is no way it was created in the current txg. - */ - ASSERT(!now); - ASSERT3U(spa_syncing_txg(spa), ==, txg); - checkpoint = B_TRUE; - } - - spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); - - for (int d = 0; d < ndvas; d++) { - if (now) { - metaslab_unalloc_dva(spa, &dva[d], txg); - } else { - ASSERT3U(txg, ==, spa_syncing_txg(spa)); - metaslab_free_dva(spa, &dva[d], checkpoint); - } - } - - spa_config_exit(spa, SCL_FREE, FTAG); -} - -int -metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) -{ - const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); - int error = 0; - - ASSERT(!BP_IS_HOLE(bp)); - - if (txg != 0) { - /* - * First do a dry run to make sure all DVAs are claimable, - * so we don't have to unwind from partial failures below. - */ - if ((error = metaslab_claim(spa, bp, 0)) != 0) - return (error); - } - - spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); - - for (int d = 0; d < ndvas; d++) { - error = metaslab_claim_dva(spa, &dva[d], txg); - if (error != 0) - break; - } - - spa_config_exit(spa, SCL_ALLOC, FTAG); - - ASSERT(error == 0 || txg == 0); - - return (error); -} - -/* ARGSUSED */ -static void -metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, - uint64_t size, void *arg) -{ - if (vd->vdev_ops == &vdev_indirect_ops) - return; - - metaslab_check_free_impl(vd, offset, size); -} - -static void -metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) -{ - metaslab_t *msp; - spa_t *spa = vd->vdev_spa; - - if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) - return; - - if (vd->vdev_ops->vdev_op_remap != NULL) { - vd->vdev_ops->vdev_op_remap(vd, offset, size, - metaslab_check_free_impl_cb, NULL); - return; - } - - ASSERT(vdev_is_concrete(vd)); - ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); - ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); - - msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - - mutex_enter(&msp->ms_lock); - if (msp->ms_loaded) { - range_tree_verify_not_present(msp->ms_allocatable, - offset, size); - } - - range_tree_verify_not_present(msp->ms_freeing, offset, size); - range_tree_verify_not_present(msp->ms_checkpointing, offset, size); - range_tree_verify_not_present(msp->ms_freed, offset, size); - for (int j = 0; j < TXG_DEFER_SIZE; j++) - range_tree_verify_not_present(msp->ms_defer[j], offset, size); - mutex_exit(&msp->ms_lock); -} - -void -metaslab_check_free(spa_t *spa, const blkptr_t *bp) -{ - if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) - return; - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - for (int i = 0; i < BP_GET_NDVAS(bp); i++) { - uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); - vdev_t *vd = vdev_lookup_top(spa, vdev); - uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); - uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); - - if (DVA_GET_GANG(&bp->blk_dva[i])) - size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - - ASSERT3P(vd, !=, NULL); - - metaslab_check_free_impl(vd, offset, size); - } - spa_config_exit(spa, SCL_VDEV, FTAG); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c deleted file mode 100644 index f22af0b40146..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c +++ /dev/null @@ -1,750 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2017 by Lawrence Livermore National Security, LLC. - * Copyright 2019 Joyent, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Multi-Modifier Protection (MMP) attempts to prevent a user from importing - * or opening a pool on more than one host at a time. In particular, it - * prevents "zpool import -f" on a host from succeeding while the pool is - * already imported on another host. There are many other ways in which a - * device could be used by two hosts for different purposes at the same time - * resulting in pool damage. This implementation does not attempt to detect - * those cases. - * - * MMP operates by ensuring there are frequent visible changes on disk (a - * "heartbeat") at all times. And by altering the import process to check - * for these changes and failing the import when they are detected. This - * functionality is enabled by setting the 'multihost' pool property to on. - * - * Uberblocks written by the txg_sync thread always go into the first - * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP. - * They are used to hold uberblocks which are exactly the same as the last - * synced uberblock except that the ub_timestamp and mmp_config are frequently - * updated. Like all other uberblocks, the slot is written with an embedded - * checksum, and slots with invalid checksums are ignored. This provides the - * "heartbeat", with no risk of overwriting good uberblocks that must be - * preserved, e.g. previous txgs and associated block pointers. - * - * Three optional fields are added to uberblock structure; ub_mmp_magic, - * ub_mmp_config, and ub_mmp_delay. The ub_mmp_magic value allows zfs to tell - * whether the other ub_mmp_* fields are valid. The ub_mmp_config field tells - * the importing host the settings of zfs_multihost_interval and - * zfs_multihost_fail_intervals on the host which last had (or currently has) - * the pool imported. These determine how long a host must wait to detect - * activity in the pool, before concluding the pool is not in use. The - * mmp_delay field is a decaying average of the amount of time between - * completion of successive MMP writes, in nanoseconds. It indicates whether - * MMP is enabled. - * - * During import an activity test may now be performed to determine if - * the pool is in use. The activity test is typically required if the - * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is - * POOL_STATE_ACTIVE, and the pool is not a root pool. - * - * The activity test finds the "best" uberblock (highest txg, timestamp, and, if - * ub_mmp_magic is valid, sequence number from ub_mmp_config). It then waits - * some time, and finds the "best" uberblock again. If any of the mentioned - * fields have different values in the newly read uberblock, the pool is in use - * by another host and the import fails. In order to assure the accuracy of the - * activity test, the default values result in an activity test duration of 20x - * the mmp write interval. - * - * The duration of the "zpool import" activity test depends on the information - * available in the "best" uberblock: - * - * 1) If uberblock was written by zfs-0.8 or newer and fail_intervals > 0: - * ub_mmp_config.fail_intervals * ub_mmp_config.multihost_interval * 2 - * - * In this case, a weak guarantee is provided. Since the host which last had - * the pool imported will suspend the pool if no mmp writes land within - * fail_intervals * multihost_interval ms, the absence of writes during that - * time means either the pool is not imported, or it is imported but the pool - * is suspended and no further writes will occur. - * - * Note that resuming the suspended pool on the remote host would invalidate - * this guarantee, and so it is not allowed. - * - * The factor of 2 provides a conservative safety factor and derives from - * MMP_IMPORT_SAFETY_FACTOR; - * - * 2) If uberblock was written by zfs-0.8 or newer and fail_intervals == 0: - * (ub_mmp_config.multihost_interval + ub_mmp_delay) * - * zfs_multihost_import_intervals - * - * In this case no guarantee can provided. However, as long as some devices - * are healthy and connected, it is likely that at least one write will land - * within (multihost_interval + mmp_delay) because multihost_interval is - * enough time for a write to be attempted to each leaf vdev, and mmp_delay - * is enough for one to land, based on past delays. Multiplying by - * zfs_multihost_import_intervals provides a conservative safety factor. - * - * 3) If uberblock was written by zfs-0.7: - * (zfs_multihost_interval + ub_mmp_delay) * zfs_multihost_import_intervals - * - * The same logic as case #2 applies, but we do not know remote tunables. - * - * We use the local value for zfs_multihost_interval because the original MMP - * did not record this value in the uberblock. - * - * ub_mmp_delay >= (zfs_multihost_interval / leaves), so if the other host - * has a much larger zfs_multihost_interval set, ub_mmp_delay will reflect - * that. We will have waited enough time for zfs_multihost_import_intervals - * writes to be issued and all but one to land. - * - * single device pool example delays - * - * import_delay = (1 + 1) * 20 = 40s #defaults, no I/O delay - * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay - * import_delay = (10 + 10) * 20 = 400s #10s multihost_interval, - * no I/O delay - * 100 device pool example delays - * - * import_delay = (1 + .01) * 20 = 20s #defaults, no I/O delay - * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay - * import_delay = (10 + .1) * 20 = 202s #10s multihost_interval, - * no I/O delay - * - * 4) Otherwise, this uberblock was written by a pre-MMP zfs: - * zfs_multihost_import_intervals * zfs_multihost_interval - * - * In this case local tunables are used. By default this product = 10s, long - * enough for a pool with any activity at all to write at least one - * uberblock. No guarantee can be provided. - * - * Additionally, the duration is then extended by a random 25% to attempt to to - * detect simultaneous imports. For example, if both partner hosts are rebooted - * at the same time and automatically attempt to import the pool. - */ - -/* - * Used to control the frequency of mmp writes which are performed when the - * 'multihost' pool property is on. This is one factor used to determine the - * length of the activity check during import. - * - * On average an mmp write will be issued for each leaf vdev every - * zfs_multihost_interval milliseconds. In practice, the observed period can - * vary with the I/O load and this observed value is the ub_mmp_delay which is - * stored in the uberblock. The minimum allowed value is 100 ms. - */ -ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL; -#ifdef __FreeBSD__ -SYSCTL_DECL(_vfs_zfs); -SYSCTL_ULONG(_vfs_zfs, OID_AUTO, multihost_interval, CTLFLAG_RWTUN, - &zfs_multihost_interval, 0, "Interval between MMP writes, milliseconds"); -#endif - -/* - * Used to control the duration of the activity test on import. Smaller values - * of zfs_multihost_import_intervals will reduce the import time but increase - * the risk of failing to detect an active pool. The total activity check time - * is never allowed to drop below one second. A value of 0 is ignored and - * treated as if it was set to 1. - */ -uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS; -#ifdef __FreeBSD__ -SYSCTL_UINT(_vfs_zfs, OID_AUTO, multihost_import_intervals, CTLFLAG_RWTUN, - &zfs_multihost_import_intervals, 0, - "MMP activity check period for pool import, " - "in units of multihost_interval"); -#endif - -/* - * Controls the behavior of the pool when mmp write failures or delays are - * detected. - * - * When zfs_multihost_fail_intervals = 0, mmp write failures or delays are - * ignored. The failures will still be reported to the ZED which depending on - * its configuration may take action such as suspending the pool or taking a - * device offline. - * - * When zfs_multihost_fail_intervals > 0, the pool will be suspended if - * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds pass - * without a successful mmp write. This guarantees the activity test will see - * mmp writes if the pool is imported. A value of 1 is ignored and treated as - * if it was set to 2, because a single leaf vdev pool will issue a write once - * per multihost_interval and thus any variation in latency would cause the - * pool to be suspended. - */ -uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS; -#ifdef __FreeBSD__ -SYSCTL_UINT(_vfs_zfs, OID_AUTO, multihost_fail_intervals, CTLFLAG_RWTUN, - &zfs_multihost_fail_intervals, 0, - "How long to tolerate MMP write failures before suspending a pool, " - "in units of multihost_interval"); -#endif - -char *mmp_tag = "mmp_write_uberblock"; -static void mmp_thread(void *arg); - -void -mmp_init(spa_t *spa) -{ - mmp_thread_t *mmp = &spa->spa_mmp; - - mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL); - mmp->mmp_kstat_id = 1; - - /* - * mmp_write_done() calculates mmp_delay based on prior mmp_delay and - * the elapsed time since the last write. For the first mmp write, - * there is no "last write", so we start with fake non-zero values. - */ - mmp->mmp_last_write = gethrtime(); - mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)); -} - -void -mmp_fini(spa_t *spa) -{ - mmp_thread_t *mmp = &spa->spa_mmp; - - mutex_destroy(&mmp->mmp_thread_lock); - cv_destroy(&mmp->mmp_thread_cv); - mutex_destroy(&mmp->mmp_io_lock); -} - -static void -mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr) -{ - CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG); - mutex_enter(&mmp->mmp_thread_lock); -} - -static void -mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr) -{ - ASSERT(*mpp != NULL); - *mpp = NULL; - cv_broadcast(&mmp->mmp_thread_cv); - CALLB_CPR_EXIT(cpr); /* drops &mmp->mmp_thread_lock */ - thread_exit(); -} - -void -mmp_thread_start(spa_t *spa) -{ - mmp_thread_t *mmp = &spa->spa_mmp; - - if (spa_writeable(spa)) { - mutex_enter(&mmp->mmp_thread_lock); - if (!mmp->mmp_thread) { - mmp->mmp_thread = thread_create(NULL, 0, mmp_thread, - spa, 0, &p0, TS_RUN, minclsyspri); - zfs_dbgmsg("MMP thread started pool '%s' " - "gethrtime %llu", spa_name(spa), gethrtime()); - } - mutex_exit(&mmp->mmp_thread_lock); - } -} - -void -mmp_thread_stop(spa_t *spa) -{ - mmp_thread_t *mmp = &spa->spa_mmp; - - mutex_enter(&mmp->mmp_thread_lock); - mmp->mmp_thread_exiting = 1; - cv_broadcast(&mmp->mmp_thread_cv); - - while (mmp->mmp_thread) { - cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock); - } - mutex_exit(&mmp->mmp_thread_lock); - zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu", - spa_name(spa), gethrtime()); - - ASSERT(mmp->mmp_thread == NULL); - mmp->mmp_thread_exiting = 0; -} - -typedef enum mmp_vdev_state_flag { - MMP_FAIL_NOT_WRITABLE = (1 << 0), - MMP_FAIL_WRITE_PENDING = (1 << 1), -} mmp_vdev_state_flag_t; - -/* - * Find a leaf vdev to write an MMP block to. It must not have an outstanding - * mmp write (if so a new write will also likely block). If there is no usable - * leaf, a nonzero error value is returned. The error value returned is a bit - * field. - * - * MMP_FAIL_WRITE_PENDING One or more leaf vdevs are writeable, but have an - * outstanding MMP write. - * MMP_FAIL_NOT_WRITABLE One or more leaf vdevs are not writeable. - */ - -static int -mmp_next_leaf(spa_t *spa) -{ - vdev_t *leaf; - vdev_t *starting_leaf; - int fail_mask = 0; - - ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock)); - ASSERT(spa_config_held(spa, SCL_STATE, RW_READER)); - ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE); - ASSERT(!list_is_empty(&spa->spa_leaf_list)); - - if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) { - spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list); - spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen; - } - - leaf = spa->spa_mmp.mmp_last_leaf; - if (leaf == NULL) - leaf = list_head(&spa->spa_leaf_list); - starting_leaf = leaf; - - do { - leaf = list_next(&spa->spa_leaf_list, leaf); - if (leaf == NULL) - leaf = list_head(&spa->spa_leaf_list); - - if (!vdev_writeable(leaf)) { - fail_mask |= MMP_FAIL_NOT_WRITABLE; - } else if (leaf->vdev_mmp_pending != 0) { - fail_mask |= MMP_FAIL_WRITE_PENDING; - } else { - spa->spa_mmp.mmp_last_leaf = leaf; - return (0); - } - } while (leaf != starting_leaf); - - ASSERT(fail_mask); - - return (fail_mask); -} - -/* - * MMP writes are issued on a fixed schedule, but may complete at variable, - * much longer, intervals. The mmp_delay captures long periods between - * successful writes for any reason, including disk latency, scheduling delays, - * etc. - * - * The mmp_delay is usually calculated as a decaying average, but if the latest - * delay is higher we do not average it, so that we do not hide sudden spikes - * which the importing host must wait for. - * - * If writes are occurring frequently, such as due to a high rate of txg syncs, - * the mmp_delay could become very small. Since those short delays depend on - * activity we cannot count on, we never allow mmp_delay to get lower than rate - * expected if only mmp_thread writes occur. - * - * If an mmp write was skipped or fails, and we have already waited longer than - * mmp_delay, we need to update it so the next write reflects the longer delay. - * - * Do not set mmp_delay if the multihost property is not on, so as not to - * trigger an activity check on import. - */ -static void -mmp_delay_update(spa_t *spa, boolean_t write_completed) -{ - mmp_thread_t *mts = &spa->spa_mmp; - hrtime_t delay = gethrtime() - mts->mmp_last_write; - - ASSERT(MUTEX_HELD(&mts->mmp_io_lock)); - - if (spa_multihost(spa) == B_FALSE) { - mts->mmp_delay = 0; - return; - } - - if (delay > mts->mmp_delay) - mts->mmp_delay = delay; - - if (write_completed == B_FALSE) - return; - - mts->mmp_last_write = gethrtime(); - - /* - * strictly less than, in case delay was changed above. - */ - if (delay < mts->mmp_delay) { - hrtime_t min_delay = - MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)) / - MAX(1, vdev_count_leaves(spa)); - mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128), - min_delay); - } -} - -static void -mmp_write_done(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - vdev_t *vd = zio->io_vd; - mmp_thread_t *mts = zio->io_private; - - mutex_enter(&mts->mmp_io_lock); - uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id; - hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending; - - mmp_delay_update(spa, (zio->io_error == 0)); - - vd->vdev_mmp_pending = 0; - vd->vdev_mmp_kstat_id = 0; - - mutex_exit(&mts->mmp_io_lock); - spa_config_exit(spa, SCL_STATE, mmp_tag); - - abd_free(zio->io_abd); -} - -/* - * When the uberblock on-disk is updated by a spa_sync, - * creating a new "best" uberblock, update the one stored - * in the mmp thread state, used for mmp writes. - */ -void -mmp_update_uberblock(spa_t *spa, uberblock_t *ub) -{ - mmp_thread_t *mmp = &spa->spa_mmp; - - mutex_enter(&mmp->mmp_io_lock); - mmp->mmp_ub = *ub; - mmp->mmp_seq = 1; - mmp->mmp_ub.ub_timestamp = gethrestime_sec(); - mmp_delay_update(spa, B_TRUE); - mutex_exit(&mmp->mmp_io_lock); -} - -/* - * Choose a random vdev, label, and MMP block, and write over it - * with a copy of the last-synced uberblock, whose timestamp - * has been updated to reflect that the pool is in use. - */ -static void -mmp_write_uberblock(spa_t *spa) -{ - int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; - mmp_thread_t *mmp = &spa->spa_mmp; - uberblock_t *ub; - vdev_t *vd = NULL; - int label, error; - uint64_t offset; - - hrtime_t lock_acquire_time = gethrtime(); - spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER); - lock_acquire_time = gethrtime() - lock_acquire_time; - if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10)) - zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns " - "gethrtime %llu", spa_name(spa), lock_acquire_time, - gethrtime()); - - mutex_enter(&mmp->mmp_io_lock); - - error = mmp_next_leaf(spa); - - /* - * spa_mmp_history has two types of entries: - * Issued MMP write: records time issued, error status, etc. - * Skipped MMP write: an MMP write could not be issued because no - * suitable leaf vdev was available. See comment above struct - * spa_mmp_history for details. - */ - - if (error) { - mmp_delay_update(spa, B_FALSE); - if (mmp->mmp_skip_error == error) { - /* - * ZoL porting note: the following is TBD - * spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1); - */ - } else { - mmp->mmp_skip_error = error; - /* - * ZoL porting note: the following is TBD - * spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg, - * gethrestime_sec(), mmp->mmp_delay, NULL, 0, - * mmp->mmp_kstat_id++, error); - */ - zfs_dbgmsg("MMP error choosing leaf pool '%s' " - "gethrtime %llu fail_mask %#x", spa_name(spa), - gethrtime(), error); - } - mutex_exit(&mmp->mmp_io_lock); - spa_config_exit(spa, SCL_STATE, mmp_tag); - return; - } - - vd = spa->spa_mmp.mmp_last_leaf; - if (mmp->mmp_skip_error != 0) { - mmp->mmp_skip_error = 0; - zfs_dbgmsg("MMP write after skipping due to unavailable " - "leaves, pool '%s' gethrtime %llu leaf %#llu", - spa_name(spa), gethrtime(), vd->vdev_guid); - } - - if (mmp->mmp_zio_root == NULL) - mmp->mmp_zio_root = zio_root(spa, NULL, NULL, - flags | ZIO_FLAG_GODFATHER); - - if (mmp->mmp_ub.ub_timestamp != gethrestime_sec()) { - /* - * Want to reset mmp_seq when timestamp advances because after - * an mmp_seq wrap new values will not be chosen by - * uberblock_compare() as the "best". - */ - mmp->mmp_ub.ub_timestamp = gethrestime_sec(); - mmp->mmp_seq = 1; - } - - ub = &mmp->mmp_ub; - ub->ub_mmp_magic = MMP_MAGIC; - ub->ub_mmp_delay = mmp->mmp_delay; - ub->ub_mmp_config = MMP_SEQ_SET(mmp->mmp_seq) | - MMP_INTERVAL_SET(MMP_INTERVAL_OK(zfs_multihost_interval)) | - MMP_FAIL_INT_SET(MMP_FAIL_INTVS_OK( - zfs_multihost_fail_intervals)); - vd->vdev_mmp_pending = gethrtime(); - vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id; - - zio_t *zio = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags); - abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); - abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); - abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); - - mmp->mmp_seq++; - mmp->mmp_kstat_id++; - mutex_exit(&mmp->mmp_io_lock); - - offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) - - MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL)); - - label = spa_get_random(VDEV_LABELS); - vdev_label_write(zio, vd, label, ub_abd, offset, - VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp, - flags | ZIO_FLAG_DONT_PROPAGATE); - - /* - * ZoL porting note: the following is TBD - * (void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp, - * ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0); - */ - - zio_nowait(zio); -} - -static void -mmp_thread(void *arg) -{ - spa_t *spa = (spa_t *)arg; - mmp_thread_t *mmp = &spa->spa_mmp; - boolean_t suspended = spa_suspended(spa); - boolean_t multihost = spa_multihost(spa); - uint64_t mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK( - zfs_multihost_interval)); - uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK( - zfs_multihost_fail_intervals); - hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval; - boolean_t last_spa_suspended = suspended; - boolean_t last_spa_multihost = multihost; - uint64_t last_mmp_interval = mmp_interval; - uint32_t last_mmp_fail_intervals = mmp_fail_intervals; - hrtime_t last_mmp_fail_ns = mmp_fail_ns; - callb_cpr_t cpr; - int skip_wait = 0; - - mmp_thread_enter(mmp, &cpr); - - while (!mmp->mmp_thread_exiting) { - hrtime_t next_time = gethrtime() + - MSEC2NSEC(MMP_DEFAULT_INTERVAL); - int leaves = MAX(vdev_count_leaves(spa), 1); - - /* Detect changes in tunables or state */ - - last_spa_suspended = suspended; - last_spa_multihost = multihost; - suspended = spa_suspended(spa); - multihost = spa_multihost(spa); - - last_mmp_interval = mmp_interval; - last_mmp_fail_intervals = mmp_fail_intervals; - last_mmp_fail_ns = mmp_fail_ns; - mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK( - zfs_multihost_interval)); - mmp_fail_intervals = MMP_FAIL_INTVS_OK( - zfs_multihost_fail_intervals); - - /* Smooth so pool is not suspended when reducing tunables */ - if (mmp_fail_intervals * mmp_interval < mmp_fail_ns) { - mmp_fail_ns = (mmp_fail_ns * 31 + - mmp_fail_intervals * mmp_interval) / 32; - } else { - mmp_fail_ns = mmp_fail_intervals * - mmp_interval; - } - - if (mmp_interval != last_mmp_interval || - mmp_fail_intervals != last_mmp_fail_intervals) { - /* - * We want other hosts to see new tunables as quickly as - * possible. Write out at higher frequency than usual. - */ - skip_wait += leaves; - } - - if (multihost) - next_time = gethrtime() + mmp_interval / leaves; - - if (mmp_fail_ns != last_mmp_fail_ns) { - zfs_dbgmsg("MMP interval change pool '%s' " - "gethrtime %llu last_mmp_interval %llu " - "mmp_interval %llu last_mmp_fail_intervals %u " - "mmp_fail_intervals %u mmp_fail_ns %llu " - "skip_wait %d leaves %d next_time %llu", - spa_name(spa), gethrtime(), last_mmp_interval, - mmp_interval, last_mmp_fail_intervals, - mmp_fail_intervals, mmp_fail_ns, skip_wait, leaves, - next_time); - } - - /* - * MMP off => on, or suspended => !suspended: - * No writes occurred recently. Update mmp_last_write to give - * us some time to try. - */ - if ((!last_spa_multihost && multihost) || - (last_spa_suspended && !suspended)) { - zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu " - "last_spa_multihost %u multihost %u " - "last_spa_suspended %u suspended %u", - spa_name(spa), last_spa_multihost, multihost, - last_spa_suspended, suspended); - mutex_enter(&mmp->mmp_io_lock); - mmp->mmp_last_write = gethrtime(); - mmp->mmp_delay = mmp_interval; - mutex_exit(&mmp->mmp_io_lock); - } - - /* - * MMP on => off: - * mmp_delay == 0 tells importing node to skip activity check. - */ - if (last_spa_multihost && !multihost) { - mutex_enter(&mmp->mmp_io_lock); - mmp->mmp_delay = 0; - mutex_exit(&mmp->mmp_io_lock); - } - - /* - * Suspend the pool if no MMP write has succeeded in over - * mmp_interval * mmp_fail_intervals nanoseconds. - */ - if (multihost && !suspended && mmp_fail_intervals && - (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) { - zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu " - "mmp_last_write %llu mmp_interval %llu " - "mmp_fail_intervals %llu mmp_fail_ns %llu", - spa_name(spa), (u_longlong_t)gethrtime(), - (u_longlong_t)mmp->mmp_last_write, - (u_longlong_t)mmp_interval, - (u_longlong_t)mmp_fail_intervals, - (u_longlong_t)mmp_fail_ns); - cmn_err(CE_WARN, "MMP writes to pool '%s' have not " - "succeeded in over %llu ms; suspending pool. " - "Hrtime %llu", - spa_name(spa), - NSEC2MSEC(gethrtime() - mmp->mmp_last_write), - gethrtime()); - zio_suspend(spa, NULL, ZIO_SUSPEND_MMP); - } - - if (multihost && !suspended) - mmp_write_uberblock(spa); - - if (skip_wait > 0) { - next_time = gethrtime() + MSEC2NSEC(MMP_MIN_INTERVAL) / - leaves; - skip_wait--; - } - - CALLB_CPR_SAFE_BEGIN(&cpr); -#if defined(illumos) - (void) cv_timedwait_sig_hrtime(&mmp->mmp_thread_cv, - &mmp->mmp_thread_lock, next_time); -#elif defined(_KERNEL) - (void) cv_timedwait_sig_sbt(&mmp->mmp_thread_cv, - &mmp->mmp_thread_lock, nstosbt(next_time), - 100 * SBT_1US, C_ABSOLUTE); -#else - (void) cv_timedwait_sig_hires(&mmp->mmp_thread_cv, - &mmp->mmp_thread_lock, next_time, USEC2NSEC(100), - CALLOUT_FLAG_ABSOLUTE); -#endif - CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock); - } - - /* Outstanding writes are allowed to complete. */ - if (mmp->mmp_zio_root) - zio_wait(mmp->mmp_zio_root); - - mmp->mmp_zio_root = NULL; - mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr); -} - -/* - * Signal the MMP thread to wake it, when it is sleeping on - * its cv. Used when some module parameter has changed and - * we want the thread to know about it. - * Only signal if the pool is active and mmp thread is - * running, otherwise there is no thread to wake. - */ -static void -mmp_signal_thread(spa_t *spa) -{ - mmp_thread_t *mmp = &spa->spa_mmp; - - mutex_enter(&mmp->mmp_thread_lock); - if (mmp->mmp_thread) - cv_broadcast(&mmp->mmp_thread_cv); - mutex_exit(&mmp->mmp_thread_lock); -} - -void -mmp_signal_all_threads(void) -{ - spa_t *spa = NULL; - - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa))) { - if (spa->spa_state == POOL_STATE_ACTIVE) - mmp_signal_thread(spa); - } - mutex_exit(&spa_namespace_lock); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c deleted file mode 100644 index f517454d3d6d..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c +++ /dev/null @@ -1,423 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. - */ - -#include -#include - -/* needed for spa_get_random() */ -#include - -/* - * This overrides the number of sublists in each multilist_t, which defaults - * to the number of CPUs in the system (see multilist_create()). - */ -int zfs_multilist_num_sublists = 0; - -/* - * Given the object contained on the list, return a pointer to the - * object's multilist_node_t structure it contains. - */ -static multilist_node_t * -multilist_d2l(multilist_t *ml, void *obj) -{ - return ((multilist_node_t *)((char *)obj + ml->ml_offset)); -} - -/* - * Initialize a new mutlilist using the parameters specified. - * - * - 'size' denotes the size of the structure containing the - * multilist_node_t. - * - 'offset' denotes the byte offset of the mutlilist_node_t within - * the structure that contains it. - * - 'num' specifies the number of internal sublists to create. - * - 'index_func' is used to determine which sublist to insert into - * when the multilist_insert() function is called; as well as which - * sublist to remove from when multilist_remove() is called. The - * requirements this function must meet, are the following: - * - * - It must always return the same value when called on the same - * object (to ensure the object is removed from the list it was - * inserted into). - * - * - It must return a value in the range [0, number of sublists). - * The multilist_get_num_sublists() function may be used to - * determine the number of sublists in the multilist. - * - * Also, in order to reduce internal contention between the sublists - * during insertion and removal, this function should choose evenly - * between all available sublists when inserting. This isn't a hard - * requirement, but a general rule of thumb in order to garner the - * best multi-threaded performance out of the data structure. - */ -static multilist_t * -multilist_create_impl(size_t size, size_t offset, - unsigned int num, multilist_sublist_index_func_t *index_func) -{ - ASSERT3U(size, >, 0); - ASSERT3U(size, >=, offset + sizeof (multilist_node_t)); - ASSERT3U(num, >, 0); - ASSERT3P(index_func, !=, NULL); - - multilist_t *ml = kmem_alloc(sizeof (*ml), KM_SLEEP); - ml->ml_offset = offset; - ml->ml_num_sublists = num; - ml->ml_index_func = index_func; - - ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) * - ml->ml_num_sublists, KM_SLEEP); - - ASSERT3P(ml->ml_sublists, !=, NULL); - - for (int i = 0; i < ml->ml_num_sublists; i++) { - multilist_sublist_t *mls = &ml->ml_sublists[i]; - mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&mls->mls_list, size, offset); - } - return (ml); -} - -/* - * Allocate a new multilist, using the default number of sublists - * (the number of CPUs, or at least 4, or the tunable - * zfs_multilist_num_sublists). - */ -multilist_t * -multilist_create(size_t size, size_t offset, - multilist_sublist_index_func_t *index_func) -{ - int num_sublists; - - if (zfs_multilist_num_sublists > 0) { - num_sublists = zfs_multilist_num_sublists; - } else { - num_sublists = MAX(max_ncpus, 4); - } - - return (multilist_create_impl(size, offset, num_sublists, index_func)); -} - -/* - * Destroy the given multilist object, and free up any memory it holds. - */ -void -multilist_destroy(multilist_t *ml) -{ - ASSERT(multilist_is_empty(ml)); - - for (int i = 0; i < ml->ml_num_sublists; i++) { - multilist_sublist_t *mls = &ml->ml_sublists[i]; - - ASSERT(list_is_empty(&mls->mls_list)); - - list_destroy(&mls->mls_list); - mutex_destroy(&mls->mls_lock); - } - - ASSERT3P(ml->ml_sublists, !=, NULL); - kmem_free(ml->ml_sublists, - sizeof (multilist_sublist_t) * ml->ml_num_sublists); - - ml->ml_num_sublists = 0; - ml->ml_offset = 0; - kmem_free(ml, sizeof (multilist_t)); -} - -/* - * Insert the given object into the multilist. - * - * This function will insert the object specified into the sublist - * determined using the function given at multilist creation time. - * - * The sublist locks are automatically acquired if not already held, to - * ensure consistency when inserting and removing from multiple threads. - */ -void -multilist_insert(multilist_t *ml, void *obj) -{ - unsigned int sublist_idx = ml->ml_index_func(ml, obj); - multilist_sublist_t *mls; - boolean_t need_lock; - - DTRACE_PROBE3(multilist__insert, multilist_t *, ml, - unsigned int, sublist_idx, void *, obj); - - ASSERT3U(sublist_idx, <, ml->ml_num_sublists); - - mls = &ml->ml_sublists[sublist_idx]; - - /* - * Note: Callers may already hold the sublist lock by calling - * multilist_sublist_lock(). Here we rely on MUTEX_HELD() - * returning TRUE if and only if the current thread holds the - * lock. While it's a little ugly to make the lock recursive in - * this way, it works and allows the calling code to be much - * simpler -- otherwise it would have to pass around a flag - * indicating that it already has the lock. - */ - need_lock = !MUTEX_HELD(&mls->mls_lock); - - if (need_lock) - mutex_enter(&mls->mls_lock); - - ASSERT(!multilist_link_active(multilist_d2l(ml, obj))); - - multilist_sublist_insert_head(mls, obj); - - if (need_lock) - mutex_exit(&mls->mls_lock); -} - -/* - * Remove the given object from the multilist. - * - * This function will remove the object specified from the sublist - * determined using the function given at multilist creation time. - * - * The necessary sublist locks are automatically acquired, to ensure - * consistency when inserting and removing from multiple threads. - */ -void -multilist_remove(multilist_t *ml, void *obj) -{ - unsigned int sublist_idx = ml->ml_index_func(ml, obj); - multilist_sublist_t *mls; - boolean_t need_lock; - - DTRACE_PROBE3(multilist__remove, multilist_t *, ml, - unsigned int, sublist_idx, void *, obj); - - ASSERT3U(sublist_idx, <, ml->ml_num_sublists); - - mls = &ml->ml_sublists[sublist_idx]; - /* See comment in multilist_insert(). */ - need_lock = !MUTEX_HELD(&mls->mls_lock); - - if (need_lock) - mutex_enter(&mls->mls_lock); - - ASSERT(multilist_link_active(multilist_d2l(ml, obj))); - - multilist_sublist_remove(mls, obj); - - if (need_lock) - mutex_exit(&mls->mls_lock); -} - -/* - * Check to see if this multilist object is empty. - * - * This will return TRUE if it finds all of the sublists of this - * multilist to be empty, and FALSE otherwise. Each sublist lock will be - * automatically acquired as necessary. - * - * If concurrent insertions and removals are occurring, the semantics - * of this function become a little fuzzy. Instead of locking all - * sublists for the entire call time of the function, each sublist is - * only locked as it is individually checked for emptiness. Thus, it's - * possible for this function to return TRUE with non-empty sublists at - * the time the function returns. This would be due to another thread - * inserting into a given sublist, after that specific sublist was check - * and deemed empty, but before all sublists have been checked. - */ -int -multilist_is_empty(multilist_t *ml) -{ - for (int i = 0; i < ml->ml_num_sublists; i++) { - multilist_sublist_t *mls = &ml->ml_sublists[i]; - /* See comment in multilist_insert(). */ - boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock); - - if (need_lock) - mutex_enter(&mls->mls_lock); - - if (!list_is_empty(&mls->mls_list)) { - if (need_lock) - mutex_exit(&mls->mls_lock); - - return (FALSE); - } - - if (need_lock) - mutex_exit(&mls->mls_lock); - } - - return (TRUE); -} - -/* Return the number of sublists composing this multilist */ -unsigned int -multilist_get_num_sublists(multilist_t *ml) -{ - return (ml->ml_num_sublists); -} - -/* Return a randomly selected, valid sublist index for this multilist */ -unsigned int -multilist_get_random_index(multilist_t *ml) -{ - return (spa_get_random(ml->ml_num_sublists)); -} - -/* Lock and return the sublist specified at the given index */ -multilist_sublist_t * -multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) -{ - multilist_sublist_t *mls; - - ASSERT3U(sublist_idx, <, ml->ml_num_sublists); - mls = &ml->ml_sublists[sublist_idx]; - mutex_enter(&mls->mls_lock); - - return (mls); -} - -/* Lock and return the sublist that would be used to store the specified obj */ -multilist_sublist_t * -multilist_sublist_lock_obj(multilist_t *ml, void *obj) -{ - return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj))); -} - -void -multilist_sublist_unlock(multilist_sublist_t *mls) -{ - mutex_exit(&mls->mls_lock); -} - -/* - * We're allowing any object to be inserted into this specific sublist, - * but this can lead to trouble if multilist_remove() is called to - * remove this object. Specifically, if calling ml_index_func on this - * object returns an index for sublist different than what is passed as - * a parameter here, any call to multilist_remove() with this newly - * inserted object is undefined! (the call to multilist_remove() will - * remove the object from a list that it isn't contained in) - */ -void -multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj) -{ - ASSERT(MUTEX_HELD(&mls->mls_lock)); - list_insert_head(&mls->mls_list, obj); -} - -/* please see comment above multilist_sublist_insert_head */ -void -multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj) -{ - ASSERT(MUTEX_HELD(&mls->mls_lock)); - list_insert_tail(&mls->mls_list, obj); -} - -/* - * Move the object one element forward in the list. - * - * This function will move the given object forward in the list (towards - * the head) by one object. So, in essence, it will swap its position in - * the list with its "prev" pointer. If the given object is already at the - * head of the list, it cannot be moved forward any more than it already - * is, so no action is taken. - * - * NOTE: This function **must not** remove any object from the list other - * than the object given as the parameter. This is relied upon in - * arc_evict_state_impl(). - */ -void -multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj) -{ - void *prev = list_prev(&mls->mls_list, obj); - - ASSERT(MUTEX_HELD(&mls->mls_lock)); - ASSERT(!list_is_empty(&mls->mls_list)); - - /* 'obj' must be at the head of the list, nothing to do */ - if (prev == NULL) - return; - - list_remove(&mls->mls_list, obj); - list_insert_before(&mls->mls_list, prev, obj); -} - -void -multilist_sublist_remove(multilist_sublist_t *mls, void *obj) -{ - ASSERT(MUTEX_HELD(&mls->mls_lock)); - list_remove(&mls->mls_list, obj); -} - -int -multilist_sublist_is_empty(multilist_sublist_t *mls) -{ - ASSERT(MUTEX_HELD(&mls->mls_lock)); - return (list_is_empty(&mls->mls_list)); -} - -int -multilist_sublist_is_empty_idx(multilist_t *ml, unsigned int sublist_idx) -{ - multilist_sublist_t *mls; - int empty; - - ASSERT3U(sublist_idx, <, ml->ml_num_sublists); - mls = &ml->ml_sublists[sublist_idx]; - ASSERT(!MUTEX_HELD(&mls->mls_lock)); - mutex_enter(&mls->mls_lock); - empty = list_is_empty(&mls->mls_list); - mutex_exit(&mls->mls_lock); - return (empty); -} - -void * -multilist_sublist_head(multilist_sublist_t *mls) -{ - ASSERT(MUTEX_HELD(&mls->mls_lock)); - return (list_head(&mls->mls_list)); -} - -void * -multilist_sublist_tail(multilist_sublist_t *mls) -{ - ASSERT(MUTEX_HELD(&mls->mls_lock)); - return (list_tail(&mls->mls_list)); -} - -void * -multilist_sublist_next(multilist_sublist_t *mls, void *obj) -{ - ASSERT(MUTEX_HELD(&mls->mls_lock)); - return (list_next(&mls->mls_list, obj)); -} - -void * -multilist_sublist_prev(multilist_sublist_t *mls, void *obj) -{ - ASSERT(MUTEX_HELD(&mls->mls_lock)); - return (list_prev(&mls->mls_list, obj)); -} - -void -multilist_link_init(multilist_node_t *link) -{ - list_link_init(link); -} - -int -multilist_link_active(multilist_node_t *link) -{ - return (list_link_active(link)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c deleted file mode 100644 index fc705e37964d..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c +++ /dev/null @@ -1,670 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include - -/* - * Range trees are tree-based data structures that can be used to - * track free space or generally any space allocation information. - * A range tree keeps track of individual segments and automatically - * provides facilities such as adjacent extent merging and extent - * splitting in response to range add/remove requests. - * - * A range tree starts out completely empty, with no segments in it. - * Adding an allocation via range_tree_add to the range tree can either: - * 1) create a new extent - * 2) extend an adjacent extent - * 3) merge two adjacent extents - * Conversely, removing an allocation via range_tree_remove can: - * 1) completely remove an extent - * 2) shorten an extent (if the allocation was near one of its ends) - * 3) split an extent into two extents, in effect punching a hole - * - * A range tree is also capable of 'bridging' gaps when adding - * allocations. This is useful for cases when close proximity of - * allocations is an important detail that needs to be represented - * in the range tree. See range_tree_set_gap(). The default behavior - * is not to bridge gaps (i.e. the maximum allowed gap size is 0). - * - * In order to traverse a range tree, use either the range_tree_walk() - * or range_tree_vacate() functions. - * - * To obtain more accurate information on individual segment - * operations that the range tree performs "under the hood", you can - * specify a set of callbacks by passing a range_tree_ops_t structure - * to the range_tree_create function. Any callbacks that are non-NULL - * are then called at the appropriate times. - * - * The range tree code also supports a special variant of range trees - * that can bridge small gaps between segments. This kind of tree is used - * by the dsl scanning code to group I/Os into mostly sequential chunks to - * optimize disk performance. The code here attempts to do this with as - * little memory and computational overhead as possible. One limitation of - * this implementation is that segments of range trees with gaps can only - * support removing complete segments. - */ - -kmem_cache_t *range_seg_cache; - -/* Generic ops for managing an AVL tree alongside a range tree */ -struct range_tree_ops rt_avl_ops = { - .rtop_create = rt_avl_create, - .rtop_destroy = rt_avl_destroy, - .rtop_add = rt_avl_add, - .rtop_remove = rt_avl_remove, - .rtop_vacate = rt_avl_vacate, -}; - -void -range_tree_init(void) -{ - ASSERT(range_seg_cache == NULL); - range_seg_cache = kmem_cache_create("range_seg_cache", - sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -} - -void -range_tree_fini(void) -{ - kmem_cache_destroy(range_seg_cache); - range_seg_cache = NULL; -} - -void -range_tree_stat_verify(range_tree_t *rt) -{ - range_seg_t *rs; - uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; - int i; - - for (rs = avl_first(&rt->rt_root); rs != NULL; - rs = AVL_NEXT(&rt->rt_root, rs)) { - uint64_t size = rs->rs_end - rs->rs_start; - int idx = highbit64(size) - 1; - - hist[idx]++; - ASSERT3U(hist[idx], !=, 0); - } - - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { - if (hist[i] != rt->rt_histogram[i]) { - zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu", - i, hist, hist[i], rt->rt_histogram[i]); - } - VERIFY3U(hist[i], ==, rt->rt_histogram[i]); - } -} - -static void -range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) -{ - uint64_t size = rs->rs_end - rs->rs_start; - int idx = highbit64(size) - 1; - - ASSERT(size != 0); - ASSERT3U(idx, <, - sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); - - rt->rt_histogram[idx]++; - ASSERT3U(rt->rt_histogram[idx], !=, 0); -} - -static void -range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) -{ - uint64_t size = rs->rs_end - rs->rs_start; - int idx = highbit64(size) - 1; - - ASSERT(size != 0); - ASSERT3U(idx, <, - sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); - - ASSERT3U(rt->rt_histogram[idx], !=, 0); - rt->rt_histogram[idx]--; -} - -/* - * NOTE: caller is responsible for all locking. - */ -static int -range_tree_seg_compare(const void *x1, const void *x2) -{ - const range_seg_t *r1 = (const range_seg_t *)x1; - const range_seg_t *r2 = (const range_seg_t *)x2; - - ASSERT3U(r1->rs_start, <=, r1->rs_end); - ASSERT3U(r2->rs_start, <=, r2->rs_end); - - return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); -} - -range_tree_t * -range_tree_create_impl(range_tree_ops_t *ops, void *arg, - int (*avl_compare) (const void *, const void *), uint64_t gap) -{ - range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); - - avl_create(&rt->rt_root, range_tree_seg_compare, - sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); - - rt->rt_ops = ops; - rt->rt_arg = arg; - rt->rt_gap = gap; - rt->rt_avl_compare = avl_compare; - - if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) - rt->rt_ops->rtop_create(rt, rt->rt_arg); - - return (rt); -} - -range_tree_t * -range_tree_create(range_tree_ops_t *ops, void *arg) -{ - return (range_tree_create_impl(ops, arg, NULL, 0)); -} - -void -range_tree_destroy(range_tree_t *rt) -{ - VERIFY0(rt->rt_space); - - if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) - rt->rt_ops->rtop_destroy(rt, rt->rt_arg); - - avl_destroy(&rt->rt_root); - kmem_free(rt, sizeof (*rt)); -} - -void -range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) -{ - ASSERT3U(rs->rs_fill + delta, !=, 0); - ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start); - - if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) - rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - rs->rs_fill += delta; - if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) - rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); -} - -static void -range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) -{ - range_tree_t *rt = arg; - avl_index_t where; - range_seg_t rsearch, *rs_before, *rs_after, *rs; - uint64_t end = start + size, gap = rt->rt_gap; - uint64_t bridge_size = 0; - boolean_t merge_before, merge_after; - - ASSERT3U(size, !=, 0); - ASSERT3U(fill, <=, size); - - rsearch.rs_start = start; - rsearch.rs_end = end; - rs = avl_find(&rt->rt_root, &rsearch, &where); - - if (gap == 0 && rs != NULL && - rs->rs_start <= start && rs->rs_end >= end) { - zfs_panic_recover("zfs: allocating allocated segment" - "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n", - (longlong_t)start, (longlong_t)size, - (longlong_t)rs->rs_start, - (longlong_t)rs->rs_end - rs->rs_start); - return; - } - - /* - * If this is a gap-supporting range tree, it is possible that we - * are inserting into an existing segment. In this case simply - * bump the fill count and call the remove / add callbacks. If the - * new range will extend an existing segment, we remove the - * existing one, apply the new extent to it and re-insert it using - * the normal code paths. - */ - if (rs != NULL) { - ASSERT3U(gap, !=, 0); - if (rs->rs_start <= start && rs->rs_end >= end) { - range_tree_adjust_fill(rt, rs, fill); - return; - } - - avl_remove(&rt->rt_root, rs); - if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) - rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - - range_tree_stat_decr(rt, rs); - rt->rt_space -= rs->rs_end - rs->rs_start; - - fill += rs->rs_fill; - start = MIN(start, rs->rs_start); - end = MAX(end, rs->rs_end); - size = end - start; - - range_tree_add_impl(rt, start, size, fill); - - kmem_cache_free(range_seg_cache, rs); - return; - } - - ASSERT3P(rs, ==, NULL); - - /* - * Determine whether or not we will have to merge with our neighbors. - * If gap != 0, we might need to merge with our neighbors even if we - * aren't directly touching. - */ - rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); - rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); - - merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap); - merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap); - - if (merge_before && gap != 0) - bridge_size += start - rs_before->rs_end; - if (merge_after && gap != 0) - bridge_size += rs_after->rs_start - end; - - if (merge_before && merge_after) { - avl_remove(&rt->rt_root, rs_before); - if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) { - rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); - rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); - } - - range_tree_stat_decr(rt, rs_before); - range_tree_stat_decr(rt, rs_after); - - rs_after->rs_fill += rs_before->rs_fill + fill; - rs_after->rs_start = rs_before->rs_start; - kmem_cache_free(range_seg_cache, rs_before); - rs = rs_after; - } else if (merge_before) { - if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) - rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); - - range_tree_stat_decr(rt, rs_before); - - rs_before->rs_fill += fill; - rs_before->rs_end = end; - rs = rs_before; - } else if (merge_after) { - if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) - rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); - - range_tree_stat_decr(rt, rs_after); - - rs_after->rs_fill += fill; - rs_after->rs_start = start; - rs = rs_after; - } else { - rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); - - rs->rs_fill = fill; - rs->rs_start = start; - rs->rs_end = end; - avl_insert(&rt->rt_root, rs, where); - } - - if (gap != 0) - ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start); - else - ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start); - - if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) - rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); - - range_tree_stat_incr(rt, rs); - rt->rt_space += size + bridge_size; -} - -void -range_tree_add(void *arg, uint64_t start, uint64_t size) -{ - range_tree_add_impl(arg, start, size, size); -} - -static void -range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, - boolean_t do_fill) -{ - avl_index_t where; - range_seg_t rsearch, *rs, *newseg; - uint64_t end = start + size; - boolean_t left_over, right_over; - - VERIFY3U(size, !=, 0); - VERIFY3U(size, <=, rt->rt_space); - - rsearch.rs_start = start; - rsearch.rs_end = end; - rs = avl_find(&rt->rt_root, &rsearch, &where); - - /* Make sure we completely overlap with someone */ - if (rs == NULL) { - zfs_panic_recover("zfs: freeing free segment " - "(offset=%llu size=%llu)", - (longlong_t)start, (longlong_t)size); - return; - } - - /* - * Range trees with gap support must only remove complete segments - * from the tree. This allows us to maintain accurate fill accounting - * and to ensure that bridged sections are not leaked. If we need to - * remove less than the full segment, we can only adjust the fill count. - */ - if (rt->rt_gap != 0) { - if (do_fill) { - if (rs->rs_fill == size) { - start = rs->rs_start; - end = rs->rs_end; - size = end - start; - } else { - range_tree_adjust_fill(rt, rs, -size); - return; - } - } else if (rs->rs_start != start || rs->rs_end != end) { - zfs_panic_recover("zfs: freeing partial segment of " - "gap tree (offset=%llu size=%llu) of " - "(offset=%llu size=%llu)", - (longlong_t)start, (longlong_t)size, - (longlong_t)rs->rs_start, - (longlong_t)rs->rs_end - rs->rs_start); - return; - } - } - - VERIFY3U(rs->rs_start, <=, start); - VERIFY3U(rs->rs_end, >=, end); - - left_over = (rs->rs_start != start); - right_over = (rs->rs_end != end); - - range_tree_stat_decr(rt, rs); - - if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) - rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - - if (left_over && right_over) { - newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); - newseg->rs_start = end; - newseg->rs_end = rs->rs_end; - newseg->rs_fill = newseg->rs_end - newseg->rs_start; - range_tree_stat_incr(rt, newseg); - - rs->rs_end = start; - - avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); - if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) - rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); - } else if (left_over) { - rs->rs_end = start; - } else if (right_over) { - rs->rs_start = end; - } else { - avl_remove(&rt->rt_root, rs); - kmem_cache_free(range_seg_cache, rs); - rs = NULL; - } - - if (rs != NULL) { - /* - * The fill of the leftover segment will always be equal to - * the size, since we do not support removing partial segments - * of range trees with gaps. - */ - rs->rs_fill = rs->rs_end - rs->rs_start; - range_tree_stat_incr(rt, rs); - - if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) - rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); - } - - rt->rt_space -= size; -} - -void -range_tree_remove(void *arg, uint64_t start, uint64_t size) -{ - range_tree_remove_impl(arg, start, size, B_FALSE); -} - -void -range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size) -{ - range_tree_remove_impl(rt, start, size, B_TRUE); -} - -void -range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, - uint64_t newstart, uint64_t newsize) -{ - int64_t delta = newsize - (rs->rs_end - rs->rs_start); - - range_tree_stat_decr(rt, rs); - if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) - rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - - rs->rs_start = newstart; - rs->rs_end = newstart + newsize; - - range_tree_stat_incr(rt, rs); - if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) - rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); - - rt->rt_space += delta; -} - -static range_seg_t * -range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) -{ - range_seg_t rsearch; - uint64_t end = start + size; - - VERIFY(size != 0); - - rsearch.rs_start = start; - rsearch.rs_end = end; - return (avl_find(&rt->rt_root, &rsearch, NULL)); -} - -range_seg_t * -range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) -{ - range_seg_t *rs = range_tree_find_impl(rt, start, size); - if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size) - return (rs); - return (NULL); -} - -void -range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size) -{ - range_seg_t *rs = range_tree_find(rt, off, size); - if (rs != NULL) - panic("segment already in tree; rs=%p", (void *)rs); -} - -boolean_t -range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) -{ - return (range_tree_find(rt, start, size) != NULL); -} - -/* - * Ensure that this range is not in the tree, regardless of whether - * it is currently in the tree. - */ -void -range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) -{ - range_seg_t *rs; - - if (size == 0) - return; - - while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { - uint64_t free_start = MAX(rs->rs_start, start); - uint64_t free_end = MIN(rs->rs_end, start + size); - range_tree_remove(rt, free_start, free_end - free_start); - } -} - -void -range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) -{ - range_tree_t *rt; - - ASSERT0(range_tree_space(*rtdst)); - ASSERT0(avl_numnodes(&(*rtdst)->rt_root)); - - rt = *rtsrc; - *rtsrc = *rtdst; - *rtdst = rt; -} - -void -range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) -{ - range_seg_t *rs; - void *cookie = NULL; - - - if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) - rt->rt_ops->rtop_vacate(rt, rt->rt_arg); - - while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { - if (func != NULL) - func(arg, rs->rs_start, rs->rs_end - rs->rs_start); - kmem_cache_free(range_seg_cache, rs); - } - - bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); - rt->rt_space = 0; -} - -void -range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) -{ - range_seg_t *rs; - - for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) - func(arg, rs->rs_start, rs->rs_end - rs->rs_start); -} - -range_seg_t * -range_tree_first(range_tree_t *rt) -{ - return (avl_first(&rt->rt_root)); -} - -uint64_t -range_tree_space(range_tree_t *rt) -{ - return (rt->rt_space); -} - -/* Generic range tree functions for maintaining segments in an AVL tree. */ -void -rt_avl_create(range_tree_t *rt, void *arg) -{ - avl_tree_t *tree = arg; - - avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t), - offsetof(range_seg_t, rs_pp_node)); -} - -void -rt_avl_destroy(range_tree_t *rt, void *arg) -{ - avl_tree_t *tree = arg; - - ASSERT0(avl_numnodes(tree)); - avl_destroy(tree); -} - -void -rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg) -{ - avl_tree_t *tree = arg; - avl_add(tree, rs); -} - -void -rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg) -{ - avl_tree_t *tree = arg; - avl_remove(tree, rs); -} - -void -rt_avl_vacate(range_tree_t *rt, void *arg) -{ - /* - * Normally one would walk the tree freeing nodes along the way. - * Since the nodes are shared with the range trees we can avoid - * walking all nodes and just reinitialize the avl tree. The nodes - * will be freed by the range tree, so we don't want to free them here. - */ - rt_avl_create(rt, arg); -} - -boolean_t -range_tree_is_empty(range_tree_t *rt) -{ - ASSERT(rt != NULL); - return (range_tree_space(rt) == 0); -} - -uint64_t -range_tree_min(range_tree_t *rt) -{ - range_seg_t *rs = avl_first(&rt->rt_root); - return (rs != NULL ? rs->rs_start : 0); -} - -uint64_t -range_tree_max(range_tree_t *rt) -{ - range_seg_t *rs = avl_last(&rt->rt_root); - return (rs != NULL ? rs->rs_end : 0); -} - -uint64_t -range_tree_span(range_tree_t *rt) -{ - return (range_tree_max(rt) - range_tree_min(rt)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c deleted file mode 100644 index b03a3c4abd45..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c +++ /dev/null @@ -1,321 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. - */ - -#include -#include - -#ifdef ZFS_DEBUG - -#ifdef _KERNEL -int reference_tracking_enable = FALSE; /* runs out of memory too easily */ -SYSCTL_DECL(_vfs_zfs); -SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN, - &reference_tracking_enable, 0, - "Track reference holders to refcount_t objects, used mostly by ZFS"); -#else -int reference_tracking_enable = TRUE; -#endif -int reference_history = 3; /* tunable */ - -static kmem_cache_t *reference_cache; -static kmem_cache_t *reference_history_cache; - -void -zfs_refcount_init(void) -{ - reference_cache = kmem_cache_create("reference_cache", - sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - - reference_history_cache = kmem_cache_create("reference_history_cache", - sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -} - -void -zfs_refcount_fini(void) -{ - kmem_cache_destroy(reference_cache); - kmem_cache_destroy(reference_history_cache); -} - -void -zfs_refcount_create(zfs_refcount_t *rc) -{ - mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); - list_create(&rc->rc_list, sizeof (reference_t), - offsetof(reference_t, ref_link)); - list_create(&rc->rc_removed, sizeof (reference_t), - offsetof(reference_t, ref_link)); - rc->rc_count = 0; - rc->rc_removed_count = 0; - rc->rc_tracked = reference_tracking_enable; -} - -void -zfs_refcount_create_tracked(zfs_refcount_t *rc) -{ - zfs_refcount_create(rc); - rc->rc_tracked = B_TRUE; -} - -void -zfs_refcount_create_untracked(zfs_refcount_t *rc) -{ - zfs_refcount_create(rc); - rc->rc_tracked = B_FALSE; -} - -void -zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number) -{ - reference_t *ref; - - ASSERT(rc->rc_count == number); - while (ref = list_head(&rc->rc_list)) { - list_remove(&rc->rc_list, ref); - kmem_cache_free(reference_cache, ref); - } - list_destroy(&rc->rc_list); - - while (ref = list_head(&rc->rc_removed)) { - list_remove(&rc->rc_removed, ref); - kmem_cache_free(reference_history_cache, ref->ref_removed); - kmem_cache_free(reference_cache, ref); - } - list_destroy(&rc->rc_removed); - mutex_destroy(&rc->rc_mtx); -} - -void -zfs_refcount_destroy(zfs_refcount_t *rc) -{ - zfs_refcount_destroy_many(rc, 0); -} - -int -zfs_refcount_is_zero(zfs_refcount_t *rc) -{ - return (rc->rc_count == 0); -} - -int64_t -zfs_refcount_count(zfs_refcount_t *rc) -{ - return (rc->rc_count); -} - -int64_t -zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, void *holder) -{ - reference_t *ref = NULL; - int64_t count; - - if (rc->rc_tracked) { - ref = kmem_cache_alloc(reference_cache, KM_SLEEP); - ref->ref_holder = holder; - ref->ref_number = number; - } - mutex_enter(&rc->rc_mtx); - ASSERT(rc->rc_count >= 0); - if (rc->rc_tracked) - list_insert_head(&rc->rc_list, ref); - rc->rc_count += number; - count = rc->rc_count; - mutex_exit(&rc->rc_mtx); - - return (count); -} - -int64_t -zfs_refcount_add(zfs_refcount_t *rc, void *holder) -{ - return (zfs_refcount_add_many(rc, 1, holder)); -} - -int64_t -zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, void *holder) -{ - reference_t *ref; - int64_t count; - - mutex_enter(&rc->rc_mtx); - ASSERT(rc->rc_count >= number); - - if (!rc->rc_tracked) { - rc->rc_count -= number; - count = rc->rc_count; - mutex_exit(&rc->rc_mtx); - return (count); - } - - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder && ref->ref_number == number) { - list_remove(&rc->rc_list, ref); - if (reference_history > 0) { - ref->ref_removed = - kmem_cache_alloc(reference_history_cache, - KM_SLEEP); - list_insert_head(&rc->rc_removed, ref); - rc->rc_removed_count++; - if (rc->rc_removed_count > reference_history) { - ref = list_tail(&rc->rc_removed); - list_remove(&rc->rc_removed, ref); - kmem_cache_free(reference_history_cache, - ref->ref_removed); - kmem_cache_free(reference_cache, ref); - rc->rc_removed_count--; - } - } else { - kmem_cache_free(reference_cache, ref); - } - rc->rc_count -= number; - count = rc->rc_count; - mutex_exit(&rc->rc_mtx); - return (count); - } - } - panic("No such hold %p on refcount %llx", holder, - (u_longlong_t)(uintptr_t)rc); - return (-1); -} - -int64_t -zfs_refcount_remove(zfs_refcount_t *rc, void *holder) -{ - return (zfs_refcount_remove_many(rc, 1, holder)); -} - -void -zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) -{ - int64_t count, removed_count; - list_t list, removed; - - list_create(&list, sizeof (reference_t), - offsetof(reference_t, ref_link)); - list_create(&removed, sizeof (reference_t), - offsetof(reference_t, ref_link)); - - mutex_enter(&src->rc_mtx); - count = src->rc_count; - removed_count = src->rc_removed_count; - src->rc_count = 0; - src->rc_removed_count = 0; - list_move_tail(&list, &src->rc_list); - list_move_tail(&removed, &src->rc_removed); - mutex_exit(&src->rc_mtx); - - mutex_enter(&dst->rc_mtx); - dst->rc_count += count; - dst->rc_removed_count += removed_count; - list_move_tail(&dst->rc_list, &list); - list_move_tail(&dst->rc_removed, &removed); - mutex_exit(&dst->rc_mtx); - - list_destroy(&list); - list_destroy(&removed); -} - -void -zfs_refcount_transfer_ownership(zfs_refcount_t *rc, void *current_holder, - void *new_holder) -{ - reference_t *ref; - boolean_t found = B_FALSE; - - mutex_enter(&rc->rc_mtx); - if (!rc->rc_tracked) { - mutex_exit(&rc->rc_mtx); - return; - } - - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == current_holder) { - ref->ref_holder = new_holder; - found = B_TRUE; - break; - } - } - ASSERT(found); - mutex_exit(&rc->rc_mtx); -} - -/* - * If tracking is enabled, return true if a reference exists that matches - * the "holder" tag. If tracking is disabled, then return true if a reference - * might be held. - */ -boolean_t -zfs_refcount_held(zfs_refcount_t *rc, void *holder) -{ - reference_t *ref; - - mutex_enter(&rc->rc_mtx); - - if (!rc->rc_tracked) { - mutex_exit(&rc->rc_mtx); - return (rc->rc_count > 0); - } - - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder) { - mutex_exit(&rc->rc_mtx); - return (B_TRUE); - } - } - mutex_exit(&rc->rc_mtx); - return (B_FALSE); -} - -/* - * If tracking is enabled, return true if a reference does not exist that - * matches the "holder" tag. If tracking is disabled, always return true - * since the reference might not be held. - */ -boolean_t -zfs_refcount_not_held(zfs_refcount_t *rc, void *holder) -{ - reference_t *ref; - - mutex_enter(&rc->rc_mtx); - - if (!rc->rc_tracked) { - mutex_exit(&rc->rc_mtx); - return (B_TRUE); - } - - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder) { - mutex_exit(&rc->rc_mtx); - return (B_FALSE); - } - } - mutex_exit(&rc->rc_mtx); - return (B_TRUE); -} -#endif /* ZFS_DEBUG */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c deleted file mode 100644 index 6e7456efb2d5..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c +++ /dev/null @@ -1,396 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2012 by Delphix. All rights reserved. - */ - -#include -#include - -/* - * This file contains the implementation of a re-entrant read - * reader/writer lock (aka "rrwlock"). - * - * This is a normal reader/writer lock with the additional feature - * of allowing threads who have already obtained a read lock to - * re-enter another read lock (re-entrant read) - even if there are - * waiting writers. - * - * Callers who have not obtained a read lock give waiting writers priority. - * - * The rrwlock_t lock does not allow re-entrant writers, nor does it - * allow a re-entrant mix of reads and writes (that is, it does not - * allow a caller who has already obtained a read lock to be able to - * then grab a write lock without first dropping all read locks, and - * vice versa). - * - * The rrwlock_t uses tsd (thread specific data) to keep a list of - * nodes (rrw_node_t), where each node keeps track of which specific - * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering - * should be rare, a thread that grabs multiple reads on the same rrwlock_t - * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the - * tsd list can represent a different rrwlock_t. This allows a thread - * to enter multiple and unique rrwlock_ts for read locks at the same time. - * - * Since using tsd exposes some overhead, the rrwlock_t only needs to - * keep tsd data when writers are waiting. If no writers are waiting, then - * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd - * is needed. Once a writer attempts to grab the lock, readers then - * keep tsd data and bump the linked readers count (rr_linked_rcount). - * - * If there are waiting writers and there are anonymous readers, then a - * reader doesn't know if it is a re-entrant lock. But since it may be one, - * we allow the read to proceed (otherwise it could deadlock). Since once - * waiting writers are active, readers no longer bump the anonymous count, - * the anonymous readers will eventually flush themselves out. At this point, - * readers will be able to tell if they are a re-entrant lock (have a - * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then - * we must let the proceed. If they are not, then the reader blocks for the - * waiting writers. Hence, we do not starve writers. - */ - -/* global key for TSD */ -uint_t rrw_tsd_key; - -typedef struct rrw_node { - struct rrw_node *rn_next; - rrwlock_t *rn_rrl; - void *rn_tag; -} rrw_node_t; - -static rrw_node_t * -rrn_find(rrwlock_t *rrl) -{ - rrw_node_t *rn; - - if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0) - return (NULL); - - for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { - if (rn->rn_rrl == rrl) - return (rn); - } - return (NULL); -} - -/* - * Add a node to the head of the singly linked list. - */ -static void -rrn_add(rrwlock_t *rrl, void *tag) -{ - rrw_node_t *rn; - - rn = kmem_alloc(sizeof (*rn), KM_SLEEP); - rn->rn_rrl = rrl; - rn->rn_next = tsd_get(rrw_tsd_key); - rn->rn_tag = tag; - VERIFY(tsd_set(rrw_tsd_key, rn) == 0); -} - -/* - * If a node is found for 'rrl', then remove the node from this - * thread's list and return TRUE; otherwise return FALSE. - */ -static boolean_t -rrn_find_and_remove(rrwlock_t *rrl, void *tag) -{ - rrw_node_t *rn; - rrw_node_t *prev = NULL; - - if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0) - return (B_FALSE); - - for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { - if (rn->rn_rrl == rrl && rn->rn_tag == tag) { - if (prev) - prev->rn_next = rn->rn_next; - else - VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); - kmem_free(rn, sizeof (*rn)); - return (B_TRUE); - } - prev = rn; - } - return (B_FALSE); -} - -void -rrw_init(rrwlock_t *rrl, boolean_t track_all) -{ - mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); - rrl->rr_writer = NULL; - zfs_refcount_create(&rrl->rr_anon_rcount); - zfs_refcount_create(&rrl->rr_linked_rcount); - rrl->rr_writer_wanted = B_FALSE; - rrl->rr_track_all = track_all; -} - -void -rrw_destroy(rrwlock_t *rrl) -{ - mutex_destroy(&rrl->rr_lock); - cv_destroy(&rrl->rr_cv); - ASSERT(rrl->rr_writer == NULL); - zfs_refcount_destroy(&rrl->rr_anon_rcount); - zfs_refcount_destroy(&rrl->rr_linked_rcount); -} - -static void -rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag) -{ - mutex_enter(&rrl->rr_lock); -#if !defined(DEBUG) && defined(_KERNEL) - if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted && - !rrl->rr_track_all) { - rrl->rr_anon_rcount.rc_count++; - mutex_exit(&rrl->rr_lock); - return; - } - DTRACE_PROBE(zfs__rrwfastpath__rdmiss); -#endif - ASSERT(rrl->rr_writer != curthread); - ASSERT(zfs_refcount_count(&rrl->rr_anon_rcount) >= 0); - - while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted && - zfs_refcount_is_zero(&rrl->rr_anon_rcount) && !prio && - rrn_find(rrl) == NULL)) - cv_wait(&rrl->rr_cv, &rrl->rr_lock); - - if (rrl->rr_writer_wanted || rrl->rr_track_all) { - /* may or may not be a re-entrant enter */ - rrn_add(rrl, tag); - (void) zfs_refcount_add(&rrl->rr_linked_rcount, tag); - } else { - (void) zfs_refcount_add(&rrl->rr_anon_rcount, tag); - } - ASSERT(rrl->rr_writer == NULL); - mutex_exit(&rrl->rr_lock); -} - -void -rrw_enter_read(rrwlock_t *rrl, void *tag) -{ - rrw_enter_read_impl(rrl, B_FALSE, tag); -} - -/* - * take a read lock even if there are pending write lock requests. if we want - * to take a lock reentrantly, but from different threads (that have a - * relationship to each other), the normal detection mechanism to overrule - * the pending writer does not work, so we have to give an explicit hint here. - */ -void -rrw_enter_read_prio(rrwlock_t *rrl, void *tag) -{ - rrw_enter_read_impl(rrl, B_TRUE, tag); -} - - -void -rrw_enter_write(rrwlock_t *rrl) -{ - mutex_enter(&rrl->rr_lock); - ASSERT(rrl->rr_writer != curthread); - - while (zfs_refcount_count(&rrl->rr_anon_rcount) > 0 || - zfs_refcount_count(&rrl->rr_linked_rcount) > 0 || - rrl->rr_writer != NULL) { - rrl->rr_writer_wanted = B_TRUE; - cv_wait(&rrl->rr_cv, &rrl->rr_lock); - } - rrl->rr_writer_wanted = B_FALSE; - rrl->rr_writer = curthread; - mutex_exit(&rrl->rr_lock); -} - -void -rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) -{ - if (rw == RW_READER) - rrw_enter_read(rrl, tag); - else - rrw_enter_write(rrl); -} - -void -rrw_exit(rrwlock_t *rrl, void *tag) -{ - mutex_enter(&rrl->rr_lock); -#if !defined(DEBUG) && defined(_KERNEL) - if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { - rrl->rr_anon_rcount.rc_count--; - if (rrl->rr_anon_rcount.rc_count == 0) - cv_broadcast(&rrl->rr_cv); - mutex_exit(&rrl->rr_lock); - return; - } - DTRACE_PROBE(zfs__rrwfastpath__exitmiss); -#endif - ASSERT(!zfs_refcount_is_zero(&rrl->rr_anon_rcount) || - !zfs_refcount_is_zero(&rrl->rr_linked_rcount) || - rrl->rr_writer != NULL); - - if (rrl->rr_writer == NULL) { - int64_t count; - if (rrn_find_and_remove(rrl, tag)) { - count = zfs_refcount_remove( - &rrl->rr_linked_rcount, tag); - } else { - ASSERT(!rrl->rr_track_all); - count = zfs_refcount_remove(&rrl->rr_anon_rcount, tag); - } - if (count == 0) - cv_broadcast(&rrl->rr_cv); - } else { - ASSERT(rrl->rr_writer == curthread); - ASSERT(zfs_refcount_is_zero(&rrl->rr_anon_rcount) && - zfs_refcount_is_zero(&rrl->rr_linked_rcount)); - rrl->rr_writer = NULL; - cv_broadcast(&rrl->rr_cv); - } - mutex_exit(&rrl->rr_lock); -} - -/* - * If the lock was created with track_all, rrw_held(RW_READER) will return - * B_TRUE iff the current thread has the lock for reader. Otherwise it may - * return B_TRUE if any thread has the lock for reader. - */ -boolean_t -rrw_held(rrwlock_t *rrl, krw_t rw) -{ - boolean_t held; - - mutex_enter(&rrl->rr_lock); - if (rw == RW_WRITER) { - held = (rrl->rr_writer == curthread); - } else { - held = (!zfs_refcount_is_zero(&rrl->rr_anon_rcount) || - rrn_find(rrl) != NULL); - } - mutex_exit(&rrl->rr_lock); - - return (held); -} - -void -rrw_tsd_destroy(void *arg) -{ - rrw_node_t *rn = arg; - if (rn != NULL) { - panic("thread %p terminating with rrw lock %p held", - (void *)curthread, (void *)rn->rn_rrl); - } -} - -/* - * A reader-mostly lock implementation, tuning above reader-writer locks - * for hightly parallel read acquisitions, while pessimizing writes. - * - * The idea is to split single busy lock into array of locks, so that - * each reader can lock only one of them for read, depending on result - * of simple hash function. That proportionally reduces lock congestion. - * Writer same time has to sequentially aquire write on all the locks. - * That makes write aquisition proportionally slower, but in places where - * it is used (filesystem unmount) performance is not critical. - * - * All the functions below are direct wrappers around functions above. - */ -void -rrm_init(rrmlock_t *rrl, boolean_t track_all) -{ - int i; - - for (i = 0; i < RRM_NUM_LOCKS; i++) - rrw_init(&rrl->locks[i], track_all); -} - -void -rrm_destroy(rrmlock_t *rrl) -{ - int i; - - for (i = 0; i < RRM_NUM_LOCKS; i++) - rrw_destroy(&rrl->locks[i]); -} - -void -rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag) -{ - if (rw == RW_READER) - rrm_enter_read(rrl, tag); - else - rrm_enter_write(rrl); -} - -/* - * This maps the current thread to a specific lock. Note that the lock - * must be released by the same thread that acquired it. We do this - * mapping by taking the thread pointer mod a prime number. We examine - * only the low 32 bits of the thread pointer, because 32-bit division - * is faster than 64-bit division, and the high 32 bits have little - * entropy anyway. - */ -#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS) - -void -rrm_enter_read(rrmlock_t *rrl, void *tag) -{ - rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag); -} - -void -rrm_enter_write(rrmlock_t *rrl) -{ - int i; - - for (i = 0; i < RRM_NUM_LOCKS; i++) - rrw_enter_write(&rrl->locks[i]); -} - -void -rrm_exit(rrmlock_t *rrl, void *tag) -{ - int i; - - if (rrl->locks[0].rr_writer == curthread) { - for (i = 0; i < RRM_NUM_LOCKS; i++) - rrw_exit(&rrl->locks[i], tag); - } else { - rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag); - } -} - -boolean_t -rrm_held(rrmlock_t *rrl, krw_t rw) -{ - if (rw == RW_WRITER) { - return (rrw_held(&rrl->locks[0], rw)); - } else { - return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw)); - } -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c deleted file mode 100644 index dfc9d012b08d..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c +++ /dev/null @@ -1,2012 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Portions Copyright 2011 iXsystems, Inc - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * ZFS System attributes: - * - * A generic mechanism to allow for arbitrary attributes - * to be stored in a dnode. The data will be stored in the bonus buffer of - * the dnode and if necessary a special "spill" block will be used to handle - * overflow situations. The spill block will be sized to fit the data - * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the - * spill block is stored at the end of the current bonus buffer. Any - * attributes that would be in the way of the blkptr_t will be relocated - * into the spill block. - * - * Attribute registration: - * - * Stored persistently on a per dataset basis - * a mapping between attribute "string" names and their actual attribute - * numeric values, length, and byteswap function. The names are only used - * during registration. All attributes are known by their unique attribute - * id value. If an attribute can have a variable size then the value - * 0 will be used to indicate this. - * - * Attribute Layout: - * - * Attribute layouts are a way to compactly store multiple attributes, but - * without taking the overhead associated with managing each attribute - * individually. Since you will typically have the same set of attributes - * stored in the same order a single table will be used to represent that - * layout. The ZPL for example will usually have only about 10 different - * layouts (regular files, device files, symlinks, - * regular files + scanstamp, files/dir with extended attributes, and then - * you have the possibility of all of those minus ACL, because it would - * be kicked out into the spill block) - * - * Layouts are simply an array of the attributes and their - * ordering i.e. [0, 1, 4, 5, 2] - * - * Each distinct layout is given a unique layout number and that is whats - * stored in the header at the beginning of the SA data buffer. - * - * A layout only covers a single dbuf (bonus or spill). If a set of - * attributes is split up between the bonus buffer and a spill buffer then - * two different layouts will be used. This allows us to byteswap the - * spill without looking at the bonus buffer and keeps the on disk format of - * the bonus and spill buffer the same. - * - * Adding a single attribute will cause the entire set of attributes to - * be rewritten and could result in a new layout number being constructed - * as part of the rewrite if no such layout exists for the new set of - * attribues. The new attribute will be appended to the end of the already - * existing attributes. - * - * Both the attribute registration and attribute layout information are - * stored in normal ZAP attributes. Their should be a small number of - * known layouts and the set of attributes is assumed to typically be quite - * small. - * - * The registered attributes and layout "table" information is maintained - * in core and a special "sa_os_t" is attached to the objset_t. - * - * A special interface is provided to allow for quickly applying - * a large set of attributes at once. sa_replace_all_by_template() is - * used to set an array of attributes. This is used by the ZPL when - * creating a brand new file. The template that is passed into the function - * specifies the attribute, size for variable length attributes, location of - * data and special "data locator" function if the data isn't in a contiguous - * location. - * - * Byteswap implications: - * - * Since the SA attributes are not entirely self describing we can't do - * the normal byteswap processing. The special ZAP layout attribute and - * attribute registration attributes define the byteswap function and the - * size of the attributes, unless it is variable sized. - * The normal ZFS byteswapping infrastructure assumes you don't need - * to read any objects in order to do the necessary byteswapping. Whereas - * SA attributes can only be properly byteswapped if the dataset is opened - * and the layout/attribute ZAP attributes are available. Because of this - * the SA attributes will be byteswapped when they are first accessed by - * the SA code that will read the SA data. - */ - -typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t, - uint16_t length, int length_idx, boolean_t, void *userp); - -static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype); -static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab); -static sa_idx_tab_t *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, - sa_hdr_phys_t *hdr); -static void sa_idx_tab_rele(objset_t *os, void *arg); -static void sa_copy_data(sa_data_locator_t *func, void *start, void *target, - int buflen); -static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, - sa_data_op_t action, sa_data_locator_t *locator, void *datastart, - uint16_t buflen, dmu_tx_t *tx); - -arc_byteswap_func_t *sa_bswap_table[] = { - byteswap_uint64_array, - byteswap_uint32_array, - byteswap_uint16_array, - byteswap_uint8_array, - zfs_acl_byteswap, -}; - -#define SA_COPY_DATA(f, s, t, l) \ - { \ - if (f == NULL) { \ - if (l == 8) { \ - *(uint64_t *)t = *(uint64_t *)s; \ - } else if (l == 16) { \ - *(uint64_t *)t = *(uint64_t *)s; \ - *(uint64_t *)((uintptr_t)t + 8) = \ - *(uint64_t *)((uintptr_t)s + 8); \ - } else { \ - bcopy(s, t, l); \ - } \ - } else \ - sa_copy_data(f, s, t, l); \ - } - -/* - * This table is fixed and cannot be changed. Its purpose is to - * allow the SA code to work with both old/new ZPL file systems. - * It contains the list of legacy attributes. These attributes aren't - * stored in the "attribute" registry zap objects, since older ZPL file systems - * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will - * use this static table. - */ -sa_attr_reg_t sa_legacy_attrs[] = { - {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, - {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, - {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, - {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, - {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, - {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, - {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, - {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, - {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, - {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, - {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, - {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, - {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, - {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, - {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, - {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, -}; - -/* - * This is only used for objects of type DMU_OT_ZNODE - */ -sa_attr_type_t sa_legacy_zpl_layout[] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -}; - -/* - * Special dummy layout used for buffers with no attributes. - */ -sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; - -static int sa_legacy_attr_count = 16; -static kmem_cache_t *sa_cache = NULL; - -/*ARGSUSED*/ -static int -sa_cache_constructor(void *buf, void *unused, int kmflag) -{ - sa_handle_t *hdl = buf; - - mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL); - return (0); -} - -/*ARGSUSED*/ -static void -sa_cache_destructor(void *buf, void *unused) -{ - sa_handle_t *hdl = buf; - mutex_destroy(&hdl->sa_lock); -} - -void -sa_cache_init(void) -{ - sa_cache = kmem_cache_create("sa_cache", - sizeof (sa_handle_t), 0, sa_cache_constructor, - sa_cache_destructor, NULL, NULL, NULL, 0); -} - -void -sa_cache_fini(void) -{ - if (sa_cache) - kmem_cache_destroy(sa_cache); -} - -static int -layout_num_compare(const void *arg1, const void *arg2) -{ - const sa_lot_t *node1 = (const sa_lot_t *)arg1; - const sa_lot_t *node2 = (const sa_lot_t *)arg2; - - return (AVL_CMP(node1->lot_num, node2->lot_num)); -} - -static int -layout_hash_compare(const void *arg1, const void *arg2) -{ - const sa_lot_t *node1 = (const sa_lot_t *)arg1; - const sa_lot_t *node2 = (const sa_lot_t *)arg2; - - int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash); - if (likely(cmp)) - return (cmp); - - return (AVL_CMP(node1->lot_instance, node2->lot_instance)); -} - -boolean_t -sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count) -{ - int i; - - if (count != tbf->lot_attr_count) - return (1); - - for (i = 0; i != count; i++) { - if (attrs[i] != tbf->lot_attrs[i]) - return (1); - } - return (0); -} - -#define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF]) - -static uint64_t -sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count) -{ - int i; - uint64_t crc = -1ULL; - - for (i = 0; i != attr_count; i++) - crc ^= SA_ATTR_HASH(attrs[i]); - - return (crc); -} - -static int -sa_get_spill(sa_handle_t *hdl) -{ - int rc; - if (hdl->sa_spill == NULL) { - if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL, - &hdl->sa_spill)) == 0) - VERIFY(0 == sa_build_index(hdl, SA_SPILL)); - } else { - rc = 0; - } - - return (rc); -} - -/* - * Main attribute lookup/update function - * returns 0 for success or non zero for failures - * - * Operates on bulk array, first failure will abort further processing - */ -int -sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, - sa_data_op_t data_op, dmu_tx_t *tx) -{ - sa_os_t *sa = hdl->sa_os->os_sa; - int i; - int error = 0; - sa_buf_type_t buftypes; - - buftypes = 0; - - ASSERT(count > 0); - for (i = 0; i != count; i++) { - ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs); - - bulk[i].sa_addr = NULL; - /* First check the bonus buffer */ - - if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT( - hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) { - SA_ATTR_INFO(sa, hdl->sa_bonus_tab, - SA_GET_HDR(hdl, SA_BONUS), - bulk[i].sa_attr, bulk[i], SA_BONUS, hdl); - if (tx && !(buftypes & SA_BONUS)) { - dmu_buf_will_dirty(hdl->sa_bonus, tx); - buftypes |= SA_BONUS; - } - } - if (bulk[i].sa_addr == NULL && - ((error = sa_get_spill(hdl)) == 0)) { - if (TOC_ATTR_PRESENT( - hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) { - SA_ATTR_INFO(sa, hdl->sa_spill_tab, - SA_GET_HDR(hdl, SA_SPILL), - bulk[i].sa_attr, bulk[i], SA_SPILL, hdl); - if (tx && !(buftypes & SA_SPILL) && - bulk[i].sa_size == bulk[i].sa_length) { - dmu_buf_will_dirty(hdl->sa_spill, tx); - buftypes |= SA_SPILL; - } - } - } - if (error && error != ENOENT) { - return ((error == ECKSUM) ? EIO : error); - } - - switch (data_op) { - case SA_LOOKUP: - if (bulk[i].sa_addr == NULL) - return (SET_ERROR(ENOENT)); - if (bulk[i].sa_data) { - SA_COPY_DATA(bulk[i].sa_data_func, - bulk[i].sa_addr, bulk[i].sa_data, - bulk[i].sa_size); - } - continue; - - case SA_UPDATE: - /* existing rewrite of attr */ - if (bulk[i].sa_addr && - bulk[i].sa_size == bulk[i].sa_length) { - SA_COPY_DATA(bulk[i].sa_data_func, - bulk[i].sa_data, bulk[i].sa_addr, - bulk[i].sa_length); - continue; - } else if (bulk[i].sa_addr) { /* attr size change */ - error = sa_modify_attrs(hdl, bulk[i].sa_attr, - SA_REPLACE, bulk[i].sa_data_func, - bulk[i].sa_data, bulk[i].sa_length, tx); - } else { /* adding new attribute */ - error = sa_modify_attrs(hdl, bulk[i].sa_attr, - SA_ADD, bulk[i].sa_data_func, - bulk[i].sa_data, bulk[i].sa_length, tx); - } - if (error) - return (error); - break; - } - } - return (error); -} - -static sa_lot_t * -sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, - uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx) -{ - sa_os_t *sa = os->os_sa; - sa_lot_t *tb, *findtb; - int i; - avl_index_t loc; - - ASSERT(MUTEX_HELD(&sa->sa_lock)); - tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP); - tb->lot_attr_count = attr_count; - tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, - KM_SLEEP); - bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count); - tb->lot_num = lot_num; - tb->lot_hash = hash; - tb->lot_instance = 0; - - if (zapadd) { - char attr_name[8]; - - if (sa->sa_layout_attr_obj == 0) { - sa->sa_layout_attr_obj = zap_create_link(os, - DMU_OT_SA_ATTR_LAYOUTS, - sa->sa_master_obj, SA_LAYOUTS, tx); - } - - (void) snprintf(attr_name, sizeof (attr_name), - "%d", (int)lot_num); - VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj, - attr_name, 2, attr_count, attrs, tx)); - } - - list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t), - offsetof(sa_idx_tab_t, sa_next)); - - for (i = 0; i != attr_count; i++) { - if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0) - tb->lot_var_sizes++; - } - - avl_add(&sa->sa_layout_num_tree, tb); - - /* verify we don't have a hash collision */ - if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) { - for (; findtb && findtb->lot_hash == hash; - findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) { - if (findtb->lot_instance != tb->lot_instance) - break; - tb->lot_instance++; - } - } - avl_add(&sa->sa_layout_hash_tree, tb); - return (tb); -} - -static void -sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs, - int count, dmu_tx_t *tx, sa_lot_t **lot) -{ - sa_lot_t *tb, tbsearch; - avl_index_t loc; - sa_os_t *sa = os->os_sa; - boolean_t found = B_FALSE; - - mutex_enter(&sa->sa_lock); - tbsearch.lot_hash = hash; - tbsearch.lot_instance = 0; - tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc); - if (tb) { - for (; tb && tb->lot_hash == hash; - tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) { - if (sa_layout_equal(tb, attrs, count) == 0) { - found = B_TRUE; - break; - } - } - } - if (!found) { - tb = sa_add_layout_entry(os, attrs, count, - avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx); - } - mutex_exit(&sa->sa_lock); - *lot = tb; -} - -static int -sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx) -{ - int error; - uint32_t blocksize; - - if (size == 0) { - blocksize = SPA_MINBLOCKSIZE; - } else if (size > SPA_OLD_MAXBLOCKSIZE) { - ASSERT(0); - return (SET_ERROR(EFBIG)); - } else { - blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t); - } - - error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx); - ASSERT(error == 0); - return (error); -} - -static void -sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen) -{ - if (func == NULL) { - bcopy(datastart, target, buflen); - } else { - boolean_t start; - int bytes; - void *dataptr; - void *saptr = target; - uint32_t length; - - start = B_TRUE; - bytes = 0; - while (bytes < buflen) { - func(&dataptr, &length, buflen, start, datastart); - bcopy(dataptr, saptr, length); - saptr = (void *)((caddr_t)saptr + length); - bytes += length; - start = B_FALSE; - } - } -} - -/* - * Determine several different sizes - * first the sa header size - * the number of bytes to be stored - * if spill would occur the index in the attribute array is returned - * - * the boolean will_spill will be set when spilling is necessary. It - * is only set when the buftype is SA_BONUS - */ -static int -sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, - dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index, - int *total, boolean_t *will_spill) -{ - int var_size = 0; - int i; - int hdrsize; - int extra_hdrsize; - - if (buftype == SA_BONUS && sa->sa_force_spill) { - *total = 0; - *index = 0; - *will_spill = B_TRUE; - return (0); - } - - *index = -1; - *total = 0; - *will_spill = B_FALSE; - - extra_hdrsize = 0; - hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 : - sizeof (sa_hdr_phys_t); - - ASSERT(IS_P2ALIGNED(full_space, 8)); - - for (i = 0; i != attr_count; i++) { - boolean_t is_var_sz; - - *total = P2ROUNDUP(*total, 8); - *total += attr_desc[i].sa_length; - if (*will_spill) - continue; - - is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0); - if (is_var_sz) { - var_size++; - } - - if (is_var_sz && var_size > 1) { - /* - * Don't worry that the spill block might overflow. - * It will be resized if needed in sa_build_layouts(). - */ - if (buftype == SA_SPILL || - P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) + - *total < full_space) { - /* - * Account for header space used by array of - * optional sizes of variable-length attributes. - * Record the extra header size in case this - * increase needs to be reversed due to - * spill-over. - */ - hdrsize += sizeof (uint16_t); - if (*index != -1) - extra_hdrsize += sizeof (uint16_t); - } else { - ASSERT(buftype == SA_BONUS); - if (*index == -1) - *index = i; - *will_spill = B_TRUE; - continue; - } - } - - /* - * find index of where spill *could* occur. - * Then continue to count of remainder attribute - * space. The sum is used later for sizing bonus - * and spill buffer. - */ - if (buftype == SA_BONUS && *index == -1 && - (*total + P2ROUNDUP(hdrsize, 8)) > - (full_space - sizeof (blkptr_t))) { - *index = i; - } - - if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space && - buftype == SA_BONUS) - *will_spill = B_TRUE; - } - - if (*will_spill) - hdrsize -= extra_hdrsize; - - hdrsize = P2ROUNDUP(hdrsize, 8); - return (hdrsize); -} - -#define BUF_SPACE_NEEDED(total, header) (total + header) - -/* - * Find layout that corresponds to ordering of attributes - * If not found a new layout number is created and added to - * persistent layout tables. - */ -static int -sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, - dmu_tx_t *tx) -{ - sa_os_t *sa = hdl->sa_os->os_sa; - uint64_t hash; - sa_buf_type_t buftype; - sa_hdr_phys_t *sahdr; - void *data_start; - int buf_space; - sa_attr_type_t *attrs, *attrs_start; - int i, lot_count; - int dnodesize; - int hdrsize; - int spillhdrsize = 0; - int used; - dmu_object_type_t bonustype; - sa_lot_t *lot; - int len_idx; - int spill_used; - int bonuslen; - boolean_t spilling; - - dmu_buf_will_dirty(hdl->sa_bonus, tx); - bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus); - dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize); - bonuslen = DN_BONUS_SIZE(dnodesize); - - dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize); - bonuslen = DN_BONUS_SIZE(dnodesize); - - /* first determine bonus header size and sum of all attributes */ - hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus, - SA_BONUS, bonuslen, &i, &used, &spilling); - - if (used > SPA_OLD_MAXBLOCKSIZE) - return (SET_ERROR(EFBIG)); - - VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ? - MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) : - used + hdrsize, tx)); - - ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) || - bonustype == DMU_OT_SA); - - /* setup and size spill buffer when needed */ - if (spilling) { - boolean_t dummy; - - if (hdl->sa_spill == NULL) { - VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL, - &hdl->sa_spill) == 0); - } - dmu_buf_will_dirty(hdl->sa_spill, tx); - - spillhdrsize = sa_find_sizes(sa, &attr_desc[i], - attr_count - i, hdl->sa_spill, SA_SPILL, - hdl->sa_spill->db_size, &i, &spill_used, &dummy); - - if (spill_used > SPA_OLD_MAXBLOCKSIZE) - return (SET_ERROR(EFBIG)); - - buf_space = hdl->sa_spill->db_size - spillhdrsize; - if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) > - hdl->sa_spill->db_size) - VERIFY(0 == sa_resize_spill(hdl, - BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx)); - } - - /* setup starting pointers to lay down data */ - data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize); - sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data; - buftype = SA_BONUS; - - if (spilling) - buf_space = (sa->sa_force_spill) ? - 0 : SA_BLKPTR_SPACE - hdrsize; - else - buf_space = hdl->sa_bonus->db_size - hdrsize; - - attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, - KM_SLEEP); - lot_count = 0; - - for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) { - uint16_t length; - - ASSERT(IS_P2ALIGNED(data_start, 8)); - ASSERT(IS_P2ALIGNED(buf_space, 8)); - attrs[i] = attr_desc[i].sa_attr; - length = SA_REGISTERED_LEN(sa, attrs[i]); - if (length == 0) - length = attr_desc[i].sa_length; - else - VERIFY(length == attr_desc[i].sa_length); - - if (buf_space < length) { /* switch to spill buffer */ - VERIFY(spilling); - VERIFY(bonustype == DMU_OT_SA); - if (buftype == SA_BONUS && !sa->sa_force_spill) { - sa_find_layout(hdl->sa_os, hash, attrs_start, - lot_count, tx, &lot); - SA_SET_HDR(sahdr, lot->lot_num, hdrsize); - } - - buftype = SA_SPILL; - hash = -1ULL; - len_idx = 0; - - sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data; - sahdr->sa_magic = SA_MAGIC; - data_start = (void *)((uintptr_t)sahdr + - spillhdrsize); - attrs_start = &attrs[i]; - buf_space = hdl->sa_spill->db_size - spillhdrsize; - lot_count = 0; - } - hash ^= SA_ATTR_HASH(attrs[i]); - attr_desc[i].sa_addr = data_start; - attr_desc[i].sa_size = length; - SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data, - data_start, length); - if (sa->sa_attr_table[attrs[i]].sa_length == 0) { - sahdr->sa_lengths[len_idx++] = length; - } - VERIFY((uintptr_t)data_start % 8 == 0); - data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + - length), 8); - buf_space -= P2ROUNDUP(length, 8); - lot_count++; - } - - sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); - - /* - * Verify that old znodes always have layout number 0. - * Must be DMU_OT_SA for arbitrary layouts - */ - VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) || - (bonustype == DMU_OT_SA && lot->lot_num > 1)); - - if (bonustype == DMU_OT_SA) { - SA_SET_HDR(sahdr, lot->lot_num, - buftype == SA_BONUS ? hdrsize : spillhdrsize); - } - - kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count); - if (hdl->sa_bonus_tab) { - sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); - hdl->sa_bonus_tab = NULL; - } - if (!sa->sa_force_spill) - VERIFY(0 == sa_build_index(hdl, SA_BONUS)); - if (hdl->sa_spill) { - sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); - if (!spilling) { - /* - * remove spill block that is no longer needed. - */ - dmu_buf_rele(hdl->sa_spill, NULL); - hdl->sa_spill = NULL; - hdl->sa_spill_tab = NULL; - VERIFY(0 == dmu_rm_spill(hdl->sa_os, - sa_handle_object(hdl), tx)); - } else { - VERIFY(0 == sa_build_index(hdl, SA_SPILL)); - } - } - - return (0); -} - -static void -sa_free_attr_table(sa_os_t *sa) -{ - int i; - - if (sa->sa_attr_table == NULL) - return; - - for (i = 0; i != sa->sa_num_attrs; i++) { - if (sa->sa_attr_table[i].sa_name) - kmem_free(sa->sa_attr_table[i].sa_name, - strlen(sa->sa_attr_table[i].sa_name) + 1); - } - - kmem_free(sa->sa_attr_table, - sizeof (sa_attr_table_t) * sa->sa_num_attrs); - - sa->sa_attr_table = NULL; -} - -static int -sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) -{ - sa_os_t *sa = os->os_sa; - uint64_t sa_attr_count = 0; - uint64_t sa_reg_count = 0; - int error = 0; - uint64_t attr_value; - sa_attr_table_t *tb; - zap_cursor_t zc; - zap_attribute_t za; - int registered_count = 0; - int i; - dmu_objset_type_t ostype = dmu_objset_type(os); - - sa->sa_user_table = - kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP); - sa->sa_user_table_sz = count * sizeof (sa_attr_type_t); - - if (sa->sa_reg_attr_obj != 0) { - error = zap_count(os, sa->sa_reg_attr_obj, - &sa_attr_count); - - /* - * Make sure we retrieved a count and that it isn't zero - */ - if (error || (error == 0 && sa_attr_count == 0)) { - if (error == 0) - error = SET_ERROR(EINVAL); - goto bail; - } - sa_reg_count = sa_attr_count; - } - - if (ostype == DMU_OST_ZFS && sa_attr_count == 0) - sa_attr_count += sa_legacy_attr_count; - - /* Allocate attribute numbers for attributes that aren't registered */ - for (i = 0; i != count; i++) { - boolean_t found = B_FALSE; - int j; - - if (ostype == DMU_OST_ZFS) { - for (j = 0; j != sa_legacy_attr_count; j++) { - if (strcmp(reg_attrs[i].sa_name, - sa_legacy_attrs[j].sa_name) == 0) { - sa->sa_user_table[i] = - sa_legacy_attrs[j].sa_attr; - found = B_TRUE; - } - } - } - if (found) - continue; - - if (sa->sa_reg_attr_obj) - error = zap_lookup(os, sa->sa_reg_attr_obj, - reg_attrs[i].sa_name, 8, 1, &attr_value); - else - error = SET_ERROR(ENOENT); - switch (error) { - case ENOENT: - sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count; - sa_attr_count++; - break; - case 0: - sa->sa_user_table[i] = ATTR_NUM(attr_value); - break; - default: - goto bail; - } - } - - sa->sa_num_attrs = sa_attr_count; - tb = sa->sa_attr_table = - kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP); - - /* - * Attribute table is constructed from requested attribute list, - * previously foreign registered attributes, and also the legacy - * ZPL set of attributes. - */ - - if (sa->sa_reg_attr_obj) { - for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj); - (error = zap_cursor_retrieve(&zc, &za)) == 0; - zap_cursor_advance(&zc)) { - uint64_t value; - value = za.za_first_integer; - - registered_count++; - tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value); - tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value); - tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value); - tb[ATTR_NUM(value)].sa_registered = B_TRUE; - - if (tb[ATTR_NUM(value)].sa_name) { - continue; - } - tb[ATTR_NUM(value)].sa_name = - kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP); - (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name, - strlen(za.za_name) +1); - } - zap_cursor_fini(&zc); - /* - * Make sure we processed the correct number of registered - * attributes - */ - if (registered_count != sa_reg_count) { - ASSERT(error != 0); - goto bail; - } - - } - - if (ostype == DMU_OST_ZFS) { - for (i = 0; i != sa_legacy_attr_count; i++) { - if (tb[i].sa_name) - continue; - tb[i].sa_attr = sa_legacy_attrs[i].sa_attr; - tb[i].sa_length = sa_legacy_attrs[i].sa_length; - tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap; - tb[i].sa_registered = B_FALSE; - tb[i].sa_name = - kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1, - KM_SLEEP); - (void) strlcpy(tb[i].sa_name, - sa_legacy_attrs[i].sa_name, - strlen(sa_legacy_attrs[i].sa_name) + 1); - } - } - - for (i = 0; i != count; i++) { - sa_attr_type_t attr_id; - - attr_id = sa->sa_user_table[i]; - if (tb[attr_id].sa_name) - continue; - - tb[attr_id].sa_length = reg_attrs[i].sa_length; - tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap; - tb[attr_id].sa_attr = attr_id; - tb[attr_id].sa_name = - kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP); - (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name, - strlen(reg_attrs[i].sa_name) + 1); - } - - sa->sa_need_attr_registration = - (sa_attr_count != registered_count); - - return (0); -bail: - kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t)); - sa->sa_user_table = NULL; - sa_free_attr_table(sa); - return ((error != 0) ? error : EINVAL); -} - -int -sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, - sa_attr_type_t **user_table) -{ - zap_cursor_t zc; - zap_attribute_t za; - sa_os_t *sa; - dmu_objset_type_t ostype = dmu_objset_type(os); - sa_attr_type_t *tb; - int error; - - mutex_enter(&os->os_user_ptr_lock); - if (os->os_sa) { - mutex_enter(&os->os_sa->sa_lock); - mutex_exit(&os->os_user_ptr_lock); - tb = os->os_sa->sa_user_table; - mutex_exit(&os->os_sa->sa_lock); - *user_table = tb; - return (0); - } - - sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP); - mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL); - sa->sa_master_obj = sa_obj; - - os->os_sa = sa; - mutex_enter(&sa->sa_lock); - mutex_exit(&os->os_user_ptr_lock); - avl_create(&sa->sa_layout_num_tree, layout_num_compare, - sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node)); - avl_create(&sa->sa_layout_hash_tree, layout_hash_compare, - sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node)); - - if (sa_obj) { - error = zap_lookup(os, sa_obj, SA_LAYOUTS, - 8, 1, &sa->sa_layout_attr_obj); - if (error != 0 && error != ENOENT) - goto fail; - error = zap_lookup(os, sa_obj, SA_REGISTRY, - 8, 1, &sa->sa_reg_attr_obj); - if (error != 0 && error != ENOENT) - goto fail; - } - - if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0) - goto fail; - - if (sa->sa_layout_attr_obj != 0) { - uint64_t layout_count; - - error = zap_count(os, sa->sa_layout_attr_obj, - &layout_count); - - /* - * Layout number count should be > 0 - */ - if (error || (error == 0 && layout_count == 0)) { - if (error == 0) - error = SET_ERROR(EINVAL); - goto fail; - } - - for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj); - (error = zap_cursor_retrieve(&zc, &za)) == 0; - zap_cursor_advance(&zc)) { - sa_attr_type_t *lot_attrs; - uint64_t lot_num; - - lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) * - za.za_num_integers, KM_SLEEP); - - if ((error = (zap_lookup(os, sa->sa_layout_attr_obj, - za.za_name, 2, za.za_num_integers, - lot_attrs))) != 0) { - kmem_free(lot_attrs, sizeof (sa_attr_type_t) * - za.za_num_integers); - break; - } - VERIFY(ddi_strtoull(za.za_name, NULL, 10, - (unsigned long long *)&lot_num) == 0); - - (void) sa_add_layout_entry(os, lot_attrs, - za.za_num_integers, lot_num, - sa_layout_info_hash(lot_attrs, - za.za_num_integers), B_FALSE, NULL); - kmem_free(lot_attrs, sizeof (sa_attr_type_t) * - za.za_num_integers); - } - zap_cursor_fini(&zc); - - /* - * Make sure layout count matches number of entries added - * to AVL tree - */ - if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) { - ASSERT(error != 0); - goto fail; - } - } - - /* Add special layout number for old ZNODES */ - if (ostype == DMU_OST_ZFS) { - (void) sa_add_layout_entry(os, sa_legacy_zpl_layout, - sa_legacy_attr_count, 0, - sa_layout_info_hash(sa_legacy_zpl_layout, - sa_legacy_attr_count), B_FALSE, NULL); - - (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1, - 0, B_FALSE, NULL); - } - *user_table = os->os_sa->sa_user_table; - mutex_exit(&sa->sa_lock); - return (0); -fail: - os->os_sa = NULL; - sa_free_attr_table(sa); - if (sa->sa_user_table) - kmem_free(sa->sa_user_table, sa->sa_user_table_sz); - mutex_exit(&sa->sa_lock); - avl_destroy(&sa->sa_layout_hash_tree); - avl_destroy(&sa->sa_layout_num_tree); - mutex_destroy(&sa->sa_lock); - kmem_free(sa, sizeof (sa_os_t)); - return ((error == ECKSUM) ? EIO : error); -} - -void -sa_tear_down(objset_t *os) -{ - sa_os_t *sa = os->os_sa; - sa_lot_t *layout; - void *cookie; - - kmem_free(sa->sa_user_table, sa->sa_user_table_sz); - - /* Free up attr table */ - - sa_free_attr_table(sa); - - cookie = NULL; - while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) { - sa_idx_tab_t *tab; - while (tab = list_head(&layout->lot_idx_tab)) { - ASSERT(zfs_refcount_count(&tab->sa_refcount)); - sa_idx_tab_rele(os, tab); - } - } - - cookie = NULL; - while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) { - kmem_free(layout->lot_attrs, - sizeof (sa_attr_type_t) * layout->lot_attr_count); - kmem_free(layout, sizeof (sa_lot_t)); - } - - avl_destroy(&sa->sa_layout_hash_tree); - avl_destroy(&sa->sa_layout_num_tree); - mutex_destroy(&sa->sa_lock); - - kmem_free(sa, sizeof (sa_os_t)); - os->os_sa = NULL; -} - -void -sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr, - uint16_t length, int length_idx, boolean_t var_length, void *userp) -{ - sa_idx_tab_t *idx_tab = userp; - - if (var_length) { - ASSERT(idx_tab->sa_variable_lengths); - idx_tab->sa_variable_lengths[length_idx] = length; - } - TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx, - (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr)); -} - -static void -sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type, - sa_iterfunc_t func, sa_lot_t *tab, void *userp) -{ - void *data_start; - sa_lot_t *tb = tab; - sa_lot_t search; - avl_index_t loc; - sa_os_t *sa = os->os_sa; - int i; - uint16_t *length_start = NULL; - uint8_t length_idx = 0; - - if (tab == NULL) { - search.lot_num = SA_LAYOUT_NUM(hdr, type); - tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); - ASSERT(tb); - } - - if (IS_SA_BONUSTYPE(type)) { - data_start = (void *)P2ROUNDUP(((uintptr_t)hdr + - offsetof(sa_hdr_phys_t, sa_lengths) + - (sizeof (uint16_t) * tb->lot_var_sizes)), 8); - length_start = hdr->sa_lengths; - } else { - data_start = hdr; - } - - for (i = 0; i != tb->lot_attr_count; i++) { - int attr_length, reg_length; - uint8_t idx_len; - - reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length; - if (reg_length) { - attr_length = reg_length; - idx_len = 0; - } else { - attr_length = length_start[length_idx]; - idx_len = length_idx++; - } - - func(hdr, data_start, tb->lot_attrs[i], attr_length, - idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp); - - data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + - attr_length), 8); - } -} - -/*ARGSUSED*/ -void -sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr, - uint16_t length, int length_idx, boolean_t variable_length, void *userp) -{ - sa_handle_t *hdl = userp; - sa_os_t *sa = hdl->sa_os->os_sa; - - sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length); -} - -void -sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype) -{ - sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype); - dmu_buf_impl_t *db; - sa_os_t *sa = hdl->sa_os->os_sa; - int num_lengths = 1; - int i; - - ASSERT(MUTEX_HELD(&sa->sa_lock)); - if (sa_hdr_phys->sa_magic == SA_MAGIC) - return; - - db = SA_GET_DB(hdl, buftype); - - if (buftype == SA_SPILL) { - arc_release(db->db_buf, NULL); - arc_buf_thaw(db->db_buf); - } - - sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic); - sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info); - - /* - * Determine number of variable lenghts in header - * The standard 8 byte header has one for free and a - * 16 byte header would have 4 + 1; - */ - if (SA_HDR_SIZE(sa_hdr_phys) > 8) - num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1; - for (i = 0; i != num_lengths; i++) - sa_hdr_phys->sa_lengths[i] = - BSWAP_16(sa_hdr_phys->sa_lengths[i]); - - sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA, - sa_byteswap_cb, NULL, hdl); - - if (buftype == SA_SPILL) - arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf); -} - -static int -sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) -{ - sa_hdr_phys_t *sa_hdr_phys; - dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype); - dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db); - sa_os_t *sa = hdl->sa_os->os_sa; - sa_idx_tab_t *idx_tab; - - sa_hdr_phys = SA_GET_HDR(hdl, buftype); - - mutex_enter(&sa->sa_lock); - - /* Do we need to byteswap? */ - - /* only check if not old znode */ - if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC && - sa_hdr_phys->sa_magic != 0) { - VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC); - sa_byteswap(hdl, buftype); - } - - idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys); - - if (buftype == SA_BONUS) - hdl->sa_bonus_tab = idx_tab; - else - hdl->sa_spill_tab = idx_tab; - - mutex_exit(&sa->sa_lock); - return (0); -} - -/*ARGSUSED*/ -static void -sa_evict_sync(void *dbu) -{ - panic("evicting sa dbuf\n"); -} - -static void -sa_idx_tab_rele(objset_t *os, void *arg) -{ - sa_os_t *sa = os->os_sa; - sa_idx_tab_t *idx_tab = arg; - - if (idx_tab == NULL) - return; - - mutex_enter(&sa->sa_lock); - if (zfs_refcount_remove(&idx_tab->sa_refcount, NULL) == 0) { - list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab); - if (idx_tab->sa_variable_lengths) - kmem_free(idx_tab->sa_variable_lengths, - sizeof (uint16_t) * - idx_tab->sa_layout->lot_var_sizes); - zfs_refcount_destroy(&idx_tab->sa_refcount); - kmem_free(idx_tab->sa_idx_tab, - sizeof (uint32_t) * sa->sa_num_attrs); - kmem_free(idx_tab, sizeof (sa_idx_tab_t)); - } - mutex_exit(&sa->sa_lock); -} - -static void -sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab) -{ - sa_os_t *sa = os->os_sa; - - ASSERT(MUTEX_HELD(&sa->sa_lock)); - (void) zfs_refcount_add(&idx_tab->sa_refcount, NULL); -} - -void -sa_handle_destroy(sa_handle_t *hdl) -{ - dmu_buf_t *db = hdl->sa_bonus; - - mutex_enter(&hdl->sa_lock); - (void) dmu_buf_remove_user(db, &hdl->sa_dbu); - - if (hdl->sa_bonus_tab) - sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); - - if (hdl->sa_spill_tab) - sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); - - dmu_buf_rele(hdl->sa_bonus, NULL); - - if (hdl->sa_spill) - dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL); - mutex_exit(&hdl->sa_lock); - - kmem_cache_free(sa_cache, hdl); -} - -int -sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, - sa_handle_type_t hdl_type, sa_handle_t **handlepp) -{ - int error = 0; - dmu_object_info_t doi; - sa_handle_t *handle = NULL; - -#ifdef ZFS_DEBUG - dmu_object_info_from_db(db, &doi); - ASSERT(doi.doi_bonus_type == DMU_OT_SA || - doi.doi_bonus_type == DMU_OT_ZNODE); -#endif - /* find handle, if it exists */ - /* if one doesn't exist then create a new one, and initialize it */ - - if (hdl_type == SA_HDL_SHARED) - handle = dmu_buf_get_user(db); - - if (handle == NULL) { - sa_handle_t *winner = NULL; - - handle = kmem_cache_alloc(sa_cache, KM_SLEEP); - handle->sa_dbu.dbu_evict_func_sync = NULL; - handle->sa_dbu.dbu_evict_func_async = NULL; - handle->sa_userp = userp; - handle->sa_bonus = db; - handle->sa_os = os; - handle->sa_spill = NULL; - handle->sa_bonus_tab = NULL; - handle->sa_spill_tab = NULL; - - error = sa_build_index(handle, SA_BONUS); - - if (hdl_type == SA_HDL_SHARED) { - dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL, - NULL); - winner = dmu_buf_set_user_ie(db, &handle->sa_dbu); - } - - if (winner != NULL) { - kmem_cache_free(sa_cache, handle); - handle = winner; - } - } - *handlepp = handle; - - return (error); -} - -int -sa_handle_get(objset_t *objset, uint64_t objid, void *userp, - sa_handle_type_t hdl_type, sa_handle_t **handlepp) -{ - dmu_buf_t *db; - int error; - - if (error = dmu_bonus_hold(objset, objid, NULL, &db)) - return (error); - - return (sa_handle_get_from_db(objset, db, userp, hdl_type, - handlepp)); -} - -int -sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db) -{ - return (dmu_bonus_hold(objset, obj_num, tag, db)); -} - -void -sa_buf_rele(dmu_buf_t *db, void *tag) -{ - dmu_buf_rele(db, tag); -} - -int -sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count) -{ - ASSERT(hdl); - ASSERT(MUTEX_HELD(&hdl->sa_lock)); - return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL)); -} - -int -sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen) -{ - int error; - sa_bulk_attr_t bulk; - - bulk.sa_attr = attr; - bulk.sa_data = buf; - bulk.sa_length = buflen; - bulk.sa_data_func = NULL; - - ASSERT(hdl); - mutex_enter(&hdl->sa_lock); - error = sa_lookup_impl(hdl, &bulk, 1); - mutex_exit(&hdl->sa_lock); - return (error); -} - -#ifdef _KERNEL -int -sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) -{ - int error; - sa_bulk_attr_t bulk; - - bulk.sa_data = NULL; - bulk.sa_attr = attr; - bulk.sa_data_func = NULL; - - ASSERT(hdl); - - mutex_enter(&hdl->sa_lock); - if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) { - error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, - uio->uio_resid), UIO_READ, uio); - } - mutex_exit(&hdl->sa_lock); - return (error); - -} -#endif - -static sa_idx_tab_t * -sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, sa_hdr_phys_t *hdr) -{ - sa_idx_tab_t *idx_tab; - sa_os_t *sa = os->os_sa; - sa_lot_t *tb, search; - avl_index_t loc; - - /* - * Deterimine layout number. If SA node and header == 0 then - * force the index table to the dummy "1" empty layout. - * - * The layout number would only be zero for a newly created file - * that has not added any attributes yet, or with crypto enabled which - * doesn't write any attributes to the bonus buffer. - */ - - search.lot_num = SA_LAYOUT_NUM(hdr, bonustype); - - tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); - - /* Verify header size is consistent with layout information */ - ASSERT(tb); - ASSERT(IS_SA_BONUSTYPE(bonustype) && - SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) || - (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0)); - - /* - * See if any of the already existing TOC entries can be reused? - */ - - for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab; - idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) { - boolean_t valid_idx = B_TRUE; - int i; - - if (tb->lot_var_sizes != 0 && - idx_tab->sa_variable_lengths != NULL) { - for (i = 0; i != tb->lot_var_sizes; i++) { - if (hdr->sa_lengths[i] != - idx_tab->sa_variable_lengths[i]) { - valid_idx = B_FALSE; - break; - } - } - } - if (valid_idx) { - sa_idx_tab_hold(os, idx_tab); - return (idx_tab); - } - } - - /* No such luck, create a new entry */ - idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP); - idx_tab->sa_idx_tab = - kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP); - idx_tab->sa_layout = tb; - zfs_refcount_create(&idx_tab->sa_refcount); - if (tb->lot_var_sizes) - idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) * - tb->lot_var_sizes, KM_SLEEP); - - sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab, - tb, idx_tab); - sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */ - sa_idx_tab_hold(os, idx_tab); /* one for layout */ - list_insert_tail(&tb->lot_idx_tab, idx_tab); - return (idx_tab); -} - -void -sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len, - boolean_t start, void *userdata) -{ - ASSERT(start); - - *dataptr = userdata; - *len = total_len; -} - -static void -sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) -{ - uint64_t attr_value = 0; - sa_os_t *sa = hdl->sa_os->os_sa; - sa_attr_table_t *tb = sa->sa_attr_table; - int i; - - mutex_enter(&sa->sa_lock); - - if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) { - mutex_exit(&sa->sa_lock); - return; - } - - if (sa->sa_reg_attr_obj == 0) { - sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os, - DMU_OT_SA_ATTR_REGISTRATION, - sa->sa_master_obj, SA_REGISTRY, tx); - } - for (i = 0; i != sa->sa_num_attrs; i++) { - if (sa->sa_attr_table[i].sa_registered) - continue; - ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length, - tb[i].sa_byteswap); - VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj, - tb[i].sa_name, 8, 1, &attr_value, tx)); - tb[i].sa_registered = B_TRUE; - } - sa->sa_need_attr_registration = B_FALSE; - mutex_exit(&sa->sa_lock); -} - -/* - * Replace all attributes with attributes specified in template. - * If dnode had a spill buffer then those attributes will be - * also be replaced, possibly with just an empty spill block - * - * This interface is intended to only be used for bulk adding of - * attributes for a new file. It will also be used by the ZPL - * when converting and old formatted znode to native SA support. - */ -int -sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, - int attr_count, dmu_tx_t *tx) -{ - sa_os_t *sa = hdl->sa_os->os_sa; - - if (sa->sa_need_attr_registration) - sa_attr_register_sync(hdl, tx); - return (sa_build_layouts(hdl, attr_desc, attr_count, tx)); -} - -int -sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, - int attr_count, dmu_tx_t *tx) -{ - int error; - - mutex_enter(&hdl->sa_lock); - error = sa_replace_all_by_template_locked(hdl, attr_desc, - attr_count, tx); - mutex_exit(&hdl->sa_lock); - return (error); -} - -/* - * Add/remove a single attribute or replace a variable-sized attribute value - * with a value of a different size, and then rewrite the entire set - * of attributes. - * Same-length attribute value replacement (including fixed-length attributes) - * is handled more efficiently by the upper layers. - */ -static int -sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, - sa_data_op_t action, sa_data_locator_t *locator, void *datastart, - uint16_t buflen, dmu_tx_t *tx) -{ - sa_os_t *sa = hdl->sa_os->os_sa; - dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; - dnode_t *dn; - sa_bulk_attr_t *attr_desc; - void *old_data[2]; - int bonus_attr_count = 0; - int bonus_data_size = 0; - int spill_data_size = 0; - int spill_attr_count = 0; - int error; - uint16_t length, reg_length; - int i, j, k, length_idx; - sa_hdr_phys_t *hdr; - sa_idx_tab_t *idx_tab; - int attr_count; - int count; - - ASSERT(MUTEX_HELD(&hdl->sa_lock)); - - /* First make of copy of the old data */ - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (dn->dn_bonuslen != 0) { - bonus_data_size = hdl->sa_bonus->db_size; - old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); - bcopy(hdl->sa_bonus->db_data, old_data[0], - hdl->sa_bonus->db_size); - bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count; - } else { - old_data[0] = NULL; - } - DB_DNODE_EXIT(db); - - /* Bring spill buffer online if it isn't currently */ - - if ((error = sa_get_spill(hdl)) == 0) { - spill_data_size = hdl->sa_spill->db_size; - old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP); - bcopy(hdl->sa_spill->db_data, old_data[1], - hdl->sa_spill->db_size); - spill_attr_count = - hdl->sa_spill_tab->sa_layout->lot_attr_count; - } else if (error && error != ENOENT) { - if (old_data[0]) - kmem_free(old_data[0], bonus_data_size); - return (error); - } else { - old_data[1] = NULL; - } - - /* build descriptor of all attributes */ - - attr_count = bonus_attr_count + spill_attr_count; - if (action == SA_ADD) - attr_count++; - else if (action == SA_REMOVE) - attr_count--; - - attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP); - - /* - * loop through bonus and spill buffer if it exists, and - * build up new attr_descriptor to reset the attributes - */ - k = j = 0; - count = bonus_attr_count; - hdr = SA_GET_HDR(hdl, SA_BONUS); - idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS); - for (; k != 2; k++) { - /* - * Iterate over each attribute in layout. Fetch the - * size of variable-length attributes needing rewrite - * from sa_lengths[]. - */ - for (i = 0, length_idx = 0; i != count; i++) { - sa_attr_type_t attr; - - attr = idx_tab->sa_layout->lot_attrs[i]; - reg_length = SA_REGISTERED_LEN(sa, attr); - if (reg_length == 0) { - length = hdr->sa_lengths[length_idx]; - length_idx++; - } else { - length = reg_length; - } - if (attr == newattr) { - /* - * There is nothing to do for SA_REMOVE, - * so it is just skipped. - */ - if (action == SA_REMOVE) - continue; - - /* - * Duplicate attributes are not allowed, so the - * action can not be SA_ADD here. - */ - ASSERT3S(action, ==, SA_REPLACE); - - /* - * Only a variable-sized attribute can be - * replaced here, and its size must be changing. - */ - ASSERT3U(reg_length, ==, 0); - ASSERT3U(length, !=, buflen); - SA_ADD_BULK_ATTR(attr_desc, j, attr, - locator, datastart, buflen); - } else { - SA_ADD_BULK_ATTR(attr_desc, j, attr, - NULL, (void *) - (TOC_OFF(idx_tab->sa_idx_tab[attr]) + - (uintptr_t)old_data[k]), length); - } - } - if (k == 0 && hdl->sa_spill) { - hdr = SA_GET_HDR(hdl, SA_SPILL); - idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL); - count = spill_attr_count; - } else { - break; - } - } - if (action == SA_ADD) { - reg_length = SA_REGISTERED_LEN(sa, newattr); - IMPLY(reg_length != 0, reg_length == buflen); - SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator, - datastart, buflen); - } - ASSERT3U(j, ==, attr_count); - - error = sa_build_layouts(hdl, attr_desc, attr_count, tx); - - if (old_data[0]) - kmem_free(old_data[0], bonus_data_size); - if (old_data[1]) - kmem_free(old_data[1], spill_data_size); - kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count); - - return (error); -} - -static int -sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, - dmu_tx_t *tx) -{ - int error; - sa_os_t *sa = hdl->sa_os->os_sa; - dmu_object_type_t bonustype; - - bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS)); - - ASSERT(hdl); - ASSERT(MUTEX_HELD(&hdl->sa_lock)); - - /* sync out registration table if necessary */ - if (sa->sa_need_attr_registration) - sa_attr_register_sync(hdl, tx); - - error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx); - if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb) - sa->sa_update_cb(hdl, tx); - - return (error); -} - -/* - * update or add new attribute - */ -int -sa_update(sa_handle_t *hdl, sa_attr_type_t type, - void *buf, uint32_t buflen, dmu_tx_t *tx) -{ - int error; - sa_bulk_attr_t bulk; - - bulk.sa_attr = type; - bulk.sa_data_func = NULL; - bulk.sa_length = buflen; - bulk.sa_data = buf; - - mutex_enter(&hdl->sa_lock); - error = sa_bulk_update_impl(hdl, &bulk, 1, tx); - mutex_exit(&hdl->sa_lock); - return (error); -} - -int -sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr, - uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx) -{ - int error; - sa_bulk_attr_t bulk; - - bulk.sa_attr = attr; - bulk.sa_data = userdata; - bulk.sa_data_func = locator; - bulk.sa_length = buflen; - - mutex_enter(&hdl->sa_lock); - error = sa_bulk_update_impl(hdl, &bulk, 1, tx); - mutex_exit(&hdl->sa_lock); - return (error); -} - -/* - * Return size of an attribute - */ - -int -sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size) -{ - sa_bulk_attr_t bulk; - int error; - - bulk.sa_data = NULL; - bulk.sa_attr = attr; - bulk.sa_data_func = NULL; - - ASSERT(hdl); - mutex_enter(&hdl->sa_lock); - if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) { - mutex_exit(&hdl->sa_lock); - return (error); - } - *size = bulk.sa_size; - - mutex_exit(&hdl->sa_lock); - return (0); -} - -int -sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) -{ - ASSERT(hdl); - ASSERT(MUTEX_HELD(&hdl->sa_lock)); - return (sa_lookup_impl(hdl, attrs, count)); -} - -int -sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) -{ - int error; - - ASSERT(hdl); - mutex_enter(&hdl->sa_lock); - error = sa_bulk_lookup_locked(hdl, attrs, count); - mutex_exit(&hdl->sa_lock); - return (error); -} - -int -sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx) -{ - int error; - - ASSERT(hdl); - mutex_enter(&hdl->sa_lock); - error = sa_bulk_update_impl(hdl, attrs, count, tx); - mutex_exit(&hdl->sa_lock); - return (error); -} - -int -sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx) -{ - int error; - - mutex_enter(&hdl->sa_lock); - error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL, - NULL, 0, tx); - mutex_exit(&hdl->sa_lock); - return (error); -} - -void -sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi) -{ - dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi); -} - -void -sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) -{ - dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus, - blksize, nblocks); -} - -void -sa_set_userp(sa_handle_t *hdl, void *ptr) -{ - hdl->sa_userp = ptr; -} - -dmu_buf_t * -sa_get_db(sa_handle_t *hdl) -{ - return ((dmu_buf_t *)hdl->sa_bonus); -} - -void * -sa_get_userdata(sa_handle_t *hdl) -{ - return (hdl->sa_userp); -} - -void -sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func) -{ - ASSERT(MUTEX_HELD(&os->os_sa->sa_lock)); - os->os_sa->sa_update_cb = func; -} - -void -sa_register_update_callback(objset_t *os, sa_update_cb_t *func) -{ - - mutex_enter(&os->os_sa->sa_lock); - sa_register_update_callback_locked(os, func); - mutex_exit(&os->os_sa->sa_lock); -} - -uint64_t -sa_handle_object(sa_handle_t *hdl) -{ - return (hdl->sa_bonus->db_object); -} - -boolean_t -sa_enabled(objset_t *os) -{ - return (os->os_sa == NULL); -} - -int -sa_set_sa_object(objset_t *os, uint64_t sa_object) -{ - sa_os_t *sa = os->os_sa; - - if (sa->sa_master_obj) - return (1); - - sa->sa_master_obj = sa_object; - - return (0); -} - -int -sa_hdrsize(void *arg) -{ - sa_hdr_phys_t *hdr = arg; - - return (SA_HDR_SIZE(hdr)); -} - -void -sa_handle_lock(sa_handle_t *hdl) -{ - ASSERT(hdl); - mutex_enter(&hdl->sa_lock); -} - -void -sa_handle_unlock(sa_handle_t *hdl) -{ - ASSERT(hdl); - mutex_exit(&hdl->sa_lock); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c deleted file mode 100644 index 34c909f0c71a..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright 2013 Saso Kiselkov. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. - */ -#include -#include -#ifdef _KERNEL -#include -#include -#else -#include -#include -#endif -#include - -static int -sha256_incremental(void *buf, size_t size, void *arg) -{ - SHA256_CTX *ctx = arg; - SHA256_Update(ctx, buf, size); - return (0); -} - -static int -sha512_incremental(void *buf, size_t size, void *arg) -{ - SHA512_CTX *ctx = arg; - SHA512_256_Update(ctx, buf, size); - return (0); -} - -/*ARGSUSED*/ -void -abd_checksum_SHA256(abd_t *abd, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - SHA256_CTX ctx; - zio_cksum_t tmp; - - SHA256_Init(&ctx); - (void) abd_iterate_func(abd, 0, size, sha256_incremental, &ctx); - SHA256_Final((unsigned char *)&tmp, &ctx); - - /* - * A prior implementation of this function had a - * private SHA256 implementation always wrote things out in - * Big Endian and there wasn't a byteswap variant of it. - * To preserve on disk compatibility we need to force that - * behavior. - */ - zcp->zc_word[0] = BE_64(tmp.zc_word[0]); - zcp->zc_word[1] = BE_64(tmp.zc_word[1]); - zcp->zc_word[2] = BE_64(tmp.zc_word[2]); - zcp->zc_word[3] = BE_64(tmp.zc_word[3]); -} - -/*ARGSUSED*/ -void -abd_checksum_SHA512_native(abd_t *abd, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - SHA512_CTX ctx; - - SHA512_256_Init(&ctx); - (void) abd_iterate_func(abd, 0, size, sha512_incremental, &ctx); - SHA512_256_Final((unsigned char *)zcp, &ctx); -} - -/*ARGSUSED*/ -void -abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - zio_cksum_t tmp; - - abd_checksum_SHA512_native(abd, size, ctx_template, &tmp); - zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); - zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); - zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); - zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c deleted file mode 100644 index c30f590a5fdb..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2013 Saso Kiselkov. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. - */ -#include -#include -#ifdef _KERNEL -#include -#else -#include -#endif -#include - -static int -skein_incremental(void *buf, size_t size, void *arg) -{ - Skein_512_Ctxt_t *ctx = arg; - (void) Skein_512_Update(ctx, buf, size); - return (0); -} - -/* - * Computes a native 256-bit skein MAC checksum. Please note that this - * function requires the presence of a ctx_template that should be allocated - * using abd_checksum_skein_tmpl_init. - */ -/*ARGSUSED*/ -void -abd_checksum_skein_native(abd_t *abd, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - Skein_512_Ctxt_t ctx; - - ASSERT(ctx_template != NULL); - bcopy(ctx_template, &ctx, sizeof (ctx)); - (void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx); - (void) Skein_512_Final(&ctx, (uint8_t *)zcp); - bzero(&ctx, sizeof (ctx)); -} - -/* - * Byteswapped version of abd_checksum_skein_native. This just invokes - * the native checksum function and byteswaps the resulting checksum (since - * skein is internally endian-insensitive). - */ -void -abd_checksum_skein_byteswap(abd_t *abd, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - zio_cksum_t tmp; - - abd_checksum_skein_native(abd, size, ctx_template, &tmp); - zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); - zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); - zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); - zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]); -} - -/* - * Allocates a skein MAC template suitable for using in skein MAC checksum - * computations and returns a pointer to it. - */ -void * -abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) -{ - Skein_512_Ctxt_t *ctx; - - ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); - (void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0, - salt->zcs_bytes, sizeof (salt->zcs_bytes)); - return (ctx); -} - -/* - * Frees a skein context template previously allocated using - * abd_checksum_skein_tmpl_init. - */ -void -abd_checksum_skein_tmpl_free(void *ctx_template) -{ - Skein_512_Ctxt_t *ctx = ctx_template; - - bzero(ctx, sizeof (*ctx)); - kmem_free(ctx, sizeof (*ctx)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c deleted file mode 100644 index 65ae7904047b..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ /dev/null @@ -1,8972 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013 Martin Matuska . All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright 2013 Saso Kiselkov. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2016 Toomas Soome - * Copyright 2018 Joyent, Inc. - * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2017 Datto Inc. - * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. - * Copyright (c) 2016 Actifio, Inc. All rights reserved. - */ - -/* - * SPA: Storage Pool Allocator - * - * This file contains all the routines used when modifying on-disk SPA state. - * This includes opening, importing, destroying, exporting a pool, and syncing a - * pool. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef _KERNEL -#include -#include -#include -#endif /* _KERNEL */ - -#include "zfs_prop.h" -#include "zfs_comutil.h" - -/* Check hostid on import? */ -static int check_hostid = 1; - -/* - * The interval, in seconds, at which failed configuration cache file writes - * should be retried. - */ -int zfs_ccw_retry_interval = 300; - -SYSCTL_DECL(_vfs_zfs); -SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, - "Check hostid on import?"); -TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval); -SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW, - &zfs_ccw_retry_interval, 0, - "Configuration cache file write, retry after failure, interval (seconds)"); - -typedef enum zti_modes { - ZTI_MODE_FIXED, /* value is # of threads (min 1) */ - ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ - ZTI_MODE_NULL, /* don't create a taskq */ - ZTI_NMODES -} zti_modes_t; - -#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } -#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } -#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } - -#define ZTI_N(n) ZTI_P(n, 1) -#define ZTI_ONE ZTI_N(1) - -typedef struct zio_taskq_info { - zti_modes_t zti_mode; - uint_t zti_value; - uint_t zti_count; -} zio_taskq_info_t; - -static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { - "issue", "issue_high", "intr", "intr_high" -}; - -/* - * This table defines the taskq settings for each ZFS I/O type. When - * initializing a pool, we use this table to create an appropriately sized - * taskq. Some operations are low volume and therefore have a small, static - * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE - * macros. Other operations process a large amount of data; the ZTI_BATCH - * macro causes us to create a taskq oriented for throughput. Some operations - * are so high frequency and short-lived that the taskq itself can become a a - * point of lock contention. The ZTI_P(#, #) macro indicates that we need an - * additional degree of parallelism specified by the number of threads per- - * taskq and the number of taskqs; when dispatching an event in this case, the - * particular taskq is chosen at random. - * - * The different taskq priorities are to handle the different contexts (issue - * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that - * need to be handled with minimum delay. - */ -const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { - /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ - { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ - { ZTI_BATCH, ZTI_N(5), ZTI_P(12, 8), ZTI_N(5) }, /* WRITE */ - { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ -}; - -static void spa_sync_version(void *arg, dmu_tx_t *tx); -static void spa_sync_props(void *arg, dmu_tx_t *tx); -static boolean_t spa_has_active_shared_spare(spa_t *spa); -static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); -static void spa_vdev_resilver_done(spa_t *spa); - -uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ -#ifdef PSRSET_BIND -id_t zio_taskq_psrset_bind = PS_NONE; -#endif -#ifdef SYSDC -boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ -uint_t zio_taskq_basedc = 80; /* base duty cycle */ -#endif - -#ifdef _KERNEL -#define SPA_PROCESS -#endif -boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ - -extern int zfs_sync_pass_deferred_free; - -/* - * Report any spa_load_verify errors found, but do not fail spa_load. - * This is used by zdb to analyze non-idle pools. - */ -boolean_t spa_load_verify_dryrun = B_FALSE; - -/* - * This (illegal) pool name is used when temporarily importing a spa_t in order - * to get the vdev stats associated with the imported devices. - */ -#define TRYIMPORT_NAME "$import" - -/* - * For debugging purposes: print out vdev tree during pool import. - */ -int spa_load_print_vdev_tree = B_FALSE; - -/* - * A non-zero value for zfs_max_missing_tvds means that we allow importing - * pools with missing top-level vdevs. This is strictly intended for advanced - * pool recovery cases since missing data is almost inevitable. Pools with - * missing devices can only be imported read-only for safety reasons, and their - * fail-mode will be automatically set to "continue". - * - * With 1 missing vdev we should be able to import the pool and mount all - * datasets. User data that was not modified after the missing device has been - * added should be recoverable. This means that snapshots created prior to the - * addition of that device should be completely intact. - * - * With 2 missing vdevs, some datasets may fail to mount since there are - * dataset statistics that are stored as regular metadata. Some data might be - * recoverable if those vdevs were added recently. - * - * With 3 or more missing vdevs, the pool is severely damaged and MOS entries - * may be missing entirely. Chances of data recovery are very low. Note that - * there are also risks of performing an inadvertent rewind as we might be - * missing all the vdevs with the latest uberblocks. - */ -uint64_t zfs_max_missing_tvds = 0; - -/* - * The parameters below are similar to zfs_max_missing_tvds but are only - * intended for a preliminary open of the pool with an untrusted config which - * might be incomplete or out-dated. - * - * We are more tolerant for pools opened from a cachefile since we could have - * an out-dated cachefile where a device removal was not registered. - * We could have set the limit arbitrarily high but in the case where devices - * are really missing we would want to return the proper error codes; we chose - * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available - * and we get a chance to retrieve the trusted config. - */ -uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; - -/* - * In the case where config was assembled by scanning device paths (/dev/dsks - * by default) we are less tolerant since all the existing devices should have - * been detected and we want spa_load to return the right error codes. - */ -uint64_t zfs_max_missing_tvds_scan = 0; - - -SYSCTL_DECL(_vfs_zfs_zio); -SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_batch_pct, CTLFLAG_RDTUN, - &zio_taskq_batch_pct, 0, - "Percentage of CPUs to run an IO worker thread"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_print_vdev_tree, CTLFLAG_RWTUN, - &spa_load_print_vdev_tree, 0, - "print out vdev tree during pool import"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds, CTLFLAG_RWTUN, - &zfs_max_missing_tvds, 0, - "allow importing pools with missing top-level vdevs"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN, - &zfs_max_missing_tvds_cachefile, 0, - "allow importing pools with missing top-level vdevs in cache file"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN, - &zfs_max_missing_tvds_scan, 0, - "allow importing pools with missing top-level vdevs during scan"); - -/* - * Debugging aid that pauses spa_sync() towards the end. - */ -boolean_t zfs_pause_spa_sync = B_FALSE; - -/* - * ========================================================================== - * SPA properties routines - * ========================================================================== - */ - -/* - * Add a (source=src, propname=propval) list to an nvlist. - */ -static void -spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, - uint64_t intval, zprop_source_t src) -{ - const char *propname = zpool_prop_to_name(prop); - nvlist_t *propval; - - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); - - if (strval != NULL) - VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); - else - VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); - - VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); - nvlist_free(propval); -} - -/* - * Get property values from the spa configuration. - */ -static void -spa_prop_get_config(spa_t *spa, nvlist_t **nvp) -{ - vdev_t *rvd = spa->spa_root_vdev; - dsl_pool_t *pool = spa->spa_dsl_pool; - uint64_t size, alloc, cap, version; - zprop_source_t src = ZPROP_SRC_NONE; - spa_config_dirent_t *dp; - metaslab_class_t *mc = spa_normal_class(spa); - - ASSERT(MUTEX_HELD(&spa->spa_props_lock)); - - if (rvd != NULL) { - alloc = metaslab_class_get_alloc(mc); - alloc += metaslab_class_get_alloc(spa_special_class(spa)); - alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); - - size = metaslab_class_get_space(mc); - size += metaslab_class_get_space(spa_special_class(spa)); - size += metaslab_class_get_space(spa_dedup_class(spa)); - - spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, - size - alloc, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, - spa->spa_checkpoint_info.sci_dspace, src); - - spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, - metaslab_class_fragmentation(mc), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, - metaslab_class_expandable_space(mc), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, - (spa_mode(spa) == FREAD), src); - - cap = (size == 0) ? 0 : (alloc * 100 / size); - spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); - - spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, - ddt_get_pool_dedup_ratio(spa), src); - - spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, - rvd->vdev_state, src); - - version = spa_version(spa); - if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) - src = ZPROP_SRC_DEFAULT; - else - src = ZPROP_SRC_LOCAL; - spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); - } - - if (pool != NULL) { - /* - * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, - * when opening pools before this version freedir will be NULL. - */ - if (pool->dp_free_dir != NULL) { - spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, - dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, - src); - } else { - spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, - NULL, 0, src); - } - - if (pool->dp_leak_dir != NULL) { - spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, - dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, - src); - } else { - spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, - NULL, 0, src); - } - } - - spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); - - if (spa->spa_comment != NULL) { - spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, - 0, ZPROP_SRC_LOCAL); - } - - if (spa->spa_root != NULL) - spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, - 0, ZPROP_SRC_LOCAL); - - if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { - spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, - MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); - } else { - spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, - SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); - } - - if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { - spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, - DNODE_MAX_SIZE, ZPROP_SRC_NONE); - } else { - spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, - DNODE_MIN_SIZE, ZPROP_SRC_NONE); - } - - if ((dp = list_head(&spa->spa_config_list)) != NULL) { - if (dp->scd_path == NULL) { - spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, - "none", 0, ZPROP_SRC_LOCAL); - } else if (strcmp(dp->scd_path, spa_config_path) != 0) { - spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, - dp->scd_path, 0, ZPROP_SRC_LOCAL); - } - } -} - -/* - * Get zpool property values. - */ -int -spa_prop_get(spa_t *spa, nvlist_t **nvp) -{ - objset_t *mos = spa->spa_meta_objset; - zap_cursor_t zc; - zap_attribute_t za; - int err; - - VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - mutex_enter(&spa->spa_props_lock); - - /* - * Get properties from the spa config. - */ - spa_prop_get_config(spa, nvp); - - /* If no pool property object, no more prop to get. */ - if (mos == NULL || spa->spa_pool_props_object == 0) { - mutex_exit(&spa->spa_props_lock); - return (0); - } - - /* - * Get properties from the MOS pool property object. - */ - for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); - (err = zap_cursor_retrieve(&zc, &za)) == 0; - zap_cursor_advance(&zc)) { - uint64_t intval = 0; - char *strval = NULL; - zprop_source_t src = ZPROP_SRC_DEFAULT; - zpool_prop_t prop; - - if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) - continue; - - switch (za.za_integer_length) { - case 8: - /* integer property */ - if (za.za_first_integer != - zpool_prop_default_numeric(prop)) - src = ZPROP_SRC_LOCAL; - - if (prop == ZPOOL_PROP_BOOTFS) { - dsl_pool_t *dp; - dsl_dataset_t *ds = NULL; - - dp = spa_get_dsl(spa); - dsl_pool_config_enter(dp, FTAG); - err = dsl_dataset_hold_obj(dp, - za.za_first_integer, FTAG, &ds); - if (err != 0) { - dsl_pool_config_exit(dp, FTAG); - break; - } - - strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, - KM_SLEEP); - dsl_dataset_name(ds, strval); - dsl_dataset_rele(ds, FTAG); - dsl_pool_config_exit(dp, FTAG); - } else { - strval = NULL; - intval = za.za_first_integer; - } - - spa_prop_add_list(*nvp, prop, strval, intval, src); - - if (strval != NULL) - kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); - - break; - - case 1: - /* string property */ - strval = kmem_alloc(za.za_num_integers, KM_SLEEP); - err = zap_lookup(mos, spa->spa_pool_props_object, - za.za_name, 1, za.za_num_integers, strval); - if (err) { - kmem_free(strval, za.za_num_integers); - break; - } - spa_prop_add_list(*nvp, prop, strval, 0, src); - kmem_free(strval, za.za_num_integers); - break; - - default: - break; - } - } - zap_cursor_fini(&zc); - mutex_exit(&spa->spa_props_lock); -out: - if (err && err != ENOENT) { - nvlist_free(*nvp); - *nvp = NULL; - return (err); - } - - return (0); -} - -/* - * Validate the given pool properties nvlist and modify the list - * for the property values to be set. - */ -static int -spa_prop_validate(spa_t *spa, nvlist_t *props) -{ - nvpair_t *elem; - int error = 0, reset_bootfs = 0; - uint64_t objnum = 0; - boolean_t has_feature = B_FALSE; - - elem = NULL; - while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { - uint64_t intval; - char *strval, *slash, *check, *fname; - const char *propname = nvpair_name(elem); - zpool_prop_t prop = zpool_name_to_prop(propname); - - switch (prop) { - case ZPOOL_PROP_INVAL: - if (!zpool_prop_feature(propname)) { - error = SET_ERROR(EINVAL); - break; - } - - /* - * Sanitize the input. - */ - if (nvpair_type(elem) != DATA_TYPE_UINT64) { - error = SET_ERROR(EINVAL); - break; - } - - if (nvpair_value_uint64(elem, &intval) != 0) { - error = SET_ERROR(EINVAL); - break; - } - - if (intval != 0) { - error = SET_ERROR(EINVAL); - break; - } - - fname = strchr(propname, '@') + 1; - if (zfeature_lookup_name(fname, NULL) != 0) { - error = SET_ERROR(EINVAL); - break; - } - - has_feature = B_TRUE; - break; - - case ZPOOL_PROP_VERSION: - error = nvpair_value_uint64(elem, &intval); - if (!error && - (intval < spa_version(spa) || - intval > SPA_VERSION_BEFORE_FEATURES || - has_feature)) - error = SET_ERROR(EINVAL); - break; - - case ZPOOL_PROP_DELEGATION: - case ZPOOL_PROP_AUTOREPLACE: - case ZPOOL_PROP_LISTSNAPS: - case ZPOOL_PROP_AUTOEXPAND: - error = nvpair_value_uint64(elem, &intval); - if (!error && intval > 1) - error = SET_ERROR(EINVAL); - break; - - case ZPOOL_PROP_MULTIHOST: - error = nvpair_value_uint64(elem, &intval); - if (!error && intval > 1) - error = SET_ERROR(EINVAL); - - if (!error && !spa_get_hostid()) - error = SET_ERROR(ENOTSUP); - - break; - - case ZPOOL_PROP_BOOTFS: - /* - * If the pool version is less than SPA_VERSION_BOOTFS, - * or the pool is still being created (version == 0), - * the bootfs property cannot be set. - */ - if (spa_version(spa) < SPA_VERSION_BOOTFS) { - error = SET_ERROR(ENOTSUP); - break; - } - - /* - * Make sure the vdev config is bootable - */ - if (!vdev_is_bootable(spa->spa_root_vdev)) { - error = SET_ERROR(ENOTSUP); - break; - } - - reset_bootfs = 1; - - error = nvpair_value_string(elem, &strval); - - if (!error) { - objset_t *os; - uint64_t propval; - - if (strval == NULL || strval[0] == '\0') { - objnum = zpool_prop_default_numeric( - ZPOOL_PROP_BOOTFS); - break; - } - - error = dmu_objset_hold(strval, FTAG, &os); - if (error != 0) - break; - - /* - * Must be ZPL, and its property settings - * must be supported. - */ - - if (dmu_objset_type(os) != DMU_OST_ZFS) { - error = SET_ERROR(ENOTSUP); - } else if ((error = - dsl_prop_get_int_ds(dmu_objset_ds(os), - zfs_prop_to_name(ZFS_PROP_COMPRESSION), - &propval)) == 0 && - !BOOTFS_COMPRESS_VALID(propval)) { - error = SET_ERROR(ENOTSUP); - } else { - objnum = dmu_objset_id(os); - } - dmu_objset_rele(os, FTAG); - } - break; - - case ZPOOL_PROP_FAILUREMODE: - error = nvpair_value_uint64(elem, &intval); - if (!error && (intval < ZIO_FAILURE_MODE_WAIT || - intval > ZIO_FAILURE_MODE_PANIC)) - error = SET_ERROR(EINVAL); - - /* - * This is a special case which only occurs when - * the pool has completely failed. This allows - * the user to change the in-core failmode property - * without syncing it out to disk (I/Os might - * currently be blocked). We do this by returning - * EIO to the caller (spa_prop_set) to trick it - * into thinking we encountered a property validation - * error. - */ - if (!error && spa_suspended(spa)) { - spa->spa_failmode = intval; - error = SET_ERROR(EIO); - } - break; - - case ZPOOL_PROP_CACHEFILE: - if ((error = nvpair_value_string(elem, &strval)) != 0) - break; - - if (strval[0] == '\0') - break; - - if (strcmp(strval, "none") == 0) - break; - - if (strval[0] != '/') { - error = SET_ERROR(EINVAL); - break; - } - - slash = strrchr(strval, '/'); - ASSERT(slash != NULL); - - if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || - strcmp(slash, "/..") == 0) - error = SET_ERROR(EINVAL); - break; - - case ZPOOL_PROP_COMMENT: - if ((error = nvpair_value_string(elem, &strval)) != 0) - break; - for (check = strval; *check != '\0'; check++) { - /* - * The kernel doesn't have an easy isprint() - * check. For this kernel check, we merely - * check ASCII apart from DEL. Fix this if - * there is an easy-to-use kernel isprint(). - */ - if (*check >= 0x7f) { - error = SET_ERROR(EINVAL); - break; - } - } - if (strlen(strval) > ZPROP_MAX_COMMENT) - error = E2BIG; - break; - - case ZPOOL_PROP_DEDUPDITTO: - if (spa_version(spa) < SPA_VERSION_DEDUP) - error = SET_ERROR(ENOTSUP); - else - error = nvpair_value_uint64(elem, &intval); - if (error == 0 && - intval != 0 && intval < ZIO_DEDUPDITTO_MIN) - error = SET_ERROR(EINVAL); - break; - } - - if (error) - break; - } - - if (!error && reset_bootfs) { - error = nvlist_remove(props, - zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); - - if (!error) { - error = nvlist_add_uint64(props, - zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); - } - } - - return (error); -} - -void -spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) -{ - char *cachefile; - spa_config_dirent_t *dp; - - if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), - &cachefile) != 0) - return; - - dp = kmem_alloc(sizeof (spa_config_dirent_t), - KM_SLEEP); - - if (cachefile[0] == '\0') - dp->scd_path = spa_strdup(spa_config_path); - else if (strcmp(cachefile, "none") == 0) - dp->scd_path = NULL; - else - dp->scd_path = spa_strdup(cachefile); - - list_insert_head(&spa->spa_config_list, dp); - if (need_sync) - spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); -} - -int -spa_prop_set(spa_t *spa, nvlist_t *nvp) -{ - int error; - nvpair_t *elem = NULL; - boolean_t need_sync = B_FALSE; - - if ((error = spa_prop_validate(spa, nvp)) != 0) - return (error); - - while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { - zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); - - if (prop == ZPOOL_PROP_CACHEFILE || - prop == ZPOOL_PROP_ALTROOT || - prop == ZPOOL_PROP_READONLY) - continue; - - if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { - uint64_t ver; - - if (prop == ZPOOL_PROP_VERSION) { - VERIFY(nvpair_value_uint64(elem, &ver) == 0); - } else { - ASSERT(zpool_prop_feature(nvpair_name(elem))); - ver = SPA_VERSION_FEATURES; - need_sync = B_TRUE; - } - - /* Save time if the version is already set. */ - if (ver == spa_version(spa)) - continue; - - /* - * In addition to the pool directory object, we might - * create the pool properties object, the features for - * read object, the features for write object, or the - * feature descriptions object. - */ - error = dsl_sync_task(spa->spa_name, NULL, - spa_sync_version, &ver, - 6, ZFS_SPACE_CHECK_RESERVED); - if (error) - return (error); - continue; - } - - need_sync = B_TRUE; - break; - } - - if (need_sync) { - return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, - nvp, 6, ZFS_SPACE_CHECK_RESERVED)); - } - - return (0); -} - -/* - * If the bootfs property value is dsobj, clear it. - */ -void -spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) -{ - if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { - VERIFY(zap_remove(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); - spa->spa_bootfs = 0; - } -} - -/*ARGSUSED*/ -static int -spa_change_guid_check(void *arg, dmu_tx_t *tx) -{ - uint64_t *newguid = arg; - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - vdev_t *rvd = spa->spa_root_vdev; - uint64_t vdev_state; - - if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { - int error = (spa_has_checkpoint(spa)) ? - ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; - return (SET_ERROR(error)); - } - - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - vdev_state = rvd->vdev_state; - spa_config_exit(spa, SCL_STATE, FTAG); - - if (vdev_state != VDEV_STATE_HEALTHY) - return (SET_ERROR(ENXIO)); - - ASSERT3U(spa_guid(spa), !=, *newguid); - - return (0); -} - -static void -spa_change_guid_sync(void *arg, dmu_tx_t *tx) -{ - uint64_t *newguid = arg; - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - uint64_t oldguid; - vdev_t *rvd = spa->spa_root_vdev; - - oldguid = spa_guid(spa); - - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - rvd->vdev_guid = *newguid; - rvd->vdev_guid_sum += (*newguid - oldguid); - vdev_config_dirty(rvd); - spa_config_exit(spa, SCL_STATE, FTAG); - - spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", - oldguid, *newguid); -} - -/* - * Change the GUID for the pool. This is done so that we can later - * re-import a pool built from a clone of our own vdevs. We will modify - * the root vdev's guid, our own pool guid, and then mark all of our - * vdevs dirty. Note that we must make sure that all our vdevs are - * online when we do this, or else any vdevs that weren't present - * would be orphaned from our pool. We are also going to issue a - * sysevent to update any watchers. - */ -int -spa_change_guid(spa_t *spa) -{ - int error; - uint64_t guid; - - mutex_enter(&spa->spa_vdev_top_lock); - mutex_enter(&spa_namespace_lock); - guid = spa_generate_guid(NULL); - - error = dsl_sync_task(spa->spa_name, spa_change_guid_check, - spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); - - if (error == 0) { - spa_write_cachefile(spa, B_FALSE, B_TRUE); - spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); - } - - mutex_exit(&spa_namespace_lock); - mutex_exit(&spa->spa_vdev_top_lock); - - return (error); -} - -/* - * ========================================================================== - * SPA state manipulation (open/create/destroy/import/export) - * ========================================================================== - */ - -static int -spa_error_entry_compare(const void *a, const void *b) -{ - const spa_error_entry_t *sa = (const spa_error_entry_t *)a; - const spa_error_entry_t *sb = (const spa_error_entry_t *)b; - int ret; - - ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, - sizeof (zbookmark_phys_t)); - - return (AVL_ISIGN(ret)); -} - -/* - * Utility function which retrieves copies of the current logs and - * re-initializes them in the process. - */ -void -spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) -{ - ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); - - bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); - bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); - - avl_create(&spa->spa_errlist_scrub, - spa_error_entry_compare, sizeof (spa_error_entry_t), - offsetof(spa_error_entry_t, se_avl)); - avl_create(&spa->spa_errlist_last, - spa_error_entry_compare, sizeof (spa_error_entry_t), - offsetof(spa_error_entry_t, se_avl)); -} - -static void -spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) -{ - const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; - enum zti_modes mode = ztip->zti_mode; - uint_t value = ztip->zti_value; - uint_t count = ztip->zti_count; - spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; - char name[32]; - uint_t flags = 0; - boolean_t batch = B_FALSE; - - if (mode == ZTI_MODE_NULL) { - tqs->stqs_count = 0; - tqs->stqs_taskq = NULL; - return; - } - - ASSERT3U(count, >, 0); - - tqs->stqs_count = count; - tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); - - switch (mode) { - case ZTI_MODE_FIXED: - ASSERT3U(value, >=, 1); - value = MAX(value, 1); - break; - - case ZTI_MODE_BATCH: - batch = B_TRUE; - flags |= TASKQ_THREADS_CPU_PCT; - value = zio_taskq_batch_pct; - break; - - default: - panic("unrecognized mode for %s_%s taskq (%u:%u) in " - "spa_activate()", - zio_type_name[t], zio_taskq_types[q], mode, value); - break; - } - - for (uint_t i = 0; i < count; i++) { - taskq_t *tq; - - if (count > 1) { - (void) snprintf(name, sizeof (name), "%s_%s_%u", - zio_type_name[t], zio_taskq_types[q], i); - } else { - (void) snprintf(name, sizeof (name), "%s_%s", - zio_type_name[t], zio_taskq_types[q]); - } - -#ifdef SYSDC - if (zio_taskq_sysdc && spa->spa_proc != &p0) { - if (batch) - flags |= TASKQ_DC_BATCH; - - tq = taskq_create_sysdc(name, value, 50, INT_MAX, - spa->spa_proc, zio_taskq_basedc, flags); - } else { -#endif - pri_t pri = maxclsyspri; - /* - * The write issue taskq can be extremely CPU - * intensive. Run it at slightly lower priority - * than the other taskqs. - * FreeBSD notes: - * - numerically higher priorities are lower priorities; - * - if priorities divided by four (RQ_PPQ) are equal - * then a difference between them is insignificant. - */ - if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) -#ifdef illumos - pri--; -#else - pri += 4; -#endif - - tq = taskq_create_proc(name, value, pri, 50, - INT_MAX, spa->spa_proc, flags); -#ifdef SYSDC - } -#endif - - tqs->stqs_taskq[i] = tq; - } -} - -static void -spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) -{ - spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; - - if (tqs->stqs_taskq == NULL) { - ASSERT0(tqs->stqs_count); - return; - } - - for (uint_t i = 0; i < tqs->stqs_count; i++) { - ASSERT3P(tqs->stqs_taskq[i], !=, NULL); - taskq_destroy(tqs->stqs_taskq[i]); - } - - kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); - tqs->stqs_taskq = NULL; -} - -/* - * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. - * Note that a type may have multiple discrete taskqs to avoid lock contention - * on the taskq itself. In that case we choose which taskq at random by using - * the low bits of gethrtime(). - */ -void -spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) -{ - spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; - taskq_t *tq; - - ASSERT3P(tqs->stqs_taskq, !=, NULL); - ASSERT3U(tqs->stqs_count, !=, 0); - - if (tqs->stqs_count == 1) { - tq = tqs->stqs_taskq[0]; - } else { -#ifdef _KERNEL - tq = tqs->stqs_taskq[(u_int)(sbinuptime() + curcpu) % - tqs->stqs_count]; -#else - tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; -#endif - } - - taskq_dispatch_ent(tq, func, arg, flags, ent); -} - -static void -spa_create_zio_taskqs(spa_t *spa) -{ - for (int t = 0; t < ZIO_TYPES; t++) { - for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { - spa_taskqs_init(spa, t, q); - } - } -} - -#ifdef SPA_PROCESS -static int -newproc(void (*pc)(void *), void *arg, id_t cid, int pri, - void **ct, pid_t pid) -{ - va_list ap; - spa_t *spa = (spa_t *)arg; /* XXX */ - struct proc *newp; - struct thread *td; - int error; - - ASSERT(ct == NULL); - ASSERT(pid == 0); - ASSERT(cid == syscid); - - error = kproc_create(pc, arg, &newp, 0, 0, "zpool-%s", spa->spa_name); - if (error != 0) - return (error); - td = FIRST_THREAD_IN_PROC(newp); - thread_lock(td); - sched_prio(td, pri); - thread_unlock(td); - return (0); -} - -static void -spa_thread(void *arg) -{ - callb_cpr_t cprinfo; - - spa_t *spa = arg; -#ifdef illumos - user_t *pu = PTOU(curproc); -#endif - CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, - spa->spa_name); - - ASSERT(curproc != &p0); -#ifdef illumos - (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), - "zpool-%s", spa->spa_name); - (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); -#endif - -#ifdef PSRSET_BIND - /* bind this thread to the requested psrset */ - if (zio_taskq_psrset_bind != PS_NONE) { - pool_lock(); - mutex_enter(&cpu_lock); - mutex_enter(&pidlock); - mutex_enter(&curproc->p_lock); - - if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, - 0, NULL, NULL) == 0) { - curthread->t_bind_pset = zio_taskq_psrset_bind; - } else { - cmn_err(CE_WARN, - "Couldn't bind process for zfs pool \"%s\" to " - "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); - } - - mutex_exit(&curproc->p_lock); - mutex_exit(&pidlock); - mutex_exit(&cpu_lock); - pool_unlock(); - } -#endif - -#ifdef SYSDC - if (zio_taskq_sysdc) { - sysdc_thread_enter(curthread, 100, 0); - } -#endif - - spa->spa_proc = curproc; - spa->spa_did = curthread->t_did; - - spa_create_zio_taskqs(spa); - - mutex_enter(&spa->spa_proc_lock); - ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); - - spa->spa_proc_state = SPA_PROC_ACTIVE; - cv_broadcast(&spa->spa_proc_cv); - - CALLB_CPR_SAFE_BEGIN(&cprinfo); - while (spa->spa_proc_state == SPA_PROC_ACTIVE) - cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); - CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); - - ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); - spa->spa_proc_state = SPA_PROC_GONE; - spa->spa_proc = &p0; - cv_broadcast(&spa->spa_proc_cv); - CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ - -#ifdef illumos - mutex_enter(&curproc->p_lock); - lwp_exit(); -#else - kthread_exit(); -#endif -} -#endif /* SPA_PROCESS */ - -/* - * Activate an uninitialized pool. - */ -static void -spa_activate(spa_t *spa, int mode) -{ - ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); - - spa->spa_state = POOL_STATE_ACTIVE; - spa->spa_mode = mode; - - spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); - spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); - spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops); - spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops); - - /* Try to create a covering process */ - mutex_enter(&spa->spa_proc_lock); - ASSERT(spa->spa_proc_state == SPA_PROC_NONE); - ASSERT(spa->spa_proc == &p0); - spa->spa_did = 0; - -#ifdef SPA_PROCESS - /* Only create a process if we're going to be around a while. */ - if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { - if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, - NULL, 0) == 0) { - spa->spa_proc_state = SPA_PROC_CREATED; - while (spa->spa_proc_state == SPA_PROC_CREATED) { - cv_wait(&spa->spa_proc_cv, - &spa->spa_proc_lock); - } - ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); - ASSERT(spa->spa_proc != &p0); - ASSERT(spa->spa_did != 0); - } else { -#ifdef _KERNEL - cmn_err(CE_WARN, - "Couldn't create process for zfs pool \"%s\"\n", - spa->spa_name); -#endif - } - } -#endif /* SPA_PROCESS */ - mutex_exit(&spa->spa_proc_lock); - - /* If we didn't create a process, we need to create our taskqs. */ -#ifndef SPA_PROCESS - ASSERT(spa->spa_proc == &p0); -#endif /* SPA_PROCESS */ - if (spa->spa_proc == &p0) { - spa_create_zio_taskqs(spa); - } - - /* - * Start TRIM thread. - */ - trim_thread_create(spa); - - /* - * This taskq is used to perform zvol-minor-related tasks - * asynchronously. This has several advantages, including easy - * resolution of various deadlocks (zfsonlinux bug #3681). - * - * The taskq must be single threaded to ensure tasks are always - * processed in the order in which they were dispatched. - * - * A taskq per pool allows one to keep the pools independent. - * This way if one pool is suspended, it will not impact another. - * - * The preferred location to dispatch a zvol minor task is a sync - * task. In this context, there is easy access to the spa_t and minimal - * error handling is required because the sync task must succeed. - */ - spa->spa_zvol_taskq = taskq_create("z_zvol", 1, minclsyspri, - 1, INT_MAX, 0); - - for (size_t i = 0; i < TXG_SIZE; i++) { - spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL); - } - - list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), - offsetof(vdev_t, vdev_config_dirty_node)); - list_create(&spa->spa_evicting_os_list, sizeof (objset_t), - offsetof(objset_t, os_evicting_node)); - list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), - offsetof(vdev_t, vdev_state_dirty_node)); - - txg_list_create(&spa->spa_vdev_txg_list, spa, - offsetof(struct vdev, vdev_txg_node)); - - avl_create(&spa->spa_errlist_scrub, - spa_error_entry_compare, sizeof (spa_error_entry_t), - offsetof(spa_error_entry_t, se_avl)); - avl_create(&spa->spa_errlist_last, - spa_error_entry_compare, sizeof (spa_error_entry_t), - offsetof(spa_error_entry_t, se_avl)); -} - -/* - * Opposite of spa_activate(). - */ -static void -spa_deactivate(spa_t *spa) -{ - ASSERT(spa->spa_sync_on == B_FALSE); - ASSERT(spa->spa_dsl_pool == NULL); - ASSERT(spa->spa_root_vdev == NULL); - ASSERT(spa->spa_async_zio_root == NULL); - ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); - - /* - * Stop TRIM thread in case spa_unload() wasn't called directly - * before spa_deactivate(). - */ - trim_thread_destroy(spa); - - spa_evicting_os_wait(spa); - - if (spa->spa_zvol_taskq) { - taskq_destroy(spa->spa_zvol_taskq); - spa->spa_zvol_taskq = NULL; - } - - txg_list_destroy(&spa->spa_vdev_txg_list); - - list_destroy(&spa->spa_config_dirty_list); - list_destroy(&spa->spa_evicting_os_list); - list_destroy(&spa->spa_state_dirty_list); - - for (int t = 0; t < ZIO_TYPES; t++) { - for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { - spa_taskqs_fini(spa, t, q); - } - } - - for (size_t i = 0; i < TXG_SIZE; i++) { - ASSERT3P(spa->spa_txg_zio[i], !=, NULL); - VERIFY0(zio_wait(spa->spa_txg_zio[i])); - spa->spa_txg_zio[i] = NULL; - } - - metaslab_class_destroy(spa->spa_normal_class); - spa->spa_normal_class = NULL; - - metaslab_class_destroy(spa->spa_log_class); - spa->spa_log_class = NULL; - - metaslab_class_destroy(spa->spa_special_class); - spa->spa_special_class = NULL; - - metaslab_class_destroy(spa->spa_dedup_class); - spa->spa_dedup_class = NULL; - - /* - * If this was part of an import or the open otherwise failed, we may - * still have errors left in the queues. Empty them just in case. - */ - spa_errlog_drain(spa); - - avl_destroy(&spa->spa_errlist_scrub); - avl_destroy(&spa->spa_errlist_last); - - spa->spa_state = POOL_STATE_UNINITIALIZED; - - mutex_enter(&spa->spa_proc_lock); - if (spa->spa_proc_state != SPA_PROC_NONE) { - ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); - spa->spa_proc_state = SPA_PROC_DEACTIVATE; - cv_broadcast(&spa->spa_proc_cv); - while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { - ASSERT(spa->spa_proc != &p0); - cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); - } - ASSERT(spa->spa_proc_state == SPA_PROC_GONE); - spa->spa_proc_state = SPA_PROC_NONE; - } - ASSERT(spa->spa_proc == &p0); - mutex_exit(&spa->spa_proc_lock); - -#ifdef SPA_PROCESS -#ifdef illumos - /* - * We want to make sure spa_thread() has actually exited the ZFS - * module, so that the module can't be unloaded out from underneath - * it. - */ - if (spa->spa_did != 0) { - thread_join(spa->spa_did); - spa->spa_did = 0; - } -#endif -#endif /* SPA_PROCESS */ -} - -/* - * Verify a pool configuration, and construct the vdev tree appropriately. This - * will create all the necessary vdevs in the appropriate layout, with each vdev - * in the CLOSED state. This will prep the pool before open/creation/import. - * All vdev validation is done by the vdev_alloc() routine. - */ -static int -spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, - uint_t id, int atype) -{ - nvlist_t **child; - uint_t children; - int error; - - if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) - return (error); - - if ((*vdp)->vdev_ops->vdev_op_leaf) - return (0); - - error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children); - - if (error == ENOENT) - return (0); - - if (error) { - vdev_free(*vdp); - *vdp = NULL; - return (SET_ERROR(EINVAL)); - } - - for (int c = 0; c < children; c++) { - vdev_t *vd; - if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, - atype)) != 0) { - vdev_free(*vdp); - *vdp = NULL; - return (error); - } - } - - ASSERT(*vdp != NULL); - - return (0); -} - -/* - * Opposite of spa_load(). - */ -static void -spa_unload(spa_t *spa) -{ - int i; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - spa_load_note(spa, "UNLOADING"); - - /* - * Stop TRIM thread. - */ - trim_thread_destroy(spa); - - /* - * Stop async tasks. - */ - spa_async_suspend(spa); - - if (spa->spa_root_vdev) { - vdev_initialize_stop_all(spa->spa_root_vdev, - VDEV_INITIALIZE_ACTIVE); - } - - /* - * Stop syncing. - */ - if (spa->spa_sync_on) { - txg_sync_stop(spa->spa_dsl_pool); - spa->spa_sync_on = B_FALSE; - } - - /* - * Even though vdev_free() also calls vdev_metaslab_fini, we need - * to call it earlier, before we wait for async i/o to complete. - * This ensures that there is no async metaslab prefetching, by - * calling taskq_wait(mg_taskq). - */ - if (spa->spa_root_vdev != NULL) { - spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); - for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) - vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); - spa_config_exit(spa, SCL_ALL, spa); - } - - if (spa->spa_mmp.mmp_thread) - mmp_thread_stop(spa); - - /* - * Wait for any outstanding async I/O to complete. - */ - if (spa->spa_async_zio_root != NULL) { - for (int i = 0; i < max_ncpus; i++) - (void) zio_wait(spa->spa_async_zio_root[i]); - kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); - spa->spa_async_zio_root = NULL; - } - - if (spa->spa_vdev_removal != NULL) { - spa_vdev_removal_destroy(spa->spa_vdev_removal); - spa->spa_vdev_removal = NULL; - } - - if (spa->spa_condense_zthr != NULL) { - zthr_destroy(spa->spa_condense_zthr); - spa->spa_condense_zthr = NULL; - } - - if (spa->spa_checkpoint_discard_zthr != NULL) { - zthr_destroy(spa->spa_checkpoint_discard_zthr); - spa->spa_checkpoint_discard_zthr = NULL; - } - - spa_condense_fini(spa); - - bpobj_close(&spa->spa_deferred_bpobj); - - spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); - - /* - * Close all vdevs. - */ - if (spa->spa_root_vdev) - vdev_free(spa->spa_root_vdev); - ASSERT(spa->spa_root_vdev == NULL); - - /* - * Close the dsl pool. - */ - if (spa->spa_dsl_pool) { - dsl_pool_close(spa->spa_dsl_pool); - spa->spa_dsl_pool = NULL; - spa->spa_meta_objset = NULL; - } - - ddt_unload(spa); - - /* - * Drop and purge level 2 cache - */ - spa_l2cache_drop(spa); - - for (i = 0; i < spa->spa_spares.sav_count; i++) - vdev_free(spa->spa_spares.sav_vdevs[i]); - if (spa->spa_spares.sav_vdevs) { - kmem_free(spa->spa_spares.sav_vdevs, - spa->spa_spares.sav_count * sizeof (void *)); - spa->spa_spares.sav_vdevs = NULL; - } - if (spa->spa_spares.sav_config) { - nvlist_free(spa->spa_spares.sav_config); - spa->spa_spares.sav_config = NULL; - } - spa->spa_spares.sav_count = 0; - - for (i = 0; i < spa->spa_l2cache.sav_count; i++) { - vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); - vdev_free(spa->spa_l2cache.sav_vdevs[i]); - } - if (spa->spa_l2cache.sav_vdevs) { - kmem_free(spa->spa_l2cache.sav_vdevs, - spa->spa_l2cache.sav_count * sizeof (void *)); - spa->spa_l2cache.sav_vdevs = NULL; - } - if (spa->spa_l2cache.sav_config) { - nvlist_free(spa->spa_l2cache.sav_config); - spa->spa_l2cache.sav_config = NULL; - } - spa->spa_l2cache.sav_count = 0; - - spa->spa_async_suspended = 0; - - spa->spa_indirect_vdevs_loaded = B_FALSE; - - if (spa->spa_comment != NULL) { - spa_strfree(spa->spa_comment); - spa->spa_comment = NULL; - } - - spa_config_exit(spa, SCL_ALL, spa); -} - -/* - * Load (or re-load) the current list of vdevs describing the active spares for - * this pool. When this is called, we have some form of basic information in - * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and - * then re-generate a more complete list including status information. - */ -void -spa_load_spares(spa_t *spa) -{ - nvlist_t **spares; - uint_t nspares; - int i; - vdev_t *vd, *tvd; - -#ifndef _KERNEL - /* - * zdb opens both the current state of the pool and the - * checkpointed state (if present), with a different spa_t. - * - * As spare vdevs are shared among open pools, we skip loading - * them when we load the checkpointed state of the pool. - */ - if (!spa_writeable(spa)) - return; -#endif - - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - - /* - * First, close and free any existing spare vdevs. - */ - for (i = 0; i < spa->spa_spares.sav_count; i++) { - vd = spa->spa_spares.sav_vdevs[i]; - - /* Undo the call to spa_activate() below */ - if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, - B_FALSE)) != NULL && tvd->vdev_isspare) - spa_spare_remove(tvd); - vdev_close(vd); - vdev_free(vd); - } - - if (spa->spa_spares.sav_vdevs) - kmem_free(spa->spa_spares.sav_vdevs, - spa->spa_spares.sav_count * sizeof (void *)); - - if (spa->spa_spares.sav_config == NULL) - nspares = 0; - else - VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); - - spa->spa_spares.sav_count = (int)nspares; - spa->spa_spares.sav_vdevs = NULL; - - if (nspares == 0) - return; - - /* - * Construct the array of vdevs, opening them to get status in the - * process. For each spare, there is potentially two different vdev_t - * structures associated with it: one in the list of spares (used only - * for basic validation purposes) and one in the active vdev - * configuration (if it's spared in). During this phase we open and - * validate each vdev on the spare list. If the vdev also exists in the - * active configuration, then we also mark this vdev as an active spare. - */ - spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), - KM_SLEEP); - for (i = 0; i < spa->spa_spares.sav_count; i++) { - VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, - VDEV_ALLOC_SPARE) == 0); - ASSERT(vd != NULL); - - spa->spa_spares.sav_vdevs[i] = vd; - - if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, - B_FALSE)) != NULL) { - if (!tvd->vdev_isspare) - spa_spare_add(tvd); - - /* - * We only mark the spare active if we were successfully - * able to load the vdev. Otherwise, importing a pool - * with a bad active spare would result in strange - * behavior, because multiple pool would think the spare - * is actively in use. - * - * There is a vulnerability here to an equally bizarre - * circumstance, where a dead active spare is later - * brought back to life (onlined or otherwise). Given - * the rarity of this scenario, and the extra complexity - * it adds, we ignore the possibility. - */ - if (!vdev_is_dead(tvd)) - spa_spare_activate(tvd); - } - - vd->vdev_top = vd; - vd->vdev_aux = &spa->spa_spares; - - if (vdev_open(vd) != 0) - continue; - - if (vdev_validate_aux(vd) == 0) - spa_spare_add(vd); - } - - /* - * Recompute the stashed list of spares, with status information - * this time. - */ - VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, - DATA_TYPE_NVLIST_ARRAY) == 0); - - spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), - KM_SLEEP); - for (i = 0; i < spa->spa_spares.sav_count; i++) - spares[i] = vdev_config_generate(spa, - spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); - VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); - for (i = 0; i < spa->spa_spares.sav_count; i++) - nvlist_free(spares[i]); - kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); -} - -/* - * Load (or re-load) the current list of vdevs describing the active l2cache for - * this pool. When this is called, we have some form of basic information in - * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and - * then re-generate a more complete list including status information. - * Devices which are already active have their details maintained, and are - * not re-opened. - */ -void -spa_load_l2cache(spa_t *spa) -{ - nvlist_t **l2cache; - uint_t nl2cache; - int i, j, oldnvdevs; - uint64_t guid; - vdev_t *vd, **oldvdevs, **newvdevs; - spa_aux_vdev_t *sav = &spa->spa_l2cache; - -#ifndef _KERNEL - /* - * zdb opens both the current state of the pool and the - * checkpointed state (if present), with a different spa_t. - * - * As L2 caches are part of the ARC which is shared among open - * pools, we skip loading them when we load the checkpointed - * state of the pool. - */ - if (!spa_writeable(spa)) - return; -#endif - - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - - if (sav->sav_config != NULL) { - VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, - ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); - newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); - } else { - nl2cache = 0; - newvdevs = NULL; - } - - oldvdevs = sav->sav_vdevs; - oldnvdevs = sav->sav_count; - sav->sav_vdevs = NULL; - sav->sav_count = 0; - - /* - * Process new nvlist of vdevs. - */ - for (i = 0; i < nl2cache; i++) { - VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, - &guid) == 0); - - newvdevs[i] = NULL; - for (j = 0; j < oldnvdevs; j++) { - vd = oldvdevs[j]; - if (vd != NULL && guid == vd->vdev_guid) { - /* - * Retain previous vdev for add/remove ops. - */ - newvdevs[i] = vd; - oldvdevs[j] = NULL; - break; - } - } - - if (newvdevs[i] == NULL) { - /* - * Create new vdev - */ - VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, - VDEV_ALLOC_L2CACHE) == 0); - ASSERT(vd != NULL); - newvdevs[i] = vd; - - /* - * Commit this vdev as an l2cache device, - * even if it fails to open. - */ - spa_l2cache_add(vd); - - vd->vdev_top = vd; - vd->vdev_aux = sav; - - spa_l2cache_activate(vd); - - if (vdev_open(vd) != 0) - continue; - - (void) vdev_validate_aux(vd); - - if (!vdev_is_dead(vd)) - l2arc_add_vdev(spa, vd); - } - } - - /* - * Purge vdevs that were dropped - */ - for (i = 0; i < oldnvdevs; i++) { - uint64_t pool; - - vd = oldvdevs[i]; - if (vd != NULL) { - ASSERT(vd->vdev_isl2cache); - - if (spa_l2cache_exists(vd->vdev_guid, &pool) && - pool != 0ULL && l2arc_vdev_present(vd)) - l2arc_remove_vdev(vd); - vdev_clear_stats(vd); - vdev_free(vd); - } - } - - if (oldvdevs) - kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); - - if (sav->sav_config == NULL) - goto out; - - sav->sav_vdevs = newvdevs; - sav->sav_count = (int)nl2cache; - - /* - * Recompute the stashed list of l2cache devices, with status - * information this time. - */ - VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, - DATA_TYPE_NVLIST_ARRAY) == 0); - - l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); - for (i = 0; i < sav->sav_count; i++) - l2cache[i] = vdev_config_generate(spa, - sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); - VERIFY(nvlist_add_nvlist_array(sav->sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); -out: - for (i = 0; i < sav->sav_count; i++) - nvlist_free(l2cache[i]); - if (sav->sav_count) - kmem_free(l2cache, sav->sav_count * sizeof (void *)); -} - -static int -load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) -{ - dmu_buf_t *db; - char *packed = NULL; - size_t nvsize = 0; - int error; - *value = NULL; - - error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); - if (error != 0) - return (error); - - nvsize = *(uint64_t *)db->db_data; - dmu_buf_rele(db, FTAG); - - packed = kmem_alloc(nvsize, KM_SLEEP); - error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, - DMU_READ_PREFETCH); - if (error == 0) - error = nvlist_unpack(packed, nvsize, value, 0); - kmem_free(packed, nvsize); - - return (error); -} - -/* - * Concrete top-level vdevs that are not missing and are not logs. At every - * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. - */ -static uint64_t -spa_healthy_core_tvds(spa_t *spa) -{ - vdev_t *rvd = spa->spa_root_vdev; - uint64_t tvds = 0; - - for (uint64_t i = 0; i < rvd->vdev_children; i++) { - vdev_t *vd = rvd->vdev_child[i]; - if (vd->vdev_islog) - continue; - if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) - tvds++; - } - - return (tvds); -} - -/* - * Checks to see if the given vdev could not be opened, in which case we post a - * sysevent to notify the autoreplace code that the device has been removed. - */ -static void -spa_check_removed(vdev_t *vd) -{ - for (uint64_t c = 0; c < vd->vdev_children; c++) - spa_check_removed(vd->vdev_child[c]); - - if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && - vdev_is_concrete(vd)) { - zfs_post_autoreplace(vd->vdev_spa, vd); - spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); - } -} - -static int -spa_check_for_missing_logs(spa_t *spa) -{ - vdev_t *rvd = spa->spa_root_vdev; - - /* - * If we're doing a normal import, then build up any additional - * diagnostic information about missing log devices. - * We'll pass this up to the user for further processing. - */ - if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { - nvlist_t **child, *nv; - uint64_t idx = 0; - - child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), - KM_SLEEP); - VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - for (uint64_t c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - - /* - * We consider a device as missing only if it failed - * to open (i.e. offline or faulted is not considered - * as missing). - */ - if (tvd->vdev_islog && - tvd->vdev_state == VDEV_STATE_CANT_OPEN) { - child[idx++] = vdev_config_generate(spa, tvd, - B_FALSE, VDEV_CONFIG_MISSING); - } - } - - if (idx > 0) { - fnvlist_add_nvlist_array(nv, - ZPOOL_CONFIG_CHILDREN, child, idx); - fnvlist_add_nvlist(spa->spa_load_info, - ZPOOL_CONFIG_MISSING_DEVICES, nv); - - for (uint64_t i = 0; i < idx; i++) - nvlist_free(child[i]); - } - nvlist_free(nv); - kmem_free(child, rvd->vdev_children * sizeof (char **)); - - if (idx > 0) { - spa_load_failed(spa, "some log devices are missing"); - vdev_dbgmsg_print_tree(rvd, 2); - return (SET_ERROR(ENXIO)); - } - } else { - for (uint64_t c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - - if (tvd->vdev_islog && - tvd->vdev_state == VDEV_STATE_CANT_OPEN) { - spa_set_log_state(spa, SPA_LOG_CLEAR); - spa_load_note(spa, "some log devices are " - "missing, ZIL is dropped."); - vdev_dbgmsg_print_tree(rvd, 2); - break; - } - } - } - - return (0); -} - -/* - * Check for missing log devices - */ -static boolean_t -spa_check_logs(spa_t *spa) -{ - boolean_t rv = B_FALSE; - dsl_pool_t *dp = spa_get_dsl(spa); - - switch (spa->spa_log_state) { - case SPA_LOG_MISSING: - /* need to recheck in case slog has been restored */ - case SPA_LOG_UNKNOWN: - rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); - if (rv) - spa_set_log_state(spa, SPA_LOG_MISSING); - break; - } - return (rv); -} - -static boolean_t -spa_passivate_log(spa_t *spa) -{ - vdev_t *rvd = spa->spa_root_vdev; - boolean_t slog_found = B_FALSE; - - ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); - - if (!spa_has_slogs(spa)) - return (B_FALSE); - - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; - - if (tvd->vdev_islog) { - metaslab_group_passivate(mg); - slog_found = B_TRUE; - } - } - - return (slog_found); -} - -static void -spa_activate_log(spa_t *spa) -{ - vdev_t *rvd = spa->spa_root_vdev; - - ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); - - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; - - if (tvd->vdev_islog) - metaslab_group_activate(mg); - } -} - -int -spa_reset_logs(spa_t *spa) -{ - int error; - - error = dmu_objset_find(spa_name(spa), zil_reset, - NULL, DS_FIND_CHILDREN); - if (error == 0) { - /* - * We successfully offlined the log device, sync out the - * current txg so that the "stubby" block can be removed - * by zil_sync(). - */ - txg_wait_synced(spa->spa_dsl_pool, 0); - } - return (error); -} - -static void -spa_aux_check_removed(spa_aux_vdev_t *sav) -{ - int i; - - for (i = 0; i < sav->sav_count; i++) - spa_check_removed(sav->sav_vdevs[i]); -} - -void -spa_claim_notify(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - - if (zio->io_error) - return; - - mutex_enter(&spa->spa_props_lock); /* any mutex will do */ - if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) - spa->spa_claim_max_txg = zio->io_bp->blk_birth; - mutex_exit(&spa->spa_props_lock); -} - -typedef struct spa_load_error { - uint64_t sle_meta_count; - uint64_t sle_data_count; -} spa_load_error_t; - -static void -spa_load_verify_done(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - spa_load_error_t *sle = zio->io_private; - dmu_object_type_t type = BP_GET_TYPE(bp); - int error = zio->io_error; - spa_t *spa = zio->io_spa; - - abd_free(zio->io_abd); - if (error) { - if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && - type != DMU_OT_INTENT_LOG) - atomic_inc_64(&sle->sle_meta_count); - else - atomic_inc_64(&sle->sle_data_count); - } - - mutex_enter(&spa->spa_scrub_lock); - spa->spa_load_verify_ios--; - cv_broadcast(&spa->spa_scrub_io_cv); - mutex_exit(&spa->spa_scrub_lock); -} - -/* - * Maximum number of concurrent scrub i/os to create while verifying - * a pool while importing it. - */ -int spa_load_verify_maxinflight = 10000; -boolean_t spa_load_verify_metadata = B_TRUE; -boolean_t spa_load_verify_data = B_TRUE; - -SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, - &spa_load_verify_maxinflight, 0, - "Maximum number of concurrent scrub I/Os to create while verifying a " - "pool while importing it"); - -SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, - &spa_load_verify_metadata, 0, - "Check metadata on import?"); - -SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, - &spa_load_verify_data, 0, - "Check user data on import?"); - -/*ARGSUSED*/ -static int -spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) -{ - if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) - return (0); - /* - * Note: normally this routine will not be called if - * spa_load_verify_metadata is not set. However, it may be useful - * to manually set the flag after the traversal has begun. - */ - if (!spa_load_verify_metadata) - return (0); - if (!BP_IS_METADATA(bp) && !spa_load_verify_data) - return (0); - - zio_t *rio = arg; - size_t size = BP_GET_PSIZE(bp); - - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight) - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_load_verify_ios++; - mutex_exit(&spa->spa_scrub_lock); - - zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, - spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, - ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | - ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); - return (0); -} - -/* ARGSUSED */ -int -verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) -{ - if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) - return (SET_ERROR(ENAMETOOLONG)); - - return (0); -} - -static int -spa_load_verify(spa_t *spa) -{ - zio_t *rio; - spa_load_error_t sle = { 0 }; - zpool_load_policy_t policy; - boolean_t verify_ok = B_FALSE; - int error = 0; - - zpool_get_load_policy(spa->spa_config, &policy); - - if (policy.zlp_rewind & ZPOOL_NEVER_REWIND) - return (0); - - dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); - error = dmu_objset_find_dp(spa->spa_dsl_pool, - spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, - DS_FIND_CHILDREN); - dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); - if (error != 0) - return (error); - - rio = zio_root(spa, NULL, &sle, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); - - if (spa_load_verify_metadata) { - if (spa->spa_extreme_rewind) { - spa_load_note(spa, "performing a complete scan of the " - "pool since extreme rewind is on. This may take " - "a very long time.\n (spa_load_verify_data=%u, " - "spa_load_verify_metadata=%u)", - spa_load_verify_data, spa_load_verify_metadata); - } - error = traverse_pool(spa, spa->spa_verify_min_txg, - TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, - spa_load_verify_cb, rio); - } - - (void) zio_wait(rio); - - spa->spa_load_meta_errors = sle.sle_meta_count; - spa->spa_load_data_errors = sle.sle_data_count; - - if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { - spa_load_note(spa, "spa_load_verify found %llu metadata errors " - "and %llu data errors", (u_longlong_t)sle.sle_meta_count, - (u_longlong_t)sle.sle_data_count); - } - - if (spa_load_verify_dryrun || - (!error && sle.sle_meta_count <= policy.zlp_maxmeta && - sle.sle_data_count <= policy.zlp_maxdata)) { - int64_t loss = 0; - - verify_ok = B_TRUE; - spa->spa_load_txg = spa->spa_uberblock.ub_txg; - spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; - - loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; - VERIFY(nvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); - VERIFY(nvlist_add_int64(spa->spa_load_info, - ZPOOL_CONFIG_REWIND_TIME, loss) == 0); - VERIFY(nvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); - } else { - spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; - } - - if (spa_load_verify_dryrun) - return (0); - - if (error) { - if (error != ENXIO && error != EIO) - error = SET_ERROR(EIO); - return (error); - } - - return (verify_ok ? 0 : EIO); -} - -/* - * Find a value in the pool props object. - */ -static void -spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) -{ - (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, - zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); -} - -/* - * Find a value in the pool directory object. - */ -static int -spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) -{ - int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - name, sizeof (uint64_t), 1, val); - - if (error != 0 && (error != ENOENT || log_enoent)) { - spa_load_failed(spa, "couldn't get '%s' value in MOS directory " - "[error=%d]", name, error); - } - - return (error); -} - -static int -spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) -{ - vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); - return (SET_ERROR(err)); -} - -static void -spa_spawn_aux_threads(spa_t *spa) -{ - ASSERT(spa_writeable(spa)); - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - spa_start_indirect_condensing_thread(spa); - - ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); - spa->spa_checkpoint_discard_zthr = - zthr_create(spa_checkpoint_discard_thread_check, - spa_checkpoint_discard_thread, spa); -} - -/* - * Fix up config after a partly-completed split. This is done with the - * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off - * pool have that entry in their config, but only the splitting one contains - * a list of all the guids of the vdevs that are being split off. - * - * This function determines what to do with that list: either rejoin - * all the disks to the pool, or complete the splitting process. To attempt - * the rejoin, each disk that is offlined is marked online again, and - * we do a reopen() call. If the vdev label for every disk that was - * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) - * then we call vdev_split() on each disk, and complete the split. - * - * Otherwise we leave the config alone, with all the vdevs in place in - * the original pool. - */ -static void -spa_try_repair(spa_t *spa, nvlist_t *config) -{ - uint_t extracted; - uint64_t *glist; - uint_t i, gcount; - nvlist_t *nvl; - vdev_t **vd; - boolean_t attempt_reopen; - - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) - return; - - /* check that the config is complete */ - if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, - &glist, &gcount) != 0) - return; - - vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); - - /* attempt to online all the vdevs & validate */ - attempt_reopen = B_TRUE; - for (i = 0; i < gcount; i++) { - if (glist[i] == 0) /* vdev is hole */ - continue; - - vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); - if (vd[i] == NULL) { - /* - * Don't bother attempting to reopen the disks; - * just do the split. - */ - attempt_reopen = B_FALSE; - } else { - /* attempt to re-online it */ - vd[i]->vdev_offline = B_FALSE; - } - } - - if (attempt_reopen) { - vdev_reopen(spa->spa_root_vdev); - - /* check each device to see what state it's in */ - for (extracted = 0, i = 0; i < gcount; i++) { - if (vd[i] != NULL && - vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) - break; - ++extracted; - } - } - - /* - * If every disk has been moved to the new pool, or if we never - * even attempted to look at them, then we split them off for - * good. - */ - if (!attempt_reopen || gcount == extracted) { - for (i = 0; i < gcount; i++) - if (vd[i] != NULL) - vdev_split(vd[i]); - vdev_reopen(spa->spa_root_vdev); - } - - kmem_free(vd, gcount * sizeof (vdev_t *)); -} - -static int -spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) -{ - char *ereport = FM_EREPORT_ZFS_POOL; - int error; - - spa->spa_load_state = state; - - gethrestime(&spa->spa_loaded_ts); - error = spa_load_impl(spa, type, &ereport); - - /* - * Don't count references from objsets that are already closed - * and are making their way through the eviction process. - */ - spa_evicting_os_wait(spa); - spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); - if (error) { - if (error != EEXIST) { - spa->spa_loaded_ts.tv_sec = 0; - spa->spa_loaded_ts.tv_nsec = 0; - } - if (error != EBADF) { - zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); - } - } - spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; - spa->spa_ena = 0; - - return (error); -} - -/* - * Count the number of per-vdev ZAPs associated with all of the vdevs in the - * vdev tree rooted in the given vd, and ensure that each ZAP is present in the - * spa's per-vdev ZAP list. - */ -static uint64_t -vdev_count_verify_zaps(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - uint64_t total = 0; - if (vd->vdev_top_zap != 0) { - total++; - ASSERT0(zap_lookup_int(spa->spa_meta_objset, - spa->spa_all_vdev_zaps, vd->vdev_top_zap)); - } - if (vd->vdev_leaf_zap != 0) { - total++; - ASSERT0(zap_lookup_int(spa->spa_meta_objset, - spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); - } - - for (uint64_t i = 0; i < vd->vdev_children; i++) { - total += vdev_count_verify_zaps(vd->vdev_child[i]); - } - - return (total); -} - -/* - * Determine whether the activity check is required. - */ -static boolean_t -spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, - nvlist_t *config) -{ - uint64_t state = 0; - uint64_t hostid = 0; - uint64_t tryconfig_txg = 0; - uint64_t tryconfig_timestamp = 0; - uint16_t tryconfig_mmp_seq = 0; - nvlist_t *nvinfo; - - if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { - nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); - (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, - &tryconfig_txg); - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, - &tryconfig_timestamp); - (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, - &tryconfig_mmp_seq); - } - - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); - - /* - * Disable the MMP activity check - This is used by zdb which - * is intended to be used on potentially active pools. - */ - if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) - return (B_FALSE); - - /* - * Skip the activity check when the MMP feature is disabled. - */ - if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) - return (B_FALSE); - - /* - * If the tryconfig_ values are nonzero, they are the results of an - * earlier tryimport. If they all match the uberblock we just found, - * then the pool has not changed and we return false so we do not test - * a second time. - */ - if (tryconfig_txg && tryconfig_txg == ub->ub_txg && - tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && - tryconfig_mmp_seq && tryconfig_mmp_seq == - (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) - return (B_FALSE); - - /* - * Allow the activity check to be skipped when importing the pool - * on the same host which last imported it. Since the hostid from - * configuration may be stale use the one read from the label. - */ - if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) - hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); - - if (hostid == spa_get_hostid()) - return (B_FALSE); - - /* - * Skip the activity test when the pool was cleanly exported. - */ - if (state != POOL_STATE_ACTIVE) - return (B_FALSE); - - return (B_TRUE); -} - -/* - * Nanoseconds the activity check must watch for changes on-disk. - */ -static uint64_t -spa_activity_check_duration(spa_t *spa, uberblock_t *ub) -{ - uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); - uint64_t multihost_interval = MSEC2NSEC( - MMP_INTERVAL_OK(zfs_multihost_interval)); - uint64_t import_delay = MAX(NANOSEC, import_intervals * - multihost_interval); - - /* - * Local tunables determine a minimum duration except for the case - * where we know when the remote host will suspend the pool if MMP - * writes do not land. - * - * See Big Theory comment at the top of mmp.c for the reasoning behind - * these cases and times. - */ - - ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); - - if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && - MMP_FAIL_INT(ub) > 0) { - - /* MMP on remote host will suspend pool after failed writes */ - import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * - MMP_IMPORT_SAFETY_FACTOR / 100; - - zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " - "mmp_fails=%llu ub_mmp mmp_interval=%llu " - "import_intervals=%u", import_delay, MMP_FAIL_INT(ub), - MMP_INTERVAL(ub), import_intervals); - - } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && - MMP_FAIL_INT(ub) == 0) { - - /* MMP on remote host will never suspend pool */ - import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + - ub->ub_mmp_delay) * import_intervals); - - zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " - "mmp_interval=%llu ub_mmp_delay=%llu " - "import_intervals=%u", import_delay, MMP_INTERVAL(ub), - ub->ub_mmp_delay, import_intervals); - - } else if (MMP_VALID(ub)) { - /* - * zfs-0.7 compatability case - */ - - import_delay = MAX(import_delay, (multihost_interval + - ub->ub_mmp_delay) * import_intervals); - - zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " - "import_intervals=%u leaves=%u", import_delay, - ub->ub_mmp_delay, import_intervals, - vdev_count_leaves(spa)); - } else { - /* Using local tunings is the only reasonable option */ - zfs_dbgmsg("pool last imported on non-MMP aware " - "host using import_delay=%llu multihost_interval=%llu " - "import_intervals=%u", import_delay, multihost_interval, - import_intervals); - } - - return (import_delay); -} - -/* - * Perform the import activity check. If the user canceled the import or - * we detected activity then fail. - */ -static int -spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) -{ - uint64_t txg = ub->ub_txg; - uint64_t timestamp = ub->ub_timestamp; - uint64_t mmp_config = ub->ub_mmp_config; - uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; - uint64_t import_delay; - hrtime_t import_expire; - nvlist_t *mmp_label = NULL; - vdev_t *rvd = spa->spa_root_vdev; - kcondvar_t cv; - kmutex_t mtx; - int error = 0; - - cv_init(&cv, NULL, CV_DEFAULT, NULL); - mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_enter(&mtx); - - /* - * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed - * during the earlier tryimport. If the txg recorded there is 0 then - * the pool is known to be active on another host. - * - * Otherwise, the pool might be in use on another host. Check for - * changes in the uberblocks on disk if necessary. - */ - if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { - nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, - ZPOOL_CONFIG_LOAD_INFO); - - if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && - fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { - vdev_uberblock_load(rvd, ub, &mmp_label); - error = SET_ERROR(EREMOTEIO); - goto out; - } - } - - import_delay = spa_activity_check_duration(spa, ub); - - /* Add a small random factor in case of simultaneous imports (0-25%) */ - import_delay += import_delay * spa_get_random(250) / 1000; - - import_expire = gethrtime() + import_delay; - - while (gethrtime() < import_expire) { - vdev_uberblock_load(rvd, ub, &mmp_label); - - if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || - mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { - zfs_dbgmsg("multihost activity detected " - "txg %llu ub_txg %llu " - "timestamp %llu ub_timestamp %llu " - "mmp_config %#llx ub_mmp_config %#llx", - txg, ub->ub_txg, timestamp, ub->ub_timestamp, - mmp_config, ub->ub_mmp_config); - - error = SET_ERROR(EREMOTEIO); - break; - } - - if (mmp_label) { - nvlist_free(mmp_label); - mmp_label = NULL; - } - error = cv_timedwait_sig(&cv, &mtx, hz); -#if defined(illumos) || !defined(_KERNEL) - if (error != -1) { -#else - if (error != EWOULDBLOCK) { -#endif - error = SET_ERROR(EINTR); - break; - } - error = 0; - } - -out: - mutex_exit(&mtx); - mutex_destroy(&mtx); - cv_destroy(&cv); - - /* - * If the pool is determined to be active store the status in the - * spa->spa_load_info nvlist. If the remote hostname or hostid are - * available from configuration read from disk store them as well. - * This allows 'zpool import' to generate a more useful message. - * - * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) - * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool - * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool - */ - if (error == EREMOTEIO) { - char *hostname = ""; - uint64_t hostid = 0; - - if (mmp_label) { - if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { - hostname = fnvlist_lookup_string(mmp_label, - ZPOOL_CONFIG_HOSTNAME); - fnvlist_add_string(spa->spa_load_info, - ZPOOL_CONFIG_MMP_HOSTNAME, hostname); - } - - if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { - hostid = fnvlist_lookup_uint64(mmp_label, - ZPOOL_CONFIG_HOSTID); - fnvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_MMP_HOSTID, hostid); - } - } - - fnvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); - fnvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_MMP_TXG, 0); - - error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); - } - - if (mmp_label) - nvlist_free(mmp_label); - - return (error); -} - -static int -spa_verify_host(spa_t *spa, nvlist_t *mos_config) -{ - uint64_t hostid; - char *hostname; - uint64_t myhostid = 0; - - if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, - ZPOOL_CONFIG_HOSTID, &hostid) == 0) { - hostname = fnvlist_lookup_string(mos_config, - ZPOOL_CONFIG_HOSTNAME); - - myhostid = zone_get_hostid(NULL); - - if (hostid != 0 && myhostid != 0 && hostid != myhostid) { - cmn_err(CE_WARN, "pool '%s' could not be " - "loaded as it was last accessed by " - "another system (host: %s hostid: 0x%llx). " - "See: http://illumos.org/msg/ZFS-8000-EY", - spa_name(spa), hostname, (u_longlong_t)hostid); - spa_load_failed(spa, "hostid verification failed: pool " - "last accessed by host: %s (hostid: 0x%llx)", - hostname, (u_longlong_t)hostid); - return (SET_ERROR(EBADF)); - } - } - - return (0); -} - -static int -spa_ld_parse_config(spa_t *spa, spa_import_type_t type) -{ - int error = 0; - nvlist_t *nvtree, *nvl, *config = spa->spa_config; - int parse; - vdev_t *rvd; - uint64_t pool_guid; - char *comment; - - /* - * Versioning wasn't explicitly added to the label until later, so if - * it's not present treat it as the initial version. - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, - &spa->spa_ubsync.ub_version) != 0) - spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; - - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { - spa_load_failed(spa, "invalid config provided: '%s' missing", - ZPOOL_CONFIG_POOL_GUID); - return (SET_ERROR(EINVAL)); - } - - /* - * If we are doing an import, ensure that the pool is not already - * imported by checking if its pool guid already exists in the - * spa namespace. - * - * The only case that we allow an already imported pool to be - * imported again, is when the pool is checkpointed and we want to - * look at its checkpointed state from userland tools like zdb. - */ -#ifdef _KERNEL - if ((spa->spa_load_state == SPA_LOAD_IMPORT || - spa->spa_load_state == SPA_LOAD_TRYIMPORT) && - spa_guid_exists(pool_guid, 0)) { -#else - if ((spa->spa_load_state == SPA_LOAD_IMPORT || - spa->spa_load_state == SPA_LOAD_TRYIMPORT) && - spa_guid_exists(pool_guid, 0) && - !spa_importing_readonly_checkpoint(spa)) { -#endif - spa_load_failed(spa, "a pool with guid %llu is already open", - (u_longlong_t)pool_guid); - return (SET_ERROR(EEXIST)); - } - - spa->spa_config_guid = pool_guid; - - nvlist_free(spa->spa_load_info); - spa->spa_load_info = fnvlist_alloc(); - - ASSERT(spa->spa_comment == NULL); - if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) - spa->spa_comment = spa_strdup(comment); - - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, - &spa->spa_config_txg); - - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) - spa->spa_config_splitting = fnvlist_dup(nvl); - - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { - spa_load_failed(spa, "invalid config provided: '%s' missing", - ZPOOL_CONFIG_VDEV_TREE); - return (SET_ERROR(EINVAL)); - } - - /* - * Create "The Godfather" zio to hold all async IOs - */ - spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), - KM_SLEEP); - for (int i = 0; i < max_ncpus; i++) { - spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | - ZIO_FLAG_GODFATHER); - } - - /* - * Parse the configuration into a vdev tree. We explicitly set the - * value that will be returned by spa_version() since parsing the - * configuration requires knowing the version number. - */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - parse = (type == SPA_IMPORT_EXISTING ? - VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); - error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); - spa_config_exit(spa, SCL_ALL, FTAG); - - if (error != 0) { - spa_load_failed(spa, "unable to parse config [error=%d]", - error); - return (error); - } - - ASSERT(spa->spa_root_vdev == rvd); - ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); - ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); - - if (type != SPA_IMPORT_ASSEMBLE) { - ASSERT(spa_guid(spa) == pool_guid); - } - - return (0); -} - -/* - * Recursively open all vdevs in the vdev tree. This function is called twice: - * first with the untrusted config, then with the trusted config. - */ -static int -spa_ld_open_vdevs(spa_t *spa) -{ - int error = 0; - - /* - * spa_missing_tvds_allowed defines how many top-level vdevs can be - * missing/unopenable for the root vdev to be still considered openable. - */ - if (spa->spa_trust_config) { - spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; - } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { - spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; - } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { - spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; - } else { - spa->spa_missing_tvds_allowed = 0; - } - - spa->spa_missing_tvds_allowed = - MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_open(spa->spa_root_vdev); - spa_config_exit(spa, SCL_ALL, FTAG); - - if (spa->spa_missing_tvds != 0) { - spa_load_note(spa, "vdev tree has %lld missing top-level " - "vdevs.", (u_longlong_t)spa->spa_missing_tvds); - if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) { - /* - * Although theoretically we could allow users to open - * incomplete pools in RW mode, we'd need to add a lot - * of extra logic (e.g. adjust pool space to account - * for missing vdevs). - * This limitation also prevents users from accidentally - * opening the pool in RW mode during data recovery and - * damaging it further. - */ - spa_load_note(spa, "pools with missing top-level " - "vdevs can only be opened in read-only mode."); - error = SET_ERROR(ENXIO); - } else { - spa_load_note(spa, "current settings allow for maximum " - "%lld missing top-level vdevs at this stage.", - (u_longlong_t)spa->spa_missing_tvds_allowed); - } - } - if (error != 0) { - spa_load_failed(spa, "unable to open vdev tree [error=%d]", - error); - } - if (spa->spa_missing_tvds != 0 || error != 0) - vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); - - return (error); -} - -/* - * We need to validate the vdev labels against the configuration that - * we have in hand. This function is called twice: first with an untrusted - * config, then with a trusted config. The validation is more strict when the - * config is trusted. - */ -static int -spa_ld_validate_vdevs(spa_t *spa) -{ - int error = 0; - vdev_t *rvd = spa->spa_root_vdev; - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_validate(rvd); - spa_config_exit(spa, SCL_ALL, FTAG); - - if (error != 0) { - spa_load_failed(spa, "vdev_validate failed [error=%d]", error); - return (error); - } - - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { - spa_load_failed(spa, "cannot open vdev tree after invalidating " - "some vdevs"); - vdev_dbgmsg_print_tree(rvd, 2); - return (SET_ERROR(ENXIO)); - } - - return (0); -} - -static void -spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) -{ - spa->spa_state = POOL_STATE_ACTIVE; - spa->spa_ubsync = spa->spa_uberblock; - spa->spa_verify_min_txg = spa->spa_extreme_rewind ? - TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; - spa->spa_first_txg = spa->spa_last_ubsync_txg ? - spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; - spa->spa_claim_max_txg = spa->spa_first_txg; - spa->spa_prev_software_version = ub->ub_software_version; -} - -static int -spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) -{ - vdev_t *rvd = spa->spa_root_vdev; - nvlist_t *label; - uberblock_t *ub = &spa->spa_uberblock; - boolean_t activity_check = B_FALSE; - - /* - * If we are opening the checkpointed state of the pool by - * rewinding to it, at this point we will have written the - * checkpointed uberblock to the vdev labels, so searching - * the labels will find the right uberblock. However, if - * we are opening the checkpointed state read-only, we have - * not modified the labels. Therefore, we must ignore the - * labels and continue using the spa_uberblock that was set - * by spa_ld_checkpoint_rewind. - * - * Note that it would be fine to ignore the labels when - * rewinding (opening writeable) as well. However, if we - * crash just after writing the labels, we will end up - * searching the labels. Doing so in the common case means - * that this code path gets exercised normally, rather than - * just in the edge case. - */ - if (ub->ub_checkpoint_txg != 0 && - spa_importing_readonly_checkpoint(spa)) { - spa_ld_select_uberblock_done(spa, ub); - return (0); - } - - /* - * Find the best uberblock. - */ - vdev_uberblock_load(rvd, ub, &label); - - /* - * If we weren't able to find a single valid uberblock, return failure. - */ - if (ub->ub_txg == 0) { - nvlist_free(label); - spa_load_failed(spa, "no valid uberblock found"); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); - } - - spa_load_note(spa, "using uberblock with txg=%llu", - (u_longlong_t)ub->ub_txg); - - /* - * For pools which have the multihost property on determine if the - * pool is truly inactive and can be safely imported. Prevent - * hosts which don't have a hostid set from importing the pool. - */ - activity_check = spa_activity_check_required(spa, ub, label, - spa->spa_config); - if (activity_check) { - if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && - spa_get_hostid() == 0) { - nvlist_free(label); - fnvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); - return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); - } - - int error = spa_activity_check(spa, ub, spa->spa_config); - if (error) { - nvlist_free(label); - return (error); - } - - fnvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); - fnvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); - fnvlist_add_uint16(spa->spa_load_info, - ZPOOL_CONFIG_MMP_SEQ, - (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); - } - - /* - * If the pool has an unsupported version we can't open it. - */ - if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { - nvlist_free(label); - spa_load_failed(spa, "version %llu is not supported", - (u_longlong_t)ub->ub_version); - return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); - } - - if (ub->ub_version >= SPA_VERSION_FEATURES) { - nvlist_t *features; - - /* - * If we weren't able to find what's necessary for reading the - * MOS in the label, return failure. - */ - if (label == NULL) { - spa_load_failed(spa, "label config unavailable"); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, - ENXIO)); - } - - if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, - &features) != 0) { - nvlist_free(label); - spa_load_failed(spa, "invalid label: '%s' missing", - ZPOOL_CONFIG_FEATURES_FOR_READ); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, - ENXIO)); - } - - /* - * Update our in-core representation with the definitive values - * from the label. - */ - nvlist_free(spa->spa_label_features); - VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); - } - - nvlist_free(label); - - /* - * Look through entries in the label nvlist's features_for_read. If - * there is a feature listed there which we don't understand then we - * cannot open a pool. - */ - if (ub->ub_version >= SPA_VERSION_FEATURES) { - nvlist_t *unsup_feat; - - VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == - 0); - - for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, - NULL); nvp != NULL; - nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { - if (!zfeature_is_supported(nvpair_name(nvp))) { - VERIFY(nvlist_add_string(unsup_feat, - nvpair_name(nvp), "") == 0); - } - } - - if (!nvlist_empty(unsup_feat)) { - VERIFY(nvlist_add_nvlist(spa->spa_load_info, - ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); - nvlist_free(unsup_feat); - spa_load_failed(spa, "some features are unsupported"); - return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, - ENOTSUP)); - } - - nvlist_free(unsup_feat); - } - - if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa_try_repair(spa, spa->spa_config); - spa_config_exit(spa, SCL_ALL, FTAG); - nvlist_free(spa->spa_config_splitting); - spa->spa_config_splitting = NULL; - } - - /* - * Initialize internal SPA structures. - */ - spa_ld_select_uberblock_done(spa, ub); - - return (0); -} - -static int -spa_ld_open_rootbp(spa_t *spa) -{ - int error = 0; - vdev_t *rvd = spa->spa_root_vdev; - - error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); - if (error != 0) { - spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " - "[error=%d]", error); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } - spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; - - return (0); -} - -static int -spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, - boolean_t reloading) -{ - vdev_t *mrvd, *rvd = spa->spa_root_vdev; - nvlist_t *nv, *mos_config, *policy; - int error = 0, copy_error; - uint64_t healthy_tvds, healthy_tvds_mos; - uint64_t mos_config_txg; - - if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) - != 0) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - - /* - * If we're assembling a pool from a split, the config provided is - * already trusted so there is nothing to do. - */ - if (type == SPA_IMPORT_ASSEMBLE) - return (0); - - healthy_tvds = spa_healthy_core_tvds(spa); - - if (load_nvlist(spa, spa->spa_config_object, &mos_config) - != 0) { - spa_load_failed(spa, "unable to retrieve MOS config"); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } - - /* - * If we are doing an open, pool owner wasn't verified yet, thus do - * the verification here. - */ - if (spa->spa_load_state == SPA_LOAD_OPEN) { - error = spa_verify_host(spa, mos_config); - if (error != 0) { - nvlist_free(mos_config); - return (error); - } - } - - nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - - /* - * Build a new vdev tree from the trusted config - */ - VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); - - /* - * Vdev paths in the MOS may be obsolete. If the untrusted config was - * obtained by scanning /dev/dsk, then it will have the right vdev - * paths. We update the trusted MOS config with this information. - * We first try to copy the paths with vdev_copy_path_strict, which - * succeeds only when both configs have exactly the same vdev tree. - * If that fails, we fall back to a more flexible method that has a - * best effort policy. - */ - copy_error = vdev_copy_path_strict(rvd, mrvd); - if (copy_error != 0 || spa_load_print_vdev_tree) { - spa_load_note(spa, "provided vdev tree:"); - vdev_dbgmsg_print_tree(rvd, 2); - spa_load_note(spa, "MOS vdev tree:"); - vdev_dbgmsg_print_tree(mrvd, 2); - } - if (copy_error != 0) { - spa_load_note(spa, "vdev_copy_path_strict failed, falling " - "back to vdev_copy_path_relaxed"); - vdev_copy_path_relaxed(rvd, mrvd); - } - - vdev_close(rvd); - vdev_free(rvd); - spa->spa_root_vdev = mrvd; - rvd = mrvd; - spa_config_exit(spa, SCL_ALL, FTAG); - - /* - * We will use spa_config if we decide to reload the spa or if spa_load - * fails and we rewind. We must thus regenerate the config using the - * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to - * pass settings on how to load the pool and is not stored in the MOS. - * We copy it over to our new, trusted config. - */ - mos_config_txg = fnvlist_lookup_uint64(mos_config, - ZPOOL_CONFIG_POOL_TXG); - nvlist_free(mos_config); - mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); - if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, - &policy) == 0) - fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); - spa_config_set(spa, mos_config); - spa->spa_config_source = SPA_CONFIG_SRC_MOS; - - /* - * Now that we got the config from the MOS, we should be more strict - * in checking blkptrs and can make assumptions about the consistency - * of the vdev tree. spa_trust_config must be set to true before opening - * vdevs in order for them to be writeable. - */ - spa->spa_trust_config = B_TRUE; - - /* - * Open and validate the new vdev tree - */ - error = spa_ld_open_vdevs(spa); - if (error != 0) - return (error); - - error = spa_ld_validate_vdevs(spa); - if (error != 0) - return (error); - - if (copy_error != 0 || spa_load_print_vdev_tree) { - spa_load_note(spa, "final vdev tree:"); - vdev_dbgmsg_print_tree(rvd, 2); - } - - if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && - !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { - /* - * Sanity check to make sure that we are indeed loading the - * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds - * in the config provided and they happened to be the only ones - * to have the latest uberblock, we could involuntarily perform - * an extreme rewind. - */ - healthy_tvds_mos = spa_healthy_core_tvds(spa); - if (healthy_tvds_mos - healthy_tvds >= - SPA_SYNC_MIN_VDEVS) { - spa_load_note(spa, "config provided misses too many " - "top-level vdevs compared to MOS (%lld vs %lld). ", - (u_longlong_t)healthy_tvds, - (u_longlong_t)healthy_tvds_mos); - spa_load_note(spa, "vdev tree:"); - vdev_dbgmsg_print_tree(rvd, 2); - if (reloading) { - spa_load_failed(spa, "config was already " - "provided from MOS. Aborting."); - return (spa_vdev_err(rvd, - VDEV_AUX_CORRUPT_DATA, EIO)); - } - spa_load_note(spa, "spa must be reloaded using MOS " - "config"); - return (SET_ERROR(EAGAIN)); - } - } - - error = spa_check_for_missing_logs(spa); - if (error != 0) - return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); - - if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { - spa_load_failed(spa, "uberblock guid sum doesn't match MOS " - "guid sum (%llu != %llu)", - (u_longlong_t)spa->spa_uberblock.ub_guid_sum, - (u_longlong_t)rvd->vdev_guid_sum); - return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, - ENXIO)); - } - - return (0); -} - -static int -spa_ld_open_indirect_vdev_metadata(spa_t *spa) -{ - int error = 0; - vdev_t *rvd = spa->spa_root_vdev; - - /* - * Everything that we read before spa_remove_init() must be stored - * on concreted vdevs. Therefore we do this as early as possible. - */ - error = spa_remove_init(spa); - if (error != 0) { - spa_load_failed(spa, "spa_remove_init failed [error=%d]", - error); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } - - /* - * Retrieve information needed to condense indirect vdev mappings. - */ - error = spa_condense_init(spa); - if (error != 0) { - spa_load_failed(spa, "spa_condense_init failed [error=%d]", - error); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); - } - - return (0); -} - -static int -spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) -{ - int error = 0; - vdev_t *rvd = spa->spa_root_vdev; - - if (spa_version(spa) >= SPA_VERSION_FEATURES) { - boolean_t missing_feat_read = B_FALSE; - nvlist_t *unsup_feat, *enabled_feat; - - if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, - &spa->spa_feat_for_read_obj, B_TRUE) != 0) { - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } - - if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, - &spa->spa_feat_for_write_obj, B_TRUE) != 0) { - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } - - if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, - &spa->spa_feat_desc_obj, B_TRUE) != 0) { - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } - - enabled_feat = fnvlist_alloc(); - unsup_feat = fnvlist_alloc(); - - if (!spa_features_check(spa, B_FALSE, - unsup_feat, enabled_feat)) - missing_feat_read = B_TRUE; - - if (spa_writeable(spa) || - spa->spa_load_state == SPA_LOAD_TRYIMPORT) { - if (!spa_features_check(spa, B_TRUE, - unsup_feat, enabled_feat)) { - *missing_feat_writep = B_TRUE; - } - } - - fnvlist_add_nvlist(spa->spa_load_info, - ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); - - if (!nvlist_empty(unsup_feat)) { - fnvlist_add_nvlist(spa->spa_load_info, - ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); - } - - fnvlist_free(enabled_feat); - fnvlist_free(unsup_feat); - - if (!missing_feat_read) { - fnvlist_add_boolean(spa->spa_load_info, - ZPOOL_CONFIG_CAN_RDONLY); - } - - /* - * If the state is SPA_LOAD_TRYIMPORT, our objective is - * twofold: to determine whether the pool is available for - * import in read-write mode and (if it is not) whether the - * pool is available for import in read-only mode. If the pool - * is available for import in read-write mode, it is displayed - * as available in userland; if it is not available for import - * in read-only mode, it is displayed as unavailable in - * userland. If the pool is available for import in read-only - * mode but not read-write mode, it is displayed as unavailable - * in userland with a special note that the pool is actually - * available for open in read-only mode. - * - * As a result, if the state is SPA_LOAD_TRYIMPORT and we are - * missing a feature for write, we must first determine whether - * the pool can be opened read-only before returning to - * userland in order to know whether to display the - * abovementioned note. - */ - if (missing_feat_read || (*missing_feat_writep && - spa_writeable(spa))) { - spa_load_failed(spa, "pool uses unsupported features"); - return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, - ENOTSUP)); - } - - /* - * Load refcounts for ZFS features from disk into an in-memory - * cache during SPA initialization. - */ - for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { - uint64_t refcount; - - error = feature_get_refcount_from_disk(spa, - &spa_feature_table[i], &refcount); - if (error == 0) { - spa->spa_feat_refcount_cache[i] = refcount; - } else if (error == ENOTSUP) { - spa->spa_feat_refcount_cache[i] = - SPA_FEATURE_DISABLED; - } else { - spa_load_failed(spa, "error getting refcount " - "for feature %s [error=%d]", - spa_feature_table[i].fi_guid, error); - return (spa_vdev_err(rvd, - VDEV_AUX_CORRUPT_DATA, EIO)); - } - } - } - - if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { - if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, - &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } - - return (0); -} - -static int -spa_ld_load_special_directories(spa_t *spa) -{ - int error = 0; - vdev_t *rvd = spa->spa_root_vdev; - - spa->spa_is_initializing = B_TRUE; - error = dsl_pool_open(spa->spa_dsl_pool); - spa->spa_is_initializing = B_FALSE; - if (error != 0) { - spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } - - return (0); -} - -static int -spa_ld_get_props(spa_t *spa) -{ - int error = 0; - uint64_t obj; - vdev_t *rvd = spa->spa_root_vdev; - - /* Grab the secret checksum salt from the MOS. */ - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_CHECKSUM_SALT, 1, - sizeof (spa->spa_cksum_salt.zcs_bytes), - spa->spa_cksum_salt.zcs_bytes); - if (error == ENOENT) { - /* Generate a new salt for subsequent use */ - (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, - sizeof (spa->spa_cksum_salt.zcs_bytes)); - } else if (error != 0) { - spa_load_failed(spa, "unable to retrieve checksum salt from " - "MOS [error=%d]", error); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } - - if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); - if (error != 0) { - spa_load_failed(spa, "error opening deferred-frees bpobj " - "[error=%d]", error); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } - - /* - * Load the bit that tells us to use the new accounting function - * (raid-z deflation). If we have an older pool, this will not - * be present. - */ - error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); - if (error != 0 && error != ENOENT) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - - error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, - &spa->spa_creation_version, B_FALSE); - if (error != 0 && error != ENOENT) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - - /* - * Load the persistent error log. If we have an older pool, this will - * not be present. - */ - error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, - B_FALSE); - if (error != 0 && error != ENOENT) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - - error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, - &spa->spa_errlog_scrub, B_FALSE); - if (error != 0 && error != ENOENT) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - - /* - * Load the history object. If we have an older pool, this - * will not be present. - */ - error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); - if (error != 0 && error != ENOENT) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - - /* - * Load the per-vdev ZAP map. If we have an older pool, this will not - * be present; in this case, defer its creation to a later time to - * avoid dirtying the MOS this early / out of sync context. See - * spa_sync_config_object. - */ - - /* The sentinel is only available in the MOS config. */ - nvlist_t *mos_config; - if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { - spa_load_failed(spa, "unable to retrieve MOS config"); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } - - error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, - &spa->spa_all_vdev_zaps, B_FALSE); - - if (error == ENOENT) { - VERIFY(!nvlist_exists(mos_config, - ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); - spa->spa_avz_action = AVZ_ACTION_INITIALIZE; - ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); - } else if (error != 0) { - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { - /* - * An older version of ZFS overwrote the sentinel value, so - * we have orphaned per-vdev ZAPs in the MOS. Defer their - * destruction to later; see spa_sync_config_object. - */ - spa->spa_avz_action = AVZ_ACTION_DESTROY; - /* - * We're assuming that no vdevs have had their ZAPs created - * before this. Better be sure of it. - */ - ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); - } - nvlist_free(mos_config); - - spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); - - error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, - B_FALSE); - if (error && error != ENOENT) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - - if (error == 0) { - uint64_t autoreplace; - - spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); - spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); - spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); - spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); - spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); - spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); - spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, - &spa->spa_dedup_ditto); - - spa->spa_autoreplace = (autoreplace != 0); - } - - /* - * If we are importing a pool with missing top-level vdevs, - * we enforce that the pool doesn't panic or get suspended on - * error since the likelihood of missing data is extremely high. - */ - if (spa->spa_missing_tvds > 0 && - spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && - spa->spa_load_state != SPA_LOAD_TRYIMPORT) { - spa_load_note(spa, "forcing failmode to 'continue' " - "as some top level vdevs are missing"); - spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; - } - - return (0); -} - -static int -spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) -{ - int error = 0; - vdev_t *rvd = spa->spa_root_vdev; - - /* - * If we're assembling the pool from the split-off vdevs of - * an existing pool, we don't want to attach the spares & cache - * devices. - */ - - /* - * Load any hot spares for this pool. - */ - error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, - B_FALSE); - if (error != 0 && error != ENOENT) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { - ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); - if (load_nvlist(spa, spa->spa_spares.sav_object, - &spa->spa_spares.sav_config) != 0) { - spa_load_failed(spa, "error loading spares nvlist"); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa_load_spares(spa); - spa_config_exit(spa, SCL_ALL, FTAG); - } else if (error == 0) { - spa->spa_spares.sav_sync = B_TRUE; - } - - /* - * Load any level 2 ARC devices for this pool. - */ - error = spa_dir_prop(spa, DMU_POOL_L2CACHE, - &spa->spa_l2cache.sav_object, B_FALSE); - if (error != 0 && error != ENOENT) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { - ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); - if (load_nvlist(spa, spa->spa_l2cache.sav_object, - &spa->spa_l2cache.sav_config) != 0) { - spa_load_failed(spa, "error loading l2cache nvlist"); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa_load_l2cache(spa); - spa_config_exit(spa, SCL_ALL, FTAG); - } else if (error == 0) { - spa->spa_l2cache.sav_sync = B_TRUE; - } - - return (0); -} - -static int -spa_ld_load_vdev_metadata(spa_t *spa) -{ - int error = 0; - vdev_t *rvd = spa->spa_root_vdev; - - /* - * If the 'multihost' property is set, then never allow a pool to - * be imported when the system hostid is zero. The exception to - * this rule is zdb which is always allowed to access pools. - */ - if (spa_multihost(spa) && spa_get_hostid() == 0 && - (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { - fnvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); - return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); - } - - /* - * If the 'autoreplace' property is set, then post a resource notifying - * the ZFS DE that it should not issue any faults for unopenable - * devices. We also iterate over the vdevs, and post a sysevent for any - * unopenable vdevs so that the normal autoreplace handler can take - * over. - */ - if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { - spa_check_removed(spa->spa_root_vdev); - /* - * For the import case, this is done in spa_import(), because - * at this point we're using the spare definitions from - * the MOS config, not necessarily from the userland config. - */ - if (spa->spa_load_state != SPA_LOAD_IMPORT) { - spa_aux_check_removed(&spa->spa_spares); - spa_aux_check_removed(&spa->spa_l2cache); - } - } - - /* - * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. - */ - error = vdev_load(rvd); - if (error != 0) { - spa_load_failed(spa, "vdev_load failed [error=%d]", error); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); - } - - /* - * Propagate the leaf DTLs we just loaded all the way up the vdev tree. - */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - vdev_dtl_reassess(rvd, 0, 0, B_FALSE); - spa_config_exit(spa, SCL_ALL, FTAG); - - return (0); -} - -static int -spa_ld_load_dedup_tables(spa_t *spa) -{ - int error = 0; - vdev_t *rvd = spa->spa_root_vdev; - - error = ddt_load(spa); - if (error != 0) { - spa_load_failed(spa, "ddt_load failed [error=%d]", error); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } - - return (0); -} - -static int -spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) -{ - vdev_t *rvd = spa->spa_root_vdev; - - if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { - boolean_t missing = spa_check_logs(spa); - if (missing) { - if (spa->spa_missing_tvds != 0) { - spa_load_note(spa, "spa_check_logs failed " - "so dropping the logs"); - } else { - *ereport = FM_EREPORT_ZFS_LOG_REPLAY; - spa_load_failed(spa, "spa_check_logs failed"); - return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, - ENXIO)); - } - } - } - - return (0); -} - -static int -spa_ld_verify_pool_data(spa_t *spa) -{ - int error = 0; - vdev_t *rvd = spa->spa_root_vdev; - - /* - * We've successfully opened the pool, verify that we're ready - * to start pushing transactions. - */ - if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { - error = spa_load_verify(spa); - if (error != 0) { - spa_load_failed(spa, "spa_load_verify failed " - "[error=%d]", error); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, - error)); - } - } - - return (0); -} - -static void -spa_ld_claim_log_blocks(spa_t *spa) -{ - dmu_tx_t *tx; - dsl_pool_t *dp = spa_get_dsl(spa); - - /* - * Claim log blocks that haven't been committed yet. - * This must all happen in a single txg. - * Note: spa_claim_max_txg is updated by spa_claim_notify(), - * invoked from zil_claim_log_block()'s i/o done callback. - * Price of rollback is that we abandon the log. - */ - spa->spa_claiming = B_TRUE; - - tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); - (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - zil_claim, tx, DS_FIND_CHILDREN); - dmu_tx_commit(tx); - - spa->spa_claiming = B_FALSE; - - spa_set_log_state(spa, SPA_LOG_GOOD); -} - -static void -spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, - boolean_t update_config_cache) -{ - vdev_t *rvd = spa->spa_root_vdev; - int need_update = B_FALSE; - - /* - * If the config cache is stale, or we have uninitialized - * metaslabs (see spa_vdev_add()), then update the config. - * - * If this is a verbatim import, trust the current - * in-core spa_config and update the disk labels. - */ - if (update_config_cache || config_cache_txg != spa->spa_config_txg || - spa->spa_load_state == SPA_LOAD_IMPORT || - spa->spa_load_state == SPA_LOAD_RECOVER || - (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) - need_update = B_TRUE; - - for (int c = 0; c < rvd->vdev_children; c++) - if (rvd->vdev_child[c]->vdev_ms_array == 0) - need_update = B_TRUE; - - /* - * Update the config cache asychronously in case we're the - * root pool, in which case the config cache isn't writable yet. - */ - if (need_update) - spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); -} - -static void -spa_ld_prepare_for_reload(spa_t *spa) -{ - int mode = spa->spa_mode; - int async_suspended = spa->spa_async_suspended; - - spa_unload(spa); - spa_deactivate(spa); - spa_activate(spa, mode); - - /* - * We save the value of spa_async_suspended as it gets reset to 0 by - * spa_unload(). We want to restore it back to the original value before - * returning as we might be calling spa_async_resume() later. - */ - spa->spa_async_suspended = async_suspended; -} - -static int -spa_ld_read_checkpoint_txg(spa_t *spa) -{ - uberblock_t checkpoint; - int error = 0; - - ASSERT0(spa->spa_checkpoint_txg); - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), - sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); - - if (error == ENOENT) - return (0); - - if (error != 0) - return (error); - - ASSERT3U(checkpoint.ub_txg, !=, 0); - ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); - ASSERT3U(checkpoint.ub_timestamp, !=, 0); - spa->spa_checkpoint_txg = checkpoint.ub_txg; - spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; - - return (0); -} - -static int -spa_ld_mos_init(spa_t *spa, spa_import_type_t type) -{ - int error = 0; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); - - /* - * Never trust the config that is provided unless we are assembling - * a pool following a split. - * This means don't trust blkptrs and the vdev tree in general. This - * also effectively puts the spa in read-only mode since - * spa_writeable() checks for spa_trust_config to be true. - * We will later load a trusted config from the MOS. - */ - if (type != SPA_IMPORT_ASSEMBLE) - spa->spa_trust_config = B_FALSE; - - /* - * Parse the config provided to create a vdev tree. - */ - error = spa_ld_parse_config(spa, type); - if (error != 0) - return (error); - - /* - * Now that we have the vdev tree, try to open each vdev. This involves - * opening the underlying physical device, retrieving its geometry and - * probing the vdev with a dummy I/O. The state of each vdev will be set - * based on the success of those operations. After this we'll be ready - * to read from the vdevs. - */ - error = spa_ld_open_vdevs(spa); - if (error != 0) - return (error); - - /* - * Read the label of each vdev and make sure that the GUIDs stored - * there match the GUIDs in the config provided. - * If we're assembling a new pool that's been split off from an - * existing pool, the labels haven't yet been updated so we skip - * validation for now. - */ - if (type != SPA_IMPORT_ASSEMBLE) { - error = spa_ld_validate_vdevs(spa); - if (error != 0) - return (error); - } - - /* - * Read all vdev labels to find the best uberblock (i.e. latest, - * unless spa_load_max_txg is set) and store it in spa_uberblock. We - * get the list of features required to read blkptrs in the MOS from - * the vdev label with the best uberblock and verify that our version - * of zfs supports them all. - */ - error = spa_ld_select_uberblock(spa, type); - if (error != 0) - return (error); - - /* - * Pass that uberblock to the dsl_pool layer which will open the root - * blkptr. This blkptr points to the latest version of the MOS and will - * allow us to read its contents. - */ - error = spa_ld_open_rootbp(spa); - if (error != 0) - return (error); - - return (0); -} - -static int -spa_ld_checkpoint_rewind(spa_t *spa) -{ - uberblock_t checkpoint; - int error = 0; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); - - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), - sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); - - if (error != 0) { - spa_load_failed(spa, "unable to retrieve checkpointed " - "uberblock from the MOS config [error=%d]", error); - - if (error == ENOENT) - error = ZFS_ERR_NO_CHECKPOINT; - - return (error); - } - - ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); - ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); - - /* - * We need to update the txg and timestamp of the checkpointed - * uberblock to be higher than the latest one. This ensures that - * the checkpointed uberblock is selected if we were to close and - * reopen the pool right after we've written it in the vdev labels. - * (also see block comment in vdev_uberblock_compare) - */ - checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; - checkpoint.ub_timestamp = gethrestime_sec(); - - /* - * Set current uberblock to be the checkpointed uberblock. - */ - spa->spa_uberblock = checkpoint; - - /* - * If we are doing a normal rewind, then the pool is open for - * writing and we sync the "updated" checkpointed uberblock to - * disk. Once this is done, we've basically rewound the whole - * pool and there is no way back. - * - * There are cases when we don't want to attempt and sync the - * checkpointed uberblock to disk because we are opening a - * pool as read-only. Specifically, verifying the checkpointed - * state with zdb, and importing the checkpointed state to get - * a "preview" of its content. - */ - if (spa_writeable(spa)) { - vdev_t *rvd = spa->spa_root_vdev; - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; - int svdcount = 0; - int children = rvd->vdev_children; - int c0 = spa_get_random(children); - - for (int c = 0; c < children; c++) { - vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; - - /* Stop when revisiting the first vdev */ - if (c > 0 && svd[0] == vd) - break; - - if (vd->vdev_ms_array == 0 || vd->vdev_islog || - !vdev_is_concrete(vd)) - continue; - - svd[svdcount++] = vd; - if (svdcount == SPA_SYNC_MIN_VDEVS) - break; - } - error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); - if (error == 0) - spa->spa_last_synced_guid = rvd->vdev_guid; - spa_config_exit(spa, SCL_ALL, FTAG); - - if (error != 0) { - spa_load_failed(spa, "failed to write checkpointed " - "uberblock to the vdev labels [error=%d]", error); - return (error); - } - } - - return (0); -} - -static int -spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, - boolean_t *update_config_cache) -{ - int error; - - /* - * Parse the config for pool, open and validate vdevs, - * select an uberblock, and use that uberblock to open - * the MOS. - */ - error = spa_ld_mos_init(spa, type); - if (error != 0) - return (error); - - /* - * Retrieve the trusted config stored in the MOS and use it to create - * a new, exact version of the vdev tree, then reopen all vdevs. - */ - error = spa_ld_trusted_config(spa, type, B_FALSE); - if (error == EAGAIN) { - if (update_config_cache != NULL) - *update_config_cache = B_TRUE; - - /* - * Redo the loading process with the trusted config if it is - * too different from the untrusted config. - */ - spa_ld_prepare_for_reload(spa); - spa_load_note(spa, "RELOADING"); - error = spa_ld_mos_init(spa, type); - if (error != 0) - return (error); - - error = spa_ld_trusted_config(spa, type, B_TRUE); - if (error != 0) - return (error); - - } else if (error != 0) { - return (error); - } - - return (0); -} - -/* - * Load an existing storage pool, using the config provided. This config - * describes which vdevs are part of the pool and is later validated against - * partial configs present in each vdev's label and an entire copy of the - * config stored in the MOS. - */ -static int -spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) -{ - int error = 0; - boolean_t missing_feat_write = B_FALSE; - boolean_t checkpoint_rewind = - (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); - boolean_t update_config_cache = B_FALSE; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); - - spa_load_note(spa, "LOADING"); - - error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); - if (error != 0) - return (error); - - /* - * If we are rewinding to the checkpoint then we need to repeat - * everything we've done so far in this function but this time - * selecting the checkpointed uberblock and using that to open - * the MOS. - */ - if (checkpoint_rewind) { - /* - * If we are rewinding to the checkpoint update config cache - * anyway. - */ - update_config_cache = B_TRUE; - - /* - * Extract the checkpointed uberblock from the current MOS - * and use this as the pool's uberblock from now on. If the - * pool is imported as writeable we also write the checkpoint - * uberblock to the labels, making the rewind permanent. - */ - error = spa_ld_checkpoint_rewind(spa); - if (error != 0) - return (error); - - /* - * Redo the loading process process again with the - * checkpointed uberblock. - */ - spa_ld_prepare_for_reload(spa); - spa_load_note(spa, "LOADING checkpointed uberblock"); - error = spa_ld_mos_with_trusted_config(spa, type, NULL); - if (error != 0) - return (error); - } - - /* - * Retrieve the checkpoint txg if the pool has a checkpoint. - */ - error = spa_ld_read_checkpoint_txg(spa); - if (error != 0) - return (error); - - /* - * Retrieve the mapping of indirect vdevs. Those vdevs were removed - * from the pool and their contents were re-mapped to other vdevs. Note - * that everything that we read before this step must have been - * rewritten on concrete vdevs after the last device removal was - * initiated. Otherwise we could be reading from indirect vdevs before - * we have loaded their mappings. - */ - error = spa_ld_open_indirect_vdev_metadata(spa); - if (error != 0) - return (error); - - /* - * Retrieve the full list of active features from the MOS and check if - * they are all supported. - */ - error = spa_ld_check_features(spa, &missing_feat_write); - if (error != 0) - return (error); - - /* - * Load several special directories from the MOS needed by the dsl_pool - * layer. - */ - error = spa_ld_load_special_directories(spa); - if (error != 0) - return (error); - - /* - * Retrieve pool properties from the MOS. - */ - error = spa_ld_get_props(spa); - if (error != 0) - return (error); - - /* - * Retrieve the list of auxiliary devices - cache devices and spares - - * and open them. - */ - error = spa_ld_open_aux_vdevs(spa, type); - if (error != 0) - return (error); - - /* - * Load the metadata for all vdevs. Also check if unopenable devices - * should be autoreplaced. - */ - error = spa_ld_load_vdev_metadata(spa); - if (error != 0) - return (error); - - error = spa_ld_load_dedup_tables(spa); - if (error != 0) - return (error); - - /* - * Verify the logs now to make sure we don't have any unexpected errors - * when we claim log blocks later. - */ - error = spa_ld_verify_logs(spa, type, ereport); - if (error != 0) - return (error); - - if (missing_feat_write) { - ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); - - /* - * At this point, we know that we can open the pool in - * read-only mode but not read-write mode. We now have enough - * information and can return to userland. - */ - return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, - ENOTSUP)); - } - - /* - * Traverse the last txgs to make sure the pool was left off in a safe - * state. When performing an extreme rewind, we verify the whole pool, - * which can take a very long time. - */ - error = spa_ld_verify_pool_data(spa); - if (error != 0) - return (error); - - /* - * Calculate the deflated space for the pool. This must be done before - * we write anything to the pool because we'd need to update the space - * accounting using the deflated sizes. - */ - spa_update_dspace(spa); - - /* - * We have now retrieved all the information we needed to open the - * pool. If we are importing the pool in read-write mode, a few - * additional steps must be performed to finish the import. - */ - if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || - spa->spa_load_max_txg == UINT64_MAX)) { - uint64_t config_cache_txg = spa->spa_config_txg; - - ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); - - /* - * In case of a checkpoint rewind, log the original txg - * of the checkpointed uberblock. - */ - if (checkpoint_rewind) { - spa_history_log_internal(spa, "checkpoint rewind", - NULL, "rewound state to txg=%llu", - (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); - } - - /* - * Traverse the ZIL and claim all blocks. - */ - spa_ld_claim_log_blocks(spa); - - /* - * Kick-off the syncing thread. - */ - spa->spa_sync_on = B_TRUE; - txg_sync_start(spa->spa_dsl_pool); - mmp_thread_start(spa); - - /* - * Wait for all claims to sync. We sync up to the highest - * claimed log block birth time so that claimed log blocks - * don't appear to be from the future. spa_claim_max_txg - * will have been set for us by ZIL traversal operations - * performed above. - */ - txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); - - /* - * Check if we need to request an update of the config. On the - * next sync, we would update the config stored in vdev labels - * and the cachefile (by default /etc/zfs/zpool.cache). - */ - spa_ld_check_for_config_update(spa, config_cache_txg, - update_config_cache); - - /* - * Check all DTLs to see if anything needs resilvering. - */ - if (!dsl_scan_resilvering(spa->spa_dsl_pool) && - vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) - spa_async_request(spa, SPA_ASYNC_RESILVER); - - /* - * Log the fact that we booted up (so that we can detect if - * we rebooted in the middle of an operation). - */ - spa_history_log_version(spa, "open"); - - spa_restart_removal(spa); - spa_spawn_aux_threads(spa); - - /* - * Delete any inconsistent datasets. - * - * Note: - * Since we may be issuing deletes for clones here, - * we make sure to do so after we've spawned all the - * auxiliary threads above (from which the livelist - * deletion zthr is part of). - */ - (void) dmu_objset_find(spa_name(spa), - dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); - - /* - * Clean up any stale temporary dataset userrefs. - */ - dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); - - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - vdev_initialize_restart(spa->spa_root_vdev); - spa_config_exit(spa, SCL_CONFIG, FTAG); - } - - spa_load_note(spa, "LOADED"); - - return (0); -} - -static int -spa_load_retry(spa_t *spa, spa_load_state_t state) -{ - int mode = spa->spa_mode; - - spa_unload(spa); - spa_deactivate(spa); - - spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; - - spa_activate(spa, mode); - spa_async_suspend(spa); - - spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", - (u_longlong_t)spa->spa_load_max_txg); - - return (spa_load(spa, state, SPA_IMPORT_EXISTING)); -} - -/* - * If spa_load() fails this function will try loading prior txg's. If - * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool - * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this - * function will not rewind the pool and will return the same error as - * spa_load(). - */ -static int -spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, - int rewind_flags) -{ - nvlist_t *loadinfo = NULL; - nvlist_t *config = NULL; - int load_error, rewind_error; - uint64_t safe_rewind_txg; - uint64_t min_txg; - - if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { - spa->spa_load_max_txg = spa->spa_load_txg; - spa_set_log_state(spa, SPA_LOG_CLEAR); - } else { - spa->spa_load_max_txg = max_request; - if (max_request != UINT64_MAX) - spa->spa_extreme_rewind = B_TRUE; - } - - load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); - if (load_error == 0) - return (0); - if (load_error == ZFS_ERR_NO_CHECKPOINT) { - /* - * When attempting checkpoint-rewind on a pool with no - * checkpoint, we should not attempt to load uberblocks - * from previous txgs when spa_load fails. - */ - ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); - return (load_error); - } - - if (spa->spa_root_vdev != NULL) - config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); - - spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; - spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; - - if (rewind_flags & ZPOOL_NEVER_REWIND) { - nvlist_free(config); - return (load_error); - } - - if (state == SPA_LOAD_RECOVER) { - /* Price of rolling back is discarding txgs, including log */ - spa_set_log_state(spa, SPA_LOG_CLEAR); - } else { - /* - * If we aren't rolling back save the load info from our first - * import attempt so that we can restore it after attempting - * to rewind. - */ - loadinfo = spa->spa_load_info; - spa->spa_load_info = fnvlist_alloc(); - } - - spa->spa_load_max_txg = spa->spa_last_ubsync_txg; - safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; - min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? - TXG_INITIAL : safe_rewind_txg; - - /* - * Continue as long as we're finding errors, we're still within - * the acceptable rewind range, and we're still finding uberblocks - */ - while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && - spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { - if (spa->spa_load_max_txg < safe_rewind_txg) - spa->spa_extreme_rewind = B_TRUE; - rewind_error = spa_load_retry(spa, state); - } - - spa->spa_extreme_rewind = B_FALSE; - spa->spa_load_max_txg = UINT64_MAX; - - if (config && (rewind_error || state != SPA_LOAD_RECOVER)) - spa_config_set(spa, config); - else - nvlist_free(config); - - if (state == SPA_LOAD_RECOVER) { - ASSERT3P(loadinfo, ==, NULL); - return (rewind_error); - } else { - /* Store the rewind info as part of the initial load info */ - fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, - spa->spa_load_info); - - /* Restore the initial load info */ - fnvlist_free(spa->spa_load_info); - spa->spa_load_info = loadinfo; - - return (load_error); - } -} - -/* - * Pool Open/Import - * - * The import case is identical to an open except that the configuration is sent - * down from userland, instead of grabbed from the configuration cache. For the - * case of an open, the pool configuration will exist in the - * POOL_STATE_UNINITIALIZED state. - * - * The stats information (gen/count/ustats) is used to gather vdev statistics at - * the same time open the pool, without having to keep around the spa_t in some - * ambiguous state. - */ -static int -spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, - nvlist_t **config) -{ - spa_t *spa; - spa_load_state_t state = SPA_LOAD_OPEN; - int error; - int locked = B_FALSE; - int firstopen = B_FALSE; - - *spapp = NULL; - - /* - * As disgusting as this is, we need to support recursive calls to this - * function because dsl_dir_open() is called during spa_load(), and ends - * up calling spa_open() again. The real fix is to figure out how to - * avoid dsl_dir_open() calling this in the first place. - */ - if (mutex_owner(&spa_namespace_lock) != curthread) { - mutex_enter(&spa_namespace_lock); - locked = B_TRUE; - } - - if ((spa = spa_lookup(pool)) == NULL) { - if (locked) - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(ENOENT)); - } - - if (spa->spa_state == POOL_STATE_UNINITIALIZED) { - zpool_load_policy_t policy; - - firstopen = B_TRUE; - - zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, - &policy); - if (policy.zlp_rewind & ZPOOL_DO_REWIND) - state = SPA_LOAD_RECOVER; - - spa_activate(spa, spa_mode_global); - - if (state != SPA_LOAD_RECOVER) - spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; - spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; - - zfs_dbgmsg("spa_open_common: opening %s", pool); - error = spa_load_best(spa, state, policy.zlp_txg, - policy.zlp_rewind); - - if (error == EBADF) { - /* - * If vdev_validate() returns failure (indicated by - * EBADF), it indicates that one of the vdevs indicates - * that the pool has been exported or destroyed. If - * this is the case, the config cache is out of sync and - * we should remove the pool from the namespace. - */ - spa_unload(spa); - spa_deactivate(spa); - spa_write_cachefile(spa, B_TRUE, B_TRUE); - spa_remove(spa); - if (locked) - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(ENOENT)); - } - - if (error) { - /* - * We can't open the pool, but we still have useful - * information: the state of each vdev after the - * attempted vdev_open(). Return this to the user. - */ - if (config != NULL && spa->spa_config) { - VERIFY(nvlist_dup(spa->spa_config, config, - KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist(*config, - ZPOOL_CONFIG_LOAD_INFO, - spa->spa_load_info) == 0); - } - spa_unload(spa); - spa_deactivate(spa); - spa->spa_last_open_failed = error; - if (locked) - mutex_exit(&spa_namespace_lock); - *spapp = NULL; - return (error); - } - } - - spa_open_ref(spa, tag); - - if (config != NULL) - *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); - - /* - * If we've recovered the pool, pass back any information we - * gathered while doing the load. - */ - if (state == SPA_LOAD_RECOVER) { - VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, - spa->spa_load_info) == 0); - } - - if (locked) { - spa->spa_last_open_failed = 0; - spa->spa_last_ubsync_txg = 0; - spa->spa_load_txg = 0; - mutex_exit(&spa_namespace_lock); -#ifdef __FreeBSD__ -#ifdef _KERNEL - if (firstopen) - zvol_create_minors(spa, spa->spa_name); -#endif -#endif - } - - *spapp = spa; - - return (0); -} - -int -spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, - nvlist_t **config) -{ - return (spa_open_common(name, spapp, tag, policy, config)); -} - -int -spa_open(const char *name, spa_t **spapp, void *tag) -{ - return (spa_open_common(name, spapp, tag, NULL, NULL)); -} - -/* - * Lookup the given spa_t, incrementing the inject count in the process, - * preventing it from being exported or destroyed. - */ -spa_t * -spa_inject_addref(char *name) -{ - spa_t *spa; - - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(name)) == NULL) { - mutex_exit(&spa_namespace_lock); - return (NULL); - } - spa->spa_inject_ref++; - mutex_exit(&spa_namespace_lock); - - return (spa); -} - -void -spa_inject_delref(spa_t *spa) -{ - mutex_enter(&spa_namespace_lock); - spa->spa_inject_ref--; - mutex_exit(&spa_namespace_lock); -} - -/* - * Add spares device information to the nvlist. - */ -static void -spa_add_spares(spa_t *spa, nvlist_t *config) -{ - nvlist_t **spares; - uint_t i, nspares; - nvlist_t *nvroot; - uint64_t guid; - vdev_stat_t *vs; - uint_t vsc; - uint64_t pool; - - ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); - - if (spa->spa_spares.sav_count == 0) - return; - - VERIFY(nvlist_lookup_nvlist(config, - ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); - if (nspares != 0) { - VERIFY(nvlist_add_nvlist_array(nvroot, - ZPOOL_CONFIG_SPARES, spares, nspares) == 0); - VERIFY(nvlist_lookup_nvlist_array(nvroot, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); - - /* - * Go through and find any spares which have since been - * repurposed as an active spare. If this is the case, update - * their status appropriately. - */ - for (i = 0; i < nspares; i++) { - VERIFY(nvlist_lookup_uint64(spares[i], - ZPOOL_CONFIG_GUID, &guid) == 0); - if (spa_spare_exists(guid, &pool, NULL) && - pool != 0ULL) { - VERIFY(nvlist_lookup_uint64_array( - spares[i], ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &vsc) == 0); - vs->vs_state = VDEV_STATE_CANT_OPEN; - vs->vs_aux = VDEV_AUX_SPARED; - } - } - } -} - -/* - * Add l2cache device information to the nvlist, including vdev stats. - */ -static void -spa_add_l2cache(spa_t *spa, nvlist_t *config) -{ - nvlist_t **l2cache; - uint_t i, j, nl2cache; - nvlist_t *nvroot; - uint64_t guid; - vdev_t *vd; - vdev_stat_t *vs; - uint_t vsc; - - ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); - - if (spa->spa_l2cache.sav_count == 0) - return; - - VERIFY(nvlist_lookup_nvlist(config, - ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); - if (nl2cache != 0) { - VERIFY(nvlist_add_nvlist_array(nvroot, - ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); - VERIFY(nvlist_lookup_nvlist_array(nvroot, - ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); - - /* - * Update level 2 cache device stats. - */ - - for (i = 0; i < nl2cache; i++) { - VERIFY(nvlist_lookup_uint64(l2cache[i], - ZPOOL_CONFIG_GUID, &guid) == 0); - - vd = NULL; - for (j = 0; j < spa->spa_l2cache.sav_count; j++) { - if (guid == - spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { - vd = spa->spa_l2cache.sav_vdevs[j]; - break; - } - } - ASSERT(vd != NULL); - - VERIFY(nvlist_lookup_uint64_array(l2cache[i], - ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) - == 0); - vdev_get_stats(vd, vs); - } - } -} - -static void -spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) -{ - zap_cursor_t zc; - zap_attribute_t za; - - /* We may be unable to read features if pool is suspended. */ - if (spa_suspended(spa)) - return; - - if (spa->spa_feat_for_read_obj != 0) { - for (zap_cursor_init(&zc, spa->spa_meta_objset, - spa->spa_feat_for_read_obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - ASSERT(za.za_integer_length == sizeof (uint64_t) && - za.za_num_integers == 1); - VERIFY0(nvlist_add_uint64(features, za.za_name, - za.za_first_integer)); - } - zap_cursor_fini(&zc); - } - - if (spa->spa_feat_for_write_obj != 0) { - for (zap_cursor_init(&zc, spa->spa_meta_objset, - spa->spa_feat_for_write_obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - ASSERT(za.za_integer_length == sizeof (uint64_t) && - za.za_num_integers == 1); - VERIFY0(nvlist_add_uint64(features, za.za_name, - za.za_first_integer)); - } - zap_cursor_fini(&zc); - } -} - -static void -spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) -{ - int i; - - for (i = 0; i < SPA_FEATURES; i++) { - zfeature_info_t feature = spa_feature_table[i]; - uint64_t refcount; - - if (feature_get_refcount(spa, &feature, &refcount) != 0) - continue; - - VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); - } -} - -/* - * Store a list of pool features and their reference counts in the - * config. - * - * The first time this is called on a spa, allocate a new nvlist, fetch - * the pool features and reference counts from disk, then save the list - * in the spa. In subsequent calls on the same spa use the saved nvlist - * and refresh its values from the cached reference counts. This - * ensures we don't block here on I/O on a suspended pool so 'zpool - * clear' can resume the pool. - */ -static void -spa_add_feature_stats(spa_t *spa, nvlist_t *config) -{ - nvlist_t *features; - - ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); - - mutex_enter(&spa->spa_feat_stats_lock); - features = spa->spa_feat_stats; - - if (features != NULL) { - spa_feature_stats_from_cache(spa, features); - } else { - VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); - spa->spa_feat_stats = features; - spa_feature_stats_from_disk(spa, features); - } - - VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, - features)); - - mutex_exit(&spa->spa_feat_stats_lock); -} - -int -spa_get_stats(const char *name, nvlist_t **config, - char *altroot, size_t buflen) -{ - int error; - spa_t *spa; - - *config = NULL; - error = spa_open_common(name, &spa, FTAG, NULL, config); - - if (spa != NULL) { - /* - * This still leaves a window of inconsistency where the spares - * or l2cache devices could change and the config would be - * self-inconsistent. - */ - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - - if (*config != NULL) { - uint64_t loadtimes[2]; - - loadtimes[0] = spa->spa_loaded_ts.tv_sec; - loadtimes[1] = spa->spa_loaded_ts.tv_nsec; - VERIFY(nvlist_add_uint64_array(*config, - ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); - - VERIFY(nvlist_add_uint64(*config, - ZPOOL_CONFIG_ERRCOUNT, - spa_get_errlog_size(spa)) == 0); - - if (spa_suspended(spa)) { - VERIFY(nvlist_add_uint64(*config, - ZPOOL_CONFIG_SUSPENDED, - spa->spa_failmode) == 0); - VERIFY(nvlist_add_uint64(*config, - ZPOOL_CONFIG_SUSPENDED_REASON, - spa->spa_suspended) == 0); - } - - spa_add_spares(spa, *config); - spa_add_l2cache(spa, *config); - spa_add_feature_stats(spa, *config); - } - } - - /* - * We want to get the alternate root even for faulted pools, so we cheat - * and call spa_lookup() directly. - */ - if (altroot) { - if (spa == NULL) { - mutex_enter(&spa_namespace_lock); - spa = spa_lookup(name); - if (spa) - spa_altroot(spa, altroot, buflen); - else - altroot[0] = '\0'; - spa = NULL; - mutex_exit(&spa_namespace_lock); - } else { - spa_altroot(spa, altroot, buflen); - } - } - - if (spa != NULL) { - spa_config_exit(spa, SCL_CONFIG, FTAG); - spa_close(spa, FTAG); - } - - return (error); -} - -/* - * Validate that the auxiliary device array is well formed. We must have an - * array of nvlists, each which describes a valid leaf vdev. If this is an - * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be - * specified, as long as they are well-formed. - */ -static int -spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, - spa_aux_vdev_t *sav, const char *config, uint64_t version, - vdev_labeltype_t label) -{ - nvlist_t **dev; - uint_t i, ndev; - vdev_t *vd; - int error; - - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - - /* - * It's acceptable to have no devs specified. - */ - if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) - return (0); - - if (ndev == 0) - return (SET_ERROR(EINVAL)); - - /* - * Make sure the pool is formatted with a version that supports this - * device type. - */ - if (spa_version(spa) < version) - return (SET_ERROR(ENOTSUP)); - - /* - * Set the pending device list so we correctly handle device in-use - * checking. - */ - sav->sav_pending = dev; - sav->sav_npending = ndev; - - for (i = 0; i < ndev; i++) { - if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, - mode)) != 0) - goto out; - - if (!vd->vdev_ops->vdev_op_leaf) { - vdev_free(vd); - error = SET_ERROR(EINVAL); - goto out; - } - - vd->vdev_top = vd; - - if ((error = vdev_open(vd)) == 0 && - (error = vdev_label_init(vd, crtxg, label)) == 0) { - VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, - vd->vdev_guid) == 0); - } - - vdev_free(vd); - - if (error && - (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) - goto out; - else - error = 0; - } - -out: - sav->sav_pending = NULL; - sav->sav_npending = 0; - return (error); -} - -static int -spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) -{ - int error; - - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - - if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, - &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, - VDEV_LABEL_SPARE)) != 0) { - return (error); - } - - return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, - &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, - VDEV_LABEL_L2CACHE)); -} - -static void -spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, - const char *config) -{ - int i; - - if (sav->sav_config != NULL) { - nvlist_t **olddevs; - uint_t oldndevs; - nvlist_t **newdevs; - - /* - * Generate new dev list by concatentating with the - * current dev list. - */ - VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, - &olddevs, &oldndevs) == 0); - - newdevs = kmem_alloc(sizeof (void *) * - (ndevs + oldndevs), KM_SLEEP); - for (i = 0; i < oldndevs; i++) - VERIFY(nvlist_dup(olddevs[i], &newdevs[i], - KM_SLEEP) == 0); - for (i = 0; i < ndevs; i++) - VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], - KM_SLEEP) == 0); - - VERIFY(nvlist_remove(sav->sav_config, config, - DATA_TYPE_NVLIST_ARRAY) == 0); - - VERIFY(nvlist_add_nvlist_array(sav->sav_config, - config, newdevs, ndevs + oldndevs) == 0); - for (i = 0; i < oldndevs + ndevs; i++) - nvlist_free(newdevs[i]); - kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); - } else { - /* - * Generate a new dev list. - */ - VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, - KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, - devs, ndevs) == 0); - } -} - -/* - * Stop and drop level 2 ARC devices - */ -void -spa_l2cache_drop(spa_t *spa) -{ - vdev_t *vd; - int i; - spa_aux_vdev_t *sav = &spa->spa_l2cache; - - for (i = 0; i < sav->sav_count; i++) { - uint64_t pool; - - vd = sav->sav_vdevs[i]; - ASSERT(vd != NULL); - - if (spa_l2cache_exists(vd->vdev_guid, &pool) && - pool != 0ULL && l2arc_vdev_present(vd)) - l2arc_remove_vdev(vd); - } -} - -/* - * Pool Creation - */ -int -spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, - nvlist_t *zplprops) -{ - spa_t *spa; - char *altroot = NULL; - vdev_t *rvd; - dsl_pool_t *dp; - dmu_tx_t *tx; - int error = 0; - uint64_t txg = TXG_INITIAL; - nvlist_t **spares, **l2cache; - uint_t nspares, nl2cache; - uint64_t version, obj; - boolean_t has_features; - char *poolname; - nvlist_t *nvl; - - if (props == NULL || - nvlist_lookup_string(props, - zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) - poolname = (char *)pool; - - /* - * If this pool already exists, return failure. - */ - mutex_enter(&spa_namespace_lock); - if (spa_lookup(poolname) != NULL) { - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(EEXIST)); - } - - /* - * Allocate a new spa_t structure. - */ - nvl = fnvlist_alloc(); - fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); - (void) nvlist_lookup_string(props, - zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - spa = spa_add(poolname, nvl, altroot); - fnvlist_free(nvl); - spa_activate(spa, spa_mode_global); - - if (props && (error = spa_prop_validate(spa, props))) { - spa_deactivate(spa); - spa_remove(spa); - mutex_exit(&spa_namespace_lock); - return (error); - } - - /* - * Temporary pool names should never be written to disk. - */ - if (poolname != pool) - spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; - - has_features = B_FALSE; - for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); - elem != NULL; elem = nvlist_next_nvpair(props, elem)) { - if (zpool_prop_feature(nvpair_name(elem))) - has_features = B_TRUE; - } - - if (has_features || nvlist_lookup_uint64(props, - zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { - version = SPA_VERSION; - } - ASSERT(SPA_VERSION_IS_SUPPORTED(version)); - - spa->spa_first_txg = txg; - spa->spa_uberblock.ub_txg = txg - 1; - spa->spa_uberblock.ub_version = version; - spa->spa_ubsync = spa->spa_uberblock; - spa->spa_load_state = SPA_LOAD_CREATE; - spa->spa_removing_phys.sr_state = DSS_NONE; - spa->spa_removing_phys.sr_removing_vdev = -1; - spa->spa_removing_phys.sr_prev_indirect_vdev = -1; - spa->spa_indirect_vdevs_loaded = B_TRUE; - - /* - * Create "The Godfather" zio to hold all async IOs - */ - spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), - KM_SLEEP); - for (int i = 0; i < max_ncpus; i++) { - spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | - ZIO_FLAG_GODFATHER); - } - - /* - * Create the root vdev. - */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - - error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); - - ASSERT(error != 0 || rvd != NULL); - ASSERT(error != 0 || spa->spa_root_vdev == rvd); - - if (error == 0 && !zfs_allocatable_devs(nvroot)) - error = SET_ERROR(EINVAL); - - if (error == 0 && - (error = vdev_create(rvd, txg, B_FALSE)) == 0 && - (error = spa_validate_aux(spa, nvroot, txg, - VDEV_ALLOC_ADD)) == 0) { - /* - * instantiate the metaslab groups (this will dirty the vdevs) - * we can no longer error exit past this point - */ - for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { - vdev_t *vd = rvd->vdev_child[c]; - - vdev_ashift_optimize(vd); - vdev_metaslab_set_size(vd); - vdev_expand(vd, txg); - } - } - - spa_config_exit(spa, SCL_ALL, FTAG); - - if (error != 0) { - spa_unload(spa); - spa_deactivate(spa); - spa_remove(spa); - mutex_exit(&spa_namespace_lock); - return (error); - } - - /* - * Get the list of spares, if specified. - */ - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, - KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares) == 0); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa_load_spares(spa); - spa_config_exit(spa, SCL_ALL, FTAG); - spa->spa_spares.sav_sync = B_TRUE; - } - - /* - * Get the list of level 2 cache devices, if specified. - */ - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, - &l2cache, &nl2cache) == 0) { - VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa_load_l2cache(spa); - spa_config_exit(spa, SCL_ALL, FTAG); - spa->spa_l2cache.sav_sync = B_TRUE; - } - - spa->spa_is_initializing = B_TRUE; - spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); - spa->spa_meta_objset = dp->dp_meta_objset; - spa->spa_is_initializing = B_FALSE; - - /* - * Create DDTs (dedup tables). - */ - ddt_create(spa); - - spa_update_dspace(spa); - - tx = dmu_tx_create_assigned(dp, txg); - - /* - * Create the pool config object. - */ - spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, - DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, - DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); - - if (zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, - sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { - cmn_err(CE_PANIC, "failed to add pool config"); - } - - if (spa_version(spa) >= SPA_VERSION_FEATURES) - spa_feature_create_zap_objects(spa, tx); - - if (zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, - sizeof (uint64_t), 1, &version, tx) != 0) { - cmn_err(CE_PANIC, "failed to add pool version"); - } - - /* Newly created pools with the right version are always deflated. */ - if (version >= SPA_VERSION_RAIDZ_DEFLATE) { - spa->spa_deflate = TRUE; - if (zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, - sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { - cmn_err(CE_PANIC, "failed to add deflate"); - } - } - - /* - * Create the deferred-free bpobj. Turn off compression - * because sync-to-convergence takes longer if the blocksize - * keeps changing. - */ - obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); - dmu_object_set_compress(spa->spa_meta_objset, obj, - ZIO_COMPRESS_OFF, tx); - if (zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, - sizeof (uint64_t), 1, &obj, tx) != 0) { - cmn_err(CE_PANIC, "failed to add bpobj"); - } - VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, - spa->spa_meta_objset, obj)); - - /* - * Create the pool's history object. - */ - if (version >= SPA_VERSION_ZPOOL_HISTORY) - spa_history_create_obj(spa, tx); - - /* - * Generate some random noise for salted checksums to operate on. - */ - (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, - sizeof (spa->spa_cksum_salt.zcs_bytes)); - - /* - * Set pool properties. - */ - spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); - spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); - spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); - spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); - spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); - - if (props != NULL) { - spa_configfile_set(spa, props, B_FALSE); - spa_sync_props(props, tx); - } - - dmu_tx_commit(tx); - - spa->spa_sync_on = B_TRUE; - txg_sync_start(spa->spa_dsl_pool); - mmp_thread_start(spa); - - /* - * We explicitly wait for the first transaction to complete so that our - * bean counters are appropriately updated. - */ - txg_wait_synced(spa->spa_dsl_pool, txg); - - spa_spawn_aux_threads(spa); - - spa_write_cachefile(spa, B_FALSE, B_TRUE); - spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); - - spa_history_log_version(spa, "create"); - - /* - * Don't count references from objsets that are already closed - * and are making their way through the eviction process. - */ - spa_evicting_os_wait(spa); - spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); - spa->spa_load_state = SPA_LOAD_NONE; - - mutex_exit(&spa_namespace_lock); - - return (0); -} - -#ifdef _KERNEL -#ifdef illumos -/* - * Get the root pool information from the root disk, then import the root pool - * during the system boot up time. - */ -extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); - -static nvlist_t * -spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) -{ - nvlist_t *config; - nvlist_t *nvtop, *nvroot; - uint64_t pgid; - - if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) - return (NULL); - - /* - * Add this top-level vdev to the child array. - */ - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvtop) == 0); - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, - &pgid) == 0); - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); - - /* - * Put this pool's top-level vdevs into a root vdev. - */ - VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_ROOT) == 0); - VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); - VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); - VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &nvtop, 1) == 0); - - /* - * Replace the existing vdev_tree with the new root vdev in - * this pool's configuration (remove the old, add the new). - */ - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); - nvlist_free(nvroot); - return (config); -} - -/* - * Walk the vdev tree and see if we can find a device with "better" - * configuration. A configuration is "better" if the label on that - * device has a more recent txg. - */ -static void -spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) -{ - for (int c = 0; c < vd->vdev_children; c++) - spa_alt_rootvdev(vd->vdev_child[c], avd, txg); - - if (vd->vdev_ops->vdev_op_leaf) { - nvlist_t *label; - uint64_t label_txg; - - if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, - &label) != 0) - return; - - VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, - &label_txg) == 0); - - /* - * Do we have a better boot device? - */ - if (label_txg > *txg) { - *txg = label_txg; - *avd = vd; - } - nvlist_free(label); - } -} - -/* - * Import a root pool. - * - * For x86. devpath_list will consist of devid and/or physpath name of - * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). - * The GRUB "findroot" command will return the vdev we should boot. - * - * For Sparc, devpath_list consists the physpath name of the booting device - * no matter the rootpool is a single device pool or a mirrored pool. - * e.g. - * "/pci@1f,0/ide@d/disk@0,0:a" - */ -int -spa_import_rootpool(char *devpath, char *devid) -{ - spa_t *spa; - vdev_t *rvd, *bvd, *avd = NULL; - nvlist_t *config, *nvtop; - uint64_t guid, txg; - char *pname; - int error; - - /* - * Read the label from the boot device and generate a configuration. - */ - config = spa_generate_rootconf(devpath, devid, &guid); -#if defined(_OBP) && defined(_KERNEL) - if (config == NULL) { - if (strstr(devpath, "/iscsi/ssd") != NULL) { - /* iscsi boot */ - get_iscsi_bootpath_phy(devpath); - config = spa_generate_rootconf(devpath, devid, &guid); - } - } -#endif - if (config == NULL) { - cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", - devpath); - return (SET_ERROR(EIO)); - } - - VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, - &pname) == 0); - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); - - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(pname)) != NULL) { - /* - * Remove the existing root pool from the namespace so that we - * can replace it with the correct config we just read in. - */ - spa_remove(spa); - } - - spa = spa_add(pname, config, NULL); - spa->spa_is_root = B_TRUE; - spa->spa_import_flags = ZFS_IMPORT_VERBATIM; - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, - &spa->spa_ubsync.ub_version) != 0) - spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; - - /* - * Build up a vdev tree based on the boot device's label config. - */ - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvtop) == 0); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, - VDEV_ALLOC_ROOTPOOL); - spa_config_exit(spa, SCL_ALL, FTAG); - if (error) { - mutex_exit(&spa_namespace_lock); - nvlist_free(config); - cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", - pname); - return (error); - } - - /* - * Get the boot vdev. - */ - if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { - cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", - (u_longlong_t)guid); - error = SET_ERROR(ENOENT); - goto out; - } - - /* - * Determine if there is a better boot device. - */ - avd = bvd; - spa_alt_rootvdev(rvd, &avd, &txg); - if (avd != bvd) { - cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " - "try booting from '%s'", avd->vdev_path); - error = SET_ERROR(EINVAL); - goto out; - } - - /* - * If the boot device is part of a spare vdev then ensure that - * we're booting off the active spare. - */ - if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && - !bvd->vdev_isspare) { - cmn_err(CE_NOTE, "The boot device is currently spared. Please " - "try booting from '%s'", - bvd->vdev_parent-> - vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); - error = SET_ERROR(EINVAL); - goto out; - } - - error = 0; -out: - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - vdev_free(rvd); - spa_config_exit(spa, SCL_ALL, FTAG); - mutex_exit(&spa_namespace_lock); - - nvlist_free(config); - return (error); -} - -#else /* !illumos */ - -extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, - uint64_t *count); - -static nvlist_t * -spa_generate_rootconf(const char *name) -{ - nvlist_t **configs, **tops; - nvlist_t *config; - nvlist_t *best_cfg, *nvtop, *nvroot; - uint64_t *holes; - uint64_t best_txg; - uint64_t nchildren; - uint64_t pgid; - uint64_t count; - uint64_t i; - uint_t nholes; - - if (vdev_geom_read_pool_label(name, &configs, &count) != 0) - return (NULL); - - ASSERT3U(count, !=, 0); - best_txg = 0; - for (i = 0; i < count; i++) { - uint64_t txg; - - VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, - &txg) == 0); - if (txg > best_txg) { - best_txg = txg; - best_cfg = configs[i]; - } - } - - nchildren = 1; - nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); - holes = NULL; - nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, - &holes, &nholes); - - tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); - for (i = 0; i < nchildren; i++) { - if (i >= count) - break; - if (configs[i] == NULL) - continue; - VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, - &nvtop) == 0); - nvlist_dup(nvtop, &tops[i], KM_SLEEP); - } - for (i = 0; holes != NULL && i < nholes; i++) { - if (i >= nchildren) - continue; - if (tops[holes[i]] != NULL) - continue; - nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); - VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, - VDEV_TYPE_HOLE) == 0); - VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, - holes[i]) == 0); - VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, - 0) == 0); - } - for (i = 0; i < nchildren; i++) { - if (tops[i] != NULL) - continue; - nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); - VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, - VDEV_TYPE_MISSING) == 0); - VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, - i) == 0); - VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, - 0) == 0); - } - - /* - * Create pool config based on the best vdev config. - */ - nvlist_dup(best_cfg, &config, KM_SLEEP); - - /* - * Put this pool's top-level vdevs into a root vdev. - */ - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, - &pgid) == 0); - VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_ROOT) == 0); - VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); - VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); - VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - tops, nchildren) == 0); - - /* - * Replace the existing vdev_tree with the new root vdev in - * this pool's configuration (remove the old, add the new). - */ - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); - - /* - * Drop vdev config elements that should not be present at pool level. - */ - nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); - nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); - - for (i = 0; i < count; i++) - nvlist_free(configs[i]); - kmem_free(configs, count * sizeof(void *)); - for (i = 0; i < nchildren; i++) - nvlist_free(tops[i]); - kmem_free(tops, nchildren * sizeof(void *)); - nvlist_free(nvroot); - return (config); -} - -int -spa_import_rootpool(const char *name, bool checkpointrewind) -{ - spa_t *spa; - vdev_t *rvd, *bvd, *avd = NULL; - nvlist_t *config, *nvtop; - uint64_t txg; - char *pname; - int error; - - /* - * Read the label from the boot device and generate a configuration. - */ - config = spa_generate_rootconf(name); - - mutex_enter(&spa_namespace_lock); - if (config != NULL) { - VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, - &pname) == 0 && strcmp(name, pname) == 0); - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) - == 0); - - if ((spa = spa_lookup(pname)) != NULL) { - /* - * The pool could already be imported, - * e.g., after reboot -r. - */ - if (spa->spa_state == POOL_STATE_ACTIVE) { - mutex_exit(&spa_namespace_lock); - nvlist_free(config); - return (0); - } - - /* - * Remove the existing root pool from the namespace so - * that we can replace it with the correct config - * we just read in. - */ - spa_remove(spa); - } - spa = spa_add(pname, config, NULL); - - /* - * Set spa_ubsync.ub_version as it can be used in vdev_alloc() - * via spa_version(). - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, - &spa->spa_ubsync.ub_version) != 0) - spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; - } else if ((spa = spa_lookup(name)) == NULL) { - mutex_exit(&spa_namespace_lock); - nvlist_free(config); - cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", - name); - return (EIO); - } else { - VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); - } - spa->spa_is_root = B_TRUE; - spa->spa_import_flags = ZFS_IMPORT_VERBATIM; - if (checkpointrewind) { - spa->spa_import_flags |= ZFS_IMPORT_CHECKPOINT; - } - - /* - * Build up a vdev tree based on the boot device's label config. - */ - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvtop) == 0); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, - VDEV_ALLOC_ROOTPOOL); - spa_config_exit(spa, SCL_ALL, FTAG); - if (error) { - mutex_exit(&spa_namespace_lock); - nvlist_free(config); - cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", - pname); - return (error); - } - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - vdev_free(rvd); - spa_config_exit(spa, SCL_ALL, FTAG); - mutex_exit(&spa_namespace_lock); - - nvlist_free(config); - return (0); -} - -#endif /* illumos */ -#endif /* _KERNEL */ - -/* - * Import a non-root pool into the system. - */ -int -spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) -{ - spa_t *spa; - char *altroot = NULL; - spa_load_state_t state = SPA_LOAD_IMPORT; - zpool_load_policy_t policy; - uint64_t mode = spa_mode_global; - uint64_t readonly = B_FALSE; - int error; - nvlist_t *nvroot; - nvlist_t **spares, **l2cache; - uint_t nspares, nl2cache; - - /* - * If a pool with this name exists, return failure. - */ - mutex_enter(&spa_namespace_lock); - if (spa_lookup(pool) != NULL) { - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(EEXIST)); - } - - /* - * Create and initialize the spa structure. - */ - (void) nvlist_lookup_string(props, - zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - (void) nvlist_lookup_uint64(props, - zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); - if (readonly) - mode = FREAD; - spa = spa_add(pool, config, altroot); - spa->spa_import_flags = flags; - - /* - * Verbatim import - Take a pool and insert it into the namespace - * as if it had been loaded at boot. - */ - if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { - if (props != NULL) - spa_configfile_set(spa, props, B_FALSE); - - spa_write_cachefile(spa, B_FALSE, B_TRUE); - spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); - zfs_dbgmsg("spa_import: verbatim import of %s", pool); - mutex_exit(&spa_namespace_lock); - return (0); - } - - spa_activate(spa, mode); - - /* - * Don't start async tasks until we know everything is healthy. - */ - spa_async_suspend(spa); - - zpool_get_load_policy(config, &policy); - if (policy.zlp_rewind & ZPOOL_DO_REWIND) - state = SPA_LOAD_RECOVER; - - spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; - - if (state != SPA_LOAD_RECOVER) { - spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; - zfs_dbgmsg("spa_import: importing %s", pool); - } else { - zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " - "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); - } - error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); - - /* - * Propagate anything learned while loading the pool and pass it - * back to caller (i.e. rewind info, missing devices, etc). - */ - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, - spa->spa_load_info) == 0); - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - /* - * Toss any existing sparelist, as it doesn't have any validity - * anymore, and conflicts with spa_has_spare(). - */ - if (spa->spa_spares.sav_config) { - nvlist_free(spa->spa_spares.sav_config); - spa->spa_spares.sav_config = NULL; - spa_load_spares(spa); - } - if (spa->spa_l2cache.sav_config) { - nvlist_free(spa->spa_l2cache.sav_config); - spa->spa_l2cache.sav_config = NULL; - spa_load_l2cache(spa); - } - - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - if (error == 0) - error = spa_validate_aux(spa, nvroot, -1ULL, - VDEV_ALLOC_SPARE); - if (error == 0) - error = spa_validate_aux(spa, nvroot, -1ULL, - VDEV_ALLOC_L2CACHE); - spa_config_exit(spa, SCL_ALL, FTAG); - - if (props != NULL) - spa_configfile_set(spa, props, B_FALSE); - - if (error != 0 || (props && spa_writeable(spa) && - (error = spa_prop_set(spa, props)))) { - spa_unload(spa); - spa_deactivate(spa); - spa_remove(spa); - mutex_exit(&spa_namespace_lock); - return (error); - } - - spa_async_resume(spa); - - /* - * Override any spares and level 2 cache devices as specified by - * the user, as these may have correct device names/devids, etc. - */ - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - if (spa->spa_spares.sav_config) - VERIFY(nvlist_remove(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); - else - VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares) == 0); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa_load_spares(spa); - spa_config_exit(spa, SCL_ALL, FTAG); - spa->spa_spares.sav_sync = B_TRUE; - } - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, - &l2cache, &nl2cache) == 0) { - if (spa->spa_l2cache.sav_config) - VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); - else - VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa_load_l2cache(spa); - spa_config_exit(spa, SCL_ALL, FTAG); - spa->spa_l2cache.sav_sync = B_TRUE; - } - - /* - * Check for any removed devices. - */ - if (spa->spa_autoreplace) { - spa_aux_check_removed(&spa->spa_spares); - spa_aux_check_removed(&spa->spa_l2cache); - } - - if (spa_writeable(spa)) { - /* - * Update the config cache to include the newly-imported pool. - */ - spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); - } - - /* - * It's possible that the pool was expanded while it was exported. - * We kick off an async task to handle this for us. - */ - spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); - - spa_history_log_version(spa, "import"); - - spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); - - mutex_exit(&spa_namespace_lock); - -#ifdef __FreeBSD__ -#ifdef _KERNEL - zvol_create_minors(spa, pool); -#endif -#endif - return (0); -} - -nvlist_t * -spa_tryimport(nvlist_t *tryconfig) -{ - nvlist_t *config = NULL; - char *poolname, *cachefile; - spa_t *spa; - uint64_t state; - int error; - zpool_load_policy_t policy; - - if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) - return (NULL); - - if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) - return (NULL); - - /* - * Create and initialize the spa structure. - */ - mutex_enter(&spa_namespace_lock); - spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); - spa_activate(spa, FREAD); - - /* - * Rewind pool if a max txg was provided. - */ - zpool_get_load_policy(spa->spa_config, &policy); - if (policy.zlp_txg != UINT64_MAX) { - spa->spa_load_max_txg = policy.zlp_txg; - spa->spa_extreme_rewind = B_TRUE; - zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", - poolname, (longlong_t)policy.zlp_txg); - } else { - zfs_dbgmsg("spa_tryimport: importing %s", poolname); - } - - if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) - == 0) { - zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); - spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; - } else { - spa->spa_config_source = SPA_CONFIG_SRC_SCAN; - } - - error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); - - /* - * If 'tryconfig' was at least parsable, return the current config. - */ - if (spa->spa_root_vdev != NULL) { - config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); - VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, - poolname) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, - state) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, - spa->spa_uberblock.ub_timestamp) == 0); - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, - spa->spa_load_info) == 0); - - /* - * If the bootfs property exists on this pool then we - * copy it out so that external consumers can tell which - * pools are bootable. - */ - if ((!error || error == EEXIST) && spa->spa_bootfs) { - char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); - - /* - * We have to play games with the name since the - * pool was opened as TRYIMPORT_NAME. - */ - if (dsl_dsobj_to_dsname(spa_name(spa), - spa->spa_bootfs, tmpname) == 0) { - char *cp; - char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); - - cp = strchr(tmpname, '/'); - if (cp == NULL) { - (void) strlcpy(dsname, tmpname, - MAXPATHLEN); - } else { - (void) snprintf(dsname, MAXPATHLEN, - "%s/%s", poolname, ++cp); - } - VERIFY(nvlist_add_string(config, - ZPOOL_CONFIG_BOOTFS, dsname) == 0); - kmem_free(dsname, MAXPATHLEN); - } - kmem_free(tmpname, MAXPATHLEN); - } - - /* - * Add the list of hot spares and level 2 cache devices. - */ - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - spa_add_spares(spa, config); - spa_add_l2cache(spa, config); - spa_config_exit(spa, SCL_CONFIG, FTAG); - } - - spa_unload(spa); - spa_deactivate(spa); - spa_remove(spa); - mutex_exit(&spa_namespace_lock); - - return (config); -} - -/* - * Pool export/destroy - * - * The act of destroying or exporting a pool is very simple. We make sure there - * is no more pending I/O and any references to the pool are gone. Then, we - * update the pool state and sync all the labels to disk, removing the - * configuration from the cache afterwards. If the 'hardforce' flag is set, then - * we don't sync the labels or remove the configuration cache. - */ -static int -spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, - boolean_t force, boolean_t hardforce) -{ - spa_t *spa; - - if (oldconfig) - *oldconfig = NULL; - - if (!(spa_mode_global & FWRITE)) - return (SET_ERROR(EROFS)); - - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(pool)) == NULL) { - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(ENOENT)); - } - - /* - * Put a hold on the pool, drop the namespace lock, stop async tasks, - * reacquire the namespace lock, and see if we can export. - */ - spa_open_ref(spa, FTAG); - mutex_exit(&spa_namespace_lock); - spa_async_suspend(spa); - if (spa->spa_zvol_taskq) { -#ifdef _KERNEL - zvol_remove_minors(spa, spa_name(spa)); -#endif - taskq_wait(spa->spa_zvol_taskq); - } - mutex_enter(&spa_namespace_lock); - spa_close(spa, FTAG); - - /* - * The pool will be in core if it's openable, - * in which case we can modify its state. - */ - if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { - - /* - * Objsets may be open only because they're dirty, so we - * have to force it to sync before checking spa_refcnt. - */ - txg_wait_synced(spa->spa_dsl_pool, 0); - spa_evicting_os_wait(spa); - - /* - * A pool cannot be exported or destroyed if there are active - * references. If we are resetting a pool, allow references by - * fault injection handlers. - */ - if (!spa_refcount_zero(spa) || - (spa->spa_inject_ref != 0 && - new_state != POOL_STATE_UNINITIALIZED)) { - spa_async_resume(spa); - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(EBUSY)); - } - - /* - * A pool cannot be exported if it has an active shared spare. - * This is to prevent other pools stealing the active spare - * from an exported pool. At user's own will, such pool can - * be forcedly exported. - */ - if (!force && new_state == POOL_STATE_EXPORTED && - spa_has_active_shared_spare(spa)) { - spa_async_resume(spa); - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(EXDEV)); - } - - /* - * We're about to export or destroy this pool. Make sure - * we stop all initializtion activity here before we - * set the spa_final_txg. This will ensure that all - * dirty data resulting from the initialization is - * committed to disk before we unload the pool. - */ - if (spa->spa_root_vdev != NULL) { - vdev_initialize_stop_all(spa->spa_root_vdev, - VDEV_INITIALIZE_ACTIVE); - } - - /* - * We want this to be reflected on every label, - * so mark them all dirty. spa_unload() will do the - * final sync that pushes these changes out. - */ - if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa->spa_state = new_state; - spa->spa_final_txg = spa_last_synced_txg(spa) + - TXG_DEFER_SIZE + 1; - vdev_config_dirty(spa->spa_root_vdev); - spa_config_exit(spa, SCL_ALL, FTAG); - } - } - - spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); - - if (spa->spa_state != POOL_STATE_UNINITIALIZED) { - spa_unload(spa); - spa_deactivate(spa); - } - - if (oldconfig && spa->spa_config) - VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); - - if (new_state != POOL_STATE_UNINITIALIZED) { - if (!hardforce) - spa_write_cachefile(spa, B_TRUE, B_TRUE); - spa_remove(spa); - } - mutex_exit(&spa_namespace_lock); - - return (0); -} - -/* - * Destroy a storage pool. - */ -int -spa_destroy(char *pool) -{ - return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, - B_FALSE, B_FALSE)); -} - -/* - * Export a storage pool. - */ -int -spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, - boolean_t hardforce) -{ - return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, - force, hardforce)); -} - -/* - * Similar to spa_export(), this unloads the spa_t without actually removing it - * from the namespace in any way. - */ -int -spa_reset(char *pool) -{ - return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, - B_FALSE, B_FALSE)); -} - -/* - * ========================================================================== - * Device manipulation - * ========================================================================== - */ - -/* - * Add a device to a storage pool. - */ -int -spa_vdev_add(spa_t *spa, nvlist_t *nvroot) -{ - uint64_t txg, id; - int error; - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *vd, *tvd; - nvlist_t **spares, **l2cache; - uint_t nspares, nl2cache; - - ASSERT(spa_writeable(spa)); - - txg = spa_vdev_enter(spa); - - if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, - VDEV_ALLOC_ADD)) != 0) - return (spa_vdev_exit(spa, NULL, txg, error)); - - spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ - - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, - &nspares) != 0) - nspares = 0; - - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, - &nl2cache) != 0) - nl2cache = 0; - - if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) - return (spa_vdev_exit(spa, vd, txg, EINVAL)); - - if (vd->vdev_children != 0 && - (error = vdev_create(vd, txg, B_FALSE)) != 0) - return (spa_vdev_exit(spa, vd, txg, error)); - - /* - * We must validate the spares and l2cache devices after checking the - * children. Otherwise, vdev_inuse() will blindly overwrite the spare. - */ - if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) - return (spa_vdev_exit(spa, vd, txg, error)); - - /* - * If we are in the middle of a device removal, we can only add - * devices which match the existing devices in the pool. - * If we are in the middle of a removal, or have some indirect - * vdevs, we can not add raidz toplevels. - */ - if (spa->spa_vdev_removal != NULL || - spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { - for (int c = 0; c < vd->vdev_children; c++) { - tvd = vd->vdev_child[c]; - if (spa->spa_vdev_removal != NULL && - tvd->vdev_ashift != spa->spa_max_ashift) { - return (spa_vdev_exit(spa, vd, txg, EINVAL)); - } - /* Fail if top level vdev is raidz */ - if (tvd->vdev_ops == &vdev_raidz_ops) { - return (spa_vdev_exit(spa, vd, txg, EINVAL)); - } - /* - * Need the top level mirror to be - * a mirror of leaf vdevs only - */ - if (tvd->vdev_ops == &vdev_mirror_ops) { - for (uint64_t cid = 0; - cid < tvd->vdev_children; cid++) { - vdev_t *cvd = tvd->vdev_child[cid]; - if (!cvd->vdev_ops->vdev_op_leaf) { - return (spa_vdev_exit(spa, vd, - txg, EINVAL)); - } - } - } - } - } - - for (int c = 0; c < vd->vdev_children; c++) { - - /* - * Set the vdev id to the first hole, if one exists. - */ - for (id = 0; id < rvd->vdev_children; id++) { - if (rvd->vdev_child[id]->vdev_ishole) { - vdev_free(rvd->vdev_child[id]); - break; - } - } - tvd = vd->vdev_child[c]; - vdev_remove_child(vd, tvd); - tvd->vdev_id = id; - vdev_add_child(rvd, tvd); - vdev_config_dirty(tvd); - } - - if (nspares != 0) { - spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, - ZPOOL_CONFIG_SPARES); - spa_load_spares(spa); - spa->spa_spares.sav_sync = B_TRUE; - } - - if (nl2cache != 0) { - spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, - ZPOOL_CONFIG_L2CACHE); - spa_load_l2cache(spa); - spa->spa_l2cache.sav_sync = B_TRUE; - } - - /* - * We have to be careful when adding new vdevs to an existing pool. - * If other threads start allocating from these vdevs before we - * sync the config cache, and we lose power, then upon reboot we may - * fail to open the pool because there are DVAs that the config cache - * can't translate. Therefore, we first add the vdevs without - * initializing metaslabs; sync the config cache (via spa_vdev_exit()); - * and then let spa_config_update() initialize the new metaslabs. - * - * spa_load() checks for added-but-not-initialized vdevs, so that - * if we lose power at any point in this sequence, the remaining - * steps will be completed the next time we load the pool. - */ - (void) spa_vdev_exit(spa, vd, txg, 0); - - mutex_enter(&spa_namespace_lock); - spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); - spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); - mutex_exit(&spa_namespace_lock); - - return (0); -} - -/* - * Attach a device to a mirror. The arguments are the path to any device - * in the mirror, and the nvroot for the new device. If the path specifies - * a device that is not mirrored, we automatically insert the mirror vdev. - * - * If 'replacing' is specified, the new device is intended to replace the - * existing device; in this case the two devices are made into their own - * mirror using the 'replacing' vdev, which is functionally identical to - * the mirror vdev (it actually reuses all the same ops) but has a few - * extra rules: you can't attach to it after it's been created, and upon - * completion of resilvering, the first disk (the one being replaced) - * is automatically detached. - */ -int -spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) -{ - uint64_t txg, dtl_max_txg; - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; - vdev_ops_t *pvops; - char *oldvdpath, *newvdpath; - int newvd_isspare; - int error; - - ASSERT(spa_writeable(spa)); - - txg = spa_vdev_enter(spa); - - oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { - error = (spa_has_checkpoint(spa)) ? - ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; - return (spa_vdev_exit(spa, NULL, txg, error)); - } - - if (spa->spa_vdev_removal != NULL) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); - - if (oldvd == NULL) - return (spa_vdev_exit(spa, NULL, txg, ENODEV)); - - if (!oldvd->vdev_ops->vdev_op_leaf) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - - pvd = oldvd->vdev_parent; - - if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, - VDEV_ALLOC_ATTACH)) != 0) - return (spa_vdev_exit(spa, NULL, txg, EINVAL)); - - if (newrootvd->vdev_children != 1) - return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); - - newvd = newrootvd->vdev_child[0]; - - if (!newvd->vdev_ops->vdev_op_leaf) - return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); - - if ((error = vdev_create(newrootvd, txg, replacing)) != 0) - return (spa_vdev_exit(spa, newrootvd, txg, error)); - - /* - * Spares can't replace logs - */ - if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) - return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - - if (!replacing) { - /* - * For attach, the only allowable parent is a mirror or the root - * vdev. - */ - if (pvd->vdev_ops != &vdev_mirror_ops && - pvd->vdev_ops != &vdev_root_ops) - return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - - pvops = &vdev_mirror_ops; - } else { - /* - * Active hot spares can only be replaced by inactive hot - * spares. - */ - if (pvd->vdev_ops == &vdev_spare_ops && - oldvd->vdev_isspare && - !spa_has_spare(spa, newvd->vdev_guid)) - return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - - /* - * If the source is a hot spare, and the parent isn't already a - * spare, then we want to create a new hot spare. Otherwise, we - * want to create a replacing vdev. The user is not allowed to - * attach to a spared vdev child unless the 'isspare' state is - * the same (spare replaces spare, non-spare replaces - * non-spare). - */ - if (pvd->vdev_ops == &vdev_replacing_ops && - spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { - return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - } else if (pvd->vdev_ops == &vdev_spare_ops && - newvd->vdev_isspare != oldvd->vdev_isspare) { - return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - } - - if (newvd->vdev_isspare) - pvops = &vdev_spare_ops; - else - pvops = &vdev_replacing_ops; - } - - /* - * Make sure the new device is big enough. - */ - if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) - return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); - - /* - * The new device cannot have a higher alignment requirement - * than the top-level vdev. - */ - if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) - return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); - - /* - * If this is an in-place replacement, update oldvd's path and devid - * to make it distinguishable from newvd, and unopenable from now on. - */ - if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { - spa_strfree(oldvd->vdev_path); - oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, - KM_SLEEP); - (void) sprintf(oldvd->vdev_path, "%s/%s", - newvd->vdev_path, "old"); - if (oldvd->vdev_devid != NULL) { - spa_strfree(oldvd->vdev_devid); - oldvd->vdev_devid = NULL; - } - } - - /* mark the device being resilvered */ - newvd->vdev_resilver_txg = txg; - - /* - * If the parent is not a mirror, or if we're replacing, insert the new - * mirror/replacing/spare vdev above oldvd. - */ - if (pvd->vdev_ops != pvops) - pvd = vdev_add_parent(oldvd, pvops); - - ASSERT(pvd->vdev_top->vdev_parent == rvd); - ASSERT(pvd->vdev_ops == pvops); - ASSERT(oldvd->vdev_parent == pvd); - - /* - * Extract the new device from its root and add it to pvd. - */ - vdev_remove_child(newrootvd, newvd); - newvd->vdev_id = pvd->vdev_children; - newvd->vdev_crtxg = oldvd->vdev_crtxg; - vdev_add_child(pvd, newvd); - - tvd = newvd->vdev_top; - ASSERT(pvd->vdev_top == tvd); - ASSERT(tvd->vdev_parent == rvd); - - vdev_config_dirty(tvd); - - /* - * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account - * for any dmu_sync-ed blocks. It will propagate upward when - * spa_vdev_exit() calls vdev_dtl_reassess(). - */ - dtl_max_txg = txg + TXG_CONCURRENT_STATES; - - vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, - dtl_max_txg - TXG_INITIAL); - - if (newvd->vdev_isspare) { - spa_spare_activate(newvd); - spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); - } - - oldvdpath = spa_strdup(oldvd->vdev_path); - newvdpath = spa_strdup(newvd->vdev_path); - newvd_isspare = newvd->vdev_isspare; - - /* - * Mark newvd's DTL dirty in this txg. - */ - vdev_dirty(tvd, VDD_DTL, newvd, txg); - - /* - * Schedule the resilver to restart in the future. We do this to - * ensure that dmu_sync-ed blocks have been stitched into the - * respective datasets. - */ - dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); - - if (spa->spa_bootfs) - spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); - - spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); - - /* - * Commit the config - */ - (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); - - spa_history_log_internal(spa, "vdev attach", NULL, - "%s vdev=%s %s vdev=%s", - replacing && newvd_isspare ? "spare in" : - replacing ? "replace" : "attach", newvdpath, - replacing ? "for" : "to", oldvdpath); - - spa_strfree(oldvdpath); - spa_strfree(newvdpath); - - return (0); -} - -/* - * Detach a device from a mirror or replacing vdev. - * - * If 'replace_done' is specified, only detach if the parent - * is a replacing vdev. - */ -int -spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) -{ - uint64_t txg; - int error; - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *vd, *pvd, *cvd, *tvd; - boolean_t unspare = B_FALSE; - uint64_t unspare_guid = 0; - char *vdpath; - - ASSERT(spa_writeable(spa)); - - txg = spa_vdev_enter(spa); - - vd = spa_lookup_by_guid(spa, guid, B_FALSE); - - /* - * Besides being called directly from the userland through the - * ioctl interface, spa_vdev_detach() can be potentially called - * at the end of spa_vdev_resilver_done(). - * - * In the regular case, when we have a checkpoint this shouldn't - * happen as we never empty the DTLs of a vdev during the scrub - * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() - * should never get here when we have a checkpoint. - * - * That said, even in a case when we checkpoint the pool exactly - * as spa_vdev_resilver_done() calls this function everything - * should be fine as the resilver will return right away. - */ - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { - error = (spa_has_checkpoint(spa)) ? - ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; - return (spa_vdev_exit(spa, NULL, txg, error)); - } - - if (vd == NULL) - return (spa_vdev_exit(spa, NULL, txg, ENODEV)); - - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - - pvd = vd->vdev_parent; - - /* - * If the parent/child relationship is not as expected, don't do it. - * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing - * vdev that's replacing B with C. The user's intent in replacing - * is to go from M(A,B) to M(A,C). If the user decides to cancel - * the replace by detaching C, the expected behavior is to end up - * M(A,B). But suppose that right after deciding to detach C, - * the replacement of B completes. We would have M(A,C), and then - * ask to detach C, which would leave us with just A -- not what - * the user wanted. To prevent this, we make sure that the - * parent/child relationship hasn't changed -- in this example, - * that C's parent is still the replacing vdev R. - */ - if (pvd->vdev_guid != pguid && pguid != 0) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); - - /* - * Only 'replacing' or 'spare' vdevs can be replaced. - */ - if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && - pvd->vdev_ops != &vdev_spare_ops) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - - ASSERT(pvd->vdev_ops != &vdev_spare_ops || - spa_version(spa) >= SPA_VERSION_SPARES); - - /* - * Only mirror, replacing, and spare vdevs support detach. - */ - if (pvd->vdev_ops != &vdev_replacing_ops && - pvd->vdev_ops != &vdev_mirror_ops && - pvd->vdev_ops != &vdev_spare_ops) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - - /* - * If this device has the only valid copy of some data, - * we cannot safely detach it. - */ - if (vdev_dtl_required(vd)) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); - - ASSERT(pvd->vdev_children >= 2); - - /* - * If we are detaching the second disk from a replacing vdev, then - * check to see if we changed the original vdev's path to have "/old" - * at the end in spa_vdev_attach(). If so, undo that change now. - */ - if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && - vd->vdev_path != NULL) { - size_t len = strlen(vd->vdev_path); - - for (int c = 0; c < pvd->vdev_children; c++) { - cvd = pvd->vdev_child[c]; - - if (cvd == vd || cvd->vdev_path == NULL) - continue; - - if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && - strcmp(cvd->vdev_path + len, "/old") == 0) { - spa_strfree(cvd->vdev_path); - cvd->vdev_path = spa_strdup(vd->vdev_path); - break; - } - } - } - - /* - * If we are detaching the original disk from a spare, then it implies - * that the spare should become a real disk, and be removed from the - * active spare list for the pool. - */ - if (pvd->vdev_ops == &vdev_spare_ops && - vd->vdev_id == 0 && - pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) - unspare = B_TRUE; - - /* - * Erase the disk labels so the disk can be used for other things. - * This must be done after all other error cases are handled, - * but before we disembowel vd (so we can still do I/O to it). - * But if we can't do it, don't treat the error as fatal -- - * it may be that the unwritability of the disk is the reason - * it's being detached! - */ - error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); - - /* - * Remove vd from its parent and compact the parent's children. - */ - vdev_remove_child(pvd, vd); - vdev_compact_children(pvd); - - /* - * Remember one of the remaining children so we can get tvd below. - */ - cvd = pvd->vdev_child[pvd->vdev_children - 1]; - - /* - * If we need to remove the remaining child from the list of hot spares, - * do it now, marking the vdev as no longer a spare in the process. - * We must do this before vdev_remove_parent(), because that can - * change the GUID if it creates a new toplevel GUID. For a similar - * reason, we must remove the spare now, in the same txg as the detach; - * otherwise someone could attach a new sibling, change the GUID, and - * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. - */ - if (unspare) { - ASSERT(cvd->vdev_isspare); - spa_spare_remove(cvd); - unspare_guid = cvd->vdev_guid; - (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); - cvd->vdev_unspare = B_TRUE; - } - - /* - * If the parent mirror/replacing vdev only has one child, - * the parent is no longer needed. Remove it from the tree. - */ - if (pvd->vdev_children == 1) { - if (pvd->vdev_ops == &vdev_spare_ops) - cvd->vdev_unspare = B_FALSE; - vdev_remove_parent(cvd); - } - - - /* - * We don't set tvd until now because the parent we just removed - * may have been the previous top-level vdev. - */ - tvd = cvd->vdev_top; - ASSERT(tvd->vdev_parent == rvd); - - /* - * Reevaluate the parent vdev state. - */ - vdev_propagate_state(cvd); - - /* - * If the 'autoexpand' property is set on the pool then automatically - * try to expand the size of the pool. For example if the device we - * just detached was smaller than the others, it may be possible to - * add metaslabs (i.e. grow the pool). We need to reopen the vdev - * first so that we can obtain the updated sizes of the leaf vdevs. - */ - if (spa->spa_autoexpand) { - vdev_reopen(tvd); - vdev_expand(tvd, txg); - } - - vdev_config_dirty(tvd); - - /* - * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that - * vd->vdev_detached is set and free vd's DTL object in syncing context. - * But first make sure we're not on any *other* txg's DTL list, to - * prevent vd from being accessed after it's freed. - */ - vdpath = spa_strdup(vd->vdev_path); - for (int t = 0; t < TXG_SIZE; t++) - (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); - vd->vdev_detached = B_TRUE; - vdev_dirty(tvd, VDD_DTL, vd, txg); - - spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); - - /* hang on to the spa before we release the lock */ - spa_open_ref(spa, FTAG); - - error = spa_vdev_exit(spa, vd, txg, 0); - - spa_history_log_internal(spa, "detach", NULL, - "vdev=%s", vdpath); - spa_strfree(vdpath); - - /* - * If this was the removal of the original device in a hot spare vdev, - * then we want to go through and remove the device from the hot spare - * list of every other pool. - */ - if (unspare) { - spa_t *altspa = NULL; - - mutex_enter(&spa_namespace_lock); - while ((altspa = spa_next(altspa)) != NULL) { - if (altspa->spa_state != POOL_STATE_ACTIVE || - altspa == spa) - continue; - - spa_open_ref(altspa, FTAG); - mutex_exit(&spa_namespace_lock); - (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); - mutex_enter(&spa_namespace_lock); - spa_close(altspa, FTAG); - } - mutex_exit(&spa_namespace_lock); - - /* search the rest of the vdevs for spares to remove */ - spa_vdev_resilver_done(spa); - } - - /* all done with the spa; OK to release */ - mutex_enter(&spa_namespace_lock); - spa_close(spa, FTAG); - mutex_exit(&spa_namespace_lock); - - return (error); -} - -int -spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type) -{ - /* - * We hold the namespace lock through the whole function - * to prevent any changes to the pool while we're starting or - * stopping initialization. The config and state locks are held so that - * we can properly assess the vdev state before we commit to - * the initializing operation. - */ - mutex_enter(&spa_namespace_lock); - spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); - - /* Look up vdev and ensure it's a leaf. */ - vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); - if (vd == NULL || vd->vdev_detached) { - spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(ENODEV)); - } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { - spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(EINVAL)); - } else if (!vdev_writeable(vd)) { - spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(EROFS)); - } - mutex_enter(&vd->vdev_initialize_lock); - spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); - - /* - * When we activate an initialize action we check to see - * if the vdev_initialize_thread is NULL. We do this instead - * of using the vdev_initialize_state since there might be - * a previous initialization process which has completed but - * the thread is not exited. - */ - if (cmd_type == POOL_INITIALIZE_DO && - (vd->vdev_initialize_thread != NULL || - vd->vdev_top->vdev_removing)) { - mutex_exit(&vd->vdev_initialize_lock); - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(EBUSY)); - } else if (cmd_type == POOL_INITIALIZE_CANCEL && - (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && - vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { - mutex_exit(&vd->vdev_initialize_lock); - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(ESRCH)); - } else if (cmd_type == POOL_INITIALIZE_SUSPEND && - vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { - mutex_exit(&vd->vdev_initialize_lock); - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(ESRCH)); - } - - switch (cmd_type) { - case POOL_INITIALIZE_DO: - vdev_initialize(vd); - break; - case POOL_INITIALIZE_CANCEL: - vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); - break; - case POOL_INITIALIZE_SUSPEND: - vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED); - break; - default: - panic("invalid cmd_type %llu", (unsigned long long)cmd_type); - } - mutex_exit(&vd->vdev_initialize_lock); - - /* Sync out the initializing state */ - txg_wait_synced(spa->spa_dsl_pool, 0); - mutex_exit(&spa_namespace_lock); - - return (0); -} - - -/* - * Split a set of devices from their mirrors, and create a new pool from them. - */ -int -spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - nvlist_t *props, boolean_t exp) -{ - int error = 0; - uint64_t txg, *glist; - spa_t *newspa; - uint_t c, children, lastlog; - nvlist_t **child, *nvl, *tmp; - dmu_tx_t *tx; - char *altroot = NULL; - vdev_t *rvd, **vml = NULL; /* vdev modify list */ - boolean_t activate_slog; - - ASSERT(spa_writeable(spa)); - - txg = spa_vdev_enter(spa); - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { - error = (spa_has_checkpoint(spa)) ? - ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; - return (spa_vdev_exit(spa, NULL, txg, error)); - } - - /* clear the log and flush everything up to now */ - activate_slog = spa_passivate_log(spa); - (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); - error = spa_reset_logs(spa); - txg = spa_vdev_config_enter(spa); - - if (activate_slog) - spa_activate_log(spa); - - if (error != 0) - return (spa_vdev_exit(spa, NULL, txg, error)); - - /* check new spa name before going any further */ - if (spa_lookup(newname) != NULL) - return (spa_vdev_exit(spa, NULL, txg, EEXIST)); - - /* - * scan through all the children to ensure they're all mirrors - */ - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || - nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, - &children) != 0) - return (spa_vdev_exit(spa, NULL, txg, EINVAL)); - - /* first, check to ensure we've got the right child count */ - rvd = spa->spa_root_vdev; - lastlog = 0; - for (c = 0; c < rvd->vdev_children; c++) { - vdev_t *vd = rvd->vdev_child[c]; - - /* don't count the holes & logs as children */ - if (vd->vdev_islog || !vdev_is_concrete(vd)) { - if (lastlog == 0) - lastlog = c; - continue; - } - - lastlog = 0; - } - if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) - return (spa_vdev_exit(spa, NULL, txg, EINVAL)); - - /* next, ensure no spare or cache devices are part of the split */ - if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || - nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) - return (spa_vdev_exit(spa, NULL, txg, EINVAL)); - - vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); - glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); - - /* then, loop over each vdev and validate it */ - for (c = 0; c < children; c++) { - uint64_t is_hole = 0; - - (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, - &is_hole); - - if (is_hole != 0) { - if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || - spa->spa_root_vdev->vdev_child[c]->vdev_islog) { - continue; - } else { - error = SET_ERROR(EINVAL); - break; - } - } - - /* which disk is going to be split? */ - if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, - &glist[c]) != 0) { - error = SET_ERROR(EINVAL); - break; - } - - /* look it up in the spa */ - vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); - if (vml[c] == NULL) { - error = SET_ERROR(ENODEV); - break; - } - - /* make sure there's nothing stopping the split */ - if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || - vml[c]->vdev_islog || - !vdev_is_concrete(vml[c]) || - vml[c]->vdev_isspare || - vml[c]->vdev_isl2cache || - !vdev_writeable(vml[c]) || - vml[c]->vdev_children != 0 || - vml[c]->vdev_state != VDEV_STATE_HEALTHY || - c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { - error = SET_ERROR(EINVAL); - break; - } - - if (vdev_dtl_required(vml[c])) { - error = SET_ERROR(EBUSY); - break; - } - - /* we need certain info from the top level */ - VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, - vml[c]->vdev_top->vdev_ms_array) == 0); - VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, - vml[c]->vdev_top->vdev_ms_shift) == 0); - VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, - vml[c]->vdev_top->vdev_asize) == 0); - VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, - vml[c]->vdev_top->vdev_ashift) == 0); - - /* transfer per-vdev ZAPs */ - ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); - VERIFY0(nvlist_add_uint64(child[c], - ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); - - ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); - VERIFY0(nvlist_add_uint64(child[c], - ZPOOL_CONFIG_VDEV_TOP_ZAP, - vml[c]->vdev_parent->vdev_top_zap)); - } - - if (error != 0) { - kmem_free(vml, children * sizeof (vdev_t *)); - kmem_free(glist, children * sizeof (uint64_t)); - return (spa_vdev_exit(spa, NULL, txg, error)); - } - - /* stop writers from using the disks */ - for (c = 0; c < children; c++) { - if (vml[c] != NULL) - vml[c]->vdev_offline = B_TRUE; - } - vdev_reopen(spa->spa_root_vdev); - - /* - * Temporarily record the splitting vdevs in the spa config. This - * will disappear once the config is regenerated. - */ - VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, - glist, children) == 0); - kmem_free(glist, children * sizeof (uint64_t)); - - mutex_enter(&spa->spa_props_lock); - VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, - nvl) == 0); - mutex_exit(&spa->spa_props_lock); - spa->spa_config_splitting = nvl; - vdev_config_dirty(spa->spa_root_vdev); - - /* configure and create the new pool */ - VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, - exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, - spa_version(spa)) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, - spa->spa_config_txg) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, - spa_generate_guid(NULL)) == 0); - VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); - (void) nvlist_lookup_string(props, - zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - - /* add the new pool to the namespace */ - newspa = spa_add(newname, config, altroot); - newspa->spa_avz_action = AVZ_ACTION_REBUILD; - newspa->spa_config_txg = spa->spa_config_txg; - spa_set_log_state(newspa, SPA_LOG_CLEAR); - - /* release the spa config lock, retaining the namespace lock */ - spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); - - if (zio_injection_enabled) - zio_handle_panic_injection(spa, FTAG, 1); - - spa_activate(newspa, spa_mode_global); - spa_async_suspend(newspa); - - for (c = 0; c < children; c++) { - if (vml[c] != NULL) { - /* - * Temporarily stop the initializing activity. We set - * the state to ACTIVE so that we know to resume - * the initializing once the split has completed. - */ - mutex_enter(&vml[c]->vdev_initialize_lock); - vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE); - mutex_exit(&vml[c]->vdev_initialize_lock); - } - } - -#ifndef illumos - /* mark that we are creating new spa by splitting */ - newspa->spa_splitting_newspa = B_TRUE; -#endif - newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; - - /* create the new pool from the disks of the original pool */ - error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); -#ifndef illumos - newspa->spa_splitting_newspa = B_FALSE; -#endif - if (error) - goto out; - - /* if that worked, generate a real config for the new pool */ - if (newspa->spa_root_vdev != NULL) { - VERIFY(nvlist_alloc(&newspa->spa_config_splitting, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, - ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); - spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, - B_TRUE)); - } - - /* set the props */ - if (props != NULL) { - spa_configfile_set(newspa, props, B_FALSE); - error = spa_prop_set(newspa, props); - if (error) - goto out; - } - - /* flush everything */ - txg = spa_vdev_config_enter(newspa); - vdev_config_dirty(newspa->spa_root_vdev); - (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); - - if (zio_injection_enabled) - zio_handle_panic_injection(spa, FTAG, 2); - - spa_async_resume(newspa); - - /* finally, update the original pool's config */ - txg = spa_vdev_config_enter(spa); - tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error != 0) - dmu_tx_abort(tx); - for (c = 0; c < children; c++) { - if (vml[c] != NULL) { - vdev_split(vml[c]); - if (error == 0) - spa_history_log_internal(spa, "detach", tx, - "vdev=%s", vml[c]->vdev_path); - - vdev_free(vml[c]); - } - } - spa->spa_avz_action = AVZ_ACTION_REBUILD; - vdev_config_dirty(spa->spa_root_vdev); - spa->spa_config_splitting = NULL; - nvlist_free(nvl); - if (error == 0) - dmu_tx_commit(tx); - (void) spa_vdev_exit(spa, NULL, txg, 0); - - if (zio_injection_enabled) - zio_handle_panic_injection(spa, FTAG, 3); - - /* split is complete; log a history record */ - spa_history_log_internal(newspa, "split", NULL, - "from pool %s", spa_name(spa)); - - kmem_free(vml, children * sizeof (vdev_t *)); - - /* if we're not going to mount the filesystems in userland, export */ - if (exp) - error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, - B_FALSE, B_FALSE); - - return (error); - -out: - spa_unload(newspa); - spa_deactivate(newspa); - spa_remove(newspa); - - txg = spa_vdev_config_enter(spa); - - /* re-online all offlined disks */ - for (c = 0; c < children; c++) { - if (vml[c] != NULL) - vml[c]->vdev_offline = B_FALSE; - } - - /* restart initializing disks as necessary */ - spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); - - vdev_reopen(spa->spa_root_vdev); - - nvlist_free(spa->spa_config_splitting); - spa->spa_config_splitting = NULL; - (void) spa_vdev_exit(spa, NULL, txg, error); - - kmem_free(vml, children * sizeof (vdev_t *)); - return (error); -} - -/* - * Find any device that's done replacing, or a vdev marked 'unspare' that's - * currently spared, so we can detach it. - */ -static vdev_t * -spa_vdev_resilver_done_hunt(vdev_t *vd) -{ - vdev_t *newvd, *oldvd; - - for (int c = 0; c < vd->vdev_children; c++) { - oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); - if (oldvd != NULL) - return (oldvd); - } - - /* - * Check for a completed replacement. We always consider the first - * vdev in the list to be the oldest vdev, and the last one to be - * the newest (see spa_vdev_attach() for how that works). In - * the case where the newest vdev is faulted, we will not automatically - * remove it after a resilver completes. This is OK as it will require - * user intervention to determine which disk the admin wishes to keep. - */ - if (vd->vdev_ops == &vdev_replacing_ops) { - ASSERT(vd->vdev_children > 1); - - newvd = vd->vdev_child[vd->vdev_children - 1]; - oldvd = vd->vdev_child[0]; - - if (vdev_dtl_empty(newvd, DTL_MISSING) && - vdev_dtl_empty(newvd, DTL_OUTAGE) && - !vdev_dtl_required(oldvd)) - return (oldvd); - } - - /* - * Check for a completed resilver with the 'unspare' flag set. - * Also potentially update faulted state. - */ - if (vd->vdev_ops == &vdev_spare_ops) { - vdev_t *first = vd->vdev_child[0]; - vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; - - if (last->vdev_unspare) { - oldvd = first; - newvd = last; - } else if (first->vdev_unspare) { - oldvd = last; - newvd = first; - } else { - oldvd = NULL; - } - - if (oldvd != NULL && - vdev_dtl_empty(newvd, DTL_MISSING) && - vdev_dtl_empty(newvd, DTL_OUTAGE) && - !vdev_dtl_required(oldvd)) - return (oldvd); - - vdev_propagate_state(vd); - - /* - * If there are more than two spares attached to a disk, - * and those spares are not required, then we want to - * attempt to free them up now so that they can be used - * by other pools. Once we're back down to a single - * disk+spare, we stop removing them. - */ - if (vd->vdev_children > 2) { - newvd = vd->vdev_child[1]; - - if (newvd->vdev_isspare && last->vdev_isspare && - vdev_dtl_empty(last, DTL_MISSING) && - vdev_dtl_empty(last, DTL_OUTAGE) && - !vdev_dtl_required(newvd)) - return (newvd); - } - } - - return (NULL); -} - -static void -spa_vdev_resilver_done(spa_t *spa) -{ - vdev_t *vd, *pvd, *ppvd; - uint64_t guid, sguid, pguid, ppguid; - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - - while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { - pvd = vd->vdev_parent; - ppvd = pvd->vdev_parent; - guid = vd->vdev_guid; - pguid = pvd->vdev_guid; - ppguid = ppvd->vdev_guid; - sguid = 0; - /* - * If we have just finished replacing a hot spared device, then - * we need to detach the parent's first child (the original hot - * spare) as well. - */ - if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && - ppvd->vdev_children == 2) { - ASSERT(pvd->vdev_ops == &vdev_replacing_ops); - sguid = ppvd->vdev_child[1]->vdev_guid; - } - ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); - - spa_config_exit(spa, SCL_ALL, FTAG); - if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) - return; - if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) - return; - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - } - - spa_config_exit(spa, SCL_ALL, FTAG); -} - -/* - * Update the stored path or FRU for this vdev. - */ -int -spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, - boolean_t ispath) -{ - vdev_t *vd; - boolean_t sync = B_FALSE; - - ASSERT(spa_writeable(spa)); - - spa_vdev_state_enter(spa, SCL_ALL); - - if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) - return (spa_vdev_state_exit(spa, NULL, ENOENT)); - - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); - - if (ispath) { - if (strcmp(value, vd->vdev_path) != 0) { - spa_strfree(vd->vdev_path); - vd->vdev_path = spa_strdup(value); - sync = B_TRUE; - } - } else { - if (vd->vdev_fru == NULL) { - vd->vdev_fru = spa_strdup(value); - sync = B_TRUE; - } else if (strcmp(value, vd->vdev_fru) != 0) { - spa_strfree(vd->vdev_fru); - vd->vdev_fru = spa_strdup(value); - sync = B_TRUE; - } - } - - return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); -} - -int -spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) -{ - return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); -} - -int -spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) -{ - return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); -} - -/* - * ========================================================================== - * SPA Scanning - * ========================================================================== - */ -int -spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) -{ - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); - - if (dsl_scan_resilvering(spa->spa_dsl_pool)) - return (SET_ERROR(EBUSY)); - - return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); -} - -int -spa_scan_stop(spa_t *spa) -{ - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); - if (dsl_scan_resilvering(spa->spa_dsl_pool)) - return (SET_ERROR(EBUSY)); - return (dsl_scan_cancel(spa->spa_dsl_pool)); -} - -int -spa_scan(spa_t *spa, pool_scan_func_t func) -{ - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); - - if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) - return (SET_ERROR(ENOTSUP)); - - /* - * If a resilver was requested, but there is no DTL on a - * writeable leaf device, we have nothing to do. - */ - if (func == POOL_SCAN_RESILVER && - !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { - spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); - return (0); - } - - return (dsl_scan(spa->spa_dsl_pool, func)); -} - -/* - * ========================================================================== - * SPA async task processing - * ========================================================================== - */ - -static void -spa_async_remove(spa_t *spa, vdev_t *vd) -{ - if (vd->vdev_remove_wanted) { - vd->vdev_remove_wanted = B_FALSE; - vd->vdev_delayed_close = B_FALSE; - vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); - - /* - * We want to clear the stats, but we don't want to do a full - * vdev_clear() as that will cause us to throw away - * degraded/faulted state as well as attempt to reopen the - * device, all of which is a waste. - */ - vd->vdev_stat.vs_read_errors = 0; - vd->vdev_stat.vs_write_errors = 0; - vd->vdev_stat.vs_checksum_errors = 0; - - vdev_state_dirty(vd->vdev_top); - /* Tell userspace that the vdev is gone. */ - zfs_post_remove(spa, vd); - } - - for (int c = 0; c < vd->vdev_children; c++) - spa_async_remove(spa, vd->vdev_child[c]); -} - -static void -spa_async_probe(spa_t *spa, vdev_t *vd) -{ - if (vd->vdev_probe_wanted) { - vd->vdev_probe_wanted = B_FALSE; - vdev_reopen(vd); /* vdev_open() does the actual probe */ - } - - for (int c = 0; c < vd->vdev_children; c++) - spa_async_probe(spa, vd->vdev_child[c]); -} - -static void -spa_async_autoexpand(spa_t *spa, vdev_t *vd) -{ - sysevent_id_t eid; - nvlist_t *attr; - char *physpath; - - if (!spa->spa_autoexpand) - return; - - for (int c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - spa_async_autoexpand(spa, cvd); - } - - if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) - return; - - physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); - (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); - - VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); - - (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, - ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); - - nvlist_free(attr); - kmem_free(physpath, MAXPATHLEN); -} - -static void -spa_async_thread(void *arg) -{ - spa_t *spa = (spa_t *)arg; - int tasks; - - ASSERT(spa->spa_sync_on); - - mutex_enter(&spa->spa_async_lock); - tasks = spa->spa_async_tasks; - spa->spa_async_tasks &= SPA_ASYNC_REMOVE; - mutex_exit(&spa->spa_async_lock); - - /* - * See if the config needs to be updated. - */ - if (tasks & SPA_ASYNC_CONFIG_UPDATE) { - uint64_t old_space, new_space; - - mutex_enter(&spa_namespace_lock); - old_space = metaslab_class_get_space(spa_normal_class(spa)); - old_space += metaslab_class_get_space(spa_special_class(spa)); - old_space += metaslab_class_get_space(spa_dedup_class(spa)); - - spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); - - new_space = metaslab_class_get_space(spa_normal_class(spa)); - new_space += metaslab_class_get_space(spa_special_class(spa)); - new_space += metaslab_class_get_space(spa_dedup_class(spa)); - mutex_exit(&spa_namespace_lock); - - /* - * If the pool grew as a result of the config update, - * then log an internal history event. - */ - if (new_space != old_space) { - spa_history_log_internal(spa, "vdev online", NULL, - "pool '%s' size: %llu(+%llu)", - spa_name(spa), new_space, new_space - old_space); - } - } - - if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - spa_async_autoexpand(spa, spa->spa_root_vdev); - spa_config_exit(spa, SCL_CONFIG, FTAG); - } - - /* - * See if any devices need to be probed. - */ - if (tasks & SPA_ASYNC_PROBE) { - spa_vdev_state_enter(spa, SCL_NONE); - spa_async_probe(spa, spa->spa_root_vdev); - (void) spa_vdev_state_exit(spa, NULL, 0); - } - - /* - * If any devices are done replacing, detach them. - */ - if (tasks & SPA_ASYNC_RESILVER_DONE) - spa_vdev_resilver_done(spa); - - /* - * Kick off a resilver. - */ - if (tasks & SPA_ASYNC_RESILVER) - dsl_resilver_restart(spa->spa_dsl_pool, 0); - - if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { - mutex_enter(&spa_namespace_lock); - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - vdev_initialize_restart(spa->spa_root_vdev); - spa_config_exit(spa, SCL_CONFIG, FTAG); - mutex_exit(&spa_namespace_lock); - } - - /* - * Let the world know that we're done. - */ - mutex_enter(&spa->spa_async_lock); - spa->spa_async_thread = NULL; - cv_broadcast(&spa->spa_async_cv); - mutex_exit(&spa->spa_async_lock); - thread_exit(); -} - -static void -spa_async_thread_vd(void *arg) -{ - spa_t *spa = arg; - int tasks; - - mutex_enter(&spa->spa_async_lock); - tasks = spa->spa_async_tasks; -retry: - spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; - mutex_exit(&spa->spa_async_lock); - - /* - * See if any devices need to be marked REMOVED. - */ - if (tasks & SPA_ASYNC_REMOVE) { - spa_vdev_state_enter(spa, SCL_NONE); - spa_async_remove(spa, spa->spa_root_vdev); - for (int i = 0; i < spa->spa_l2cache.sav_count; i++) - spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); - for (int i = 0; i < spa->spa_spares.sav_count; i++) - spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); - (void) spa_vdev_state_exit(spa, NULL, 0); - } - - /* - * Let the world know that we're done. - */ - mutex_enter(&spa->spa_async_lock); - tasks = spa->spa_async_tasks; - if ((tasks & SPA_ASYNC_REMOVE) != 0) - goto retry; - spa->spa_async_thread_vd = NULL; - cv_broadcast(&spa->spa_async_cv); - mutex_exit(&spa->spa_async_lock); - thread_exit(); -} - -void -spa_async_suspend(spa_t *spa) -{ - mutex_enter(&spa->spa_async_lock); - spa->spa_async_suspended++; - while (spa->spa_async_thread != NULL || - spa->spa_async_thread_vd != NULL) - cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); - mutex_exit(&spa->spa_async_lock); - - spa_vdev_remove_suspend(spa); - - zthr_t *condense_thread = spa->spa_condense_zthr; - if (condense_thread != NULL) - zthr_cancel(condense_thread); - - zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; - if (discard_thread != NULL) - zthr_cancel(discard_thread); -} - -void -spa_async_resume(spa_t *spa) -{ - mutex_enter(&spa->spa_async_lock); - ASSERT(spa->spa_async_suspended != 0); - spa->spa_async_suspended--; - mutex_exit(&spa->spa_async_lock); - spa_restart_removal(spa); - - zthr_t *condense_thread = spa->spa_condense_zthr; - if (condense_thread != NULL) - zthr_resume(condense_thread); - - zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; - if (discard_thread != NULL) - zthr_resume(discard_thread); -} - -static boolean_t -spa_async_tasks_pending(spa_t *spa) -{ - uint_t non_config_tasks; - uint_t config_task; - boolean_t config_task_suspended; - - non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | - SPA_ASYNC_REMOVE); - config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; - if (spa->spa_ccw_fail_time == 0) { - config_task_suspended = B_FALSE; - } else { - config_task_suspended = - (gethrtime() - spa->spa_ccw_fail_time) < - (zfs_ccw_retry_interval * NANOSEC); - } - - return (non_config_tasks || (config_task && !config_task_suspended)); -} - -static void -spa_async_dispatch(spa_t *spa) -{ - mutex_enter(&spa->spa_async_lock); - if (spa_async_tasks_pending(spa) && - !spa->spa_async_suspended && - spa->spa_async_thread == NULL && - rootdir != NULL) - spa->spa_async_thread = thread_create(NULL, 0, - spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); - mutex_exit(&spa->spa_async_lock); -} - -static void -spa_async_dispatch_vd(spa_t *spa) -{ - mutex_enter(&spa->spa_async_lock); - if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && - !spa->spa_async_suspended && - spa->spa_async_thread_vd == NULL && - rootdir != NULL) - spa->spa_async_thread_vd = thread_create(NULL, 0, - spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); - mutex_exit(&spa->spa_async_lock); -} - -void -spa_async_request(spa_t *spa, int task) -{ - zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); - mutex_enter(&spa->spa_async_lock); - spa->spa_async_tasks |= task; - mutex_exit(&spa->spa_async_lock); - spa_async_dispatch_vd(spa); -} - -/* - * ========================================================================== - * SPA syncing routines - * ========================================================================== - */ - -static int -bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -{ - bpobj_t *bpo = arg; - bpobj_enqueue(bpo, bp, tx); - return (0); -} - -static int -spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -{ - zio_t *zio = arg; - - zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, - BP_GET_PSIZE(bp), zio->io_flags)); - return (0); -} - -/* - * Note: this simple function is not inlined to make it easier to dtrace the - * amount of time spent syncing frees. - */ -static void -spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) -{ - zio_t *zio = zio_root(spa, NULL, NULL, 0); - bplist_iterate(bpl, spa_free_sync_cb, zio, tx); - VERIFY(zio_wait(zio) == 0); -} - -/* - * Note: this simple function is not inlined to make it easier to dtrace the - * amount of time spent syncing deferred frees. - */ -static void -spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) -{ - zio_t *zio = zio_root(spa, NULL, NULL, 0); - VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, - spa_free_sync_cb, zio, tx), ==, 0); - VERIFY0(zio_wait(zio)); -} - - -static void -spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) -{ - char *packed = NULL; - size_t bufsize; - size_t nvsize = 0; - dmu_buf_t *db; - - VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); - - /* - * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration - * information. This avoids the dmu_buf_will_dirty() path and - * saves us a pre-read to get data we don't actually care about. - */ - bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); - packed = kmem_alloc(bufsize, KM_SLEEP); - - VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, - KM_SLEEP) == 0); - bzero(packed + nvsize, bufsize - nvsize); - - dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); - - kmem_free(packed, bufsize); - - VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); - dmu_buf_will_dirty(db, tx); - *(uint64_t *)db->db_data = nvsize; - dmu_buf_rele(db, FTAG); -} - -static void -spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, - const char *config, const char *entry) -{ - nvlist_t *nvroot; - nvlist_t **list; - int i; - - if (!sav->sav_sync) - return; - - /* - * Update the MOS nvlist describing the list of available devices. - * spa_validate_aux() will have already made sure this nvlist is - * valid and the vdevs are labeled appropriately. - */ - if (sav->sav_object == 0) { - sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, - DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, - sizeof (uint64_t), tx); - VERIFY(zap_update(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, - &sav->sav_object, tx) == 0); - } - - VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if (sav->sav_count == 0) { - VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); - } else { - list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); - for (i = 0; i < sav->sav_count; i++) - list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], - B_FALSE, VDEV_CONFIG_L2CACHE); - VERIFY(nvlist_add_nvlist_array(nvroot, config, list, - sav->sav_count) == 0); - for (i = 0; i < sav->sav_count; i++) - nvlist_free(list[i]); - kmem_free(list, sav->sav_count * sizeof (void *)); - } - - spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); - nvlist_free(nvroot); - - sav->sav_sync = B_FALSE; -} - -/* - * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. - * The all-vdev ZAP must be empty. - */ -static void -spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) -{ - spa_t *spa = vd->vdev_spa; - if (vd->vdev_top_zap != 0) { - VERIFY0(zap_add_int(spa->spa_meta_objset, avz, - vd->vdev_top_zap, tx)); - } - if (vd->vdev_leaf_zap != 0) { - VERIFY0(zap_add_int(spa->spa_meta_objset, avz, - vd->vdev_leaf_zap, tx)); - } - for (uint64_t i = 0; i < vd->vdev_children; i++) { - spa_avz_build(vd->vdev_child[i], avz, tx); - } -} - -static void -spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) -{ - nvlist_t *config; - - /* - * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, - * its config may not be dirty but we still need to build per-vdev ZAPs. - * Similarly, if the pool is being assembled (e.g. after a split), we - * need to rebuild the AVZ although the config may not be dirty. - */ - if (list_is_empty(&spa->spa_config_dirty_list) && - spa->spa_avz_action == AVZ_ACTION_NONE) - return; - - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - - ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || - spa->spa_avz_action == AVZ_ACTION_INITIALIZE || - spa->spa_all_vdev_zaps != 0); - - if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { - /* Make and build the new AVZ */ - uint64_t new_avz = zap_create(spa->spa_meta_objset, - DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); - spa_avz_build(spa->spa_root_vdev, new_avz, tx); - - /* Diff old AVZ with new one */ - zap_cursor_t zc; - zap_attribute_t za; - - for (zap_cursor_init(&zc, spa->spa_meta_objset, - spa->spa_all_vdev_zaps); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - uint64_t vdzap = za.za_first_integer; - if (zap_lookup_int(spa->spa_meta_objset, new_avz, - vdzap) == ENOENT) { - /* - * ZAP is listed in old AVZ but not in new one; - * destroy it - */ - VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, - tx)); - } - } - - zap_cursor_fini(&zc); - - /* Destroy the old AVZ */ - VERIFY0(zap_destroy(spa->spa_meta_objset, - spa->spa_all_vdev_zaps, tx)); - - /* Replace the old AVZ in the dir obj with the new one */ - VERIFY0(zap_update(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, - sizeof (new_avz), 1, &new_avz, tx)); - - spa->spa_all_vdev_zaps = new_avz; - } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { - zap_cursor_t zc; - zap_attribute_t za; - - /* Walk through the AVZ and destroy all listed ZAPs */ - for (zap_cursor_init(&zc, spa->spa_meta_objset, - spa->spa_all_vdev_zaps); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - uint64_t zap = za.za_first_integer; - VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); - } - - zap_cursor_fini(&zc); - - /* Destroy and unlink the AVZ itself */ - VERIFY0(zap_destroy(spa->spa_meta_objset, - spa->spa_all_vdev_zaps, tx)); - VERIFY0(zap_remove(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); - spa->spa_all_vdev_zaps = 0; - } - - if (spa->spa_all_vdev_zaps == 0) { - spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, - DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_VDEV_ZAP_MAP, tx); - } - spa->spa_avz_action = AVZ_ACTION_NONE; - - /* Create ZAPs for vdevs that don't have them. */ - vdev_construct_zaps(spa->spa_root_vdev, tx); - - config = spa_config_generate(spa, spa->spa_root_vdev, - dmu_tx_get_txg(tx), B_FALSE); - - /* - * If we're upgrading the spa version then make sure that - * the config object gets updated with the correct version. - */ - if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) - fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, - spa->spa_uberblock.ub_version); - - spa_config_exit(spa, SCL_STATE, FTAG); - - nvlist_free(spa->spa_config_syncing); - spa->spa_config_syncing = config; - - spa_sync_nvlist(spa, spa->spa_config_object, config, tx); -} - -static void -spa_sync_version(void *arg, dmu_tx_t *tx) -{ - uint64_t *versionp = arg; - uint64_t version = *versionp; - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - - /* - * Setting the version is special cased when first creating the pool. - */ - ASSERT(tx->tx_txg != TXG_INITIAL); - - ASSERT(SPA_VERSION_IS_SUPPORTED(version)); - ASSERT(version >= spa_version(spa)); - - spa->spa_uberblock.ub_version = version; - vdev_config_dirty(spa->spa_root_vdev); - spa_history_log_internal(spa, "set", tx, "version=%lld", version); -} - -/* - * Set zpool properties. - */ -static void -spa_sync_props(void *arg, dmu_tx_t *tx) -{ - nvlist_t *nvp = arg; - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - objset_t *mos = spa->spa_meta_objset; - nvpair_t *elem = NULL; - - mutex_enter(&spa->spa_props_lock); - - while ((elem = nvlist_next_nvpair(nvp, elem))) { - uint64_t intval; - char *strval, *fname; - zpool_prop_t prop; - const char *propname; - zprop_type_t proptype; - spa_feature_t fid; - - switch (prop = zpool_name_to_prop(nvpair_name(elem))) { - case ZPOOL_PROP_INVAL: - /* - * We checked this earlier in spa_prop_validate(). - */ - ASSERT(zpool_prop_feature(nvpair_name(elem))); - - fname = strchr(nvpair_name(elem), '@') + 1; - VERIFY0(zfeature_lookup_name(fname, &fid)); - - spa_feature_enable(spa, fid, tx); - spa_history_log_internal(spa, "set", tx, - "%s=enabled", nvpair_name(elem)); - break; - - case ZPOOL_PROP_VERSION: - intval = fnvpair_value_uint64(elem); - /* - * The version is synced seperatly before other - * properties and should be correct by now. - */ - ASSERT3U(spa_version(spa), >=, intval); - break; - - case ZPOOL_PROP_ALTROOT: - /* - * 'altroot' is a non-persistent property. It should - * have been set temporarily at creation or import time. - */ - ASSERT(spa->spa_root != NULL); - break; - - case ZPOOL_PROP_READONLY: - case ZPOOL_PROP_CACHEFILE: - /* - * 'readonly' and 'cachefile' are also non-persisitent - * properties. - */ - break; - case ZPOOL_PROP_COMMENT: - strval = fnvpair_value_string(elem); - if (spa->spa_comment != NULL) - spa_strfree(spa->spa_comment); - spa->spa_comment = spa_strdup(strval); - /* - * We need to dirty the configuration on all the vdevs - * so that their labels get updated. It's unnecessary - * to do this for pool creation since the vdev's - * configuratoin has already been dirtied. - */ - if (tx->tx_txg != TXG_INITIAL) - vdev_config_dirty(spa->spa_root_vdev); - spa_history_log_internal(spa, "set", tx, - "%s=%s", nvpair_name(elem), strval); - break; - default: - /* - * Set pool property values in the poolprops mos object. - */ - if (spa->spa_pool_props_object == 0) { - spa->spa_pool_props_object = - zap_create_link(mos, DMU_OT_POOL_PROPS, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, - tx); - } - - /* normalize the property name */ - propname = zpool_prop_to_name(prop); - proptype = zpool_prop_get_type(prop); - - if (nvpair_type(elem) == DATA_TYPE_STRING) { - ASSERT(proptype == PROP_TYPE_STRING); - strval = fnvpair_value_string(elem); - VERIFY0(zap_update(mos, - spa->spa_pool_props_object, propname, - 1, strlen(strval) + 1, strval, tx)); - spa_history_log_internal(spa, "set", tx, - "%s=%s", nvpair_name(elem), strval); - } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { - intval = fnvpair_value_uint64(elem); - - if (proptype == PROP_TYPE_INDEX) { - const char *unused; - VERIFY0(zpool_prop_index_to_string( - prop, intval, &unused)); - } - VERIFY0(zap_update(mos, - spa->spa_pool_props_object, propname, - 8, 1, &intval, tx)); - spa_history_log_internal(spa, "set", tx, - "%s=%lld", nvpair_name(elem), intval); - } else { - ASSERT(0); /* not allowed */ - } - - switch (prop) { - case ZPOOL_PROP_DELEGATION: - spa->spa_delegation = intval; - break; - case ZPOOL_PROP_BOOTFS: - spa->spa_bootfs = intval; - break; - case ZPOOL_PROP_FAILUREMODE: - spa->spa_failmode = intval; - break; - case ZPOOL_PROP_AUTOEXPAND: - spa->spa_autoexpand = intval; - if (tx->tx_txg != TXG_INITIAL) - spa_async_request(spa, - SPA_ASYNC_AUTOEXPAND); - break; - case ZPOOL_PROP_MULTIHOST: - spa->spa_multihost = intval; - break; - case ZPOOL_PROP_DEDUPDITTO: - spa->spa_dedup_ditto = intval; - break; - default: - break; - } - } - - } - - mutex_exit(&spa->spa_props_lock); -} - -/* - * Perform one-time upgrade on-disk changes. spa_version() does not - * reflect the new version this txg, so there must be no changes this - * txg to anything that the upgrade code depends on after it executes. - * Therefore this must be called after dsl_pool_sync() does the sync - * tasks. - */ -static void -spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) -{ - dsl_pool_t *dp = spa->spa_dsl_pool; - - ASSERT(spa->spa_sync_pass == 1); - - rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); - - if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && - spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { - dsl_pool_create_origin(dp, tx); - - /* Keeping the origin open increases spa_minref */ - spa->spa_minref += 3; - } - - if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && - spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { - dsl_pool_upgrade_clones(dp, tx); - } - - if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && - spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { - dsl_pool_upgrade_dir_clones(dp, tx); - - /* Keeping the freedir open increases spa_minref */ - spa->spa_minref += 3; - } - - if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && - spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { - spa_feature_create_zap_objects(spa, tx); - } - - /* - * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable - * when possibility to use lz4 compression for metadata was added - * Old pools that have this feature enabled must be upgraded to have - * this feature active - */ - if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { - boolean_t lz4_en = spa_feature_is_enabled(spa, - SPA_FEATURE_LZ4_COMPRESS); - boolean_t lz4_ac = spa_feature_is_active(spa, - SPA_FEATURE_LZ4_COMPRESS); - - if (lz4_en && !lz4_ac) - spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); - } - - /* - * If we haven't written the salt, do so now. Note that the - * feature may not be activated yet, but that's fine since - * the presence of this ZAP entry is backwards compatible. - */ - if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_CHECKSUM_SALT) == ENOENT) { - VERIFY0(zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, - sizeof (spa->spa_cksum_salt.zcs_bytes), - spa->spa_cksum_salt.zcs_bytes, tx)); - } - - rrw_exit(&dp->dp_config_rwlock, FTAG); -} - -static void -vdev_indirect_state_sync_verify(vdev_t *vd) -{ - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - vdev_indirect_births_t *vib = vd->vdev_indirect_births; - - if (vd->vdev_ops == &vdev_indirect_ops) { - ASSERT(vim != NULL); - ASSERT(vib != NULL); - } - - if (vdev_obsolete_sm_object(vd) != 0) { - ASSERT(vd->vdev_obsolete_sm != NULL); - ASSERT(vd->vdev_removing || - vd->vdev_ops == &vdev_indirect_ops); - ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); - ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); - - ASSERT3U(vdev_obsolete_sm_object(vd), ==, - space_map_object(vd->vdev_obsolete_sm)); - ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, - space_map_allocated(vd->vdev_obsolete_sm)); - } - ASSERT(vd->vdev_obsolete_segments != NULL); - - /* - * Since frees / remaps to an indirect vdev can only - * happen in syncing context, the obsolete segments - * tree must be empty when we start syncing. - */ - ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); -} - -/* - * Sync the specified transaction group. New blocks may be dirtied as - * part of the process, so we iterate until it converges. - */ -void -spa_sync(spa_t *spa, uint64_t txg) -{ - dsl_pool_t *dp = spa->spa_dsl_pool; - objset_t *mos = spa->spa_meta_objset; - bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; - metaslab_class_t *normal = spa_normal_class(spa); - metaslab_class_t *special = spa_special_class(spa); - metaslab_class_t *dedup = spa_dedup_class(spa); - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *vd; - dmu_tx_t *tx; - int error; - uint32_t max_queue_depth = zfs_vdev_async_write_max_active * - zfs_vdev_queue_depth_pct / 100; - - VERIFY(spa_writeable(spa)); - - /* - * Wait for i/os issued in open context that need to complete - * before this txg syncs. - */ - (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); - spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL); - - /* - * Lock out configuration changes. - */ - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - - spa->spa_syncing_txg = txg; - spa->spa_sync_pass = 0; - - for (int i = 0; i < spa->spa_alloc_count; i++) { - mutex_enter(&spa->spa_alloc_locks[i]); - VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); - mutex_exit(&spa->spa_alloc_locks[i]); - } - - /* - * If there are any pending vdev state changes, convert them - * into config changes that go out with this transaction group. - */ - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - while (list_head(&spa->spa_state_dirty_list) != NULL) { - /* - * We need the write lock here because, for aux vdevs, - * calling vdev_config_dirty() modifies sav_config. - * This is ugly and will become unnecessary when we - * eliminate the aux vdev wart by integrating all vdevs - * into the root vdev tree. - */ - spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); - spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); - while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { - vdev_state_clean(vd); - vdev_config_dirty(vd); - } - spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); - spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); - } - spa_config_exit(spa, SCL_STATE, FTAG); - - tx = dmu_tx_create_assigned(dp, txg); - - spa->spa_sync_starttime = gethrtime(); -#ifdef illumos - VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, - spa->spa_sync_starttime + spa->spa_deadman_synctime)); -#else /* !illumos */ -#ifdef _KERNEL - callout_schedule(&spa->spa_deadman_cycid, - hz * spa->spa_deadman_synctime / NANOSEC); -#endif -#endif /* illumos */ - - /* - * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, - * set spa_deflate if we have no raid-z vdevs. - */ - if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && - spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { - int i; - - for (i = 0; i < rvd->vdev_children; i++) { - vd = rvd->vdev_child[i]; - if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) - break; - } - if (i == rvd->vdev_children) { - spa->spa_deflate = TRUE; - VERIFY(0 == zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, - sizeof (uint64_t), 1, &spa->spa_deflate, tx)); - } - } - - /* - * Set the top-level vdev's max queue depth. Evaluate each - * top-level's async write queue depth in case it changed. - * The max queue depth will not change in the middle of syncing - * out this txg. - */ - uint64_t slots_per_allocator = 0; - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; - metaslab_class_t *mc; - - if (mg == NULL || !metaslab_group_initialized(mg)) - continue; - - mc = mg->mg_class; - if (mc != normal && mc != special && mc != dedup) - continue; - - /* - * It is safe to do a lock-free check here because only async - * allocations look at mg_max_alloc_queue_depth, and async - * allocations all happen from spa_sync(). - */ - for (int i = 0; i < spa->spa_alloc_count; i++) - ASSERT0(zfs_refcount_count( - &(mg->mg_alloc_queue_depth[i]))); - mg->mg_max_alloc_queue_depth = max_queue_depth; - - for (int i = 0; i < spa->spa_alloc_count; i++) { - mg->mg_cur_max_alloc_queue_depth[i] = - zfs_vdev_def_queue_depth; - } - slots_per_allocator += zfs_vdev_def_queue_depth; - } - - for (int i = 0; i < spa->spa_alloc_count; i++) { - ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i])); - ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i])); - ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i])); - normal->mc_alloc_max_slots[i] = slots_per_allocator; - special->mc_alloc_max_slots[i] = slots_per_allocator; - dedup->mc_alloc_max_slots[i] = slots_per_allocator; - } - normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; - special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; - dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; - - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *vd = rvd->vdev_child[c]; - vdev_indirect_state_sync_verify(vd); - - if (vdev_indirect_should_condense(vd)) { - spa_condense_indirect_start_sync(vd, tx); - break; - } - } - - /* - * Iterate to convergence. - */ - do { - int pass = ++spa->spa_sync_pass; - - spa_sync_config_object(spa, tx); - spa_sync_aux_dev(spa, &spa->spa_spares, tx, - ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); - spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, - ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); - spa_errlog_sync(spa, txg); - dsl_pool_sync(dp, txg); - - if (pass < zfs_sync_pass_deferred_free) { - spa_sync_frees(spa, free_bpl, tx); - } else { - /* - * We can not defer frees in pass 1, because - * we sync the deferred frees later in pass 1. - */ - ASSERT3U(pass, >, 1); - bplist_iterate(free_bpl, bpobj_enqueue_cb, - &spa->spa_deferred_bpobj, tx); - } - - ddt_sync(spa, txg); - dsl_scan_sync(dp, tx); - - if (spa->spa_vdev_removal != NULL) - svr_sync(spa, tx); - - while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) - != NULL) - vdev_sync(vd, txg); - - if (pass == 1) { - spa_sync_upgrades(spa, tx); - ASSERT3U(txg, >=, - spa->spa_uberblock.ub_rootbp.blk_birth); - /* - * Note: We need to check if the MOS is dirty - * because we could have marked the MOS dirty - * without updating the uberblock (e.g. if we - * have sync tasks but no dirty user data). We - * need to check the uberblock's rootbp because - * it is updated if we have synced out dirty - * data (though in this case the MOS will most - * likely also be dirty due to second order - * effects, we don't want to rely on that here). - */ - if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && - !dmu_objset_is_dirty(mos, txg)) { - /* - * Nothing changed on the first pass, - * therefore this TXG is a no-op. Avoid - * syncing deferred frees, so that we - * can keep this TXG as a no-op. - */ - ASSERT(txg_list_empty(&dp->dp_dirty_datasets, - txg)); - ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); - ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); - ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, - txg)); - break; - } - spa_sync_deferred_frees(spa, tx); - } - - } while (dmu_objset_is_dirty(mos, txg)); - - if (!list_is_empty(&spa->spa_config_dirty_list)) { - /* - * Make sure that the number of ZAPs for all the vdevs matches - * the number of ZAPs in the per-vdev ZAP list. This only gets - * called if the config is dirty; otherwise there may be - * outstanding AVZ operations that weren't completed in - * spa_sync_config_object. - */ - uint64_t all_vdev_zap_entry_count; - ASSERT0(zap_count(spa->spa_meta_objset, - spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); - ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, - all_vdev_zap_entry_count); - } - - if (spa->spa_vdev_removal != NULL) { - ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); - } - - /* - * Rewrite the vdev configuration (which includes the uberblock) - * to commit the transaction group. - * - * If there are no dirty vdevs, we sync the uberblock to a few - * random top-level vdevs that are known to be visible in the - * config cache (see spa_vdev_add() for a complete description). - * If there *are* dirty vdevs, sync the uberblock to all vdevs. - */ - for (;;) { - /* - * We hold SCL_STATE to prevent vdev open/close/etc. - * while we're attempting to write the vdev labels. - */ - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - - if (list_is_empty(&spa->spa_config_dirty_list)) { - vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; - int svdcount = 0; - int children = rvd->vdev_children; - int c0 = spa_get_random(children); - - for (int c = 0; c < children; c++) { - vd = rvd->vdev_child[(c0 + c) % children]; - - /* Stop when revisiting the first vdev */ - if (c > 0 && svd[0] == vd) - break; - - if (vd->vdev_ms_array == 0 || vd->vdev_islog || - !vdev_is_concrete(vd)) - continue; - - svd[svdcount++] = vd; - if (svdcount == SPA_SYNC_MIN_VDEVS) - break; - } - error = vdev_config_sync(svd, svdcount, txg); - } else { - error = vdev_config_sync(rvd->vdev_child, - rvd->vdev_children, txg); - } - - if (error == 0) - spa->spa_last_synced_guid = rvd->vdev_guid; - - spa_config_exit(spa, SCL_STATE, FTAG); - - if (error == 0) - break; - zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); - zio_resume_wait(spa); - } - dmu_tx_commit(tx); - -#ifdef illumos - VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); -#else /* !illumos */ -#ifdef _KERNEL - callout_drain(&spa->spa_deadman_cycid); -#endif -#endif /* illumos */ - - /* - * Clear the dirty config list. - */ - while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) - vdev_config_clean(vd); - - /* - * Now that the new config has synced transactionally, - * let it become visible to the config cache. - */ - if (spa->spa_config_syncing != NULL) { - spa_config_set(spa, spa->spa_config_syncing); - spa->spa_config_txg = txg; - spa->spa_config_syncing = NULL; - } - - dsl_pool_sync_done(dp, txg); - - for (int i = 0; i < spa->spa_alloc_count; i++) { - mutex_enter(&spa->spa_alloc_locks[i]); - VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); - mutex_exit(&spa->spa_alloc_locks[i]); - } - - /* - * Update usable space statistics. - */ - while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) - != NULL) - vdev_sync_done(vd, txg); - - spa_update_dspace(spa); - - /* - * It had better be the case that we didn't dirty anything - * since vdev_config_sync(). - */ - ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); - ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); - ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); - - while (zfs_pause_spa_sync) - delay(1); - - spa->spa_sync_pass = 0; - - /* - * Update the last synced uberblock here. We want to do this at - * the end of spa_sync() so that consumers of spa_last_synced_txg() - * will be guaranteed that all the processing associated with - * that txg has been completed. - */ - spa->spa_ubsync = spa->spa_uberblock; - spa_config_exit(spa, SCL_CONFIG, FTAG); - - spa_handle_ignored_writes(spa); - - /* - * If any async tasks have been requested, kick them off. - */ - spa_async_dispatch(spa); - spa_async_dispatch_vd(spa); -} - -/* - * Sync all pools. We don't want to hold the namespace lock across these - * operations, so we take a reference on the spa_t and drop the lock during the - * sync. - */ -void -spa_sync_allpools(void) -{ - spa_t *spa = NULL; - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) { - if (spa_state(spa) != POOL_STATE_ACTIVE || - !spa_writeable(spa) || spa_suspended(spa)) - continue; - spa_open_ref(spa, FTAG); - mutex_exit(&spa_namespace_lock); - txg_wait_synced(spa_get_dsl(spa), 0); - mutex_enter(&spa_namespace_lock); - spa_close(spa, FTAG); - } - mutex_exit(&spa_namespace_lock); -} - -/* - * ========================================================================== - * Miscellaneous routines - * ========================================================================== - */ - -/* - * Remove all pools in the system. - */ -void -spa_evict_all(void) -{ - spa_t *spa; - - /* - * Remove all cached state. All pools should be closed now, - * so every spa in the AVL tree should be unreferenced. - */ - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(NULL)) != NULL) { - /* - * Stop async tasks. The async thread may need to detach - * a device that's been replaced, which requires grabbing - * spa_namespace_lock, so we must drop it here. - */ - spa_open_ref(spa, FTAG); - mutex_exit(&spa_namespace_lock); - spa_async_suspend(spa); - mutex_enter(&spa_namespace_lock); - spa_close(spa, FTAG); - - if (spa->spa_state != POOL_STATE_UNINITIALIZED) { - spa_unload(spa); - spa_deactivate(spa); - } - spa_remove(spa); - } - mutex_exit(&spa_namespace_lock); -} - -vdev_t * -spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) -{ - vdev_t *vd; - int i; - - if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) - return (vd); - - if (aux) { - for (i = 0; i < spa->spa_l2cache.sav_count; i++) { - vd = spa->spa_l2cache.sav_vdevs[i]; - if (vd->vdev_guid == guid) - return (vd); - } - - for (i = 0; i < spa->spa_spares.sav_count; i++) { - vd = spa->spa_spares.sav_vdevs[i]; - if (vd->vdev_guid == guid) - return (vd); - } - } - - return (NULL); -} - -void -spa_upgrade(spa_t *spa, uint64_t version) -{ - ASSERT(spa_writeable(spa)); - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - - /* - * This should only be called for a non-faulted pool, and since a - * future version would result in an unopenable pool, this shouldn't be - * possible. - */ - ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); - ASSERT3U(version, >=, spa->spa_uberblock.ub_version); - - spa->spa_uberblock.ub_version = version; - vdev_config_dirty(spa->spa_root_vdev); - - spa_config_exit(spa, SCL_ALL, FTAG); - - txg_wait_synced(spa_get_dsl(spa), 0); -} - -boolean_t -spa_has_spare(spa_t *spa, uint64_t guid) -{ - int i; - uint64_t spareguid; - spa_aux_vdev_t *sav = &spa->spa_spares; - - for (i = 0; i < sav->sav_count; i++) - if (sav->sav_vdevs[i]->vdev_guid == guid) - return (B_TRUE); - - for (i = 0; i < sav->sav_npending; i++) { - if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, - &spareguid) == 0 && spareguid == guid) - return (B_TRUE); - } - - return (B_FALSE); -} - -/* - * Check if a pool has an active shared spare device. - * Note: reference count of an active spare is 2, as a spare and as a replace - */ -static boolean_t -spa_has_active_shared_spare(spa_t *spa) -{ - int i, refcnt; - uint64_t pool; - spa_aux_vdev_t *sav = &spa->spa_spares; - - for (i = 0; i < sav->sav_count; i++) { - if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, - &refcnt) && pool != 0ULL && pool == spa_guid(spa) && - refcnt > 2) - return (B_TRUE); - } - - return (B_FALSE); -} - -sysevent_t * -spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) -{ - sysevent_t *ev = NULL; -#ifdef _KERNEL - sysevent_attr_list_t *attr = NULL; - sysevent_value_t value; - - ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", - SE_SLEEP); - ASSERT(ev != NULL); - - value.value_type = SE_DATA_TYPE_STRING; - value.value.sv_string = spa_name(spa); - if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) - goto done; - - value.value_type = SE_DATA_TYPE_UINT64; - value.value.sv_uint64 = spa_guid(spa); - if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) - goto done; - - if (vd) { - value.value_type = SE_DATA_TYPE_UINT64; - value.value.sv_uint64 = vd->vdev_guid; - if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, - SE_SLEEP) != 0) - goto done; - - if (vd->vdev_path) { - value.value_type = SE_DATA_TYPE_STRING; - value.value.sv_string = vd->vdev_path; - if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, - &value, SE_SLEEP) != 0) - goto done; - } - } - - if (hist_nvl != NULL) { - fnvlist_merge((nvlist_t *)attr, hist_nvl); - } - - if (sysevent_attach_attributes(ev, attr) != 0) - goto done; - attr = NULL; - -done: - if (attr) - sysevent_free_attr(attr); - -#endif - return (ev); -} - -void -spa_event_post(sysevent_t *ev) -{ -#ifdef _KERNEL - sysevent_id_t eid; - - (void) log_sysevent(ev, SE_SLEEP, &eid); - sysevent_free(ev); -#endif -} - -void -spa_event_discard(sysevent_t *ev) -{ -#ifdef _KERNEL - sysevent_free(ev); -#endif -} - -/* - * Post a sysevent corresponding to the given event. The 'name' must be one of - * the event definitions in sys/sysevent/eventdefs.h. The payload will be - * filled in from the spa and (optionally) the vdev and history nvl. This - * doesn't do anything in the userland libzpool, as we don't want consumers to - * misinterpret ztest or zdb as real changes. - */ -void -spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) -{ - spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c deleted file mode 100644 index 62c3137cd590..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c +++ /dev/null @@ -1,623 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2017 by Delphix. All rights reserved. - */ - -/* - * Storage Pool Checkpoint - * - * A storage pool checkpoint can be thought of as a pool-wide snapshot or - * a stable version of extreme rewind that guarantees no blocks from the - * checkpointed state will have been overwritten. It remembers the entire - * state of the storage pool (e.g. snapshots, dataset names, etc..) from the - * point that it was taken and the user can rewind back to that point even if - * they applied destructive operations on their datasets or even enabled new - * zpool on-disk features. If a pool has a checkpoint that is no longer - * needed, the user can discard it. - * - * == On disk data structures used == - * - * - The pool has a new feature flag and a new entry in the MOS. The feature - * flag is set to active when we create the checkpoint and remains active - * until the checkpoint is fully discarded. The entry in the MOS config - * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that - * references the state of the pool when we take the checkpoint. The entry - * remains populated until we start discarding the checkpoint or we rewind - * back to it. - * - * - Each vdev contains a vdev-wide space map while the pool has a checkpoint, - * which persists until the checkpoint is fully discarded. The space map - * contains entries that have been freed in the current state of the pool - * but we want to keep around in case we decide to rewind to the checkpoint. - * [see vdev_checkpoint_sm] - * - * - Each metaslab's ms_sm space map behaves the same as without the - * checkpoint, with the only exception being the scenario when we free - * blocks that belong to the checkpoint. In this case, these blocks remain - * ALLOCATED in the metaslab's space map and they are added as FREE in the - * vdev's checkpoint space map. - * - * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that - * the uberblock was checkpointed. For normal uberblocks this field is 0. - * - * == Overview of operations == - * - * - To create a checkpoint, we first wait for the current TXG to be synced, - * so we can use the most recently synced uberblock (spa_ubsync) as the - * checkpointed uberblock. Then we use an early synctask to place that - * uberblock in MOS config, increment the feature flag for the checkpoint - * (marking it active), and setting spa_checkpoint_txg (see its use below) - * to the TXG of the checkpointed uberblock. We use an early synctask for - * the aforementioned operations to ensure that no blocks were dirtied - * between the current TXG and the TXG of the checkpointed uberblock - * (e.g the previous txg). - * - * - When a checkpoint exists, we need to ensure that the blocks that - * belong to the checkpoint are freed but never reused. This means that - * these blocks should never end up in the ms_allocatable or the ms_freeing - * trees of a metaslab. Therefore, whenever there is a checkpoint the new - * ms_checkpointing tree is used in addition to the aforementioned ones. - * - * Whenever a block is freed and we find out that it is referenced by the - * checkpoint (we find out by comparing its birth to spa_checkpoint_txg), - * we place it in the ms_checkpointing tree instead of the ms_freeingtree. - * This way, we divide the blocks that are being freed into checkpointed - * and not-checkpointed blocks. - * - * In order to persist these frees, we write the extents from the - * ms_freeingtree to the ms_sm as usual, and the extents from the - * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these - * checkpointed extents will remain allocated in the metaslab's ms_sm space - * map, and therefore won't be reused [see metaslab_sync()]. In addition, - * when we discard the checkpoint, we can find the entries that have - * actually been freed in vdev_checkpoint_sm. - * [see spa_checkpoint_discard_thread_sync()] - * - * - To discard the checkpoint we use an early synctask to delete the - * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0, - * and wakeup the discarding zthr thread (an open-context async thread). - * We use an early synctask to ensure that the operation happens before any - * new data end up in the checkpoint's data structures. - * - * Once the synctask is done and the discarding zthr is awake, we discard - * the checkpointed data over multiple TXGs by having the zthr prefetching - * entries from vdev_checkpoint_sm and then starting a synctask that places - * them as free blocks in to their respective ms_allocatable and ms_sm - * structures. - * [see spa_checkpoint_discard_thread()] - * - * When there are no entries left in the vdev_checkpoint_sm of all - * top-level vdevs, a final synctask runs that decrements the feature flag. - * - * - To rewind to the checkpoint, we first use the current uberblock and - * open the MOS so we can access the checkpointed uberblock from the MOS - * config. After we retrieve the checkpointed uberblock, we use it as the - * current uberblock for the pool by writing it to disk with an updated - * TXG, opening its version of the MOS, and moving on as usual from there. - * [see spa_ld_checkpoint_rewind()] - * - * An important note on rewinding to the checkpoint has to do with how we - * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL - * blocks that have not been claimed by the time we took the checkpoint - * as they should no longer be valid. - * [see comment in zil_claim()] - * - * == Miscellaneous information == - * - * - In the hypothetical event that we take a checkpoint, remove a vdev, - * and attempt to rewind, the rewind would fail as the checkpointed - * uberblock would reference data in the removed device. For this reason - * and others of similar nature, we disallow the following operations that - * can change the config: - * vdev removal and attach/detach, mirror splitting, and pool reguid. - * - * - As most of the checkpoint logic is implemented in the SPA and doesn't - * distinguish datasets when it comes to space accounting, having a - * checkpoint can potentially break the boundaries set by dataset - * reservations. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * The following parameter limits the amount of memory to be used for the - * prefetching of the checkpoint space map done on each vdev while - * discarding the checkpoint. - * - * The reason it exists is because top-level vdevs with long checkpoint - * space maps can potentially take up a lot of memory depending on the - * amount of checkpointed data that has been freed within them while - * the pool had a checkpoint. - */ -uint64_t zfs_spa_discard_memory_limit = 16 * 1024 * 1024; - -int -spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs) -{ - if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) - return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); - - bzero(pcs, sizeof (pool_checkpoint_stat_t)); - - int error = zap_contains(spa_meta_objset(spa), - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT); - ASSERT(error == 0 || error == ENOENT); - - if (error == ENOENT) - pcs->pcs_state = CS_CHECKPOINT_DISCARDING; - else - pcs->pcs_state = CS_CHECKPOINT_EXISTS; - - pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace; - pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp; - - return (0); -} - -static void -spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx) -{ - spa_t *spa = arg; - - spa->spa_checkpoint_info.sci_timestamp = 0; - - spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); - - spa_history_log_internal(spa, "spa discard checkpoint", tx, - "finished discarding checkpointed state from the pool"); -} - -typedef struct spa_checkpoint_discard_sync_callback_arg { - vdev_t *sdc_vd; - uint64_t sdc_txg; - uint64_t sdc_entry_limit; -} spa_checkpoint_discard_sync_callback_arg_t; - -static int -spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg) -{ - spa_checkpoint_discard_sync_callback_arg_t *sdc = arg; - vdev_t *vd = sdc->sdc_vd; - metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; - uint64_t end = sme->sme_offset + sme->sme_run; - - if (sdc->sdc_entry_limit == 0) - return (EINTR); - - /* - * Since the space map is not condensed, we know that - * none of its entries is crossing the boundaries of - * its respective metaslab. - * - * That said, there is no fundamental requirement that - * the checkpoint's space map entries should not cross - * metaslab boundaries. So if needed we could add code - * that handles metaslab-crossing segments in the future. - */ - VERIFY3U(sme->sme_type, ==, SM_FREE); - VERIFY3U(sme->sme_offset, >=, ms->ms_start); - VERIFY3U(end, <=, ms->ms_start + ms->ms_size); - - /* - * At this point we should not be processing any - * other frees concurrently, so the lock is technically - * unnecessary. We use the lock anyway though to - * potentially save ourselves from future headaches. - */ - mutex_enter(&ms->ms_lock); - if (range_tree_is_empty(ms->ms_freeing)) - vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg); - range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run); - mutex_exit(&ms->ms_lock); - - ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, - sme->sme_run); - ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run); - - vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run; - vd->vdev_stat.vs_checkpoint_space -= sme->sme_run; - sdc->sdc_entry_limit--; - - return (0); -} - -static void -spa_checkpoint_accounting_verify(spa_t *spa) -{ - vdev_t *rvd = spa->spa_root_vdev; - uint64_t ckpoint_sm_space_sum = 0; - uint64_t vs_ckpoint_space_sum = 0; - - for (uint64_t c = 0; c < rvd->vdev_children; c++) { - vdev_t *vd = rvd->vdev_child[c]; - - if (vd->vdev_checkpoint_sm != NULL) { - ckpoint_sm_space_sum += - -space_map_allocated(vd->vdev_checkpoint_sm); - vs_ckpoint_space_sum += - vd->vdev_stat.vs_checkpoint_space; - ASSERT3U(ckpoint_sm_space_sum, ==, - vs_ckpoint_space_sum); - } else { - ASSERT0(vd->vdev_stat.vs_checkpoint_space); - } - } - ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum); -} - -static void -spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) -{ - vdev_t *vd = arg; - int error; - - /* - * The space map callback is applied only to non-debug entries. - * Because the number of debug entries is less or equal to the - * number of non-debug entries, we want to ensure that we only - * read what we prefetched from open-context. - * - * Thus, we set the maximum entries that the space map callback - * will be applied to be half the entries that could fit in the - * imposed memory limit. - * - * Note that since this is a conservative estimate we also - * assume the worst case scenario in our computation where each - * entry is two-word. - */ - uint64_t max_entry_limit = - (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1; - - /* - * Iterate from the end of the space map towards the beginning, - * placing its entries on ms_freeing and removing them from the - * space map. The iteration stops if one of the following - * conditions is true: - * - * 1] We reached the beginning of the space map. At this point - * the space map should be completely empty and - * space_map_incremental_destroy should have returned 0. - * The next step would be to free and close the space map - * and remove its entry from its vdev's top zap. This allows - * spa_checkpoint_discard_thread() to move on to the next vdev. - * - * 2] We reached the memory limit (amount of memory used to hold - * space map entries in memory) and space_map_incremental_destroy - * returned EINTR. This means that there are entries remaining - * in the space map that will be cleared in a future invocation - * of this function by spa_checkpoint_discard_thread(). - */ - spa_checkpoint_discard_sync_callback_arg_t sdc; - sdc.sdc_vd = vd; - sdc.sdc_txg = tx->tx_txg; - sdc.sdc_entry_limit = max_entry_limit; - - uint64_t words_before = - space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); - - error = space_map_incremental_destroy(vd->vdev_checkpoint_sm, - spa_checkpoint_discard_sync_callback, &sdc, tx); - - uint64_t words_after = - space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); - -#ifdef DEBUG - spa_checkpoint_accounting_verify(vd->vdev_spa); -#endif - - zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, " - "deleted %llu words - %llu words are left", - tx->tx_txg, vd->vdev_id, (words_before - words_after), - words_after); - - if (error != EINTR) { - if (error != 0) { - zfs_panic_recover("zfs: error %d was returned " - "while incrementally destroying the checkpoint " - "space map of vdev %llu\n", - error, vd->vdev_id); - } - ASSERT0(words_after); - ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm)); - ASSERT0(space_map_length(vd->vdev_checkpoint_sm)); - - space_map_free(vd->vdev_checkpoint_sm, tx); - space_map_close(vd->vdev_checkpoint_sm); - vd->vdev_checkpoint_sm = NULL; - - VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa), - vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx)); - } -} - -static boolean_t -spa_checkpoint_discard_is_done(spa_t *spa) -{ - vdev_t *rvd = spa->spa_root_vdev; - - ASSERT(!spa_has_checkpoint(spa)); - ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)); - - for (uint64_t c = 0; c < rvd->vdev_children; c++) { - if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL) - return (B_FALSE); - ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space); - } - - return (B_TRUE); -} - -/* ARGSUSED */ -boolean_t -spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr) -{ - spa_t *spa = arg; - - if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) - return (B_FALSE); - - if (spa_has_checkpoint(spa)) - return (B_FALSE); - - return (B_TRUE); -} - -void -spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) -{ - spa_t *spa = arg; - vdev_t *rvd = spa->spa_root_vdev; - - for (uint64_t c = 0; c < rvd->vdev_children; c++) { - vdev_t *vd = rvd->vdev_child[c]; - - while (vd->vdev_checkpoint_sm != NULL) { - space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm; - int numbufs; - dmu_buf_t **dbp; - - if (zthr_iscancelled(zthr)) - return; - - ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); - - uint64_t size = MIN(space_map_length(checkpoint_sm), - zfs_spa_discard_memory_limit); - uint64_t offset = - space_map_length(checkpoint_sm) - size; - - /* - * Ensure that the part of the space map that will - * be destroyed by the synctask, is prefetched in - * memory before the synctask runs. - */ - int error = dmu_buf_hold_array_by_bonus( - checkpoint_sm->sm_dbuf, offset, size, - B_TRUE, FTAG, &numbufs, &dbp); - if (error != 0) { - zfs_panic_recover("zfs: error %d was returned " - "while prefetching checkpoint space map " - "entries of vdev %llu\n", - error, vd->vdev_id); - } - - VERIFY0(dsl_sync_task(spa->spa_name, NULL, - spa_checkpoint_discard_thread_sync, vd, - 0, ZFS_SPACE_CHECK_NONE)); - - dmu_buf_rele_array(dbp, numbufs, FTAG); - } - } - - VERIFY(spa_checkpoint_discard_is_done(spa)); - VERIFY0(spa->spa_checkpoint_info.sci_dspace); - VERIFY0(dsl_sync_task(spa->spa_name, NULL, - spa_checkpoint_discard_complete_sync, spa, - 0, ZFS_SPACE_CHECK_NONE)); -} - - -/* ARGSUSED */ -static int -spa_checkpoint_check(void *arg, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - - if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT)) - return (SET_ERROR(ENOTSUP)); - - if (!spa_top_vdevs_spacemap_addressable(spa)) - return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG)); - - if (spa->spa_vdev_removal != NULL) - return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS)); - - if (spa->spa_checkpoint_txg != 0) - return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS)); - - if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) - return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); - - return (0); -} - -/* ARGSUSED */ -static void -spa_checkpoint_sync(void *arg, dmu_tx_t *tx) -{ - dsl_pool_t *dp = dmu_tx_pool(tx); - spa_t *spa = dp->dp_spa; - uberblock_t checkpoint = spa->spa_ubsync; - - /* - * At this point, there should not be a checkpoint in the MOS. - */ - ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT); - - ASSERT0(spa->spa_checkpoint_info.sci_timestamp); - ASSERT0(spa->spa_checkpoint_info.sci_dspace); - - /* - * Since the checkpointed uberblock is the one that just got synced - * (we use spa_ubsync), its txg must be equal to the txg number of - * the txg we are syncing, minus 1. - */ - ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1); - - /* - * Once the checkpoint is in place, we need to ensure that none of - * its blocks will be marked for reuse after it has been freed. - * When there is a checkpoint and a block is freed, we compare its - * birth txg to the txg of the checkpointed uberblock to see if the - * block is part of the checkpoint or not. Therefore, we have to set - * spa_checkpoint_txg before any frees happen in this txg (which is - * why this is done as an early_synctask as explained in the comment - * in spa_checkpoint()). - */ - spa->spa_checkpoint_txg = checkpoint.ub_txg; - spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; - - checkpoint.ub_checkpoint_txg = checkpoint.ub_txg; - VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, - sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), - &checkpoint, tx)); - - /* - * Increment the feature refcount and thus activate the feature. - * Note that the feature will be deactivated when we've - * completely discarded all checkpointed state (both vdev - * space maps and uberblock). - */ - spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); - - spa_history_log_internal(spa, "spa checkpoint", tx, - "checkpointed uberblock txg=%llu", checkpoint.ub_txg); -} - -/* - * Create a checkpoint for the pool. - */ -int -spa_checkpoint(const char *pool) -{ - int error; - spa_t *spa; - - error = spa_open(pool, &spa, FTAG); - if (error != 0) - return (error); - - mutex_enter(&spa->spa_vdev_top_lock); - - /* - * Wait for current syncing txg to finish so the latest synced - * uberblock (spa_ubsync) has all the changes that we expect - * to see if we were to revert later to the checkpoint. In other - * words we want the checkpointed uberblock to include/reference - * all the changes that were pending at the time that we issued - * the checkpoint command. - */ - txg_wait_synced(spa_get_dsl(spa), 0); - - /* - * As the checkpointed uberblock references blocks from the previous - * txg (spa_ubsync) we want to ensure that are not freeing any of - * these blocks in the same txg that the following synctask will - * run. Thus, we run it as an early synctask, so the dirty changes - * that are synced to disk afterwards during zios and other synctasks - * do not reuse checkpointed blocks. - */ - error = dsl_early_sync_task(pool, spa_checkpoint_check, - spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL); - - mutex_exit(&spa->spa_vdev_top_lock); - - spa_close(spa, FTAG); - return (error); -} - -/* ARGSUSED */ -static int -spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - - if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) - return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); - - if (spa->spa_checkpoint_txg == 0) - return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); - - VERIFY0(zap_contains(spa_meta_objset(spa), - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT)); - - return (0); -} - -/* ARGSUSED */ -static void -spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - - VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_ZPOOL_CHECKPOINT, tx)); - - spa->spa_checkpoint_txg = 0; - - zthr_wakeup(spa->spa_checkpoint_discard_zthr); - - spa_history_log_internal(spa, "spa discard checkpoint", tx, - "started discarding checkpointed state from the pool"); -} - -/* - * Discard the checkpoint from a pool. - */ -int -spa_checkpoint_discard(const char *pool) -{ - /* - * Similarly to spa_checkpoint(), we want our synctask to run - * before any pending dirty data are written to disk so they - * won't end up in the checkpoint's data structures (e.g. - * ms_checkpointing and vdev_checkpoint_sm) and re-create any - * space maps that the discarding open-context thread has - * deleted. - * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread] - */ - return (dsl_early_sync_task(pool, spa_checkpoint_discard_check, - spa_checkpoint_discard_sync, NULL, 0, - ZFS_SPACE_CHECK_DISCARD_CHECKPOINT)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c deleted file mode 100644 index b616a439f7b8..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c +++ /dev/null @@ -1,594 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright 2017 Joyent, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef _KERNEL -#include -#include -#endif - -/* - * Pool configuration repository. - * - * Pool configuration is stored as a packed nvlist on the filesystem. By - * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot - * (when the ZFS module is loaded). Pools can also have the 'cachefile' - * property set that allows them to be stored in an alternate location until - * the control of external software. - * - * For each cache file, we have a single nvlist which holds all the - * configuration information. When the module loads, we read this information - * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is - * maintained independently in spa.c. Whenever the namespace is modified, or - * the configuration of a pool is changed, we call spa_write_cachefile(), which - * walks through all the active pools and writes the configuration to disk. - */ - -static uint64_t spa_config_generation = 1; - -/* - * This can be overridden in userland to preserve an alternate namespace for - * userland pools when doing testing. - */ -const char *spa_config_path = ZPOOL_CACHE; - -/* - * Called when the module is first loaded, this routine loads the configuration - * file into the SPA namespace. It does not actually open or load the pools; it - * only populates the namespace. - */ -void -spa_config_load(void) -{ - void *buf = NULL; - nvlist_t *nvlist, *child; - nvpair_t *nvpair; - char *pathname; - struct _buf *file; - uint64_t fsize; - - /* - * Open the configuration file. - */ - pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); - - (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); - - file = kobj_open_file(pathname); - - kmem_free(pathname, MAXPATHLEN); - - if (file == (struct _buf *)-1) - return; - - if (kobj_get_filesize(file, &fsize) != 0) - goto out; - - buf = kmem_alloc(fsize, KM_SLEEP); - - /* - * Read the nvlist from the file. - */ - if (kobj_read_file(file, buf, fsize, 0) < 0) - goto out; - - /* - * Unpack the nvlist. - */ - if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) - goto out; - - /* - * Iterate over all elements in the nvlist, creating a new spa_t for - * each one with the specified configuration. - */ - mutex_enter(&spa_namespace_lock); - nvpair = NULL; - while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { - if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) - continue; - - child = fnvpair_value_nvlist(nvpair); - - if (spa_lookup(nvpair_name(nvpair)) != NULL) - continue; - (void) spa_add(nvpair_name(nvpair), child, NULL); - } - mutex_exit(&spa_namespace_lock); - - nvlist_free(nvlist); - -out: - if (buf != NULL) - kmem_free(buf, fsize); - - kobj_close_file(file); -} - -static void -spa_config_clean(nvlist_t *nvl) -{ - nvlist_t **child; - nvlist_t *nvroot = NULL; - uint_t c, children; - - if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, - &children) == 0) { - for (c = 0; c < children; c++) - spa_config_clean(child[c]); - } - - if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0) - spa_config_clean(nvroot); - - nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS, DATA_TYPE_UINT64_ARRAY); - nvlist_remove(nvl, ZPOOL_CONFIG_SCAN_STATS, DATA_TYPE_UINT64_ARRAY); -} - -static int -spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) -{ - size_t buflen; - char *buf; - vnode_t *vp; - int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; - char *temp; - int err; - - /* - * If the nvlist is empty (NULL), then remove the old cachefile. - */ - if (nvl == NULL) { - err = vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE); - return (err); - } - - /* - * Pack the configuration into a buffer. - */ - buf = fnvlist_pack(nvl, &buflen); - temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); - - /* - * Write the configuration to disk. We need to do the traditional - * 'write to temporary file, sync, move over original' to make sure we - * always have a consistent view of the data. - */ - (void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path); - - err = vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0); - if (err == 0) { - err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, NULL); - if (err == 0) - err = VOP_FSYNC(vp, FSYNC, kcred, NULL); - if (err == 0) - err = vn_rename(temp, dp->scd_path, UIO_SYSSPACE); - (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL); - } - - (void) vn_remove(temp, UIO_SYSSPACE, RMFILE); - - fnvlist_pack_free(buf, buflen); - kmem_free(temp, MAXPATHLEN); - return (err); -} - -/* - * Synchronize pool configuration to disk. This must be called with the - * namespace lock held. Synchronizing the pool cache is typically done after - * the configuration has been synced to the MOS. This exposes a window where - * the MOS config will have been updated but the cache file has not. If - * the system were to crash at that instant then the cached config may not - * contain the correct information to open the pool and an explicit import - * would be required. - */ -void -spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) -{ - spa_config_dirent_t *dp, *tdp; - nvlist_t *nvl; - boolean_t ccw_failure; - int error; - char *pool_name; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - if (rootdir == NULL || !(spa_mode_global & FWRITE)) - return; - - /* - * Iterate over all cachefiles for the pool, past or present. When the - * cachefile is changed, the new one is pushed onto this list, allowing - * us to update previous cachefiles that no longer contain this pool. - */ - ccw_failure = B_FALSE; - for (dp = list_head(&target->spa_config_list); dp != NULL; - dp = list_next(&target->spa_config_list, dp)) { - spa_t *spa = NULL; - if (dp->scd_path == NULL) - continue; - - /* - * Iterate over all pools, adding any matching pools to 'nvl'. - */ - nvl = NULL; - while ((spa = spa_next(spa)) != NULL) { - nvlist_t *nvroot = NULL; - /* - * Skip over our own pool if we're about to remove - * ourselves from the spa namespace or any pool that - * is readonly. Since we cannot guarantee that a - * readonly pool would successfully import upon reboot, - * we don't allow them to be written to the cache file. - */ - if ((spa == target && removing) || - (spa_state(spa) == POOL_STATE_ACTIVE && - !spa_writeable(spa))) - continue; - - mutex_enter(&spa->spa_props_lock); - tdp = list_head(&spa->spa_config_list); - if (spa->spa_config == NULL || - tdp->scd_path == NULL || - strcmp(tdp->scd_path, dp->scd_path) != 0) { - mutex_exit(&spa->spa_props_lock); - continue; - } - - if (nvl == NULL) - nvl = fnvlist_alloc(); - - if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { - pool_name = fnvlist_lookup_string( - spa->spa_config, ZPOOL_CONFIG_POOL_NAME); - } else { - pool_name = spa_name(spa); - } - - fnvlist_add_nvlist(nvl, pool_name, - spa->spa_config); - mutex_exit(&spa->spa_props_lock); - - if (nvlist_lookup_nvlist(nvl, pool_name, &nvroot) == 0) - spa_config_clean(nvroot); - } - - error = spa_config_write(dp, nvl); - if (error != 0) - ccw_failure = B_TRUE; - nvlist_free(nvl); - } - - if (ccw_failure) { - /* - * Keep trying so that configuration data is - * written if/when any temporary filesystem - * resource issues are resolved. - */ - if (target->spa_ccw_fail_time == 0) { - zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, - target, NULL, NULL, 0, 0); - } - target->spa_ccw_fail_time = gethrtime(); - spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); - } else { - /* - * Do not rate limit future attempts to update - * the config cache. - */ - target->spa_ccw_fail_time = 0; - } - - /* - * Remove any config entries older than the current one. - */ - dp = list_head(&target->spa_config_list); - while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { - list_remove(&target->spa_config_list, tdp); - if (tdp->scd_path != NULL) - spa_strfree(tdp->scd_path); - kmem_free(tdp, sizeof (spa_config_dirent_t)); - } - - spa_config_generation++; - - if (postsysevent) - spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC); -} - -/* - * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, - * and we don't want to allow the local zone to see all the pools anyway. - * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration - * information for all pool visible within the zone. - */ -nvlist_t * -spa_all_configs(uint64_t *generation) -{ - nvlist_t *pools; - spa_t *spa = NULL; - - if (*generation == spa_config_generation) - return (NULL); - - pools = fnvlist_alloc(); - - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) { - if (INGLOBALZONE(curthread) || - zone_dataset_visible(spa_name(spa), NULL)) { - mutex_enter(&spa->spa_props_lock); - fnvlist_add_nvlist(pools, spa_name(spa), - spa->spa_config); - mutex_exit(&spa->spa_props_lock); - } - } - *generation = spa_config_generation; - mutex_exit(&spa_namespace_lock); - - return (pools); -} - -void -spa_config_set(spa_t *spa, nvlist_t *config) -{ - mutex_enter(&spa->spa_props_lock); - if (spa->spa_config != NULL && spa->spa_config != config) - nvlist_free(spa->spa_config); - spa->spa_config = config; - mutex_exit(&spa->spa_props_lock); -} - -/* - * Generate the pool's configuration based on the current in-core state. - * - * We infer whether to generate a complete config or just one top-level config - * based on whether vd is the root vdev. - */ -nvlist_t * -spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) -{ - nvlist_t *config, *nvroot; - vdev_t *rvd = spa->spa_root_vdev; - unsigned long hostid = 0; - boolean_t locked = B_FALSE; - uint64_t split_guid; - char *pool_name; - - if (vd == NULL) { - vd = rvd; - locked = B_TRUE; - spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); - } - - ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == - (SCL_CONFIG | SCL_STATE)); - - /* - * If txg is -1, report the current value of spa->spa_config_txg. - */ - if (txg == -1ULL) - txg = spa->spa_config_txg; - - /* - * Originally, users had to handle spa namespace collisions by either - * exporting the already imported pool or by specifying a new name for - * the pool with a conflicting name. In the case of root pools from - * virtual guests, neither approach to collision resolution is - * reasonable. This is addressed by extending the new name syntax with - * an option to specify that the new name is temporary. When specified, - * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us - * to use the previous name, which we do below. - */ - if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { - pool_name = fnvlist_lookup_string(spa->spa_config, - ZPOOL_CONFIG_POOL_NAME); - } else { - pool_name = spa_name(spa); - } - - config = fnvlist_alloc(); - - fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); - fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name); - fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa)); - fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); - fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); - if (spa->spa_comment != NULL) { - fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, - spa->spa_comment); - } - - hostid = spa_get_hostid(); - if (hostid != 0) { - fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid); - } - fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname.nodename); - - int config_gen_flags = 0; - if (vd != rvd) { - fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, - vd->vdev_top->vdev_guid); - fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID, - vd->vdev_guid); - if (vd->vdev_isspare) { - fnvlist_add_uint64(config, - ZPOOL_CONFIG_IS_SPARE, 1ULL); - } - if (vd->vdev_islog) { - fnvlist_add_uint64(config, - ZPOOL_CONFIG_IS_LOG, 1ULL); - } - vd = vd->vdev_top; /* label contains top config */ - } else { - /* - * Only add the (potentially large) split information - * in the mos config, and not in the vdev labels - */ - if (spa->spa_config_splitting != NULL) - fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, - spa->spa_config_splitting); - fnvlist_add_boolean(config, - ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS); - - config_gen_flags |= VDEV_CONFIG_MOS; - } - - /* - * Add the top-level config. We even add this on pools which - * don't support holes in the namespace. - */ - vdev_top_config_generate(spa, config); - - /* - * If we're splitting, record the original pool's guid. - */ - if (spa->spa_config_splitting != NULL && - nvlist_lookup_uint64(spa->spa_config_splitting, - ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { - fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, - split_guid); - } - - nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags); - fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot); - nvlist_free(nvroot); - - /* - * Store what's necessary for reading the MOS in the label. - */ - fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, - spa->spa_label_features); - - if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { - ddt_histogram_t *ddh; - ddt_stat_t *dds; - ddt_object_t *ddo; - - ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); - ddt_get_dedup_histogram(spa, ddh); - fnvlist_add_uint64_array(config, - ZPOOL_CONFIG_DDT_HISTOGRAM, - (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)); - kmem_free(ddh, sizeof (ddt_histogram_t)); - - ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP); - ddt_get_dedup_object_stats(spa, ddo); - fnvlist_add_uint64_array(config, - ZPOOL_CONFIG_DDT_OBJ_STATS, - (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)); - kmem_free(ddo, sizeof (ddt_object_t)); - - dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP); - ddt_get_dedup_stats(spa, dds); - fnvlist_add_uint64_array(config, - ZPOOL_CONFIG_DDT_STATS, - (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)); - kmem_free(dds, sizeof (ddt_stat_t)); - } - - if (locked) - spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); - - return (config); -} - -/* - * Update all disk labels, generate a fresh config based on the current - * in-core state, and sync the global config cache (do not sync the config - * cache if this is a booting rootpool). - */ -void -spa_config_update(spa_t *spa, int what) -{ - vdev_t *rvd = spa->spa_root_vdev; - uint64_t txg; - int c; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - txg = spa_last_synced_txg(spa) + 1; - if (what == SPA_CONFIG_UPDATE_POOL) { - vdev_config_dirty(rvd); - } else { - /* - * If we have top-level vdevs that were added but have - * not yet been prepared for allocation, do that now. - * (It's safe now because the config cache is up to date, - * so it will be able to translate the new DVAs.) - * See comments in spa_vdev_add() for full details. - */ - for (c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - - /* - * Explicitly skip vdevs that are indirect or - * log vdevs that are being removed. The reason - * is that both of those can have vdev_ms_array - * set to 0 and we wouldn't want to change their - * metaslab size nor call vdev_expand() on them. - */ - if (!vdev_is_concrete(tvd) || - (tvd->vdev_islog && tvd->vdev_removing)) - continue; - - if (tvd->vdev_ms_array == 0) { - vdev_ashift_optimize(tvd); - vdev_metaslab_set_size(tvd); - } - vdev_expand(tvd, txg); - } - } - spa_config_exit(spa, SCL_ALL, FTAG); - - /* - * Wait for the mosconfig to be regenerated and synced. - */ - txg_wait_synced(spa->spa_dsl_pool, txg); - - /* - * Update the global config cache to reflect the new mosconfig. - */ - spa_write_cachefile(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); - - if (what == SPA_CONFIG_UPDATE_POOL) - spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c deleted file mode 100644 index 8ce780537abb..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c +++ /dev/null @@ -1,406 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. - */ - -/* - * Routines to manage the on-disk persistent error log. - * - * Each pool stores a log of all logical data errors seen during normal - * operation. This is actually the union of two distinct logs: the last log, - * and the current log. All errors seen are logged to the current log. When a - * scrub completes, the current log becomes the last log, the last log is thrown - * out, and the current log is reinitialized. This way, if an error is somehow - * corrected, a new scrub will show that that it no longer exists, and will be - * deleted from the log when the scrub completes. - * - * The log is stored using a ZAP object whose key is a string form of the - * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an - * optional 'objset:object' human-readable string describing the data. When an - * error is first logged, this string will be empty, indicating that no name is - * known. This prevents us from having to issue a potentially large amount of - * I/O to discover the object name during an error path. Instead, we do the - * calculation when the data is requested, storing the result so future queries - * will be faster. - * - * This log is then shipped into an nvlist where the key is the dataset name and - * the value is the object name. Userland is then responsible for uniquifying - * this list and displaying it to the user. - */ - -#include -#include -#include -#include -#include - - -/* - * Convert a bookmark to a string. - */ -static void -bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len) -{ - (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", - (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, - (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid); -} - -/* - * Convert a string to a bookmark - */ -#ifdef _KERNEL -static void -name_to_bookmark(char *buf, zbookmark_phys_t *zb) -{ - zb->zb_objset = zfs_strtonum(buf, &buf); - ASSERT(*buf == ':'); - zb->zb_object = zfs_strtonum(buf + 1, &buf); - ASSERT(*buf == ':'); - zb->zb_level = (int)zfs_strtonum(buf + 1, &buf); - ASSERT(*buf == ':'); - zb->zb_blkid = zfs_strtonum(buf + 1, &buf); - ASSERT(*buf == '\0'); -} -#endif - -/* - * Log an uncorrectable error to the persistent error log. We add it to the - * spa's list of pending errors. The changes are actually synced out to disk - * during spa_errlog_sync(). - */ -void -spa_log_error(spa_t *spa, zio_t *zio) -{ - zbookmark_phys_t *zb = &zio->io_logical->io_bookmark; - spa_error_entry_t search; - spa_error_entry_t *new; - avl_tree_t *tree; - avl_index_t where; - - /* - * If we are trying to import a pool, ignore any errors, as we won't be - * writing to the pool any time soon. - */ - if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) - return; - - mutex_enter(&spa->spa_errlist_lock); - - /* - * If we have had a request to rotate the log, log it to the next list - * instead of the current one. - */ - if (spa->spa_scrub_active || spa->spa_scrub_finished) - tree = &spa->spa_errlist_scrub; - else - tree = &spa->spa_errlist_last; - - search.se_bookmark = *zb; - if (avl_find(tree, &search, &where) != NULL) { - mutex_exit(&spa->spa_errlist_lock); - return; - } - - new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); - new->se_bookmark = *zb; - avl_insert(tree, new, where); - - mutex_exit(&spa->spa_errlist_lock); -} - -/* - * Return the number of errors currently in the error log. This is actually the - * sum of both the last log and the current log, since we don't know the union - * of these logs until we reach userland. - */ -uint64_t -spa_get_errlog_size(spa_t *spa) -{ - uint64_t total = 0, count; - - mutex_enter(&spa->spa_errlog_lock); - if (spa->spa_errlog_scrub != 0 && - zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, - &count) == 0) - total += count; - - if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && - zap_count(spa->spa_meta_objset, spa->spa_errlog_last, - &count) == 0) - total += count; - mutex_exit(&spa->spa_errlog_lock); - - mutex_enter(&spa->spa_errlist_lock); - total += avl_numnodes(&spa->spa_errlist_last); - total += avl_numnodes(&spa->spa_errlist_scrub); - mutex_exit(&spa->spa_errlist_lock); - - return (total); -} - -#ifdef _KERNEL -static int -process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) -{ - zap_cursor_t zc; - zap_attribute_t za; - zbookmark_phys_t zb; - - if (obj == 0) - return (0); - - for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - - if (*count == 0) { - zap_cursor_fini(&zc); - return (SET_ERROR(ENOMEM)); - } - - name_to_bookmark(za.za_name, &zb); - - if (copyout(&zb, (char *)addr + - (*count - 1) * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) { - zap_cursor_fini(&zc); - return (SET_ERROR(EFAULT)); - } - - *count -= 1; - } - - zap_cursor_fini(&zc); - - return (0); -} - -static int -process_error_list(avl_tree_t *list, void *addr, size_t *count) -{ - spa_error_entry_t *se; - - for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { - - if (*count == 0) - return (SET_ERROR(ENOMEM)); - - if (copyout(&se->se_bookmark, (char *)addr + - (*count - 1) * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) - return (SET_ERROR(EFAULT)); - - *count -= 1; - } - - return (0); -} -#endif - -/* - * Copy all known errors to userland as an array of bookmarks. This is - * actually a union of the on-disk last log and current log, as well as any - * pending error requests. - * - * Because the act of reading the on-disk log could cause errors to be - * generated, we have two separate locks: one for the error log and one for the - * in-core error lists. We only need the error list lock to log and error, so - * we grab the error log lock while we read the on-disk logs, and only pick up - * the error list lock when we are finished. - */ -int -spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) -{ - int ret = 0; - -#ifdef _KERNEL - mutex_enter(&spa->spa_errlog_lock); - - ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); - - if (!ret && !spa->spa_scrub_finished) - ret = process_error_log(spa, spa->spa_errlog_last, uaddr, - count); - - mutex_enter(&spa->spa_errlist_lock); - if (!ret) - ret = process_error_list(&spa->spa_errlist_scrub, uaddr, - count); - if (!ret) - ret = process_error_list(&spa->spa_errlist_last, uaddr, - count); - mutex_exit(&spa->spa_errlist_lock); - - mutex_exit(&spa->spa_errlog_lock); -#endif - - return (ret); -} - -/* - * Called when a scrub completes. This simply set a bit which tells which AVL - * tree to add new errors. spa_errlog_sync() is responsible for actually - * syncing the changes to the underlying objects. - */ -void -spa_errlog_rotate(spa_t *spa) -{ - mutex_enter(&spa->spa_errlist_lock); - spa->spa_scrub_finished = B_TRUE; - mutex_exit(&spa->spa_errlist_lock); -} - -/* - * Discard any pending errors from the spa_t. Called when unloading a faulted - * pool, as the errors encountered during the open cannot be synced to disk. - */ -void -spa_errlog_drain(spa_t *spa) -{ - spa_error_entry_t *se; - void *cookie; - - mutex_enter(&spa->spa_errlist_lock); - - cookie = NULL; - while ((se = avl_destroy_nodes(&spa->spa_errlist_last, - &cookie)) != NULL) - kmem_free(se, sizeof (spa_error_entry_t)); - cookie = NULL; - while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub, - &cookie)) != NULL) - kmem_free(se, sizeof (spa_error_entry_t)); - - mutex_exit(&spa->spa_errlist_lock); -} - -/* - * Process a list of errors into the current on-disk log. - */ -static void -sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) -{ - spa_error_entry_t *se; - char buf[64]; - void *cookie; - - if (avl_numnodes(t) != 0) { - /* create log if necessary */ - if (*obj == 0) - *obj = zap_create(spa->spa_meta_objset, - DMU_OT_ERROR_LOG, DMU_OT_NONE, - 0, tx); - - /* add errors to the current log */ - for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { - char *name = se->se_name ? se->se_name : ""; - - bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); - - (void) zap_update(spa->spa_meta_objset, - *obj, buf, 1, strlen(name) + 1, name, tx); - } - - /* purge the error list */ - cookie = NULL; - while ((se = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(se, sizeof (spa_error_entry_t)); - } -} - -/* - * Sync the error log out to disk. This is a little tricky because the act of - * writing the error log requires the spa_errlist_lock. So, we need to lock the - * error lists, take a copy of the lists, and then reinitialize them. Then, we - * drop the error list lock and take the error log lock, at which point we - * do the errlog processing. Then, if we encounter an I/O error during this - * process, we can successfully add the error to the list. Note that this will - * result in the perpetual recycling of errors, but it is an unlikely situation - * and not a performance critical operation. - */ -void -spa_errlog_sync(spa_t *spa, uint64_t txg) -{ - dmu_tx_t *tx; - avl_tree_t scrub, last; - int scrub_finished; - - mutex_enter(&spa->spa_errlist_lock); - - /* - * Bail out early under normal circumstances. - */ - if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && - avl_numnodes(&spa->spa_errlist_last) == 0 && - !spa->spa_scrub_finished) { - mutex_exit(&spa->spa_errlist_lock); - return; - } - - spa_get_errlists(spa, &last, &scrub); - scrub_finished = spa->spa_scrub_finished; - spa->spa_scrub_finished = B_FALSE; - - mutex_exit(&spa->spa_errlist_lock); - mutex_enter(&spa->spa_errlog_lock); - - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - - /* - * Sync out the current list of errors. - */ - sync_error_list(spa, &last, &spa->spa_errlog_last, tx); - - /* - * Rotate the log if necessary. - */ - if (scrub_finished) { - if (spa->spa_errlog_last != 0) - VERIFY(dmu_object_free(spa->spa_meta_objset, - spa->spa_errlog_last, tx) == 0); - spa->spa_errlog_last = spa->spa_errlog_scrub; - spa->spa_errlog_scrub = 0; - - sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx); - } - - /* - * Sync out any pending scrub errors. - */ - sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx); - - /* - * Update the MOS to reflect the new values. - */ - (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, - &spa->spa_errlog_last, tx); - (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, - &spa->spa_errlog_scrub, tx); - - dmu_tx_commit(tx); - - mutex_exit(&spa->spa_errlog_lock); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c deleted file mode 100644 index 4b080fc48cdf..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c +++ /dev/null @@ -1,628 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2017 Joyent, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "zfs_comutil.h" -#ifdef _KERNEL -#include -#include -#endif - -/* - * Routines to manage the on-disk history log. - * - * The history log is stored as a dmu object containing - * tuples. - * - * Where "record nvlist" is a nvlist containing uint64_ts and strings, and - * "packed record length" is the packed length of the "record nvlist" stored - * as a little endian uint64_t. - * - * The log is implemented as a ring buffer, though the original creation - * of the pool ('zpool create') is never overwritten. - * - * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer - * of 'spa_history' stores the offsets for logging/retrieving history as - * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of - * where the 'zpool create' record is stored. This allows us to never - * overwrite the original creation of the pool. 'sh_phys_max_off' is the - * physical ending offset in bytes of the log. This tells you the length of - * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record - * is added, 'sh_eof' is incremented by the the size of the record. - * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes). - * This is where the consumer should start reading from after reading in - * the 'zpool create' portion of the log. - * - * 'sh_records_lost' keeps track of how many records have been overwritten - * and permanently lost. - */ - -/* convert a logical offset to physical */ -static uint64_t -spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp) -{ - uint64_t phys_len; - - phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len; - return ((log_off - shpp->sh_pool_create_len) % phys_len - + shpp->sh_pool_create_len); -} - -void -spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) -{ - dmu_buf_t *dbp; - spa_history_phys_t *shpp; - objset_t *mos = spa->spa_meta_objset; - - ASSERT(spa->spa_history == 0); - spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY, - SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, - sizeof (spa_history_phys_t), tx); - - VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_HISTORY, sizeof (uint64_t), 1, - &spa->spa_history, tx) == 0); - - VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); - ASSERT(dbp->db_size >= sizeof (spa_history_phys_t)); - - shpp = dbp->db_data; - dmu_buf_will_dirty(dbp, tx); - - /* - * Figure out maximum size of history log. We set it at - * 0.1% of pool size, with a max of 1G and min of 128KB. - */ - shpp->sh_phys_max_off = - metaslab_class_get_dspace(spa_normal_class(spa)) / 1000; - shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30); - shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); - - dmu_buf_rele(dbp, FTAG); -} - -/* - * Change 'sh_bof' to the beginning of the next record. - */ -static int -spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) -{ - objset_t *mos = spa->spa_meta_objset; - uint64_t firstread, reclen, phys_bof; - char buf[sizeof (reclen)]; - int err; - - phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp); - firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); - - if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, - buf, DMU_READ_PREFETCH)) != 0) - return (err); - if (firstread != sizeof (reclen)) { - if ((err = dmu_read(mos, spa->spa_history, - shpp->sh_pool_create_len, sizeof (reclen) - firstread, - buf + firstread, DMU_READ_PREFETCH)) != 0) - return (err); - } - - reclen = LE_64(*((uint64_t *)buf)); - shpp->sh_bof += reclen + sizeof (reclen); - shpp->sh_records_lost++; - return (0); -} - -static int -spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, - dmu_tx_t *tx) -{ - uint64_t firstwrite, phys_eof; - objset_t *mos = spa->spa_meta_objset; - int err; - - ASSERT(MUTEX_HELD(&spa->spa_history_lock)); - - /* see if we need to reset logical BOF */ - while (shpp->sh_phys_max_off - shpp->sh_pool_create_len - - (shpp->sh_eof - shpp->sh_bof) <= len) { - if ((err = spa_history_advance_bof(spa, shpp)) != 0) { - return (err); - } - } - - phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); - firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof); - shpp->sh_eof += len; - dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx); - - len -= firstwrite; - if (len > 0) { - /* write out the rest at the beginning of physical file */ - dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len, - len, (char *)buf + firstwrite, tx); - } - - return (0); -} - -static char * -spa_history_zone(void) -{ -#ifdef _KERNEL - /* XXX: pr_hostname can be changed by default from within a jail! */ - if (jailed(curthread->td_ucred)) - return (curthread->td_ucred->cr_prison->pr_hostname); -#endif - return (NULL); -} - -/* - * Post a history sysevent. - * - * The nvlist_t* passed into this function will be transformed into a new - * nvlist where: - * - * 1. Nested nvlists will be flattened to a single level - * 2. Keys will have their names normalized (to remove any problematic - * characters, such as whitespace) - * - * The nvlist_t passed into this function will duplicated and should be freed - * by caller. - * - */ -static void -spa_history_log_notify(spa_t *spa, nvlist_t *nvl) -{ - nvlist_t *hist_nvl = fnvlist_alloc(); - uint64_t uint64; - char *string; - - if (nvlist_lookup_string(nvl, ZPOOL_HIST_CMD, &string) == 0) - fnvlist_add_string(hist_nvl, ZFS_EV_HIST_CMD, string); - - if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0) - fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string); - - if (nvlist_lookup_string(nvl, ZPOOL_HIST_ZONE, &string) == 0) - fnvlist_add_string(hist_nvl, ZFS_EV_HIST_ZONE, string); - - if (nvlist_lookup_string(nvl, ZPOOL_HIST_HOST, &string) == 0) - fnvlist_add_string(hist_nvl, ZFS_EV_HIST_HOST, string); - - if (nvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME, &string) == 0) - fnvlist_add_string(hist_nvl, ZFS_EV_HIST_DSNAME, string); - - if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR, &string) == 0) - fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_STR, string); - - if (nvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL, &string) == 0) - fnvlist_add_string(hist_nvl, ZFS_EV_HIST_IOCTL, string); - - if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0) - fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string); - - if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID, &uint64) == 0) - fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_DSID, uint64); - - if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG, &uint64) == 0) - fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TXG, uint64); - - if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TIME, &uint64) == 0) - fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TIME, uint64); - - if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_WHO, &uint64) == 0) - fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_WHO, uint64); - - if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_INT_EVENT, &uint64) == 0) - fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_INT_EVENT, uint64); - - spa_event_notify(spa, NULL, hist_nvl, ESC_ZFS_HISTORY_EVENT); - - nvlist_free(hist_nvl); -} - -/* - * Write out a history event. - */ -/*ARGSUSED*/ -static void -spa_history_log_sync(void *arg, dmu_tx_t *tx) -{ - nvlist_t *nvl = arg; - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - objset_t *mos = spa->spa_meta_objset; - dmu_buf_t *dbp; - spa_history_phys_t *shpp; - size_t reclen; - uint64_t le_len; - char *record_packed = NULL; - int ret; - - /* - * If we have an older pool that doesn't have a command - * history object, create it now. - */ - mutex_enter(&spa->spa_history_lock); - if (!spa->spa_history) - spa_history_create_obj(spa, tx); - mutex_exit(&spa->spa_history_lock); - - /* - * Get the offset of where we need to write via the bonus buffer. - * Update the offset when the write completes. - */ - VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); - shpp = dbp->db_data; - - dmu_buf_will_dirty(dbp, tx); - -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(dbp, &doi); - ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); - } -#endif - - fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec()); -#ifdef _KERNEL - fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename); -#endif - if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) { - zfs_dbgmsg("command: %s", - fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD)); - } else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) { - if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) { - zfs_dbgmsg("txg %lld %s %s (id %llu) %s", - fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), - fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), - fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME), - fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID), - fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); - } else { - zfs_dbgmsg("txg %lld %s %s", - fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), - fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), - fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); - } - /* - * The history sysevent is posted only for internal history - * messages to show what has happened, not how it happened. For - * example, the following command: - * - * # zfs destroy -r tank/foo - * - * will result in one sysevent posted per dataset that is - * destroyed as a result of the command - which could be more - * than one event in total. By contrast, if the sysevent was - * posted as a result of the ZPOOL_HIST_CMD key being present - * it would result in only one sysevent being posted with the - * full command line arguments, requiring the consumer to know - * how to parse and understand zfs(1M) command invocations. - */ - spa_history_log_notify(spa, nvl); - } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) { - zfs_dbgmsg("ioctl %s", - fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL)); - } - - record_packed = fnvlist_pack(nvl, &reclen); - - mutex_enter(&spa->spa_history_lock); - - /* write out the packed length as little endian */ - le_len = LE_64((uint64_t)reclen); - ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); - if (!ret) - ret = spa_history_write(spa, record_packed, reclen, shpp, tx); - - /* The first command is the create, which we keep forever */ - if (ret == 0 && shpp->sh_pool_create_len == 0 && - nvlist_exists(nvl, ZPOOL_HIST_CMD)) { - shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof; - } - - mutex_exit(&spa->spa_history_lock); - fnvlist_pack_free(record_packed, reclen); - dmu_buf_rele(dbp, FTAG); - fnvlist_free(nvl); -} - -/* - * Write out a history event. - */ -int -spa_history_log(spa_t *spa, const char *msg) -{ - int err; - nvlist_t *nvl = fnvlist_alloc(); - - fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg); - err = spa_history_log_nvl(spa, nvl); - fnvlist_free(nvl); - return (err); -} - -int -spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) -{ - int err = 0; - dmu_tx_t *tx; - nvlist_t *nvarg; - - if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) - return (EINVAL); - - if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa)) - return (SET_ERROR(EINVAL)); - - tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - - nvarg = fnvlist_dup(nvl); - if (spa_history_zone() != NULL) { - fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE, - spa_history_zone()); - } - fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED())); - - /* Kick this off asynchronously; errors are ignored. */ - dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, - nvarg, 0, ZFS_SPACE_CHECK_NONE, tx); - dmu_tx_commit(tx); - - /* spa_history_log_sync will free nvl */ - return (err); - -} - -/* - * Read out the command history. - */ -int -spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) -{ - objset_t *mos = spa->spa_meta_objset; - dmu_buf_t *dbp; - uint64_t read_len, phys_read_off, phys_eof; - uint64_t leftover = 0; - spa_history_phys_t *shpp; - int err; - - /* - * If the command history doesn't exist (older pool), - * that's ok, just return ENOENT. - */ - if (!spa->spa_history) - return (SET_ERROR(ENOENT)); - - /* - * The history is logged asynchronously, so when they request - * the first chunk of history, make sure everything has been - * synced to disk so that we get it. - */ - if (*offp == 0 && spa_writeable(spa)) - txg_wait_synced(spa_get_dsl(spa), 0); - - if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) - return (err); - shpp = dbp->db_data; - -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(dbp, &doi); - ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); - } -#endif - - mutex_enter(&spa->spa_history_lock); - phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); - - if (*offp < shpp->sh_pool_create_len) { - /* read in just the zpool create history */ - phys_read_off = *offp; - read_len = MIN(*len, shpp->sh_pool_create_len - - phys_read_off); - } else { - /* - * Need to reset passed in offset to BOF if the passed in - * offset has since been overwritten. - */ - *offp = MAX(*offp, shpp->sh_bof); - phys_read_off = spa_history_log_to_phys(*offp, shpp); - - /* - * Read up to the minimum of what the user passed down or - * the EOF (physical or logical). If we hit physical EOF, - * use 'leftover' to read from the physical BOF. - */ - if (phys_read_off <= phys_eof) { - read_len = MIN(*len, phys_eof - phys_read_off); - } else { - read_len = MIN(*len, - shpp->sh_phys_max_off - phys_read_off); - if (phys_read_off + *len > shpp->sh_phys_max_off) { - leftover = MIN(*len - read_len, - phys_eof - shpp->sh_pool_create_len); - } - } - } - - /* offset for consumer to use next */ - *offp += read_len + leftover; - - /* tell the consumer how much you actually read */ - *len = read_len + leftover; - - if (read_len == 0) { - mutex_exit(&spa->spa_history_lock); - dmu_buf_rele(dbp, FTAG); - return (0); - } - - err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, - DMU_READ_PREFETCH); - if (leftover && err == 0) { - err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, - leftover, buf + read_len, DMU_READ_PREFETCH); - } - mutex_exit(&spa->spa_history_lock); - - dmu_buf_rele(dbp, FTAG); - return (err); -} - -/* - * The nvlist will be consumed by this call. - */ -static void -log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, - dmu_tx_t *tx, const char *fmt, va_list adx) -{ - char *msg; - va_list adx2; - - /* - * If this is part of creating a pool, not everything is - * initialized yet, so don't bother logging the internal events. - * Likewise if the pool is not writeable. - */ - if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) { - fnvlist_free(nvl); - return; - } - - va_copy(adx2, adx); - - msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP); - (void) vsprintf(msg, fmt, adx2); - fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg); - strfree(msg); - - va_end(adx2); - - fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation); - fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg); - - if (dmu_tx_is_syncing(tx)) { - spa_history_log_sync(nvl, tx); - } else { - dsl_sync_task_nowait(spa_get_dsl(spa), - spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx); - } - /* spa_history_log_sync() will free nvl */ -} - -void -spa_history_log_internal(spa_t *spa, const char *operation, - dmu_tx_t *tx, const char *fmt, ...) -{ - dmu_tx_t *htx = tx; - va_list adx; - - /* create a tx if we didn't get one */ - if (tx == NULL) { - htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - if (dmu_tx_assign(htx, TXG_WAIT) != 0) { - dmu_tx_abort(htx); - return; - } - } - - va_start(adx, fmt); - log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx); - va_end(adx); - - /* if we didn't get a tx from the caller, commit the one we made */ - if (tx == NULL) - dmu_tx_commit(htx); -} - -void -spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation, - dmu_tx_t *tx, const char *fmt, ...) -{ - va_list adx; - char namebuf[ZFS_MAX_DATASET_NAME_LEN]; - nvlist_t *nvl = fnvlist_alloc(); - - ASSERT(tx != NULL); - - dsl_dataset_name(ds, namebuf); - fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); - fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object); - - va_start(adx, fmt); - log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx); - va_end(adx); -} - -void -spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, - dmu_tx_t *tx, const char *fmt, ...) -{ - va_list adx; - char namebuf[ZFS_MAX_DATASET_NAME_LEN]; - nvlist_t *nvl = fnvlist_alloc(); - - ASSERT(tx != NULL); - - dsl_dir_name(dd, namebuf); - fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); - fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, - dsl_dir_phys(dd)->dd_head_dataset_obj); - - va_start(adx, fmt); - log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx); - va_end(adx); -} - -void -spa_history_log_version(spa_t *spa, const char *operation) -{ - spa_history_log_internal(spa, operation, NULL, - "pool version %llu; software version %llu/%llu; uts %s %s %s %s", - (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION, - utsname.nodename, utsname.release, utsname.version, - utsname.machine); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c deleted file mode 100644 index 0706767a9d1f..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ /dev/null @@ -1,2523 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright 2015 Nexenta Systems, Inc. All rights reserved. - * Copyright 2013 Martin Matuska . All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright 2013 Saso Kiselkov. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright (c) 2017 Datto Inc. - * Copyright (c) 2017, Intel Corporation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "zfs_prop.h" -#include - -#if defined(__FreeBSD__) && defined(_KERNEL) -#include -#include -#endif - -/* - * SPA locking - * - * There are four basic locks for managing spa_t structures: - * - * spa_namespace_lock (global mutex) - * - * This lock must be acquired to do any of the following: - * - * - Lookup a spa_t by name - * - Add or remove a spa_t from the namespace - * - Increase spa_refcount from non-zero - * - Check if spa_refcount is zero - * - Rename a spa_t - * - add/remove/attach/detach devices - * - Held for the duration of create/destroy/import/export - * - * It does not need to handle recursion. A create or destroy may - * reference objects (files or zvols) in other pools, but by - * definition they must have an existing reference, and will never need - * to lookup a spa_t by name. - * - * spa_refcount (per-spa zfs_refcount_t protected by mutex) - * - * This reference count keep track of any active users of the spa_t. The - * spa_t cannot be destroyed or freed while this is non-zero. Internally, - * the refcount is never really 'zero' - opening a pool implicitly keeps - * some references in the DMU. Internally we check against spa_minref, but - * present the image of a zero/non-zero value to consumers. - * - * spa_config_lock[] (per-spa array of rwlocks) - * - * This protects the spa_t from config changes, and must be held in - * the following circumstances: - * - * - RW_READER to perform I/O to the spa - * - RW_WRITER to change the vdev config - * - * The locking order is fairly straightforward: - * - * spa_namespace_lock -> spa_refcount - * - * The namespace lock must be acquired to increase the refcount from 0 - * or to check if it is zero. - * - * spa_refcount -> spa_config_lock[] - * - * There must be at least one valid reference on the spa_t to acquire - * the config lock. - * - * spa_namespace_lock -> spa_config_lock[] - * - * The namespace lock must always be taken before the config lock. - * - * - * The spa_namespace_lock can be acquired directly and is globally visible. - * - * The namespace is manipulated using the following functions, all of which - * require the spa_namespace_lock to be held. - * - * spa_lookup() Lookup a spa_t by name. - * - * spa_add() Create a new spa_t in the namespace. - * - * spa_remove() Remove a spa_t from the namespace. This also - * frees up any memory associated with the spa_t. - * - * spa_next() Returns the next spa_t in the system, or the - * first if NULL is passed. - * - * spa_evict_all() Shutdown and remove all spa_t structures in - * the system. - * - * spa_guid_exists() Determine whether a pool/device guid exists. - * - * The spa_refcount is manipulated using the following functions: - * - * spa_open_ref() Adds a reference to the given spa_t. Must be - * called with spa_namespace_lock held if the - * refcount is currently zero. - * - * spa_close() Remove a reference from the spa_t. This will - * not free the spa_t or remove it from the - * namespace. No locking is required. - * - * spa_refcount_zero() Returns true if the refcount is currently - * zero. Must be called with spa_namespace_lock - * held. - * - * The spa_config_lock[] is an array of rwlocks, ordered as follows: - * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. - * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). - * - * To read the configuration, it suffices to hold one of these locks as reader. - * To modify the configuration, you must hold all locks as writer. To modify - * vdev state without altering the vdev tree's topology (e.g. online/offline), - * you must hold SCL_STATE and SCL_ZIO as writer. - * - * We use these distinct config locks to avoid recursive lock entry. - * For example, spa_sync() (which holds SCL_CONFIG as reader) induces - * block allocations (SCL_ALLOC), which may require reading space maps - * from disk (dmu_read() -> zio_read() -> SCL_ZIO). - * - * The spa config locks cannot be normal rwlocks because we need the - * ability to hand off ownership. For example, SCL_ZIO is acquired - * by the issuing thread and later released by an interrupt thread. - * They do, however, obey the usual write-wanted semantics to prevent - * writer (i.e. system administrator) starvation. - * - * The lock acquisition rules are as follows: - * - * SCL_CONFIG - * Protects changes to the vdev tree topology, such as vdev - * add/remove/attach/detach. Protects the dirty config list - * (spa_config_dirty_list) and the set of spares and l2arc devices. - * - * SCL_STATE - * Protects changes to pool state and vdev state, such as vdev - * online/offline/fault/degrade/clear. Protects the dirty state list - * (spa_state_dirty_list) and global pool state (spa_state). - * - * SCL_ALLOC - * Protects changes to metaslab groups and classes. - * Held as reader by metaslab_alloc() and metaslab_claim(). - * - * SCL_ZIO - * Held by bp-level zios (those which have no io_vd upon entry) - * to prevent changes to the vdev tree. The bp-level zio implicitly - * protects all of its vdev child zios, which do not hold SCL_ZIO. - * - * SCL_FREE - * Protects changes to metaslab groups and classes. - * Held as reader by metaslab_free(). SCL_FREE is distinct from - * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free - * blocks in zio_done() while another i/o that holds either - * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. - * - * SCL_VDEV - * Held as reader to prevent changes to the vdev tree during trivial - * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the - * other locks, and lower than all of them, to ensure that it's safe - * to acquire regardless of caller context. - * - * In addition, the following rules apply: - * - * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. - * The lock ordering is SCL_CONFIG > spa_props_lock. - * - * (b) I/O operations on leaf vdevs. For any zio operation that takes - * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), - * or zio_write_phys() -- the caller must ensure that the config cannot - * cannot change in the interim, and that the vdev cannot be reopened. - * SCL_STATE as reader suffices for both. - * - * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). - * - * spa_vdev_enter() Acquire the namespace lock and the config lock - * for writing. - * - * spa_vdev_exit() Release the config lock, wait for all I/O - * to complete, sync the updated configs to the - * cache, and release the namespace lock. - * - * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). - * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual - * locking is, always, based on spa_namespace_lock and spa_config_lock[]. - */ - -static avl_tree_t spa_namespace_avl; -kmutex_t spa_namespace_lock; -static kcondvar_t spa_namespace_cv; -static int spa_active_count; -int spa_max_replication_override = SPA_DVAS_PER_BP; - -static kmutex_t spa_spare_lock; -static avl_tree_t spa_spare_avl; -static kmutex_t spa_l2cache_lock; -static avl_tree_t spa_l2cache_avl; - -kmem_cache_t *spa_buffer_pool; -int spa_mode_global; - -#ifdef ZFS_DEBUG -/* - * Everything except dprintf, spa, and indirect_remap is on by default - * in debug builds. - */ -int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_INDIRECT_REMAP); -#else -int zfs_flags = 0; -#endif - -/* - * zfs_recover can be set to nonzero to attempt to recover from - * otherwise-fatal errors, typically caused by on-disk corruption. When - * set, calls to zfs_panic_recover() will turn into warning messages. - * This should only be used as a last resort, as it typically results - * in leaked space, or worse. - */ -boolean_t zfs_recover = B_FALSE; - -/* - * If destroy encounters an EIO while reading metadata (e.g. indirect - * blocks), space referenced by the missing metadata can not be freed. - * Normally this causes the background destroy to become "stalled", as - * it is unable to make forward progress. While in this stalled state, - * all remaining space to free from the error-encountering filesystem is - * "temporarily leaked". Set this flag to cause it to ignore the EIO, - * permanently leak the space from indirect blocks that can not be read, - * and continue to free everything else that it can. - * - * The default, "stalling" behavior is useful if the storage partially - * fails (i.e. some but not all i/os fail), and then later recovers. In - * this case, we will be able to continue pool operations while it is - * partially failed, and when it recovers, we can continue to free the - * space, with no leaks. However, note that this case is actually - * fairly rare. - * - * Typically pools either (a) fail completely (but perhaps temporarily, - * e.g. a top-level vdev going offline), or (b) have localized, - * permanent errors (e.g. disk returns the wrong data due to bit flip or - * firmware bug). In case (a), this setting does not matter because the - * pool will be suspended and the sync thread will not be able to make - * forward progress regardless. In case (b), because the error is - * permanent, the best we can do is leak the minimum amount of space, - * which is what setting this flag will do. Therefore, it is reasonable - * for this flag to normally be set, but we chose the more conservative - * approach of not setting it, so that there is no possibility of - * leaking space in the "partial temporary" failure case. - */ -boolean_t zfs_free_leak_on_eio = B_FALSE; - -/* - * Expiration time in milliseconds. This value has two meanings. First it is - * used to determine when the spa_deadman() logic should fire. By default the - * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. - * Secondly, the value determines if an I/O is considered "hung". Any I/O that - * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting - * in a system panic. - */ -uint64_t zfs_deadman_synctime_ms = 1000000ULL; - -/* - * Check time in milliseconds. This defines the frequency at which we check - * for hung I/O. - */ -uint64_t zfs_deadman_checktime_ms = 5000ULL; - -/* - * Default value of -1 for zfs_deadman_enabled is resolved in - * zfs_deadman_init() - */ -int zfs_deadman_enabled = -1; - -/* - * The worst case is single-sector max-parity RAID-Z blocks, in which - * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) - * times the size; so just assume that. Add to this the fact that - * we can have up to 3 DVAs per bp, and one more factor of 2 because - * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, - * the worst case is: - * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 - */ -int spa_asize_inflation = 24; - -#if defined(__FreeBSD__) && defined(_KERNEL) -SYSCTL_DECL(_vfs_zfs); -SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0, - "Try to recover from otherwise-fatal errors."); - -static int -sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) -{ - int err, val; - - val = zfs_flags; - err = sysctl_handle_int(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - /* - * ZFS_DEBUG_MODIFY must be enabled prior to boot so all - * arc buffers in the system have the necessary additional - * checksum data. However, it is safe to disable at any - * time. - */ - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) - val &= ~ZFS_DEBUG_MODIFY; - zfs_flags = val; - - return (0); -} - -SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags, - CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), - sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RWTUN, - &zfs_deadman_synctime_ms, 0, - "Stalled ZFS I/O expiration time in milliseconds"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RWTUN, - &zfs_deadman_checktime_ms, 0, - "Period of checks for stalled ZFS I/O in milliseconds"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RWTUN, - &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN, - &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes"); -#endif - -#ifndef illumos -#ifdef _KERNEL -static void -zfs_deadman_init() -{ - /* - * If we are not i386 or amd64 or in a virtual machine, - * disable ZFS deadman thread by default - */ - if (zfs_deadman_enabled == -1) { -#if defined(__amd64__) || defined(__i386__) - zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0; -#else - zfs_deadman_enabled = 0; -#endif - } -} -#endif /* _KERNEL */ -#endif /* !illumos */ - -/* - * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in - * the pool to be consumed. This ensures that we don't run the pool - * completely out of space, due to unaccounted changes (e.g. to the MOS). - * It also limits the worst-case time to allocate space. If we have - * less than this amount of free space, most ZPL operations (e.g. write, - * create) will return ENOSPC. - * - * Certain operations (e.g. file removal, most administrative actions) can - * use half the slop space. They will only return ENOSPC if less than half - * the slop space is free. Typically, once the pool has less than the slop - * space free, the user will use these operations to free up space in the pool. - * These are the operations that call dsl_pool_adjustedsize() with the netfree - * argument set to TRUE. - * - * Operations that are almost guaranteed to free up space in the absence of - * a pool checkpoint can use up to three quarters of the slop space - * (e.g zfs destroy). - * - * A very restricted set of operations are always permitted, regardless of - * the amount of free space. These are the operations that call - * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net - * increase in the amount of space used, it is possible to run the pool - * completely out of space, causing it to be permanently read-only. - * - * Note that on very small pools, the slop space will be larger than - * 3.2%, in an effort to have it be at least spa_min_slop (128MB), - * but we never allow it to be more than half the pool size. - * - * See also the comments in zfs_space_check_t. - */ -int spa_slop_shift = 5; -SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN, - &spa_slop_shift, 0, - "Shift value of reserved space (1/(2^spa_slop_shift))."); -uint64_t spa_min_slop = 128 * 1024 * 1024; -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN, - &spa_min_slop, 0, - "Minimal value of reserved space"); - -int spa_allocators = 4; - -SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_allocators, CTLFLAG_RWTUN, - &spa_allocators, 0, - "Number of allocators per metaslab group"); - -/*PRINTFLIKE2*/ -void -spa_load_failed(spa_t *spa, const char *fmt, ...) -{ - va_list adx; - char buf[256]; - - va_start(adx, fmt); - (void) vsnprintf(buf, sizeof (buf), fmt, adx); - va_end(adx); - - zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name, - spa->spa_trust_config ? "trusted" : "untrusted", buf); -} - -/*PRINTFLIKE2*/ -void -spa_load_note(spa_t *spa, const char *fmt, ...) -{ - va_list adx; - char buf[256]; - - va_start(adx, fmt); - (void) vsnprintf(buf, sizeof (buf), fmt, adx); - va_end(adx); - - zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, - spa->spa_trust_config ? "trusted" : "untrusted", buf); -} - -/* - * By default dedup and user data indirects land in the special class - */ -int zfs_ddt_data_is_special = B_TRUE; -int zfs_user_indirect_is_special = B_TRUE; - -/* - * The percentage of special class final space reserved for metadata only. - * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only - * let metadata into the class. - */ -int zfs_special_class_metadata_reserve_pct = 25; - -#if defined(__FreeBSD__) && defined(_KERNEL) -SYSCTL_INT(_vfs_zfs, OID_AUTO, ddt_data_is_special, CTLFLAG_RWTUN, - &zfs_ddt_data_is_special, 0, - "Whether DDT data is eligible for the special class vdevs"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, user_indirect_is_special, CTLFLAG_RWTUN, - &zfs_user_indirect_is_special, 0, - "Whether indirect blocks are eligible for the special class vdevs"); -SYSCTL_INT(_vfs_zfs, OID_AUTO, special_class_metadata_reserve_pct, - CTLFLAG_RWTUN, &zfs_special_class_metadata_reserve_pct, 0, - "Percentage of space in the special class reserved solely for metadata"); -#endif - -/* - * ========================================================================== - * SPA config locking - * ========================================================================== - */ -static void -spa_config_lock_init(spa_t *spa) -{ - for (int i = 0; i < SCL_LOCKS; i++) { - spa_config_lock_t *scl = &spa->spa_config_lock[i]; - mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); - zfs_refcount_create_untracked(&scl->scl_count); - scl->scl_writer = NULL; - scl->scl_write_wanted = 0; - } -} - -static void -spa_config_lock_destroy(spa_t *spa) -{ - for (int i = 0; i < SCL_LOCKS; i++) { - spa_config_lock_t *scl = &spa->spa_config_lock[i]; - mutex_destroy(&scl->scl_lock); - cv_destroy(&scl->scl_cv); - zfs_refcount_destroy(&scl->scl_count); - ASSERT(scl->scl_writer == NULL); - ASSERT(scl->scl_write_wanted == 0); - } -} - -int -spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) -{ - for (int i = 0; i < SCL_LOCKS; i++) { - spa_config_lock_t *scl = &spa->spa_config_lock[i]; - if (!(locks & (1 << i))) - continue; - mutex_enter(&scl->scl_lock); - if (rw == RW_READER) { - if (scl->scl_writer || scl->scl_write_wanted) { - mutex_exit(&scl->scl_lock); - spa_config_exit(spa, locks & ((1 << i) - 1), - tag); - return (0); - } - } else { - ASSERT(scl->scl_writer != curthread); - if (!zfs_refcount_is_zero(&scl->scl_count)) { - mutex_exit(&scl->scl_lock); - spa_config_exit(spa, locks & ((1 << i) - 1), - tag); - return (0); - } - scl->scl_writer = curthread; - } - (void) zfs_refcount_add(&scl->scl_count, tag); - mutex_exit(&scl->scl_lock); - } - return (1); -} - -void -spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) -{ - int wlocks_held = 0; - - ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); - - for (int i = 0; i < SCL_LOCKS; i++) { - spa_config_lock_t *scl = &spa->spa_config_lock[i]; - if (scl->scl_writer == curthread) - wlocks_held |= (1 << i); - if (!(locks & (1 << i))) - continue; - mutex_enter(&scl->scl_lock); - if (rw == RW_READER) { - while (scl->scl_writer || scl->scl_write_wanted) { - cv_wait(&scl->scl_cv, &scl->scl_lock); - } - } else { - ASSERT(scl->scl_writer != curthread); - while (!zfs_refcount_is_zero(&scl->scl_count)) { - scl->scl_write_wanted++; - cv_wait(&scl->scl_cv, &scl->scl_lock); - scl->scl_write_wanted--; - } - scl->scl_writer = curthread; - } - (void) zfs_refcount_add(&scl->scl_count, tag); - mutex_exit(&scl->scl_lock); - } - ASSERT3U(wlocks_held, <=, locks); -} - -void -spa_config_exit(spa_t *spa, int locks, void *tag) -{ - for (int i = SCL_LOCKS - 1; i >= 0; i--) { - spa_config_lock_t *scl = &spa->spa_config_lock[i]; - if (!(locks & (1 << i))) - continue; - mutex_enter(&scl->scl_lock); - ASSERT(!zfs_refcount_is_zero(&scl->scl_count)); - if (zfs_refcount_remove(&scl->scl_count, tag) == 0) { - ASSERT(scl->scl_writer == NULL || - scl->scl_writer == curthread); - scl->scl_writer = NULL; /* OK in either case */ - cv_broadcast(&scl->scl_cv); - } - mutex_exit(&scl->scl_lock); - } -} - -int -spa_config_held(spa_t *spa, int locks, krw_t rw) -{ - int locks_held = 0; - - for (int i = 0; i < SCL_LOCKS; i++) { - spa_config_lock_t *scl = &spa->spa_config_lock[i]; - if (!(locks & (1 << i))) - continue; - if ((rw == RW_READER && - !zfs_refcount_is_zero(&scl->scl_count)) || - (rw == RW_WRITER && scl->scl_writer == curthread)) - locks_held |= 1 << i; - } - - return (locks_held); -} - -/* - * ========================================================================== - * SPA namespace functions - * ========================================================================== - */ - -/* - * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. - * Returns NULL if no matching spa_t is found. - */ -spa_t * -spa_lookup(const char *name) -{ - static spa_t search; /* spa_t is large; don't allocate on stack */ - spa_t *spa; - avl_index_t where; - char *cp; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); - - /* - * If it's a full dataset name, figure out the pool name and - * just use that. - */ - cp = strpbrk(search.spa_name, "/@#"); - if (cp != NULL) - *cp = '\0'; - - spa = avl_find(&spa_namespace_avl, &search, &where); - - return (spa); -} - -/* - * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. - * If the zfs_deadman_enabled flag is set then it inspects all vdev queues - * looking for potentially hung I/Os. - */ -static void -spa_deadman(void *arg, int pending) -{ - spa_t *spa = arg; - - /* - * Disable the deadman timer if the pool is suspended. - */ - if (spa_suspended(spa)) { -#ifdef illumos - VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); -#else - /* Nothing. just don't schedule any future callouts. */ -#endif - return; - } - - zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", - (gethrtime() - spa->spa_sync_starttime) / NANOSEC, - ++spa->spa_deadman_calls); - if (zfs_deadman_enabled) - vdev_deadman(spa->spa_root_vdev); -#ifdef __FreeBSD__ -#ifdef _KERNEL - callout_schedule(&spa->spa_deadman_cycid, - hz * zfs_deadman_checktime_ms / MILLISEC); -#endif -#endif -} - -#if defined(__FreeBSD__) && defined(_KERNEL) -static void -spa_deadman_timeout(void *arg) -{ - spa_t *spa = arg; - - taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task); -} -#endif - -/* - * Create an uninitialized spa_t with the given name. Requires - * spa_namespace_lock. The caller must ensure that the spa_t doesn't already - * exist by calling spa_lookup() first. - */ -spa_t * -spa_add(const char *name, nvlist_t *config, const char *altroot) -{ - spa_t *spa; - spa_config_dirent_t *dp; -#ifdef illumos - cyc_handler_t hdlr; - cyc_time_t when; -#endif - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); - - mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); - - cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); - cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); - cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); - cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); - cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); - - for (int t = 0; t < TXG_SIZE; t++) - bplist_create(&spa->spa_free_bplist[t]); - - (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); - spa->spa_state = POOL_STATE_UNINITIALIZED; - spa->spa_freeze_txg = UINT64_MAX; - spa->spa_final_txg = UINT64_MAX; - spa->spa_load_max_txg = UINT64_MAX; - spa->spa_proc = &p0; - spa->spa_proc_state = SPA_PROC_NONE; - spa->spa_trust_config = B_TRUE; - -#ifdef illumos - hdlr.cyh_func = spa_deadman; - hdlr.cyh_arg = spa; - hdlr.cyh_level = CY_LOW_LEVEL; -#endif - - spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); - -#ifdef illumos - /* - * This determines how often we need to check for hung I/Os after - * the cyclic has already fired. Since checking for hung I/Os is - * an expensive operation we don't want to check too frequently. - * Instead wait for 5 seconds before checking again. - */ - when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms); - when.cyt_when = CY_INFINITY; - mutex_enter(&cpu_lock); - spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); - mutex_exit(&cpu_lock); -#else /* !illumos */ -#ifdef _KERNEL - /* - * callout(9) does not provide a way to initialize a callout with - * a function and an argument, so we use callout_reset() to schedule - * the callout in the very distant future. Even if that event ever - * fires, it should be okayas we won't have any active zio-s. - * But normally spa_sync() will reschedule the callout with a proper - * timeout. - * callout(9) does not allow the callback function to sleep but - * vdev_deadman() needs to acquire vq_lock and illumos mutexes are - * emulated using sx(9). For this reason spa_deadman_timeout() - * will schedule spa_deadman() as task on a taskqueue that allows - * sleeping. - */ - TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa); - callout_init(&spa->spa_deadman_cycid, 1); - callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0, - spa_deadman_timeout, spa, 0); -#endif -#endif - zfs_refcount_create(&spa->spa_refcount); - spa_config_lock_init(spa); - - avl_add(&spa_namespace_avl, spa); - - /* - * Set the alternate root, if there is one. - */ - if (altroot) { - spa->spa_root = spa_strdup(altroot); - spa_active_count++; - } - - spa->spa_alloc_count = spa_allocators; - spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count * - sizeof (kmutex_t), KM_SLEEP); - spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count * - sizeof (avl_tree_t), KM_SLEEP); - for (int i = 0; i < spa->spa_alloc_count; i++) { - mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL); - avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare, - sizeof (zio_t), offsetof(zio_t, io_alloc_node)); - } - - /* - * Every pool starts with the default cachefile - */ - list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), - offsetof(spa_config_dirent_t, scd_link)); - - dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); - dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); - list_insert_head(&spa->spa_config_list, dp); - - VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, - KM_SLEEP) == 0); - - if (config != NULL) { - nvlist_t *features; - - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, - &features) == 0) { - VERIFY(nvlist_dup(features, &spa->spa_label_features, - 0) == 0); - } - - VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); - } - - if (spa->spa_label_features == NULL) { - VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, - KM_SLEEP) == 0); - } - - spa->spa_min_ashift = INT_MAX; - spa->spa_max_ashift = 0; - - /* - * As a pool is being created, treat all features as disabled by - * setting SPA_FEATURE_DISABLED for all entries in the feature - * refcount cache. - */ - for (int i = 0; i < SPA_FEATURES; i++) { - spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; - } - - list_create(&spa->spa_leaf_list, sizeof (vdev_t), - offsetof(vdev_t, vdev_leaf_node)); - - return (spa); -} - -/* - * Removes a spa_t from the namespace, freeing up any memory used. Requires - * spa_namespace_lock. This is called only after the spa_t has been closed and - * deactivated. - */ -void -spa_remove(spa_t *spa) -{ - spa_config_dirent_t *dp; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); - ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0); - - nvlist_free(spa->spa_config_splitting); - - avl_remove(&spa_namespace_avl, spa); - cv_broadcast(&spa_namespace_cv); - - if (spa->spa_root) { - spa_strfree(spa->spa_root); - spa_active_count--; - } - - while ((dp = list_head(&spa->spa_config_list)) != NULL) { - list_remove(&spa->spa_config_list, dp); - if (dp->scd_path != NULL) - spa_strfree(dp->scd_path); - kmem_free(dp, sizeof (spa_config_dirent_t)); - } - - for (int i = 0; i < spa->spa_alloc_count; i++) { - avl_destroy(&spa->spa_alloc_trees[i]); - mutex_destroy(&spa->spa_alloc_locks[i]); - } - kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count * - sizeof (kmutex_t)); - kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count * - sizeof (avl_tree_t)); - - list_destroy(&spa->spa_config_list); - list_destroy(&spa->spa_leaf_list); - - nvlist_free(spa->spa_label_features); - nvlist_free(spa->spa_load_info); - nvlist_free(spa->spa_feat_stats); - spa_config_set(spa, NULL); - -#ifdef illumos - mutex_enter(&cpu_lock); - if (spa->spa_deadman_cycid != CYCLIC_NONE) - cyclic_remove(spa->spa_deadman_cycid); - mutex_exit(&cpu_lock); - spa->spa_deadman_cycid = CYCLIC_NONE; -#else /* !illumos */ -#ifdef _KERNEL - callout_drain(&spa->spa_deadman_cycid); - taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task); -#endif -#endif - - zfs_refcount_destroy(&spa->spa_refcount); - - spa_config_lock_destroy(spa); - - for (int t = 0; t < TXG_SIZE; t++) - bplist_destroy(&spa->spa_free_bplist[t]); - - zio_checksum_templates_free(spa); - - cv_destroy(&spa->spa_async_cv); - cv_destroy(&spa->spa_evicting_os_cv); - cv_destroy(&spa->spa_proc_cv); - cv_destroy(&spa->spa_scrub_io_cv); - cv_destroy(&spa->spa_suspend_cv); - - mutex_destroy(&spa->spa_async_lock); - mutex_destroy(&spa->spa_errlist_lock); - mutex_destroy(&spa->spa_errlog_lock); - mutex_destroy(&spa->spa_evicting_os_lock); - mutex_destroy(&spa->spa_history_lock); - mutex_destroy(&spa->spa_proc_lock); - mutex_destroy(&spa->spa_props_lock); - mutex_destroy(&spa->spa_cksum_tmpls_lock); - mutex_destroy(&spa->spa_scrub_lock); - mutex_destroy(&spa->spa_suspend_lock); - mutex_destroy(&spa->spa_vdev_top_lock); - mutex_destroy(&spa->spa_feat_stats_lock); - - kmem_free(spa, sizeof (spa_t)); -} - -/* - * Given a pool, return the next pool in the namespace, or NULL if there is - * none. If 'prev' is NULL, return the first pool. - */ -spa_t * -spa_next(spa_t *prev) -{ - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - if (prev) - return (AVL_NEXT(&spa_namespace_avl, prev)); - else - return (avl_first(&spa_namespace_avl)); -} - -/* - * ========================================================================== - * SPA refcount functions - * ========================================================================== - */ - -/* - * Add a reference to the given spa_t. Must have at least one reference, or - * have the namespace lock held. - */ -void -spa_open_ref(spa_t *spa, void *tag) -{ - ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref || - MUTEX_HELD(&spa_namespace_lock)); - (void) zfs_refcount_add(&spa->spa_refcount, tag); -} - -/* - * Remove a reference to the given spa_t. Must have at least one reference, or - * have the namespace lock held. - */ -void -spa_close(spa_t *spa, void *tag) -{ - ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || - MUTEX_HELD(&spa_namespace_lock)); - (void) zfs_refcount_remove(&spa->spa_refcount, tag); -} - -/* - * Remove a reference to the given spa_t held by a dsl dir that is - * being asynchronously released. Async releases occur from a taskq - * performing eviction of dsl datasets and dirs. The namespace lock - * isn't held and the hold by the object being evicted may contribute to - * spa_minref (e.g. dataset or directory released during pool export), - * so the asserts in spa_close() do not apply. - */ -void -spa_async_close(spa_t *spa, void *tag) -{ - (void) zfs_refcount_remove(&spa->spa_refcount, tag); -} - -/* - * Check to see if the spa refcount is zero. Must be called with - * spa_namespace_lock held. We really compare against spa_minref, which is the - * number of references acquired when opening a pool - */ -boolean_t -spa_refcount_zero(spa_t *spa) -{ - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref); -} - -/* - * ========================================================================== - * SPA spare and l2cache tracking - * ========================================================================== - */ - -/* - * Hot spares and cache devices are tracked using the same code below, - * for 'auxiliary' devices. - */ - -typedef struct spa_aux { - uint64_t aux_guid; - uint64_t aux_pool; - avl_node_t aux_avl; - int aux_count; -} spa_aux_t; - -static inline int -spa_aux_compare(const void *a, const void *b) -{ - const spa_aux_t *sa = (const spa_aux_t *)a; - const spa_aux_t *sb = (const spa_aux_t *)b; - - return (AVL_CMP(sa->aux_guid, sb->aux_guid)); -} - -void -spa_aux_add(vdev_t *vd, avl_tree_t *avl) -{ - avl_index_t where; - spa_aux_t search; - spa_aux_t *aux; - - search.aux_guid = vd->vdev_guid; - if ((aux = avl_find(avl, &search, &where)) != NULL) { - aux->aux_count++; - } else { - aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); - aux->aux_guid = vd->vdev_guid; - aux->aux_count = 1; - avl_insert(avl, aux, where); - } -} - -void -spa_aux_remove(vdev_t *vd, avl_tree_t *avl) -{ - spa_aux_t search; - spa_aux_t *aux; - avl_index_t where; - - search.aux_guid = vd->vdev_guid; - aux = avl_find(avl, &search, &where); - - ASSERT(aux != NULL); - - if (--aux->aux_count == 0) { - avl_remove(avl, aux); - kmem_free(aux, sizeof (spa_aux_t)); - } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { - aux->aux_pool = 0ULL; - } -} - -boolean_t -spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) -{ - spa_aux_t search, *found; - - search.aux_guid = guid; - found = avl_find(avl, &search, NULL); - - if (pool) { - if (found) - *pool = found->aux_pool; - else - *pool = 0ULL; - } - - if (refcnt) { - if (found) - *refcnt = found->aux_count; - else - *refcnt = 0; - } - - return (found != NULL); -} - -void -spa_aux_activate(vdev_t *vd, avl_tree_t *avl) -{ - spa_aux_t search, *found; - avl_index_t where; - - search.aux_guid = vd->vdev_guid; - found = avl_find(avl, &search, &where); - ASSERT(found != NULL); - ASSERT(found->aux_pool == 0ULL); - - found->aux_pool = spa_guid(vd->vdev_spa); -} - -/* - * Spares are tracked globally due to the following constraints: - * - * - A spare may be part of multiple pools. - * - A spare may be added to a pool even if it's actively in use within - * another pool. - * - A spare in use in any pool can only be the source of a replacement if - * the target is a spare in the same pool. - * - * We keep track of all spares on the system through the use of a reference - * counted AVL tree. When a vdev is added as a spare, or used as a replacement - * spare, then we bump the reference count in the AVL tree. In addition, we set - * the 'vdev_isspare' member to indicate that the device is a spare (active or - * inactive). When a spare is made active (used to replace a device in the - * pool), we also keep track of which pool its been made a part of. - * - * The 'spa_spare_lock' protects the AVL tree. These functions are normally - * called under the spa_namespace lock as part of vdev reconfiguration. The - * separate spare lock exists for the status query path, which does not need to - * be completely consistent with respect to other vdev configuration changes. - */ - -static int -spa_spare_compare(const void *a, const void *b) -{ - return (spa_aux_compare(a, b)); -} - -void -spa_spare_add(vdev_t *vd) -{ - mutex_enter(&spa_spare_lock); - ASSERT(!vd->vdev_isspare); - spa_aux_add(vd, &spa_spare_avl); - vd->vdev_isspare = B_TRUE; - mutex_exit(&spa_spare_lock); -} - -void -spa_spare_remove(vdev_t *vd) -{ - mutex_enter(&spa_spare_lock); - ASSERT(vd->vdev_isspare); - spa_aux_remove(vd, &spa_spare_avl); - vd->vdev_isspare = B_FALSE; - mutex_exit(&spa_spare_lock); -} - -boolean_t -spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) -{ - boolean_t found; - - mutex_enter(&spa_spare_lock); - found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); - mutex_exit(&spa_spare_lock); - - return (found); -} - -void -spa_spare_activate(vdev_t *vd) -{ - mutex_enter(&spa_spare_lock); - ASSERT(vd->vdev_isspare); - spa_aux_activate(vd, &spa_spare_avl); - mutex_exit(&spa_spare_lock); -} - -/* - * Level 2 ARC devices are tracked globally for the same reasons as spares. - * Cache devices currently only support one pool per cache device, and so - * for these devices the aux reference count is currently unused beyond 1. - */ - -static int -spa_l2cache_compare(const void *a, const void *b) -{ - return (spa_aux_compare(a, b)); -} - -void -spa_l2cache_add(vdev_t *vd) -{ - mutex_enter(&spa_l2cache_lock); - ASSERT(!vd->vdev_isl2cache); - spa_aux_add(vd, &spa_l2cache_avl); - vd->vdev_isl2cache = B_TRUE; - mutex_exit(&spa_l2cache_lock); -} - -void -spa_l2cache_remove(vdev_t *vd) -{ - mutex_enter(&spa_l2cache_lock); - ASSERT(vd->vdev_isl2cache); - spa_aux_remove(vd, &spa_l2cache_avl); - vd->vdev_isl2cache = B_FALSE; - mutex_exit(&spa_l2cache_lock); -} - -boolean_t -spa_l2cache_exists(uint64_t guid, uint64_t *pool) -{ - boolean_t found; - - mutex_enter(&spa_l2cache_lock); - found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); - mutex_exit(&spa_l2cache_lock); - - return (found); -} - -void -spa_l2cache_activate(vdev_t *vd) -{ - mutex_enter(&spa_l2cache_lock); - ASSERT(vd->vdev_isl2cache); - spa_aux_activate(vd, &spa_l2cache_avl); - mutex_exit(&spa_l2cache_lock); -} - -/* - * ========================================================================== - * SPA vdev locking - * ========================================================================== - */ - -/* - * Lock the given spa_t for the purpose of adding or removing a vdev. - * Grabs the global spa_namespace_lock plus the spa config lock for writing. - * It returns the next transaction group for the spa_t. - */ -uint64_t -spa_vdev_enter(spa_t *spa) -{ - mutex_enter(&spa->spa_vdev_top_lock); - mutex_enter(&spa_namespace_lock); - return (spa_vdev_config_enter(spa)); -} - -/* - * Internal implementation for spa_vdev_enter(). Used when a vdev - * operation requires multiple syncs (i.e. removing a device) while - * keeping the spa_namespace_lock held. - */ -uint64_t -spa_vdev_config_enter(spa_t *spa) -{ - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); - - return (spa_last_synced_txg(spa) + 1); -} - -/* - * Used in combination with spa_vdev_config_enter() to allow the syncing - * of multiple transactions without releasing the spa_namespace_lock. - */ -void -spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) -{ - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - int config_changed = B_FALSE; - - ASSERT(txg > spa_last_synced_txg(spa)); - - spa->spa_pending_vdev = NULL; - - /* - * Reassess the DTLs. - */ - vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); - - if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { - config_changed = B_TRUE; - spa->spa_config_generation++; - } - - /* - * Verify the metaslab classes. - */ - ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); - ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); - ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0); - ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0); - - spa_config_exit(spa, SCL_ALL, spa); - - /* - * Panic the system if the specified tag requires it. This - * is useful for ensuring that configurations are updated - * transactionally. - */ - if (zio_injection_enabled) - zio_handle_panic_injection(spa, tag, 0); - - /* - * Note: this txg_wait_synced() is important because it ensures - * that there won't be more than one config change per txg. - * This allows us to use the txg as the generation number. - */ - if (error == 0) - txg_wait_synced(spa->spa_dsl_pool, txg); - - if (vd != NULL) { - ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); - if (vd->vdev_ops->vdev_op_leaf) { - mutex_enter(&vd->vdev_initialize_lock); - vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); - mutex_exit(&vd->vdev_initialize_lock); - } - - spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); - vdev_free(vd); - spa_config_exit(spa, SCL_ALL, spa); - } - - /* - * If the config changed, update the config cache. - */ - if (config_changed) - spa_write_cachefile(spa, B_FALSE, B_TRUE); -} - -/* - * Unlock the spa_t after adding or removing a vdev. Besides undoing the - * locking of spa_vdev_enter(), we also want make sure the transactions have - * synced to disk, and then update the global configuration cache with the new - * information. - */ -int -spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) -{ - spa_vdev_config_exit(spa, vd, txg, error, FTAG); - mutex_exit(&spa_namespace_lock); - mutex_exit(&spa->spa_vdev_top_lock); - - return (error); -} - -/* - * Lock the given spa_t for the purpose of changing vdev state. - */ -void -spa_vdev_state_enter(spa_t *spa, int oplocks) -{ - int locks = SCL_STATE_ALL | oplocks; - - /* - * Root pools may need to read of the underlying devfs filesystem - * when opening up a vdev. Unfortunately if we're holding the - * SCL_ZIO lock it will result in a deadlock when we try to issue - * the read from the root filesystem. Instead we "prefetch" - * the associated vnodes that we need prior to opening the - * underlying devices and cache them so that we can prevent - * any I/O when we are doing the actual open. - */ - if (spa_is_root(spa)) { - int low = locks & ~(SCL_ZIO - 1); - int high = locks & ~low; - - spa_config_enter(spa, high, spa, RW_WRITER); - vdev_hold(spa->spa_root_vdev); - spa_config_enter(spa, low, spa, RW_WRITER); - } else { - spa_config_enter(spa, locks, spa, RW_WRITER); - } - spa->spa_vdev_locks = locks; -} - -int -spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) -{ - boolean_t config_changed = B_FALSE; - - if (vd != NULL || error == 0) - vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, - 0, 0, B_FALSE); - - if (vd != NULL) { - vdev_state_dirty(vd->vdev_top); - config_changed = B_TRUE; - spa->spa_config_generation++; - } - - if (spa_is_root(spa)) - vdev_rele(spa->spa_root_vdev); - - ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); - spa_config_exit(spa, spa->spa_vdev_locks, spa); - - /* - * If anything changed, wait for it to sync. This ensures that, - * from the system administrator's perspective, zpool(1M) commands - * are synchronous. This is important for things like zpool offline: - * when the command completes, you expect no further I/O from ZFS. - */ - if (vd != NULL) - txg_wait_synced(spa->spa_dsl_pool, 0); - - /* - * If the config changed, update the config cache. - */ - if (config_changed) { - mutex_enter(&spa_namespace_lock); - spa_write_cachefile(spa, B_FALSE, B_TRUE); - mutex_exit(&spa_namespace_lock); - } - - return (error); -} - -/* - * ========================================================================== - * Miscellaneous functions - * ========================================================================== - */ - -void -spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) -{ - if (!nvlist_exists(spa->spa_label_features, feature)) { - fnvlist_add_boolean(spa->spa_label_features, feature); - /* - * When we are creating the pool (tx_txg==TXG_INITIAL), we can't - * dirty the vdev config because lock SCL_CONFIG is not held. - * Thankfully, in this case we don't need to dirty the config - * because it will be written out anyway when we finish - * creating the pool. - */ - if (tx->tx_txg != TXG_INITIAL) - vdev_config_dirty(spa->spa_root_vdev); - } -} - -void -spa_deactivate_mos_feature(spa_t *spa, const char *feature) -{ - if (nvlist_remove_all(spa->spa_label_features, feature) == 0) - vdev_config_dirty(spa->spa_root_vdev); -} - -/* - * Return the spa_t associated with given pool_guid, if it exists. If - * device_guid is non-zero, determine whether the pool exists *and* contains - * a device with the specified device_guid. - */ -spa_t * -spa_by_guid(uint64_t pool_guid, uint64_t device_guid) -{ - spa_t *spa; - avl_tree_t *t = &spa_namespace_avl; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { - if (spa->spa_state == POOL_STATE_UNINITIALIZED) - continue; - if (spa->spa_root_vdev == NULL) - continue; - if (spa_guid(spa) == pool_guid) { - if (device_guid == 0) - break; - - if (vdev_lookup_by_guid(spa->spa_root_vdev, - device_guid) != NULL) - break; - - /* - * Check any devices we may be in the process of adding. - */ - if (spa->spa_pending_vdev) { - if (vdev_lookup_by_guid(spa->spa_pending_vdev, - device_guid) != NULL) - break; - } - } - } - - return (spa); -} - -/* - * Determine whether a pool with the given pool_guid exists. - */ -boolean_t -spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) -{ - return (spa_by_guid(pool_guid, device_guid) != NULL); -} - -char * -spa_strdup(const char *s) -{ - size_t len; - char *new; - - len = strlen(s); - new = kmem_alloc(len + 1, KM_SLEEP); - bcopy(s, new, len); - new[len] = '\0'; - - return (new); -} - -void -spa_strfree(char *s) -{ - kmem_free(s, strlen(s) + 1); -} - -uint64_t -spa_get_random(uint64_t range) -{ - uint64_t r; - - ASSERT(range != 0); - - if (range == 1) - return (0); - - (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); - - return (r % range); -} - -uint64_t -spa_generate_guid(spa_t *spa) -{ - uint64_t guid = spa_get_random(-1ULL); - - if (spa != NULL) { - while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) - guid = spa_get_random(-1ULL); - } else { - while (guid == 0 || spa_guid_exists(guid, 0)) - guid = spa_get_random(-1ULL); - } - - return (guid); -} - -void -snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) -{ - char type[256]; - char *checksum = NULL; - char *compress = NULL; - - if (bp != NULL) { - if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { - dmu_object_byteswap_t bswap = - DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); - (void) snprintf(type, sizeof (type), "bswap %s %s", - DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? - "metadata" : "data", - dmu_ot_byteswap[bswap].ob_name); - } else { - (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, - sizeof (type)); - } - if (!BP_IS_EMBEDDED(bp)) { - checksum = - zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; - } - compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; - } - - SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, - compress); -} - -void -spa_freeze(spa_t *spa) -{ - uint64_t freeze_txg = 0; - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - if (spa->spa_freeze_txg == UINT64_MAX) { - freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; - spa->spa_freeze_txg = freeze_txg; - } - spa_config_exit(spa, SCL_ALL, FTAG); - if (freeze_txg != 0) - txg_wait_synced(spa_get_dsl(spa), freeze_txg); -} - -void -zfs_panic_recover(const char *fmt, ...) -{ - va_list adx; - - va_start(adx, fmt); - vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); - va_end(adx); -} - -/* - * This is a stripped-down version of strtoull, suitable only for converting - * lowercase hexadecimal numbers that don't overflow. - */ -uint64_t -zfs_strtonum(const char *str, char **nptr) -{ - uint64_t val = 0; - char c; - int digit; - - while ((c = *str) != '\0') { - if (c >= '0' && c <= '9') - digit = c - '0'; - else if (c >= 'a' && c <= 'f') - digit = 10 + c - 'a'; - else - break; - - val *= 16; - val += digit; - - str++; - } - - if (nptr) - *nptr = (char *)str; - - return (val); -} - -void -spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx) -{ - /* - * We bump the feature refcount for each special vdev added to the pool - */ - ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)); - spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx); -} - -/* - * ========================================================================== - * Accessor functions - * ========================================================================== - */ - -boolean_t -spa_shutting_down(spa_t *spa) -{ - return (spa->spa_async_suspended); -} - -dsl_pool_t * -spa_get_dsl(spa_t *spa) -{ - return (spa->spa_dsl_pool); -} - -boolean_t -spa_is_initializing(spa_t *spa) -{ - return (spa->spa_is_initializing); -} - -boolean_t -spa_indirect_vdevs_loaded(spa_t *spa) -{ - return (spa->spa_indirect_vdevs_loaded); -} - -blkptr_t * -spa_get_rootblkptr(spa_t *spa) -{ - return (&spa->spa_ubsync.ub_rootbp); -} - -void -spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) -{ - spa->spa_uberblock.ub_rootbp = *bp; -} - -void -spa_altroot(spa_t *spa, char *buf, size_t buflen) -{ - if (spa->spa_root == NULL) - buf[0] = '\0'; - else - (void) strncpy(buf, spa->spa_root, buflen); -} - -int -spa_sync_pass(spa_t *spa) -{ - return (spa->spa_sync_pass); -} - -char * -spa_name(spa_t *spa) -{ - return (spa->spa_name); -} - -uint64_t -spa_guid(spa_t *spa) -{ - dsl_pool_t *dp = spa_get_dsl(spa); - uint64_t guid; - - /* - * If we fail to parse the config during spa_load(), we can go through - * the error path (which posts an ereport) and end up here with no root - * vdev. We stash the original pool guid in 'spa_config_guid' to handle - * this case. - */ - if (spa->spa_root_vdev == NULL) - return (spa->spa_config_guid); - - guid = spa->spa_last_synced_guid != 0 ? - spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; - - /* - * Return the most recently synced out guid unless we're - * in syncing context. - */ - if (dp && dsl_pool_sync_context(dp)) - return (spa->spa_root_vdev->vdev_guid); - else - return (guid); -} - -uint64_t -spa_load_guid(spa_t *spa) -{ - /* - * This is a GUID that exists solely as a reference for the - * purposes of the arc. It is generated at load time, and - * is never written to persistent storage. - */ - return (spa->spa_load_guid); -} - -uint64_t -spa_last_synced_txg(spa_t *spa) -{ - return (spa->spa_ubsync.ub_txg); -} - -uint64_t -spa_first_txg(spa_t *spa) -{ - return (spa->spa_first_txg); -} - -uint64_t -spa_syncing_txg(spa_t *spa) -{ - return (spa->spa_syncing_txg); -} - -/* - * Return the last txg where data can be dirtied. The final txgs - * will be used to just clear out any deferred frees that remain. - */ -uint64_t -spa_final_dirty_txg(spa_t *spa) -{ - return (spa->spa_final_txg - TXG_DEFER_SIZE); -} - -pool_state_t -spa_state(spa_t *spa) -{ - return (spa->spa_state); -} - -spa_load_state_t -spa_load_state(spa_t *spa) -{ - return (spa->spa_load_state); -} - -uint64_t -spa_freeze_txg(spa_t *spa) -{ - return (spa->spa_freeze_txg); -} - -/* ARGSUSED */ -uint64_t -spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) -{ - return (lsize * spa_asize_inflation); -} - -/* - * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%), - * or at least 128MB, unless that would cause it to be more than half the - * pool size. - * - * See the comment above spa_slop_shift for details. - */ -uint64_t -spa_get_slop_space(spa_t *spa) -{ - uint64_t space = spa_get_dspace(spa); - return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop))); -} - -uint64_t -spa_get_dspace(spa_t *spa) -{ - return (spa->spa_dspace); -} - -uint64_t -spa_get_checkpoint_space(spa_t *spa) -{ - return (spa->spa_checkpoint_info.sci_dspace); -} - -void -spa_update_dspace(spa_t *spa) -{ - spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + - ddt_get_dedup_dspace(spa); - if (spa->spa_vdev_removal != NULL) { - /* - * We can't allocate from the removing device, so - * subtract its size. This prevents the DMU/DSL from - * filling up the (now smaller) pool while we are in the - * middle of removing the device. - * - * Note that the DMU/DSL doesn't actually know or care - * how much space is allocated (it does its own tracking - * of how much space has been logically used). So it - * doesn't matter that the data we are moving may be - * allocated twice (on the old device and the new - * device). - */ - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - vdev_t *vd = - vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); - spa->spa_dspace -= spa_deflate(spa) ? - vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; - spa_config_exit(spa, SCL_VDEV, FTAG); - } -} - -/* - * Return the failure mode that has been set to this pool. The default - * behavior will be to block all I/Os when a complete failure occurs. - */ -uint8_t -spa_get_failmode(spa_t *spa) -{ - return (spa->spa_failmode); -} - -boolean_t -spa_suspended(spa_t *spa) -{ - return (spa->spa_suspended != ZIO_SUSPEND_NONE); -} - -uint64_t -spa_version(spa_t *spa) -{ - return (spa->spa_ubsync.ub_version); -} - -boolean_t -spa_deflate(spa_t *spa) -{ - return (spa->spa_deflate); -} - -metaslab_class_t * -spa_normal_class(spa_t *spa) -{ - return (spa->spa_normal_class); -} - -metaslab_class_t * -spa_log_class(spa_t *spa) -{ - return (spa->spa_log_class); -} - -metaslab_class_t * -spa_special_class(spa_t *spa) -{ - return (spa->spa_special_class); -} - -metaslab_class_t * -spa_dedup_class(spa_t *spa) -{ - return (spa->spa_dedup_class); -} - -/* - * Locate an appropriate allocation class - */ -metaslab_class_t * -spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, - uint_t level, uint_t special_smallblk) -{ - if (DMU_OT_IS_ZIL(objtype)) { - if (spa->spa_log_class->mc_groups != 0) - return (spa_log_class(spa)); - else - return (spa_normal_class(spa)); - } - - boolean_t has_special_class = spa->spa_special_class->mc_groups != 0; - - if (DMU_OT_IS_DDT(objtype)) { - if (spa->spa_dedup_class->mc_groups != 0) - return (spa_dedup_class(spa)); - else if (has_special_class && zfs_ddt_data_is_special) - return (spa_special_class(spa)); - else - return (spa_normal_class(spa)); - } - - /* Indirect blocks for user data can land in special if allowed */ - if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { - if (has_special_class && zfs_user_indirect_is_special) - return (spa_special_class(spa)); - else - return (spa_normal_class(spa)); - } - - if (DMU_OT_IS_METADATA(objtype) || level > 0) { - if (has_special_class) - return (spa_special_class(spa)); - else - return (spa_normal_class(spa)); - } - - /* - * Allow small file blocks in special class in some cases (like - * for the dRAID vdev feature). But always leave a reserve of - * zfs_special_class_metadata_reserve_pct exclusively for metadata. - */ - if (DMU_OT_IS_FILE(objtype) && - has_special_class && size <= special_smallblk) { - metaslab_class_t *special = spa_special_class(spa); - uint64_t alloc = metaslab_class_get_alloc(special); - uint64_t space = metaslab_class_get_space(special); - uint64_t limit = - (space * (100 - zfs_special_class_metadata_reserve_pct)) - / 100; - - if (alloc < limit) - return (special); - } - - return (spa_normal_class(spa)); -} - -void -spa_evicting_os_register(spa_t *spa, objset_t *os) -{ - mutex_enter(&spa->spa_evicting_os_lock); - list_insert_head(&spa->spa_evicting_os_list, os); - mutex_exit(&spa->spa_evicting_os_lock); -} - -void -spa_evicting_os_deregister(spa_t *spa, objset_t *os) -{ - mutex_enter(&spa->spa_evicting_os_lock); - list_remove(&spa->spa_evicting_os_list, os); - cv_broadcast(&spa->spa_evicting_os_cv); - mutex_exit(&spa->spa_evicting_os_lock); -} - -void -spa_evicting_os_wait(spa_t *spa) -{ - mutex_enter(&spa->spa_evicting_os_lock); - while (!list_is_empty(&spa->spa_evicting_os_list)) - cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); - mutex_exit(&spa->spa_evicting_os_lock); - - dmu_buf_user_evict_wait(); -} - -int -spa_max_replication(spa_t *spa) -{ - /* - * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to - * handle BPs with more than one DVA allocated. Set our max - * replication level accordingly. - */ - if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) - return (1); - return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); -} - -int -spa_prev_software_version(spa_t *spa) -{ - return (spa->spa_prev_software_version); -} - -uint64_t -spa_deadman_synctime(spa_t *spa) -{ - return (spa->spa_deadman_synctime); -} - -struct proc * -spa_proc(spa_t *spa) -{ - return (spa->spa_proc); -} - -uint64_t -dva_get_dsize_sync(spa_t *spa, const dva_t *dva) -{ - uint64_t asize = DVA_GET_ASIZE(dva); - uint64_t dsize = asize; - - ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); - - if (asize != 0 && spa->spa_deflate) { - uint64_t vdev = DVA_GET_VDEV(dva); - vdev_t *vd = vdev_lookup_top(spa, vdev); - if (vd == NULL) { - panic( - "dva_get_dsize_sync(): bad DVA %llu:%llu", - (u_longlong_t)vdev, (u_longlong_t)asize); - } - dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; - } - - return (dsize); -} - -uint64_t -bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) -{ - uint64_t dsize = 0; - - for (int d = 0; d < BP_GET_NDVAS(bp); d++) - dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); - - return (dsize); -} - -uint64_t -bp_get_dsize(spa_t *spa, const blkptr_t *bp) -{ - uint64_t dsize = 0; - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - - for (int d = 0; d < BP_GET_NDVAS(bp); d++) - dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); - - spa_config_exit(spa, SCL_VDEV, FTAG); - - return (dsize); -} - -uint64_t -spa_dirty_data(spa_t *spa) -{ - return (spa->spa_dsl_pool->dp_dirty_total); -} - -/* - * ========================================================================== - * Initialization and Termination - * ========================================================================== - */ - -static int -spa_name_compare(const void *a1, const void *a2) -{ - const spa_t *s1 = a1; - const spa_t *s2 = a2; - int s; - - s = strcmp(s1->spa_name, s2->spa_name); - - return (AVL_ISIGN(s)); -} - -int -spa_busy(void) -{ - return (spa_active_count); -} - -void -spa_boot_init() -{ - spa_config_load(); -} - -#ifdef _KERNEL -EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); -#endif - -void -spa_init(int mode) -{ - mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); - - avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), - offsetof(spa_t, spa_avl)); - - avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), - offsetof(spa_aux_t, aux_avl)); - - avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), - offsetof(spa_aux_t, aux_avl)); - - spa_mode_global = mode; - -#ifdef illumos -#ifdef _KERNEL - spa_arch_init(); -#else - if (spa_mode_global != FREAD && dprintf_find_string("watch")) { - arc_procfd = open("/proc/self/ctl", O_WRONLY); - if (arc_procfd == -1) { - perror("could not enable watchpoints: " - "opening /proc/self/ctl failed: "); - } else { - arc_watch = B_TRUE; - } - } -#endif -#endif /* illumos */ - - zfs_refcount_init(); - unique_init(); - range_tree_init(); - metaslab_alloc_trace_init(); - zio_init(); - lz4_init(); - dmu_init(); - zil_init(); - vdev_cache_stat_init(); - vdev_file_init(); - zfs_prop_init(); - zpool_prop_init(); - zpool_feature_init(); - spa_config_load(); - l2arc_start(); - scan_init(); - dsl_scan_global_init(); -#ifndef illumos -#ifdef _KERNEL - zfs_deadman_init(); -#endif -#endif /* !illumos */ -} - -void -spa_fini(void) -{ - l2arc_stop(); - - spa_evict_all(); - - vdev_file_fini(); - vdev_cache_stat_fini(); - zil_fini(); - dmu_fini(); - lz4_fini(); - zio_fini(); - metaslab_alloc_trace_fini(); - range_tree_fini(); - unique_fini(); - zfs_refcount_fini(); - scan_fini(); - - avl_destroy(&spa_namespace_avl); - avl_destroy(&spa_spare_avl); - avl_destroy(&spa_l2cache_avl); - - cv_destroy(&spa_namespace_cv); - mutex_destroy(&spa_namespace_lock); - mutex_destroy(&spa_spare_lock); - mutex_destroy(&spa_l2cache_lock); -} - -/* - * Return whether this pool has slogs. No locking needed. - * It's not a problem if the wrong answer is returned as it's only for - * performance and not correctness - */ -boolean_t -spa_has_slogs(spa_t *spa) -{ - return (spa->spa_log_class->mc_rotor != NULL); -} - -spa_log_state_t -spa_get_log_state(spa_t *spa) -{ - return (spa->spa_log_state); -} - -void -spa_set_log_state(spa_t *spa, spa_log_state_t state) -{ - spa->spa_log_state = state; -} - -boolean_t -spa_is_root(spa_t *spa) -{ - return (spa->spa_is_root); -} - -boolean_t -spa_writeable(spa_t *spa) -{ - return (!!(spa->spa_mode & FWRITE) && spa->spa_trust_config); -} - -/* - * Returns true if there is a pending sync task in any of the current - * syncing txg, the current quiescing txg, or the current open txg. - */ -boolean_t -spa_has_pending_synctask(spa_t *spa) -{ - return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) || - !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks)); -} - -int -spa_mode(spa_t *spa) -{ - return (spa->spa_mode); -} - -uint64_t -spa_bootfs(spa_t *spa) -{ - return (spa->spa_bootfs); -} - -uint64_t -spa_delegation(spa_t *spa) -{ - return (spa->spa_delegation); -} - -objset_t * -spa_meta_objset(spa_t *spa) -{ - return (spa->spa_meta_objset); -} - -enum zio_checksum -spa_dedup_checksum(spa_t *spa) -{ - return (spa->spa_dedup_checksum); -} - -/* - * Reset pool scan stat per scan pass (or reboot). - */ -void -spa_scan_stat_init(spa_t *spa) -{ - /* data not stored on disk */ - spa->spa_scan_pass_start = gethrestime_sec(); - if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan)) - spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start; - else - spa->spa_scan_pass_scrub_pause = 0; - spa->spa_scan_pass_scrub_spent_paused = 0; - spa->spa_scan_pass_exam = 0; - spa->spa_scan_pass_issued = 0; - vdev_scan_stat_init(spa->spa_root_vdev); -} - -/* - * Get scan stats for zpool status reports - */ -int -spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) -{ - dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; - - if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) - return (SET_ERROR(ENOENT)); - bzero(ps, sizeof (pool_scan_stat_t)); - - /* data stored on disk */ - ps->pss_func = scn->scn_phys.scn_func; - ps->pss_state = scn->scn_phys.scn_state; - ps->pss_start_time = scn->scn_phys.scn_start_time; - ps->pss_end_time = scn->scn_phys.scn_end_time; - ps->pss_to_examine = scn->scn_phys.scn_to_examine; - ps->pss_to_process = scn->scn_phys.scn_to_process; - ps->pss_processed = scn->scn_phys.scn_processed; - ps->pss_errors = scn->scn_phys.scn_errors; - ps->pss_examined = scn->scn_phys.scn_examined; - ps->pss_issued = - scn->scn_issued_before_pass + spa->spa_scan_pass_issued; - /* data not stored on disk */ - ps->pss_pass_start = spa->spa_scan_pass_start; - ps->pss_pass_exam = spa->spa_scan_pass_exam; - ps->pss_pass_issued = spa->spa_scan_pass_issued; - ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause; - ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused; - - return (0); -} - -int -spa_maxblocksize(spa_t *spa) -{ - if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) - return (SPA_MAXBLOCKSIZE); - else - return (SPA_OLD_MAXBLOCKSIZE); -} - -int -spa_maxdnodesize(spa_t *spa) -{ - if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) - return (DNODE_MAX_SIZE); - else - return (DNODE_MIN_SIZE); -} - -boolean_t -spa_multihost(spa_t *spa) -{ - return (spa->spa_multihost ? B_TRUE : B_FALSE); -} - -unsigned long -spa_get_hostid(void) -{ - unsigned long myhostid; - -#ifdef _KERNEL - myhostid = zone_get_hostid(NULL); -#else /* _KERNEL */ - /* - * We're emulating the system's hostid in userland, so - * we can't use zone_get_hostid(). - */ - (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); -#endif /* _KERNEL */ - - return (myhostid); -} - -/* - * Returns the txg that the last device removal completed. No indirect mappings - * have been added since this txg. - */ -uint64_t -spa_get_last_removal_txg(spa_t *spa) -{ - uint64_t vdevid; - uint64_t ret = -1ULL; - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - /* - * sr_prev_indirect_vdev is only modified while holding all the - * config locks, so it is sufficient to hold SCL_VDEV as reader when - * examining it. - */ - vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev; - - while (vdevid != -1ULL) { - vdev_t *vd = vdev_lookup_top(spa, vdevid); - vdev_indirect_births_t *vib = vd->vdev_indirect_births; - - ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); - - /* - * If the removal did not remap any data, we don't care. - */ - if (vdev_indirect_births_count(vib) != 0) { - ret = vdev_indirect_births_last_entry_txg(vib); - break; - } - - vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev; - } - spa_config_exit(spa, SCL_VDEV, FTAG); - - IMPLY(ret != -1ULL, - spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); - - return (ret); -} - -boolean_t -spa_trust_config(spa_t *spa) -{ - return (spa->spa_trust_config); -} - -uint64_t -spa_missing_tvds_allowed(spa_t *spa) -{ - return (spa->spa_missing_tvds_allowed); -} - -void -spa_set_missing_tvds(spa_t *spa, uint64_t missing) -{ - spa->spa_missing_tvds = missing; -} - -boolean_t -spa_top_vdevs_spacemap_addressable(spa_t *spa) -{ - vdev_t *rvd = spa->spa_root_vdev; - for (uint64_t c = 0; c < rvd->vdev_children; c++) { - if (!vdev_is_spacemap_addressable(rvd->vdev_child[c])) - return (B_FALSE); - } - return (B_TRUE); -} - -boolean_t -spa_has_checkpoint(spa_t *spa) -{ - return (spa->spa_checkpoint_txg != 0); -} - -boolean_t -spa_importing_readonly_checkpoint(spa_t *spa) -{ - return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) && - spa->spa_mode == FREAD); -} - -uint64_t -spa_min_claim_txg(spa_t *spa) -{ - uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg; - - if (checkpoint_txg != 0) - return (checkpoint_txg + 1); - - return (spa->spa_first_txg); -} - -/* - * If there is a checkpoint, async destroys may consume more space from - * the pool instead of freeing it. In an attempt to save the pool from - * getting suspended when it is about to run out of space, we stop - * processing async destroys. - */ -boolean_t -spa_suspend_async_destroy(spa_t *spa) -{ - dsl_pool_t *dp = spa_get_dsl(spa); - - uint64_t unreserved = dsl_pool_unreserved_space(dp, - ZFS_SPACE_CHECK_EXTRA_RESERVED); - uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; - uint64_t avail = (unreserved > used) ? (unreserved - used) : 0; - - if (spa_has_checkpoint(spa) && avail == 0) - return (B_TRUE); - - return (B_FALSE); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c deleted file mode 100644 index 9ed7a1f4b761..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c +++ /dev/null @@ -1,1073 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -SYSCTL_DECL(_vfs_zfs); - -/* - * Note on space map block size: - * - * The data for a given space map can be kept on blocks of any size. - * Larger blocks entail fewer I/O operations, but they also cause the - * DMU to keep more data in-core, and also to waste more I/O bandwidth - * when only a few blocks have changed since the last transaction group. - */ - -/* - * Enabled whenever we want to stress test the use of double-word - * space map entries. - */ -boolean_t zfs_force_some_double_word_sm_entries = B_FALSE; - -/* - * Override the default indirect block size of 128K, instead using 16K for - * spacemaps (2^14 bytes). This dramatically reduces write inflation since - * appending to a spacemap typically has to write one data block (4KB) and one - * or two indirect blocks (16K-32K, rather than 128K). - */ -int space_map_ibs = 14; - -SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN, - &space_map_ibs, 0, "Space map indirect block shift"); - -boolean_t -sm_entry_is_debug(uint64_t e) -{ - return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX); -} - -boolean_t -sm_entry_is_single_word(uint64_t e) -{ - uint8_t prefix = SM_PREFIX_DECODE(e); - return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX); -} - -boolean_t -sm_entry_is_double_word(uint64_t e) -{ - return (SM_PREFIX_DECODE(e) == SM2_PREFIX); -} - -/* - * Iterate through the space map, invoking the callback on each (non-debug) - * space map entry. Stop after reading 'end' bytes of the space map. - */ -int -space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg) -{ - uint64_t blksz = sm->sm_blksz; - - ASSERT3U(blksz, !=, 0); - ASSERT3U(end, <=, space_map_length(sm)); - ASSERT0(P2PHASE(end, sizeof (uint64_t))); - - dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end, - ZIO_PRIORITY_SYNC_READ); - - int error = 0; - for (uint64_t block_base = 0; block_base < end && error == 0; - block_base += blksz) { - dmu_buf_t *db; - error = dmu_buf_hold(sm->sm_os, space_map_object(sm), - block_base, FTAG, &db, DMU_READ_PREFETCH); - if (error != 0) - return (error); - - uint64_t *block_start = db->db_data; - uint64_t block_length = MIN(end - block_base, blksz); - uint64_t *block_end = block_start + - (block_length / sizeof (uint64_t)); - - VERIFY0(P2PHASE(block_length, sizeof (uint64_t))); - VERIFY3U(block_length, !=, 0); - ASSERT3U(blksz, ==, db->db_size); - - for (uint64_t *block_cursor = block_start; - block_cursor < block_end && error == 0; block_cursor++) { - uint64_t e = *block_cursor; - - if (sm_entry_is_debug(e)) /* Skip debug entries */ - continue; - - uint64_t raw_offset, raw_run, vdev_id; - maptype_t type; - if (sm_entry_is_single_word(e)) { - type = SM_TYPE_DECODE(e); - vdev_id = SM_NO_VDEVID; - raw_offset = SM_OFFSET_DECODE(e); - raw_run = SM_RUN_DECODE(e); - } else { - /* it is a two-word entry */ - ASSERT(sm_entry_is_double_word(e)); - raw_run = SM2_RUN_DECODE(e); - vdev_id = SM2_VDEV_DECODE(e); - - /* move on to the second word */ - block_cursor++; - e = *block_cursor; - VERIFY3P(block_cursor, <=, block_end); - - type = SM2_TYPE_DECODE(e); - raw_offset = SM2_OFFSET_DECODE(e); - } - - uint64_t entry_offset = (raw_offset << sm->sm_shift) + - sm->sm_start; - uint64_t entry_run = raw_run << sm->sm_shift; - - VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); - VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); - ASSERT3U(entry_offset, >=, sm->sm_start); - ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size); - ASSERT3U(entry_run, <=, sm->sm_size); - ASSERT3U(entry_offset + entry_run, <=, - sm->sm_start + sm->sm_size); - - space_map_entry_t sme = { - .sme_type = type, - .sme_vdev = vdev_id, - .sme_offset = entry_offset, - .sme_run = entry_run - }; - error = callback(&sme, arg); - } - dmu_buf_rele(db, FTAG); - } - return (error); -} - -/* - * Reads the entries from the last block of the space map into - * buf in reverse order. Populates nwords with number of words - * in the last block. - * - * Refer to block comment within space_map_incremental_destroy() - * to understand why this function is needed. - */ -static int -space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf, - uint64_t bufsz, uint64_t *nwords) -{ - int error = 0; - dmu_buf_t *db; - - /* - * Find the offset of the last word in the space map and use - * that to read the last block of the space map with - * dmu_buf_hold(). - */ - uint64_t last_word_offset = - sm->sm_phys->smp_length - sizeof (uint64_t); - error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset, - FTAG, &db, DMU_READ_NO_PREFETCH); - if (error != 0) - return (error); - - ASSERT3U(sm->sm_object, ==, db->db_object); - ASSERT3U(sm->sm_blksz, ==, db->db_size); - ASSERT3U(bufsz, >=, db->db_size); - ASSERT(nwords != NULL); - - uint64_t *words = db->db_data; - *nwords = - (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); - - ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t)); - - uint64_t n = *nwords; - uint64_t j = n - 1; - for (uint64_t i = 0; i < n; i++) { - uint64_t entry = words[i]; - if (sm_entry_is_double_word(entry)) { - /* - * Since we are populating the buffer backwards - * we have to be extra careful and add the two - * words of the double-word entry in the right - * order. - */ - ASSERT3U(j, >, 0); - buf[j - 1] = entry; - - i++; - ASSERT3U(i, <, n); - entry = words[i]; - buf[j] = entry; - j -= 2; - } else { - ASSERT(sm_entry_is_debug(entry) || - sm_entry_is_single_word(entry)); - buf[j] = entry; - j--; - } - } - - /* - * Assert that we wrote backwards all the - * way to the beginning of the buffer. - */ - ASSERT3S(j, ==, -1); - - dmu_buf_rele(db, FTAG); - return (error); -} - -/* - * Note: This function performs destructive actions - specifically - * it deletes entries from the end of the space map. Thus, callers - * should ensure that they are holding the appropriate locks for - * the space map that they provide. - */ -int -space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, - dmu_tx_t *tx) -{ - uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); - uint64_t *buf = zio_buf_alloc(bufsz); - - dmu_buf_will_dirty(sm->sm_dbuf, tx); - - /* - * Ideally we would want to iterate from the beginning of the - * space map to the end in incremental steps. The issue with this - * approach is that we don't have any field on-disk that points - * us where to start between each step. We could try zeroing out - * entries that we've destroyed, but this doesn't work either as - * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]). - * - * As a result, we destroy its entries incrementally starting from - * the end after applying the callback to each of them. - * - * The problem with this approach is that we cannot literally - * iterate through the words in the space map backwards as we - * can't distinguish two-word space map entries from their second - * word. Thus we do the following: - * - * 1] We get all the entries from the last block of the space map - * and put them into a buffer in reverse order. This way the - * last entry comes first in the buffer, the second to last is - * second, etc. - * 2] We iterate through the entries in the buffer and we apply - * the callback to each one. As we move from entry to entry we - * we decrease the size of the space map, deleting effectively - * each entry. - * 3] If there are no more entries in the space map or the callback - * returns a value other than 0, we stop iterating over the - * space map. If there are entries remaining and the callback - * returned 0, we go back to step [1]. - */ - int error = 0; - while (space_map_length(sm) > 0 && error == 0) { - uint64_t nwords = 0; - error = space_map_reversed_last_block_entries(sm, buf, bufsz, - &nwords); - if (error != 0) - break; - - ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t)); - - for (uint64_t i = 0; i < nwords; i++) { - uint64_t e = buf[i]; - - if (sm_entry_is_debug(e)) { - sm->sm_phys->smp_length -= sizeof (uint64_t); - continue; - } - - int words = 1; - uint64_t raw_offset, raw_run, vdev_id; - maptype_t type; - if (sm_entry_is_single_word(e)) { - type = SM_TYPE_DECODE(e); - vdev_id = SM_NO_VDEVID; - raw_offset = SM_OFFSET_DECODE(e); - raw_run = SM_RUN_DECODE(e); - } else { - ASSERT(sm_entry_is_double_word(e)); - words = 2; - - raw_run = SM2_RUN_DECODE(e); - vdev_id = SM2_VDEV_DECODE(e); - - /* move to the second word */ - i++; - e = buf[i]; - - ASSERT3P(i, <=, nwords); - - type = SM2_TYPE_DECODE(e); - raw_offset = SM2_OFFSET_DECODE(e); - } - - uint64_t entry_offset = - (raw_offset << sm->sm_shift) + sm->sm_start; - uint64_t entry_run = raw_run << sm->sm_shift; - - VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); - VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); - VERIFY3U(entry_offset, >=, sm->sm_start); - VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size); - VERIFY3U(entry_run, <=, sm->sm_size); - VERIFY3U(entry_offset + entry_run, <=, - sm->sm_start + sm->sm_size); - - space_map_entry_t sme = { - .sme_type = type, - .sme_vdev = vdev_id, - .sme_offset = entry_offset, - .sme_run = entry_run - }; - error = callback(&sme, arg); - if (error != 0) - break; - - if (type == SM_ALLOC) - sm->sm_phys->smp_alloc -= entry_run; - else - sm->sm_phys->smp_alloc += entry_run; - sm->sm_phys->smp_length -= words * sizeof (uint64_t); - } - } - - if (space_map_length(sm) == 0) { - ASSERT0(error); - ASSERT0(space_map_allocated(sm)); - } - - zio_buf_free(buf, bufsz); - return (error); -} - -typedef struct space_map_load_arg { - space_map_t *smla_sm; - range_tree_t *smla_rt; - maptype_t smla_type; -} space_map_load_arg_t; - -static int -space_map_load_callback(space_map_entry_t *sme, void *arg) -{ - space_map_load_arg_t *smla = arg; - if (sme->sme_type == smla->smla_type) { - VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=, - smla->smla_sm->sm_size); - range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run); - } else { - range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run); - } - - return (0); -} - -/* - * Load the spacemap into the rangetree, like space_map_load. But only - * read the first 'length' bytes of the spacemap. - */ -int -space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype, - uint64_t length) -{ - space_map_load_arg_t smla; - - VERIFY0(range_tree_space(rt)); - - if (maptype == SM_FREE) - range_tree_add(rt, sm->sm_start, sm->sm_size); - - smla.smla_rt = rt; - smla.smla_sm = sm; - smla.smla_type = maptype; - int err = space_map_iterate(sm, length, - space_map_load_callback, &smla); - - if (err != 0) - range_tree_vacate(rt, NULL, NULL); - - return (err); -} - -/* - * Load the space map disk into the specified range tree. Segments of maptype - * are added to the range tree, other segment types are removed. - */ -int -space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) -{ - return (space_map_load_length(sm, rt, maptype, space_map_length(sm))); -} - -void -space_map_histogram_clear(space_map_t *sm) -{ - if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) - return; - - bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram)); -} - -boolean_t -space_map_histogram_verify(space_map_t *sm, range_tree_t *rt) -{ - /* - * Verify that the in-core range tree does not have any - * ranges smaller than our sm_shift size. - */ - for (int i = 0; i < sm->sm_shift; i++) { - if (rt->rt_histogram[i] != 0) - return (B_FALSE); - } - return (B_TRUE); -} - -void -space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx) -{ - int idx = 0; - - ASSERT(dmu_tx_is_syncing(tx)); - VERIFY3U(space_map_object(sm), !=, 0); - - if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) - return; - - dmu_buf_will_dirty(sm->sm_dbuf, tx); - - ASSERT(space_map_histogram_verify(sm, rt)); - /* - * Transfer the content of the range tree histogram to the space - * map histogram. The space map histogram contains 32 buckets ranging - * between 2^sm_shift to 2^(32+sm_shift-1). The range tree, - * however, can represent ranges from 2^0 to 2^63. Since the space - * map only cares about allocatable blocks (minimum of sm_shift) we - * can safely ignore all ranges in the range tree smaller than sm_shift. - */ - for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { - - /* - * Since the largest histogram bucket in the space map is - * 2^(32+sm_shift-1), we need to normalize the values in - * the range tree for any bucket larger than that size. For - * example given an sm_shift of 9, ranges larger than 2^40 - * would get normalized as if they were 1TB ranges. Assume - * the range tree had a count of 5 in the 2^44 (16TB) bucket, - * the calculation below would normalize this to 5 * 2^4 (16). - */ - ASSERT3U(i, >=, idx + sm->sm_shift); - sm->sm_phys->smp_histogram[idx] += - rt->rt_histogram[i] << (i - idx - sm->sm_shift); - - /* - * Increment the space map's index as long as we haven't - * reached the maximum bucket size. Accumulate all ranges - * larger than the max bucket size into the last bucket. - */ - if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { - ASSERT3U(idx + sm->sm_shift, ==, i); - idx++; - ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); - } - } -} - -static void -space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) -{ - dmu_buf_will_dirty(sm->sm_dbuf, tx); - - uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | - SM_DEBUG_ACTION_ENCODE(maptype) | - SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) | - SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); - - dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length, - sizeof (dentry), &dentry, tx); - - sm->sm_phys->smp_length += sizeof (dentry); -} - -/* - * Writes one or more entries given a segment. - * - * Note: The function may release the dbuf from the pointer initially - * passed to it, and return a different dbuf. Also, the space map's - * dbuf must be dirty for the changes in sm_phys to take effect. - */ -static void -space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, - uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx) -{ - ASSERT3U(words, !=, 0); - ASSERT3U(words, <=, 2); - - /* ensure the vdev_id can be represented by the space map */ - ASSERT3U(vdev_id, <=, SM_NO_VDEVID); - - /* - * if this is a single word entry, ensure that no vdev was - * specified. - */ - IMPLY(words == 1, vdev_id == SM_NO_VDEVID); - - dmu_buf_t *db = *dbp; - ASSERT3U(db->db_size, ==, sm->sm_blksz); - - uint64_t *block_base = db->db_data; - uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t)); - uint64_t *block_cursor = block_base + - (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); - - ASSERT3P(block_cursor, <=, block_end); - - uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; - uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift; - uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX; - - ASSERT3U(rs->rs_start, >=, sm->sm_start); - ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size); - ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size); - ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size); - - while (size != 0) { - ASSERT3P(block_cursor, <=, block_end); - - /* - * If we are at the end of this block, flush it and start - * writing again from the beginning. - */ - if (block_cursor == block_end) { - dmu_buf_rele(db, tag); - - uint64_t next_word_offset = sm->sm_phys->smp_length; - VERIFY0(dmu_buf_hold(sm->sm_os, - space_map_object(sm), next_word_offset, - tag, &db, DMU_READ_PREFETCH)); - dmu_buf_will_dirty(db, tx); - - /* update caller's dbuf */ - *dbp = db; - - ASSERT3U(db->db_size, ==, sm->sm_blksz); - - block_base = db->db_data; - block_cursor = block_base; - block_end = block_base + - (db->db_size / sizeof (uint64_t)); - } - - /* - * If we are writing a two-word entry and we only have one - * word left on this block, just pad it with an empty debug - * entry and write the two-word entry in the next block. - */ - uint64_t *next_entry = block_cursor + 1; - if (next_entry == block_end && words > 1) { - ASSERT3U(words, ==, 2); - *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | - SM_DEBUG_ACTION_ENCODE(0) | - SM_DEBUG_SYNCPASS_ENCODE(0) | - SM_DEBUG_TXG_ENCODE(0); - block_cursor++; - sm->sm_phys->smp_length += sizeof (uint64_t); - ASSERT3P(block_cursor, ==, block_end); - continue; - } - - uint64_t run_len = MIN(size, run_max); - switch (words) { - case 1: - *block_cursor = SM_OFFSET_ENCODE(start) | - SM_TYPE_ENCODE(maptype) | - SM_RUN_ENCODE(run_len); - block_cursor++; - break; - case 2: - /* write the first word of the entry */ - *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) | - SM2_RUN_ENCODE(run_len) | - SM2_VDEV_ENCODE(vdev_id); - block_cursor++; - - /* move on to the second word of the entry */ - ASSERT3P(block_cursor, <, block_end); - *block_cursor = SM2_TYPE_ENCODE(maptype) | - SM2_OFFSET_ENCODE(start); - block_cursor++; - break; - default: - panic("%d-word space map entries are not supported", - words); - break; - } - sm->sm_phys->smp_length += words * sizeof (uint64_t); - - start += run_len; - size -= run_len; - } - ASSERT0(size); - -} - -/* - * Note: The space map's dbuf must be dirty for the changes in sm_phys to - * take effect. - */ -static void -space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, - uint64_t vdev_id, dmu_tx_t *tx) -{ - spa_t *spa = tx->tx_pool->dp_spa; - dmu_buf_t *db; - - space_map_write_intro_debug(sm, maptype, tx); - -#ifdef DEBUG - /* - * We do this right after we write the intro debug entry - * because the estimate does not take it into account. - */ - uint64_t initial_objsize = sm->sm_phys->smp_length; - uint64_t estimated_growth = - space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID); - uint64_t estimated_final_objsize = initial_objsize + estimated_growth; -#endif - - /* - * Find the offset right after the last word in the space map - * and use that to get a hold of the last block, so we can - * start appending to it. - */ - uint64_t next_word_offset = sm->sm_phys->smp_length; - VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), - next_word_offset, FTAG, &db, DMU_READ_PREFETCH)); - ASSERT3U(db->db_size, ==, sm->sm_blksz); - - dmu_buf_will_dirty(db, tx); - - avl_tree_t *t = &rt->rt_root; - for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { - uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift; - uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift; - uint8_t words = 1; - - /* - * We only write two-word entries when both of the following - * are true: - * - * [1] The feature is enabled. - * [2] The offset or run is too big for a single-word entry, - * or the vdev_id is set (meaning not equal to - * SM_NO_VDEVID). - * - * Note that for purposes of testing we've added the case that - * we write two-word entries occasionally when the feature is - * enabled and zfs_force_some_double_word_sm_entries has been - * set. - */ - if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) && - (offset >= (1ULL << SM_OFFSET_BITS) || - length > SM_RUN_MAX || - vdev_id != SM_NO_VDEVID || - (zfs_force_some_double_word_sm_entries && - spa_get_random(100) == 0))) - words = 2; - - space_map_write_seg(sm, rs, maptype, vdev_id, words, - &db, FTAG, tx); - } - - dmu_buf_rele(db, FTAG); - -#ifdef DEBUG - /* - * We expect our estimation to be based on the worst case - * scenario [see comment in space_map_estimate_optimal_size()]. - * Therefore we expect the actual objsize to be equal or less - * than whatever we estimated it to be. - */ - ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length); -#endif -} - -/* - * Note: This function manipulates the state of the given space map but - * does not hold any locks implicitly. Thus the caller is responsible - * for synchronizing writes to the space map. - */ -void -space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, - uint64_t vdev_id, dmu_tx_t *tx) -{ - objset_t *os = sm->sm_os; - - ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); - VERIFY3U(space_map_object(sm), !=, 0); - - dmu_buf_will_dirty(sm->sm_dbuf, tx); - - /* - * This field is no longer necessary since the in-core space map - * now contains the object number but is maintained for backwards - * compatibility. - */ - sm->sm_phys->smp_object = sm->sm_object; - - if (range_tree_is_empty(rt)) { - VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object); - return; - } - - if (maptype == SM_ALLOC) - sm->sm_phys->smp_alloc += range_tree_space(rt); - else - sm->sm_phys->smp_alloc -= range_tree_space(rt); - - uint64_t nodes = avl_numnodes(&rt->rt_root); - uint64_t rt_space = range_tree_space(rt); - - space_map_write_impl(sm, rt, maptype, vdev_id, tx); - - /* - * Ensure that the space_map's accounting wasn't changed - * while we were in the middle of writing it out. - */ - VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root)); - VERIFY3U(range_tree_space(rt), ==, rt_space); -} - -static int -space_map_open_impl(space_map_t *sm) -{ - int error; - u_longlong_t blocks; - - error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf); - if (error) - return (error); - - dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks); - sm->sm_phys = sm->sm_dbuf->db_data; - return (0); -} - -int -space_map_open(space_map_t **smp, objset_t *os, uint64_t object, - uint64_t start, uint64_t size, uint8_t shift) -{ - space_map_t *sm; - int error; - - ASSERT(*smp == NULL); - ASSERT(os != NULL); - ASSERT(object != 0); - - sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP); - - sm->sm_start = start; - sm->sm_size = size; - sm->sm_shift = shift; - sm->sm_os = os; - sm->sm_object = object; - - error = space_map_open_impl(sm); - if (error != 0) { - space_map_close(sm); - return (error); - } - *smp = sm; - - return (0); -} - -void -space_map_close(space_map_t *sm) -{ - if (sm == NULL) - return; - - if (sm->sm_dbuf != NULL) - dmu_buf_rele(sm->sm_dbuf, sm); - sm->sm_dbuf = NULL; - sm->sm_phys = NULL; - - kmem_free(sm, sizeof (*sm)); -} - -void -space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) -{ - objset_t *os = sm->sm_os; - spa_t *spa = dmu_objset_spa(os); - dmu_object_info_t doi; - - ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); - ASSERT(dmu_tx_is_syncing(tx)); - VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa)); - - dmu_object_info_from_db(sm->sm_dbuf, &doi); - - /* - * If the space map has the wrong bonus size (because - * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or - * the wrong block size (because space_map_blksz has changed), - * free and re-allocate its object with the updated sizes. - * - * Otherwise, just truncate the current object. - */ - if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && - doi.doi_bonus_size != sizeof (space_map_phys_t)) || - doi.doi_data_block_size != blocksize || - doi.doi_metadata_block_size != 1 << space_map_ibs) { - zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating " - "object[%llu]: old bonus %u, old blocksz %u", - dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object, - doi.doi_bonus_size, doi.doi_data_block_size); - - space_map_free(sm, tx); - dmu_buf_rele(sm->sm_dbuf, sm); - - sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx); - VERIFY0(space_map_open_impl(sm)); - } else { - VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx)); - - /* - * If the spacemap is reallocated, its histogram - * will be reset. Do the same in the common case so that - * bugs related to the uncommon case do not go unnoticed. - */ - bzero(sm->sm_phys->smp_histogram, - sizeof (sm->sm_phys->smp_histogram)); - } - - dmu_buf_will_dirty(sm->sm_dbuf, tx); - sm->sm_phys->smp_length = 0; - sm->sm_phys->smp_alloc = 0; -} - -uint64_t -space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) -{ - spa_t *spa = dmu_objset_spa(os); - uint64_t object; - int bonuslen; - - if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { - spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); - bonuslen = sizeof (space_map_phys_t); - ASSERT3U(bonuslen, <=, dmu_bonus_max()); - } else { - bonuslen = SPACE_MAP_SIZE_V0; - } - - object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize, - space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); - - return (object); -} - -void -space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx) -{ - spa_t *spa = dmu_objset_spa(os); - if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { - dmu_object_info_t doi; - - VERIFY0(dmu_object_info(os, smobj, &doi)); - if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) { - spa_feature_decr(spa, - SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); - } - } - - VERIFY0(dmu_object_free(os, smobj, tx)); -} - -void -space_map_free(space_map_t *sm, dmu_tx_t *tx) -{ - if (sm == NULL) - return; - - space_map_free_obj(sm->sm_os, space_map_object(sm), tx); - sm->sm_object = 0; -} - -/* - * Given a range tree, it makes a worst-case estimate of how much - * space would the tree's segments take if they were written to - * the given space map. - */ -uint64_t -space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, - uint64_t vdev_id) -{ - spa_t *spa = dmu_objset_spa(sm->sm_os); - uint64_t shift = sm->sm_shift; - uint64_t *histogram = rt->rt_histogram; - uint64_t entries_for_seg = 0; - - /* - * In order to get a quick estimate of the optimal size that this - * range tree would have on-disk as a space map, we iterate through - * its histogram buckets instead of iterating through its nodes. - * - * Note that this is a highest-bound/worst-case estimate for the - * following reasons: - * - * 1] We assume that we always add a debug padding for each block - * we write and we also assume that we start at the last word - * of a block attempting to write a two-word entry. - * 2] Rounding up errors due to the way segments are distributed - * in the buckets of the range tree's histogram. - * 3] The activation of zfs_force_some_double_word_sm_entries - * (tunable) when testing. - * - * = Math and Rounding Errors = - * - * rt_histogram[i] bucket of a range tree represents the number - * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given - * that, we want to divide the buckets into groups: Buckets that - * can be represented using a single-word entry, ones that can - * be represented with a double-word entry, and ones that can - * only be represented with multiple two-word entries. - * - * [Note that if the new encoding feature is not enabled there - * are only two groups: single-word entry buckets and multiple - * single-word entry buckets. The information below assumes - * two-word entries enabled, but it can easily applied when - * the feature is not enabled] - * - * To find the highest bucket that can be represented with a - * single-word entry we look at the maximum run that such entry - * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that - * the run of a space map entry is shifted by sm_shift, thus we - * add it to the exponent]. This way, excluding the value of the - * maximum run that can be represented by a single-word entry, - * all runs that are smaller exist in buckets 0 to - * SM_RUN_BITS + shift - 1. - * - * To find the highest bucket that can be represented with a - * double-word entry, we follow the same approach. Finally, any - * bucket higher than that are represented with multiple two-word - * entries. To be more specific, if the highest bucket whose - * segments can be represented with a single two-word entry is X, - * then bucket X+1 will need 2 two-word entries for each of its - * segments, X+2 will need 4, X+3 will need 8, ...etc. - * - * With all of the above we make our estimation based on bucket - * groups. There is a rounding error though. As we mentioned in - * the example with the one-word entry, the maximum run that can - * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is - * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of - * that length fall into the next bucket (and bucket group) where - * we start counting two-word entries and this is one more reason - * why the estimated size may end up being bigger than the actual - * size written. - */ - uint64_t size = 0; - uint64_t idx = 0; - - if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) || - (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) { - - /* - * If we are trying to force some double word entries just - * assume the worst-case of every single word entry being - * written as a double word entry. - */ - uint64_t entry_size = - (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) && - zfs_force_some_double_word_sm_entries) ? - (2 * sizeof (uint64_t)) : sizeof (uint64_t); - - uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1; - for (; idx <= single_entry_max_bucket; idx++) - size += histogram[idx] * entry_size; - - if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) { - for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { - ASSERT3U(idx, >=, single_entry_max_bucket); - entries_for_seg = - 1ULL << (idx - single_entry_max_bucket); - size += histogram[idx] * - entries_for_seg * entry_size; - } - return (size); - } - } - - ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)); - - uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1; - for (; idx <= double_entry_max_bucket; idx++) - size += histogram[idx] * 2 * sizeof (uint64_t); - - for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { - ASSERT3U(idx, >=, double_entry_max_bucket); - entries_for_seg = 1ULL << (idx - double_entry_max_bucket); - size += histogram[idx] * - entries_for_seg * 2 * sizeof (uint64_t); - } - - /* - * Assume the worst case where we start with the padding at the end - * of the current block and we add an extra padding entry at the end - * of all subsequent blocks. - */ - size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t); - - return (size); -} - -uint64_t -space_map_object(space_map_t *sm) -{ - return (sm != NULL ? sm->sm_object : 0); -} - -int64_t -space_map_allocated(space_map_t *sm) -{ - return (sm != NULL ? sm->sm_phys->smp_alloc : 0); -} - -uint64_t -space_map_length(space_map_t *sm) -{ - return (sm != NULL ? sm->sm_phys->smp_length : 0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c deleted file mode 100644 index aa289ba1061d..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c +++ /dev/null @@ -1,149 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. - */ - -#include -#include -#include - -/* - * Space reference trees. - * - * A range tree is a collection of integers. Every integer is either - * in the tree, or it's not. A space reference tree generalizes - * the idea: it allows its members to have arbitrary reference counts, - * as opposed to the implicit reference count of 0 or 1 in a range tree. - * This representation comes in handy when computing the union or - * intersection of multiple space maps. For example, the union of - * N range trees is the subset of the reference tree with refcnt >= 1. - * The intersection of N range trees is the subset with refcnt >= N. - * - * [It's very much like a Fourier transform. Unions and intersections - * are hard to perform in the 'range tree domain', so we convert the trees - * into the 'reference count domain', where it's trivial, then invert.] - * - * vdev_dtl_reassess() uses computations of this form to determine - * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev - * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev - * has an outage wherever refcnt >= vdev_children. - */ -static int -space_reftree_compare(const void *x1, const void *x2) -{ - const space_ref_t *sr1 = (const space_ref_t *)x1; - const space_ref_t *sr2 = (const space_ref_t *)x2; - - int cmp = AVL_CMP(sr1->sr_offset, sr2->sr_offset); - if (likely(cmp)) - return (cmp); - - return (AVL_PCMP(sr1, sr2)); -} - -void -space_reftree_create(avl_tree_t *t) -{ - avl_create(t, space_reftree_compare, - sizeof (space_ref_t), offsetof(space_ref_t, sr_node)); -} - -void -space_reftree_destroy(avl_tree_t *t) -{ - space_ref_t *sr; - void *cookie = NULL; - - while ((sr = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(sr, sizeof (*sr)); - - avl_destroy(t); -} - -static void -space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt) -{ - space_ref_t *sr; - - sr = kmem_alloc(sizeof (*sr), KM_SLEEP); - sr->sr_offset = offset; - sr->sr_refcnt = refcnt; - - avl_add(t, sr); -} - -void -space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, - int64_t refcnt) -{ - space_reftree_add_node(t, start, refcnt); - space_reftree_add_node(t, end, -refcnt); -} - -/* - * Convert (or add) a range tree into a reference tree. - */ -void -space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt) -{ - range_seg_t *rs; - - for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) - space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt); -} - -/* - * Convert a reference tree into a range tree. The range tree will contain - * all members of the reference tree for which refcnt >= minref. - */ -void -space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref) -{ - uint64_t start = -1ULL; - int64_t refcnt = 0; - space_ref_t *sr; - - range_tree_vacate(rt, NULL, NULL); - - for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) { - refcnt += sr->sr_refcnt; - if (refcnt >= minref) { - if (start == -1ULL) { - start = sr->sr_offset; - } - } else { - if (start != -1ULL) { - uint64_t end = sr->sr_offset; - ASSERT(start <= end); - if (end > start) - range_tree_add(rt, start, end - start); - start = -1ULL; - } - } - } - ASSERT(refcnt == 0); - ASSERT(start == -1ULL); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h deleted file mode 100644 index 9689f931fb29..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h +++ /dev/null @@ -1,154 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright (c) 2014 by Chunwei Chen. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. - */ - -#ifndef _ABD_H -#define _ABD_H - -#include -#ifdef illumos -#include -#else -#include -#endif -#include -#include -#ifdef _KERNEL -#include -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -typedef enum abd_flags { - ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ - ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ - ABD_FLAG_META = 1 << 2 /* does this represent FS metadata? */ -} abd_flags_t; - -typedef struct abd { - abd_flags_t abd_flags; - uint_t abd_size; /* excludes scattered abd_offset */ - struct abd *abd_parent; - zfs_refcount_t abd_children; - union { - struct abd_scatter { - uint_t abd_offset; - uint_t abd_chunk_size; - void *abd_chunks[]; - } abd_scatter; - struct abd_linear { - void *abd_buf; - } abd_linear; - } abd_u; -} abd_t; - -typedef int abd_iter_func_t(void *, size_t, void *); -typedef int abd_iter_func2_t(void *, void *, size_t, void *); - -extern boolean_t zfs_abd_scatter_enabled; - -inline boolean_t -abd_is_linear(abd_t *abd) -{ - return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE); -} - -/* - * Allocations and deallocations - */ - -abd_t *abd_alloc(size_t, boolean_t); -abd_t *abd_alloc_linear(size_t, boolean_t); -abd_t *abd_alloc_for_io(size_t, boolean_t); -abd_t *abd_alloc_sametype(abd_t *, size_t); -void abd_free(abd_t *); -abd_t *abd_get_offset(abd_t *, size_t); -abd_t *abd_get_from_buf(void *, size_t); -void abd_put(abd_t *); - -/* - * Conversion to and from a normal buffer - */ - -void *abd_to_buf(abd_t *); -void *abd_borrow_buf(abd_t *, size_t); -void *abd_borrow_buf_copy(abd_t *, size_t); -void abd_return_buf(abd_t *, void *, size_t); -void abd_return_buf_copy(abd_t *, void *, size_t); -void abd_take_ownership_of_buf(abd_t *, boolean_t); -void abd_release_ownership_of_buf(abd_t *); - -/* - * ABD operations - */ - -int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); -int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, - abd_iter_func2_t *, void *); -void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); -void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); -void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); -int abd_cmp(abd_t *, abd_t *, size_t); -int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); -void abd_zero_off(abd_t *, size_t, size_t); - -/* - * Wrappers for calls with offsets of 0 - */ - -inline void -abd_copy(abd_t *dabd, abd_t *sabd, size_t size) -{ - abd_copy_off(dabd, sabd, 0, 0, size); -} - -inline void -abd_copy_from_buf(abd_t *abd, const void *buf, size_t size) -{ - abd_copy_from_buf_off(abd, buf, 0, size); -} - -inline void -abd_copy_to_buf(void* buf, abd_t *abd, size_t size) -{ - abd_copy_to_buf_off(buf, abd, 0, size); -} - -inline int -abd_cmp_buf(abd_t *abd, const void *buf, size_t size) -{ - return (abd_cmp_buf_off(abd, buf, 0, size)); -} - -inline void -abd_zero(abd_t *abd, size_t size) -{ - abd_zero_off(abd, 0, size); -} - -/* - * Module lifecycle - */ - -void abd_init(void); -void abd_fini(void); - -#ifdef __cplusplus -} -#endif - -#endif /* _ABD_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h deleted file mode 100644 index 2ae0835e55a2..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2017 by Delphix. All rights reserved. - */ - -#ifndef _SYS_AGGSUM_H -#define _SYS_AGGSUM_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct aggsum_bucket { - kmutex_t asc_lock; - int64_t asc_delta; - uint64_t asc_borrowed; - uint64_t asc_pad[2]; /* pad out to cache line (64 bytes) */ -} aggsum_bucket_t __aligned(CACHE_LINE_SIZE); - -/* - * Fan out over FANOUT cpus. - */ -typedef struct aggsum { - kmutex_t as_lock; - int64_t as_lower_bound; - int64_t as_upper_bound; - uint_t as_numbuckets; - aggsum_bucket_t *as_buckets; -} aggsum_t; - -void aggsum_init(aggsum_t *, uint64_t); -void aggsum_fini(aggsum_t *); -int64_t aggsum_lower_bound(aggsum_t *); -int64_t aggsum_upper_bound(aggsum_t *); -int aggsum_compare(aggsum_t *, uint64_t); -uint64_t aggsum_value(aggsum_t *); -void aggsum_add(aggsum_t *, int64_t); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_AGGSUM_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h deleted file mode 100644 index 95513f0cba21..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h +++ /dev/null @@ -1,290 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - */ - -#ifndef _SYS_ARC_H -#define _SYS_ARC_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include -#include - -/* - * Used by arc_flush() to inform arc_evict_state() that it should evict - * all available buffers from the arc state being passed in. - */ -#define ARC_EVICT_ALL -1ULL - -#define HDR_SET_LSIZE(hdr, x) do { \ - ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \ - (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \ -_NOTE(CONSTCOND) } while (0) - -#define HDR_SET_PSIZE(hdr, x) do { \ - ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \ - (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \ -_NOTE(CONSTCOND) } while (0) - -#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) -#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) - -typedef struct arc_buf_hdr arc_buf_hdr_t; -typedef struct arc_buf arc_buf_t; -typedef struct arc_prune arc_prune_t; - -/* - * Because the ARC can store encrypted data, errors (not due to bugs) may arise - * while transforming data into its desired format - specifically, when - * decrypting, the key may not be present, or the HMAC may not be correct - * which signifies deliberate tampering with the on-disk state - * (assuming that the checksum was correct). If any error occurs, the "buf" - * parameter will be NULL. - */ -typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb, - const blkptr_t *bp, arc_buf_t *buf, void *priv); -typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); -typedef void arc_prune_func_t(int64_t bytes, void *priv); - -/* Shared module parameters */ -extern uint64_t zfs_arc_average_blocksize; - -/* generic arc_done_func_t's which you can use */ -arc_read_done_func_t arc_bcopy_func; -arc_read_done_func_t arc_getbuf_func; - -/* generic arc_prune_func_t wrapper for callbacks */ -struct arc_prune { - arc_prune_func_t *p_pfunc; - void *p_private; - uint64_t p_adjust; - list_node_t p_node; - zfs_refcount_t p_refcnt; -}; - -typedef enum arc_strategy { - ARC_STRATEGY_META_ONLY = 0, /* Evict only meta data buffers */ - ARC_STRATEGY_META_BALANCED = 1, /* Evict data buffers if needed */ -} arc_strategy_t; - -typedef enum arc_flags -{ - /* - * Public flags that can be passed into the ARC by external consumers. - */ - ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */ - ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */ - ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ - ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ - ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ - ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */ - ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */ - - /* - * Private ARC flags. These flags are private ARC only flags that - * will show up in b_flags in the arc_hdr_buf_t. These flags should - * only be set by ARC code. - */ - ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ - ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */ - ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */ - ARC_FLAG_INDIRECT = 1 << 10, /* indirect block */ - /* Indicates that block was read with ASYNC priority. */ - ARC_FLAG_PRIO_ASYNC_READ = 1 << 11, - ARC_FLAG_L2_WRITING = 1 << 12, /* write in progress */ - ARC_FLAG_L2_EVICTED = 1 << 13, /* evicted during I/O */ - ARC_FLAG_L2_WRITE_HEAD = 1 << 14, /* head of write list */ - /* indicates that the buffer contains metadata (otherwise, data) */ - ARC_FLAG_BUFC_METADATA = 1 << 15, - - /* Flags specifying whether optional hdr struct fields are defined */ - ARC_FLAG_HAS_L1HDR = 1 << 16, - ARC_FLAG_HAS_L2HDR = 1 << 17, - - /* - * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data. - * This allows the l2arc to use the blkptr's checksum to verify - * the data without having to store the checksum in the hdr. - */ - ARC_FLAG_COMPRESSED_ARC = 1 << 18, - ARC_FLAG_SHARED_DATA = 1 << 19, - - /* - * The arc buffer's compression mode is stored in the top 7 bits of the - * flags field, so these dummy flags are included so that MDB can - * interpret the enum properly. - */ - ARC_FLAG_COMPRESS_0 = 1 << 24, - ARC_FLAG_COMPRESS_1 = 1 << 25, - ARC_FLAG_COMPRESS_2 = 1 << 26, - ARC_FLAG_COMPRESS_3 = 1 << 27, - ARC_FLAG_COMPRESS_4 = 1 << 28, - ARC_FLAG_COMPRESS_5 = 1 << 29, - ARC_FLAG_COMPRESS_6 = 1 << 30 - -} arc_flags_t; - -typedef enum arc_buf_flags { - ARC_BUF_FLAG_SHARED = 1 << 0, - ARC_BUF_FLAG_COMPRESSED = 1 << 1 -} arc_buf_flags_t; - -struct arc_buf { - arc_buf_hdr_t *b_hdr; - arc_buf_t *b_next; - kmutex_t b_evict_lock; - void *b_data; - arc_buf_flags_t b_flags; -}; - -typedef enum arc_buf_contents { - ARC_BUFC_INVALID, /* invalid type */ - ARC_BUFC_DATA, /* buffer contains data */ - ARC_BUFC_METADATA, /* buffer contains metadata */ - ARC_BUFC_NUMTYPES -} arc_buf_contents_t; - -/* - * The following breakdows of arc_size exist for kstat only. - */ -typedef enum arc_space_type { - ARC_SPACE_DATA, - ARC_SPACE_META, - ARC_SPACE_HDRS, - ARC_SPACE_L2HDRS, - ARC_SPACE_DBUF, - ARC_SPACE_DNODE, - ARC_SPACE_BONUS, - ARC_SPACE_NUMTYPES -} arc_space_type_t; - -typedef enum arc_state_type { - ARC_STATE_ANON, - ARC_STATE_MRU, - ARC_STATE_MRU_GHOST, - ARC_STATE_MFU, - ARC_STATE_MFU_GHOST, - ARC_STATE_L2C_ONLY, - ARC_STATE_NUMTYPES -} arc_state_type_t; - -typedef struct arc_buf_info { - arc_state_type_t abi_state_type; - arc_buf_contents_t abi_state_contents; - uint64_t abi_state_index; - uint32_t abi_flags; - uint32_t abi_bufcnt; - uint64_t abi_size; - uint64_t abi_spa; - uint64_t abi_access; - uint32_t abi_mru_hits; - uint32_t abi_mru_ghost_hits; - uint32_t abi_mfu_hits; - uint32_t abi_mfu_ghost_hits; - uint32_t abi_l2arc_hits; - uint32_t abi_holds; - uint64_t abi_l2arc_dattr; - uint64_t abi_l2arc_asize; - enum zio_compress abi_l2arc_compress; -} arc_buf_info_t; - -void arc_space_consume(uint64_t space, arc_space_type_t type); -void arc_space_return(uint64_t space, arc_space_type_t type); -boolean_t arc_is_metadata(arc_buf_t *buf); -enum zio_compress arc_get_compression(arc_buf_t *buf); -int arc_decompress(arc_buf_t *buf); -arc_buf_t *arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, - int32_t size); -arc_buf_t *arc_alloc_compressed_buf(spa_t *spa, void *tag, - uint64_t psize, uint64_t lsize, enum zio_compress compression_type); -arc_buf_t *arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size); -arc_buf_t *arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type); -void arc_return_buf(arc_buf_t *buf, void *tag); -void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); -void arc_buf_destroy(arc_buf_t *buf, void *tag); -void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index); -int arc_buf_size(arc_buf_t *buf); -int arc_buf_lsize(arc_buf_t *buf); -void arc_buf_access(arc_buf_t *buf); -void arc_release(arc_buf_t *buf, void *tag); -int arc_released(arc_buf_t *buf); -void arc_buf_freeze(arc_buf_t *buf); -void arc_buf_thaw(arc_buf_t *buf); -#ifdef ZFS_DEBUG -int arc_referenced(arc_buf_t *buf); -#endif - -int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - arc_read_done_func_t *done, void *priv, zio_priority_t priority, - int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); -zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, - arc_write_done_func_t *ready, arc_write_done_func_t *child_ready, - arc_write_done_func_t *physdone, arc_write_done_func_t *done, - void *priv, zio_priority_t priority, int zio_flags, - const zbookmark_phys_t *zb); - -arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv); -void arc_remove_prune_callback(arc_prune_t *p); -void arc_freed(spa_t *spa, const blkptr_t *bp); - -void arc_flush(spa_t *spa, boolean_t retry); -void arc_tempreserve_clear(uint64_t reserve); -int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg); - -uint64_t arc_max_bytes(void); -void arc_init(void); -void arc_fini(void); - -/* - * Level 2 ARC - */ - -void l2arc_add_vdev(spa_t *spa, vdev_t *vd); -void l2arc_remove_vdev(vdev_t *vd); -boolean_t l2arc_vdev_present(vdev_t *vd); -void l2arc_init(void); -void l2arc_fini(void); -void l2arc_start(void); -void l2arc_stop(void); - -#ifdef illumos -#ifndef _KERNEL -extern boolean_t arc_watch; -extern int arc_procfd; -#endif -#endif /* illumos */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ARC_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h deleted file mode 100644 index 77b1b827ac37..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2013 by Delphix. All rights reserved. - */ - -#ifndef _SYS_BLKPTR_H -#define _SYS_BLKPTR_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -void encode_embedded_bp_compressed(blkptr_t *, void *, - enum zio_compress, int, int); -void decode_embedded_bp_compressed(const blkptr_t *, void *); -int decode_embedded_bp(const blkptr_t *, void *, int); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_BLKPTR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h deleted file mode 100644 index 471be9047ec2..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#ifndef _SYS_BPLIST_H -#define _SYS_BPLIST_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct bplist_entry { - blkptr_t bpe_blk; - list_node_t bpe_node; -} bplist_entry_t; - -typedef struct bplist { - kmutex_t bpl_lock; - list_t bpl_list; -} bplist_t; - -typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); - -void bplist_create(bplist_t *bpl); -void bplist_destroy(bplist_t *bpl); -void bplist_append(bplist_t *bpl, const blkptr_t *bp); -void bplist_iterate(bplist_t *bpl, bplist_itor_t *func, - void *arg, dmu_tx_t *tx); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_BPLIST_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h deleted file mode 100644 index d425e239f6a6..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h +++ /dev/null @@ -1,95 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. - */ - -#ifndef _SYS_BPOBJ_H -#define _SYS_BPOBJ_H - -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct bpobj_phys { - /* - * This is the bonus buffer for the dead lists. The object's - * contents is an array of bpo_entries blkptr_t's, representing - * a total of bpo_bytes physical space. - */ - uint64_t bpo_num_blkptrs; - uint64_t bpo_bytes; - uint64_t bpo_comp; - uint64_t bpo_uncomp; - uint64_t bpo_subobjs; - uint64_t bpo_num_subobjs; -} bpobj_phys_t; - -#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t)) -#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t)) - -typedef struct bpobj { - kmutex_t bpo_lock; - objset_t *bpo_os; - uint64_t bpo_object; - int bpo_epb; - uint8_t bpo_havecomp; - uint8_t bpo_havesubobj; - bpobj_phys_t *bpo_phys; - dmu_buf_t *bpo_dbuf; - dmu_buf_t *bpo_cached_dbuf; -} bpobj_t; - -typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); - -uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx); -uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx); -void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); -void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx); - -int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object); -void bpobj_close(bpobj_t *bpo); -boolean_t bpobj_is_open(const bpobj_t *bpo); - -int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx); -int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *); - -void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx); -void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx); - -int bpobj_space(bpobj_t *bpo, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); -int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); -boolean_t bpobj_is_empty(bpobj_t *bpo); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_BPOBJ_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h deleted file mode 100644 index 327c128bf493..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. - */ - -#ifndef _SYS_BPTREE_H -#define _SYS_BPTREE_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct bptree_phys { - uint64_t bt_begin; - uint64_t bt_end; - uint64_t bt_bytes; - uint64_t bt_comp; - uint64_t bt_uncomp; -} bptree_phys_t; - -typedef struct bptree_entry_phys { - blkptr_t be_bp; - uint64_t be_birth_txg; /* only delete blocks born after this txg */ - zbookmark_phys_t be_zb; /* holds traversal resume point if needed */ -} bptree_entry_phys_t; - -typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); - -uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx); -int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); -boolean_t bptree_is_empty(objset_t *os, uint64_t obj); - -void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, - uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx); - -int bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, - bptree_itor_t func, void *arg, dmu_tx_t *tx); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_BPTREE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h deleted file mode 100644 index 63722df1bbf3..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2014 by Delphix. All rights reserved. - */ - -#ifndef _BQUEUE_H -#define _BQUEUE_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include - -typedef struct bqueue { - list_t bq_list; - kmutex_t bq_lock; - kcondvar_t bq_add_cv; - kcondvar_t bq_pop_cv; - uint64_t bq_size; - uint64_t bq_maxsize; - size_t bq_node_offset; -} bqueue_t; - -typedef struct bqueue_node { - list_node_t bqn_node; - uint64_t bqn_size; -} bqueue_node_t; - - -int bqueue_init(bqueue_t *, uint64_t, size_t); -void bqueue_destroy(bqueue_t *); -void bqueue_enqueue(bqueue_t *, void *, uint64_t); -void *bqueue_dequeue(bqueue_t *); -boolean_t bqueue_empty(bqueue_t *); - -#ifdef __cplusplus -} -#endif - -#endif /* _BQUEUE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h deleted file mode 100644 index 33c3b7bc2532..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2011 Google, Inc. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - - -/* - * Copyright (c) 2017 by Delphix. All rights reserved. - */ - -#ifndef _SYS_CITYHASH_H -#define _SYS_CITYHASH_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -uint64_t cityhash4(uint64_t, uint64_t, uint64_t, uint64_t); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_CITYHASH_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h deleted file mode 100644 index 4b1a9e11b165..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ /dev/null @@ -1,417 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - */ - -#ifndef _SYS_DBUF_H -#define _SYS_DBUF_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define IN_DMU_SYNC 2 - -/* - * define flags for dbuf_read - */ - -#define DB_RF_MUST_SUCCEED (1 << 0) -#define DB_RF_CANFAIL (1 << 1) -#define DB_RF_HAVESTRUCT (1 << 2) -#define DB_RF_NOPREFETCH (1 << 3) -#define DB_RF_NEVERWAIT (1 << 4) -#define DB_RF_CACHED (1 << 5) - -/* - * The simplified state transition diagram for dbufs looks like: - * - * +----> READ ----+ - * | | - * | V - * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) - * | ^ ^ - * | | | - * +----> FILL ----+ | - * | | - * | | - * +--------> NOFILL -------+ - * - * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range - * to find all dbufs in a range of a dnode and must be less than any other - * dbuf_states_t (see comment on dn_dbufs in dnode.h). - */ -typedef enum dbuf_states { - DB_SEARCH = -1, - DB_UNCACHED, - DB_FILL, - DB_NOFILL, - DB_READ, - DB_CACHED, - DB_EVICTING -} dbuf_states_t; - -typedef enum dbuf_cached_state { - DB_NO_CACHE = -1, - DB_DBUF_CACHE, - DB_DBUF_METADATA_CACHE, - DB_CACHE_MAX -} dbuf_cached_state_t; - -struct dnode; -struct dmu_tx; - -/* - * level = 0 means the user data - * level = 1 means the single indirect block - * etc. - */ - -struct dmu_buf_impl; - -typedef enum override_states { - DR_NOT_OVERRIDDEN, - DR_IN_DMU_SYNC, - DR_OVERRIDDEN -} override_states_t; - -typedef struct dbuf_dirty_record { - /* link on our parents dirty list */ - list_node_t dr_dirty_node; - - /* transaction group this data will sync in */ - uint64_t dr_txg; - - /* zio of outstanding write IO */ - zio_t *dr_zio; - - /* pointer back to our dbuf */ - struct dmu_buf_impl *dr_dbuf; - - /* pointer to next dirty record */ - struct dbuf_dirty_record *dr_next; - - /* pointer to parent dirty record */ - struct dbuf_dirty_record *dr_parent; - - /* How much space was changed to dsl_pool_dirty_space() for this? */ - unsigned int dr_accounted; - - /* A copy of the bp that points to us */ - blkptr_t dr_bp_copy; - - union dirty_types { - struct dirty_indirect { - - /* protect access to list */ - kmutex_t dr_mtx; - - /* Our list of dirty children */ - list_t dr_children; - } di; - struct dirty_leaf { - - /* - * dr_data is set when we dirty the buffer - * so that we can retain the pointer even if it - * gets COW'd in a subsequent transaction group. - */ - arc_buf_t *dr_data; - blkptr_t dr_overridden_by; - override_states_t dr_override_state; - uint8_t dr_copies; - boolean_t dr_nopwrite; - } dl; - } dt; -} dbuf_dirty_record_t; - -typedef struct dmu_buf_impl { - /* - * The following members are immutable, with the exception of - * db.db_data, which is protected by db_mtx. - */ - - /* the publicly visible structure */ - dmu_buf_t db; - - /* the objset we belong to */ - struct objset *db_objset; - - /* - * handle to safely access the dnode we belong to (NULL when evicted) - */ - struct dnode_handle *db_dnode_handle; - - /* - * our parent buffer; if the dnode points to us directly, - * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf - * only accessed by sync thread ??? - * (NULL when evicted) - * May change from NULL to non-NULL under the protection of db_mtx - * (see dbuf_check_blkptr()) - */ - struct dmu_buf_impl *db_parent; - - /* - * link for hash table of all dmu_buf_impl_t's - */ - struct dmu_buf_impl *db_hash_next; - - /* - * Our link on the owner dnodes's dn_dbufs list. - * Protected by its dn_dbufs_mtx. Should be on the same cache line - * as db_level and db_blkid for the best avl_add() performance. - */ - avl_node_t db_link; - - /* our block number */ - uint64_t db_blkid; - - /* - * Pointer to the blkptr_t which points to us. May be NULL if we - * don't have one yet. (NULL when evicted) - */ - blkptr_t *db_blkptr; - - /* - * Our indirection level. Data buffers have db_level==0. - * Indirect buffers which point to data buffers have - * db_level==1. etc. Buffers which contain dnodes have - * db_level==0, since the dnodes are stored in a file. - */ - uint8_t db_level; - - /* db_mtx protects the members below */ - kmutex_t db_mtx; - - /* - * Current state of the buffer - */ - dbuf_states_t db_state; - - /* - * Refcount accessed by dmu_buf_{hold,rele}. - * If nonzero, the buffer can't be destroyed. - * Protected by db_mtx. - */ - zfs_refcount_t db_holds; - - /* buffer holding our data */ - arc_buf_t *db_buf; - - kcondvar_t db_changed; - dbuf_dirty_record_t *db_data_pending; - - /* pointer to most recent dirty record for this buffer */ - dbuf_dirty_record_t *db_last_dirty; - - /* Link in dbuf_cache or dbuf_metadata_cache */ - multilist_node_t db_cache_link; - - /* Tells us which dbuf cache this dbuf is in, if any */ - dbuf_cached_state_t db_caching_status; - - /* Data which is unique to data (leaf) blocks: */ - - /* User callback information. */ - dmu_buf_user_t *db_user; - - /* - * Evict user data as soon as the dirty and reference - * counts are equal. - */ - uint8_t db_user_immediate_evict; - - /* - * This block was freed while a read or write was - * active. - */ - uint8_t db_freed_in_flight; - - /* - * dnode_evict_dbufs() or dnode_evict_bonus() tried to - * evict this dbuf, but couldn't due to outstanding - * references. Evict once the refcount drops to 0. - */ - uint8_t db_pending_evict; - - uint8_t db_dirtycnt; -} dmu_buf_impl_t; - -/* Note: the dbuf hash table is exposed only for the mdb module */ -#define DBUF_MUTEXES 256 -#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) -typedef struct dbuf_hash_table { - uint64_t hash_table_mask; - dmu_buf_impl_t **hash_table; - kmutex_t hash_mutexes[DBUF_MUTEXES]; -} dbuf_hash_table_t; - -uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset); - -dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); -void dbuf_create_bonus(struct dnode *dn); -int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx); -void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag); - -void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx); - -dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); -dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, - void *tag); -int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, - boolean_t fail_sparse, boolean_t fail_uncached, - void *tag, dmu_buf_impl_t **dbp); - -void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, - zio_priority_t prio, arc_flags_t aflags); - -void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); -boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj, - uint64_t blkid, void *tag); -uint64_t dbuf_refcount(dmu_buf_impl_t *db); - -void dbuf_rele(dmu_buf_impl_t *db, void *tag); -void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting); - -dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, - uint64_t blkid); - -int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); -void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); -void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); -void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); -void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); -dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); -void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, - bp_embedded_type_t etype, enum zio_compress comp, - int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); - -void dbuf_destroy(dmu_buf_impl_t *db); - -void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -void dbuf_unoverride(dbuf_dirty_record_t *dr); -void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx); -void dbuf_release_bp(dmu_buf_impl_t *db); - -boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf); - -void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, - struct dmu_tx *); - -void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); - -void dbuf_stats_init(dbuf_hash_table_t *hash); -void dbuf_stats_destroy(void); - -#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) -#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) -#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) -#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db))) -#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db))) - -void dbuf_init(void); -void dbuf_fini(void); - -boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); - -#define DBUF_GET_BUFC_TYPE(_db) \ - (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) - -#define DBUF_IS_CACHEABLE(_db) \ - ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ - (dbuf_is_metadata(_db) && \ - ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) - -#define DBUF_IS_L2CACHEABLE(_db) \ - ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ - (dbuf_is_metadata(_db) && \ - ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) - -#define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level) \ - ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL || \ - (((_level) > 0 || \ - DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) && \ - ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA))) - -#ifdef ZFS_DEBUG - -/* - * There should be a ## between the string literal and fmt, to make it - * clear that we're joining two strings together, but gcc does not - * support that preprocessor token. - */ -#define dprintf_dbuf(dbuf, fmt, ...) do { \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ - char __db_buf[32]; \ - uint64_t __db_obj = (dbuf)->db.db_object; \ - if (__db_obj == DMU_META_DNODE_OBJECT) \ - (void) strcpy(__db_buf, "mdn"); \ - else \ - (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ - (u_longlong_t)__db_obj); \ - dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \ - "obj=%s lvl=%u blkid=%lld " fmt, \ - __db_buf, (dbuf)->db_level, \ - (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \ - } \ -_NOTE(CONSTCOND) } while (0) - -#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ - char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ - snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \ - dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ - kmem_free(__blkbuf, BP_SPRINTF_LEN); \ - } \ -_NOTE(CONSTCOND) } while (0) - -#define DBUF_VERIFY(db) dbuf_verify(db) - -#else - -#define dprintf_dbuf(db, fmt, ...) -#define dprintf_dbuf_bp(db, bp, fmt, ...) -#define DBUF_VERIFY(db) - -#endif - - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DBUF_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h deleted file mode 100644 index 2468a1485fd3..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h +++ /dev/null @@ -1,248 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. - */ - -#ifndef _SYS_DDT_H -#define _SYS_DDT_H - -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct abd; - -/* - * On-disk DDT formats, in the desired search order (newest version first). - */ -enum ddt_type { - DDT_TYPE_ZAP = 0, - DDT_TYPES -}; - -/* - * DDT classes, in the desired search order (highest replication level first). - */ -enum ddt_class { - DDT_CLASS_DITTO = 0, - DDT_CLASS_DUPLICATE, - DDT_CLASS_UNIQUE, - DDT_CLASSES -}; - -#define DDT_TYPE_CURRENT 0 - -#define DDT_COMPRESS_BYTEORDER_MASK 0x80 -#define DDT_COMPRESS_FUNCTION_MASK 0x7f - -/* - * On-disk ddt entry: key (name) and physical storage (value). - */ -typedef struct ddt_key { - zio_cksum_t ddk_cksum; /* 256-bit block checksum */ - /* - * Encoded with logical & physical size, and compression, as follows: - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * | 0 | 0 | 0 | comp | PSIZE | LSIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - */ - uint64_t ddk_prop; -} ddt_key_t; - -#define DDK_GET_LSIZE(ddk) \ - BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) -#define DDK_SET_LSIZE(ddk, x) \ - BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) - -#define DDK_GET_PSIZE(ddk) \ - BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) -#define DDK_SET_PSIZE(ddk, x) \ - BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) - -#define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 8) -#define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 8, x) - -#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t)) - -typedef struct ddt_phys { - dva_t ddp_dva[SPA_DVAS_PER_BP]; - uint64_t ddp_refcnt; - uint64_t ddp_phys_birth; -} ddt_phys_t; - -enum ddt_phys_type { - DDT_PHYS_DITTO = 0, - DDT_PHYS_SINGLE = 1, - DDT_PHYS_DOUBLE = 2, - DDT_PHYS_TRIPLE = 3, - DDT_PHYS_TYPES -}; - -/* - * In-core ddt entry - */ -struct ddt_entry { - ddt_key_t dde_key; - ddt_phys_t dde_phys[DDT_PHYS_TYPES]; - zio_t *dde_lead_zio[DDT_PHYS_TYPES]; - struct abd *dde_repair_abd; - enum ddt_type dde_type; - enum ddt_class dde_class; - uint8_t dde_loading; - uint8_t dde_loaded; - kcondvar_t dde_cv; - avl_node_t dde_node; -}; - -/* - * In-core ddt - */ -struct ddt { - kmutex_t ddt_lock; - avl_tree_t ddt_tree; - avl_tree_t ddt_repair_tree; - enum zio_checksum ddt_checksum; - spa_t *ddt_spa; - objset_t *ddt_os; - uint64_t ddt_stat_object; - uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; - ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; - ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; - ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; - avl_node_t ddt_node; -}; - -/* - * In-core and on-disk bookmark for DDT walks - */ -typedef struct ddt_bookmark { - uint64_t ddb_class; - uint64_t ddb_type; - uint64_t ddb_checksum; - uint64_t ddb_cursor; -} ddt_bookmark_t; - -/* - * Ops vector to access a specific DDT object type. - */ -typedef struct ddt_ops { - char ddt_op_name[32]; - int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, - boolean_t prehash); - int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); - int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde); - void (*ddt_op_prefetch)(objset_t *os, uint64_t object, - ddt_entry_t *dde); - int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde, - dmu_tx_t *tx); - int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde, - dmu_tx_t *tx); - int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde, - uint64_t *walk); - int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); -} ddt_ops_t; - -#define DDT_NAMELEN 80 - -extern void ddt_object_name(ddt_t *ddt, enum ddt_type type, - enum ddt_class cls, char *name); -extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type, - enum ddt_class cls, uint64_t *walk, ddt_entry_t *dde); -extern int ddt_object_count(ddt_t *ddt, enum ddt_type type, - enum ddt_class cls, uint64_t *count); -extern int ddt_object_info(ddt_t *ddt, enum ddt_type type, - enum ddt_class cls, dmu_object_info_t *); -extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type, - enum ddt_class cls); - -extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, - uint64_t txg); -extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, - const ddt_phys_t *ddp, blkptr_t *bp); - -extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); - -extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp); -extern void ddt_phys_clear(ddt_phys_t *ddp); -extern void ddt_phys_addref(ddt_phys_t *ddp); -extern void ddt_phys_decref(ddt_phys_t *ddp); -extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, - uint64_t txg); -extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp); -extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); - -extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); - -extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); -extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); -extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh); -extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo); -extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh); -extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); - -extern uint64_t ddt_get_dedup_dspace(spa_t *spa); -extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa); - -extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, - ddt_phys_t *ddp_willref); -extern int ddt_ditto_copies_present(ddt_entry_t *dde); - -extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len); -extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len); - -extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); -extern void ddt_enter(ddt_t *ddt); -extern void ddt_exit(ddt_t *ddt); -extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); -extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); -extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); - -extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class, - const blkptr_t *bp); - -extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); -extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde); - -extern int ddt_entry_compare(const void *x1, const void *x2); - -extern void ddt_create(spa_t *spa); -extern int ddt_load(spa_t *spa); -extern void ddt_unload(spa_t *spa); -extern void ddt_sync(spa_t *spa, uint64_t txg); -extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); -extern int ddt_object_update(ddt_t *ddt, enum ddt_type type, - enum ddt_class cls, ddt_entry_t *dde, dmu_tx_t *tx); - -extern const ddt_ops_t ddt_zap_ops; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DDT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h deleted file mode 100644 index 1f5a837cc717..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ /dev/null @@ -1,1028 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright 2013 DEY Storage Systems, Inc. - * Copyright 2014 HybridCluster. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright 2013 Saso Kiselkov. All rights reserved. - * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2014 Integros [integros.com] - */ - -/* Portions Copyright 2010 Robert Milkowski */ - -#ifndef _SYS_DMU_H -#define _SYS_DMU_H - -/* - * This file describes the interface that the DMU provides for its - * consumers. - * - * The DMU also interacts with the SPA. That interface is described in - * dmu_spa.h. - */ - -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct uio; -struct xuio; -struct page; -struct vnode; -struct spa; -struct zilog; -struct zio; -struct blkptr; -struct zap_cursor; -struct dsl_dataset; -struct dsl_pool; -struct dnode; -struct drr_begin; -struct drr_end; -struct zbookmark_phys; -struct spa; -struct nvlist; -struct arc_buf; -struct zio_prop; -struct sa_handle; -struct file; -struct locked_range; - -typedef struct objset objset_t; -typedef struct dmu_tx dmu_tx_t; -typedef struct dsl_dir dsl_dir_t; -typedef struct dnode dnode_t; - -typedef enum dmu_object_byteswap { - DMU_BSWAP_UINT8, - DMU_BSWAP_UINT16, - DMU_BSWAP_UINT32, - DMU_BSWAP_UINT64, - DMU_BSWAP_ZAP, - DMU_BSWAP_DNODE, - DMU_BSWAP_OBJSET, - DMU_BSWAP_ZNODE, - DMU_BSWAP_OLDACL, - DMU_BSWAP_ACL, - /* - * Allocating a new byteswap type number makes the on-disk format - * incompatible with any other format that uses the same number. - * - * Data can usually be structured to work with one of the - * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. - */ - DMU_BSWAP_NUMFUNCS -} dmu_object_byteswap_t; - -#define DMU_OT_NEWTYPE 0x80 -#define DMU_OT_METADATA 0x40 -#define DMU_OT_BYTESWAP_MASK 0x3f - -/* - * Defines a uint8_t object type. Object types specify if the data - * in the object is metadata (boolean) and how to byteswap the data - * (dmu_object_byteswap_t). All of the types created by this method - * are cached in the dbuf metadata cache. - */ -#define DMU_OT(byteswap, metadata) \ - (DMU_OT_NEWTYPE | \ - ((metadata) ? DMU_OT_METADATA : 0) | \ - ((byteswap) & DMU_OT_BYTESWAP_MASK)) - -#define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ - ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ - (ot) < DMU_OT_NUMTYPES) - -#define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ - ((ot) & DMU_OT_METADATA) : \ - dmu_ot[(ot)].ot_metadata) - -#define DMU_OT_IS_DDT(ot) \ - ((ot) == DMU_OT_DDT_ZAP) - -#define DMU_OT_IS_ZIL(ot) \ - ((ot) == DMU_OT_INTENT_LOG) - -/* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */ -#define DMU_OT_IS_FILE(ot) \ - ((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER) - -#define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ - B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache) - -/* - * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't - * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill - * is repurposed for embedded BPs. - */ -#define DMU_OT_HAS_FILL(ot) \ - ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET) - -#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ - ((ot) & DMU_OT_BYTESWAP_MASK) : \ - dmu_ot[(ot)].ot_byteswap) - -typedef enum dmu_object_type { - DMU_OT_NONE, - /* general: */ - DMU_OT_OBJECT_DIRECTORY, /* ZAP */ - DMU_OT_OBJECT_ARRAY, /* UINT64 */ - DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ - DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ - DMU_OT_BPOBJ, /* UINT64 */ - DMU_OT_BPOBJ_HDR, /* UINT64 */ - /* spa: */ - DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ - DMU_OT_SPACE_MAP, /* UINT64 */ - /* zil: */ - DMU_OT_INTENT_LOG, /* UINT64 */ - /* dmu: */ - DMU_OT_DNODE, /* DNODE */ - DMU_OT_OBJSET, /* OBJSET */ - /* dsl: */ - DMU_OT_DSL_DIR, /* UINT64 */ - DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ - DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ - DMU_OT_DSL_PROPS, /* ZAP */ - DMU_OT_DSL_DATASET, /* UINT64 */ - /* zpl: */ - DMU_OT_ZNODE, /* ZNODE */ - DMU_OT_OLDACL, /* Old ACL */ - DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ - DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ - DMU_OT_MASTER_NODE, /* ZAP */ - DMU_OT_UNLINKED_SET, /* ZAP */ - /* zvol: */ - DMU_OT_ZVOL, /* UINT8 */ - DMU_OT_ZVOL_PROP, /* ZAP */ - /* other; for testing only! */ - DMU_OT_PLAIN_OTHER, /* UINT8 */ - DMU_OT_UINT64_OTHER, /* UINT64 */ - DMU_OT_ZAP_OTHER, /* ZAP */ - /* new object types: */ - DMU_OT_ERROR_LOG, /* ZAP */ - DMU_OT_SPA_HISTORY, /* UINT8 */ - DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ - DMU_OT_POOL_PROPS, /* ZAP */ - DMU_OT_DSL_PERMS, /* ZAP */ - DMU_OT_ACL, /* ACL */ - DMU_OT_SYSACL, /* SYSACL */ - DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ - DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ - DMU_OT_NEXT_CLONES, /* ZAP */ - DMU_OT_SCAN_QUEUE, /* ZAP */ - DMU_OT_USERGROUP_USED, /* ZAP */ - DMU_OT_USERGROUP_QUOTA, /* ZAP */ - DMU_OT_USERREFS, /* ZAP */ - DMU_OT_DDT_ZAP, /* ZAP */ - DMU_OT_DDT_STATS, /* ZAP */ - DMU_OT_SA, /* System attr */ - DMU_OT_SA_MASTER_NODE, /* ZAP */ - DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ - DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ - DMU_OT_SCAN_XLATE, /* ZAP */ - DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ - DMU_OT_DEADLIST, /* ZAP */ - DMU_OT_DEADLIST_HDR, /* UINT64 */ - DMU_OT_DSL_CLONES, /* ZAP */ - DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ - /* - * Do not allocate new object types here. Doing so makes the on-disk - * format incompatible with any other format that uses the same object - * type number. - * - * When creating an object which does not have one of the above types - * use the DMU_OTN_* type with the correct byteswap and metadata - * values. - * - * The DMU_OTN_* types do not have entries in the dmu_ot table, - * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead - * use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead - * of indexing into dmu_ot directly (this works for both DMU_OT_* types - * and DMU_OTN_* types). - */ - DMU_OT_NUMTYPES, - - /* - * Names for valid types declared with DMU_OT(). - */ - DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE), - DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE), - DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE), - DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE), - DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE), - DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE), - DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE), - DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE), - DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE), - DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE), -} dmu_object_type_t; - -/* - * These flags are intended to be used to specify the "txg_how" - * parameter when calling the dmu_tx_assign() function. See the comment - * above dmu_tx_assign() for more details on the meaning of these flags. - */ -#define TXG_NOWAIT (0ULL) -#define TXG_WAIT (1ULL<<0) -#define TXG_NOTHROTTLE (1ULL<<1) - -void byteswap_uint64_array(void *buf, size_t size); -void byteswap_uint32_array(void *buf, size_t size); -void byteswap_uint16_array(void *buf, size_t size); -void byteswap_uint8_array(void *buf, size_t size); -void zap_byteswap(void *buf, size_t size); -void zfs_oldacl_byteswap(void *buf, size_t size); -void zfs_acl_byteswap(void *buf, size_t size); -void zfs_znode_byteswap(void *buf, size_t size); - -#define DS_FIND_SNAPSHOTS (1<<0) -#define DS_FIND_CHILDREN (1<<1) -#define DS_FIND_SERIALIZE (1<<2) - -/* - * The maximum number of bytes that can be accessed as part of one - * operation, including metadata. - */ -#define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */ -#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ - -#define DMU_USERUSED_OBJECT (-1ULL) -#define DMU_GROUPUSED_OBJECT (-2ULL) - -/* - * artificial blkids for bonus buffer and spill blocks - */ -#define DMU_BONUS_BLKID (-1ULL) -#define DMU_SPILL_BLKID (-2ULL) -/* - * Public routines to create, destroy, open, and close objsets. - */ -int dmu_objset_hold(const char *name, void *tag, objset_t **osp); -int dmu_objset_own(const char *name, dmu_objset_type_t type, - boolean_t readonly, void *tag, objset_t **osp); -void dmu_objset_rele(objset_t *os, void *tag); -void dmu_objset_disown(objset_t *os, void *tag); -int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); - -void dmu_objset_evict_dbufs(objset_t *os); -int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, - void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); -int dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname, - struct nvlist *snaps); -int dmu_objset_clone(const char *name, const char *origin); -int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, - struct nvlist *errlist); -int dmu_objset_snapshot_one(const char *fsname, const char *snapname); -int dmu_objset_snapshot_tmp(const char *, const char *, int); -int dmu_objset_find(char *name, int func(const char *, void *), void *arg, - int flags); -void dmu_objset_byteswap(void *buf, size_t size); -int dsl_dataset_rename_snapshot(const char *fsname, - const char *oldsnapname, const char *newsnapname, boolean_t recursive); -int dmu_objset_remap_indirects(const char *fsname); - -typedef struct dmu_buf { - uint64_t db_object; /* object that this buffer is part of */ - uint64_t db_offset; /* byte offset in this object */ - uint64_t db_size; /* size of buffer in bytes */ - void *db_data; /* data in buffer */ -} dmu_buf_t; - -/* - * The names of zap entries in the DIRECTORY_OBJECT of the MOS. - */ -#define DMU_POOL_DIRECTORY_OBJECT 1 -#define DMU_POOL_CONFIG "config" -#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" -#define DMU_POOL_FEATURES_FOR_READ "features_for_read" -#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" -#define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg" -#define DMU_POOL_ROOT_DATASET "root_dataset" -#define DMU_POOL_SYNC_BPOBJ "sync_bplist" -#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" -#define DMU_POOL_ERRLOG_LAST "errlog_last" -#define DMU_POOL_SPARES "spares" -#define DMU_POOL_DEFLATE "deflate" -#define DMU_POOL_HISTORY "history" -#define DMU_POOL_PROPS "pool_props" -#define DMU_POOL_L2CACHE "l2cache" -#define DMU_POOL_TMP_USERREFS "tmp_userrefs" -#define DMU_POOL_DDT "DDT-%s-%s-%s" -#define DMU_POOL_DDT_STATS "DDT-statistics" -#define DMU_POOL_CREATION_VERSION "creation_version" -#define DMU_POOL_SCAN "scan" -#define DMU_POOL_FREE_BPOBJ "free_bpobj" -#define DMU_POOL_BPTREE_OBJ "bptree_obj" -#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" -#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt" -#define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map" -#define DMU_POOL_REMOVING "com.delphix:removing" -#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" -#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" -#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" - -/* - * Allocate an object from this objset. The range of object numbers - * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. - * - * The transaction must be assigned to a txg. The newly allocated - * object will be "held" in the transaction (ie. you can modify the - * newly allocated object in this transaction). - * - * dmu_object_alloc() chooses an object and returns it in *objectp. - * - * dmu_object_claim() allocates a specific object number. If that - * number is already allocated, it fails and returns EEXIST. - * - * Return 0 on success, or ENOSPC or EEXIST as specified above. - */ -uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); -uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, - int indirect_blockshift, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); -uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonus_type, int bonus_len, - int dnodesize, dmu_tx_t *tx); -int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonus_type, int bonus_len, - int dnodesize, dmu_tx_t *tx); -int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, - dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, - int bonuslen, int dnodesize, dmu_tx_t *txp); -int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); -int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp); - -/* - * Free an object from this objset. - * - * The object's data will be freed as well (ie. you don't need to call - * dmu_free(object, 0, -1, tx)). - * - * The object need not be held in the transaction. - * - * If there are any holds on this object's buffers (via dmu_buf_hold()), - * or tx holds on the object (via dmu_tx_hold_object()), you can not - * free it; it fails and returns EBUSY. - * - * If the object is not allocated, it fails and returns ENOENT. - * - * Return 0 on success, or EBUSY or ENOENT as specified above. - */ -int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); - -/* - * Find the next allocated or free object. - * - * The objectp parameter is in-out. It will be updated to be the next - * object which is allocated. Ignore objects which have not been - * modified since txg. - * - * XXX Can only be called on a objset with no dirty data. - * - * Returns 0 on success, or ENOENT if there are no more objects. - */ -int dmu_object_next(objset_t *os, uint64_t *objectp, - boolean_t hole, uint64_t txg); - -/* - * Set the data blocksize for an object. - * - * The object cannot have any blocks allcated beyond the first. If - * the first block is allocated already, the new size must be greater - * than the current block size. If these conditions are not met, - * ENOTSUP will be returned. - * - * Returns 0 on success, or EBUSY if there are any holds on the object - * contents, or ENOTSUP as described above. - */ -int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, - int ibs, dmu_tx_t *tx); - -/* - * Set the checksum property on a dnode. The new checksum algorithm will - * apply to all newly written blocks; existing blocks will not be affected. - */ -void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, - dmu_tx_t *tx); - -/* - * Set the compress property on a dnode. The new compression algorithm will - * apply to all newly written blocks; existing blocks will not be affected. - */ -void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, - dmu_tx_t *tx); - -int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t txg); - -void -dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, - void *data, uint8_t etype, uint8_t comp, int uncompressed_size, - int compressed_size, int byteorder, dmu_tx_t *tx); - -/* - * Decide how to write a block: checksum, compression, number of copies, etc. - */ -#define WP_NOFILL 0x1 -#define WP_DMU_SYNC 0x2 -#define WP_SPILL 0x4 - -void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, - struct zio_prop *zp); -/* - * The bonus data is accessed more or less like a regular buffer. - * You must dmu_bonus_hold() to get the buffer, which will give you a - * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus - * data. As with any normal buffer, you must call dmu_buf_will_dirty() - * before modifying it, and the - * object must be held in an assigned transaction before calling - * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus - * buffer as well. You must release your hold with dmu_buf_rele(). - * - * Returns ENOENT, EIO, or 0. - */ -int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); -int dmu_bonus_max(void); -int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); -int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); -dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); -int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); - -/* - * Special spill buffer support used by "SA" framework - */ - -int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); -int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, - void *tag, dmu_buf_t **dbp); -int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); - -/* - * Obtain the DMU buffer from the specified object which contains the - * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so - * that it will remain in memory. You must release the hold with - * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your - * hold. You must have a hold on any dmu_buf_t* you pass to the DMU. - * - * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill - * on the returned buffer before reading or writing the buffer's - * db_data. The comments for those routines describe what particular - * operations are valid after calling them. - * - * The object number must be a valid, allocated object number. - */ -int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, - void *tag, dmu_buf_t **, int flags); -int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, - void *tag, dmu_buf_t **dbp, int flags); - -/* - * Add a reference to a dmu buffer that has already been held via - * dmu_buf_hold() in the current context. - */ -void dmu_buf_add_ref(dmu_buf_t *db, void* tag); - -/* - * Attempt to add a reference to a dmu buffer that is in an unknown state, - * using a pointer that may have been invalidated by eviction processing. - * The request will succeed if the passed in dbuf still represents the - * same os/object/blkid, is ineligible for eviction, and has at least - * one hold by a user other than the syncer. - */ -boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object, - uint64_t blkid, void *tag); - -void dmu_buf_rele(dmu_buf_t *db, void *tag); -uint64_t dmu_buf_refcount(dmu_buf_t *db); - -/* - * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a - * range of an object. A pointer to an array of dmu_buf_t*'s is - * returned (in *dbpp). - * - * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and - * frees the array. The hold on the array of buffers MUST be released - * with dmu_buf_rele_array. You can NOT release the hold on each buffer - * individually with dmu_buf_rele. - */ -int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, - uint64_t length, boolean_t read, void *tag, - int *numbufsp, dmu_buf_t ***dbpp); -int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, - uint32_t flags); -void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); - -typedef void dmu_buf_evict_func_t(void *user_ptr); - -/* - * A DMU buffer user object may be associated with a dbuf for the - * duration of its lifetime. This allows the user of a dbuf (client) - * to attach private data to a dbuf (e.g. in-core only data such as a - * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified - * when that dbuf has been evicted. Clients typically respond to the - * eviction notification by freeing their private data, thus ensuring - * the same lifetime for both dbuf and private data. - * - * The mapping from a dmu_buf_user_t to any client private data is the - * client's responsibility. All current consumers of the API with private - * data embed a dmu_buf_user_t as the first member of the structure for - * their private data. This allows conversions between the two types - * with a simple cast. Since the DMU buf user API never needs access - * to the private data, other strategies can be employed if necessary - * or convenient for the client (e.g. using container_of() to do the - * conversion for private data that cannot have the dmu_buf_user_t as - * its first member). - * - * Eviction callbacks are executed without the dbuf mutex held or any - * other type of mechanism to guarantee that the dbuf is still available. - * For this reason, users must assume the dbuf has already been freed - * and not reference the dbuf from the callback context. - * - * Users requesting "immediate eviction" are notified as soon as the dbuf - * is only referenced by dirty records (dirties == holds). Otherwise the - * notification occurs after eviction processing for the dbuf begins. - */ -typedef struct dmu_buf_user { - /* - * Asynchronous user eviction callback state. - */ - taskq_ent_t dbu_tqent; - - /* - * This instance's eviction function pointers. - * - * dbu_evict_func_sync is called synchronously and then - * dbu_evict_func_async is executed asynchronously on a taskq. - */ - dmu_buf_evict_func_t *dbu_evict_func_sync; - dmu_buf_evict_func_t *dbu_evict_func_async; -#ifdef ZFS_DEBUG - /* - * Pointer to user's dbuf pointer. NULL for clients that do - * not associate a dbuf with their user data. - * - * The dbuf pointer is cleared upon eviction so as to catch - * use-after-evict bugs in clients. - */ - dmu_buf_t **dbu_clear_on_evict_dbufp; -#endif -} dmu_buf_user_t; - -/* - * Initialize the given dmu_buf_user_t instance with the eviction function - * evict_func, to be called when the user is evicted. - * - * NOTE: This function should only be called once on a given dmu_buf_user_t. - * To allow enforcement of this, dbu must already be zeroed on entry. - */ -/*ARGSUSED*/ -static inline void -dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, - dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp) -{ - ASSERT(dbu->dbu_evict_func_sync == NULL); - ASSERT(dbu->dbu_evict_func_async == NULL); - - /* must have at least one evict func */ - IMPLY(evict_func_sync == NULL, evict_func_async != NULL); - dbu->dbu_evict_func_sync = evict_func_sync; - dbu->dbu_evict_func_async = evict_func_async; -#ifdef ZFS_DEBUG - dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; -#endif -} - -/* - * Attach user data to a dbuf and mark it for normal (when the dbuf's - * data is cleared or its reference count goes to zero) eviction processing. - * - * Returns NULL on success, or the existing user if another user currently - * owns the buffer. - */ -void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); - -/* - * Attach user data to a dbuf and mark it for immediate (its dirty and - * reference counts are equal) eviction processing. - * - * Returns NULL on success, or the existing user if another user currently - * owns the buffer. - */ -void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); - -/* - * Replace the current user of a dbuf. - * - * If given the current user of a dbuf, replaces the dbuf's user with - * "new_user" and returns the user data pointer that was replaced. - * Otherwise returns the current, and unmodified, dbuf user pointer. - */ -void *dmu_buf_replace_user(dmu_buf_t *db, - dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); - -/* - * Remove the specified user data for a DMU buffer. - * - * Returns the user that was removed on success, or the current user if - * another user currently owns the buffer. - */ -void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); - -/* - * Returns the user data (dmu_buf_user_t *) associated with this dbuf. - */ -void *dmu_buf_get_user(dmu_buf_t *db); - -objset_t *dmu_buf_get_objset(dmu_buf_t *db); -dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db); -void dmu_buf_dnode_exit(dmu_buf_t *db); - -/* Block until any in-progress dmu buf user evictions complete. */ -void dmu_buf_user_evict_wait(void); - -/* - * Returns the blkptr associated with this dbuf, or NULL if not set. - */ -struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); - -/* - * Indicate that you are going to modify the buffer's data (db_data). - * - * The transaction (tx) must be assigned to a txg (ie. you've called - * dmu_tx_assign()). The buffer's object must be held in the tx - * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). - */ -void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); - -/* - * You must create a transaction, then hold the objects which you will - * (or might) modify as part of this transaction. Then you must assign - * the transaction to a transaction group. Once the transaction has - * been assigned, you can modify buffers which belong to held objects as - * part of this transaction. You can't modify buffers before the - * transaction has been assigned; you can't modify buffers which don't - * belong to objects which this transaction holds; you can't hold - * objects once the transaction has been assigned. You may hold an - * object which you are going to free (with dmu_object_free()), but you - * don't have to. - * - * You can abort the transaction before it has been assigned. - * - * Note that you may hold buffers (with dmu_buf_hold) at any time, - * regardless of transaction state. - */ - -#define DMU_NEW_OBJECT (-1ULL) -#define DMU_OBJECT_END (-1ULL) - -dmu_tx_t *dmu_tx_create(objset_t *os); -void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); -void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, - int len); -void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, - uint64_t len); -void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, - uint64_t len); -void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object); -void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); -void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, - const char *name); -void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); -void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn); -void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); -void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); -void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); -void dmu_tx_abort(dmu_tx_t *tx); -int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); -void dmu_tx_wait(dmu_tx_t *tx); -void dmu_tx_commit(dmu_tx_t *tx); -void dmu_tx_mark_netfree(dmu_tx_t *tx); - -/* - * To register a commit callback, dmu_tx_callback_register() must be called. - * - * dcb_data is a pointer to caller private data that is passed on as a - * callback parameter. The caller is responsible for properly allocating and - * freeing it. - * - * When registering a callback, the transaction must be already created, but - * it cannot be committed or aborted. It can be assigned to a txg or not. - * - * The callback will be called after the transaction has been safely written - * to stable storage and will also be called if the dmu_tx is aborted. - * If there is any error which prevents the transaction from being committed to - * disk, the callback will be called with a value of error != 0. - */ -typedef void dmu_tx_callback_func_t(void *dcb_data, int error); - -void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, - void *dcb_data); - -/* - * Free up the data blocks for a defined range of a file. If size is - * -1, the range from offset to end-of-file is freed. - */ -int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, - uint64_t size, dmu_tx_t *tx); -int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, - uint64_t size); -int dmu_free_long_object(objset_t *os, uint64_t object); - -/* - * Convenience functions. - * - * Canfail routines will return 0 on success, or an errno if there is a - * nonrecoverable I/O error. - */ -#define DMU_READ_PREFETCH 0 /* prefetch */ -#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ -int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags); -int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, - uint32_t flags); -void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx); -void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx); -void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - dmu_tx_t *tx); -int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); -int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size); -int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size); -int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, - dmu_tx_t *tx); -int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, - dmu_tx_t *tx); -int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size, - dmu_tx_t *tx); -#ifdef _KERNEL -#ifdef illumos -int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, - uint64_t size, struct page *pp, dmu_tx_t *tx); -#else -int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, - uint64_t size, struct vm_page **ppa, dmu_tx_t *tx); -int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, - int *rbehind, int *rahead, int last_size); -#endif -#endif -struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); -void dmu_return_arcbuf(struct arc_buf *buf); -void dmu_assign_arcbuf_dnode(dnode_t *handle, uint64_t offset, - struct arc_buf *buf, dmu_tx_t *tx); -void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, - dmu_tx_t *tx); -int dmu_xuio_init(struct xuio *uio, int niov); -void dmu_xuio_fini(struct xuio *uio); -int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, - size_t n); -int dmu_xuio_cnt(struct xuio *uio); -struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); -void dmu_xuio_clear(struct xuio *uio, int i); -void xuio_stat_wbuf_copied(void); -void xuio_stat_wbuf_nocopy(void); - -extern boolean_t zfs_prefetch_disable; -extern int zfs_max_recordsize; - -/* - * Asynchronously try to read in the data. - */ -void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, - uint64_t len, enum zio_priority pri); - -typedef struct dmu_object_info { - /* All sizes are in bytes unless otherwise indicated. */ - uint32_t doi_data_block_size; - uint32_t doi_metadata_block_size; - dmu_object_type_t doi_type; - dmu_object_type_t doi_bonus_type; - uint64_t doi_bonus_size; - uint8_t doi_indirection; /* 2 = dnode->indirect->data */ - uint8_t doi_checksum; - uint8_t doi_compress; - uint8_t doi_nblkptr; - int8_t doi_pad[4]; - uint64_t doi_dnodesize; - uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ - uint64_t doi_max_offset; - uint64_t doi_fill_count; /* number of non-empty blocks */ -} dmu_object_info_t; - -typedef void arc_byteswap_func_t(void *buf, size_t size); - -typedef struct dmu_object_type_info { - dmu_object_byteswap_t ot_byteswap; - boolean_t ot_metadata; - boolean_t ot_dbuf_metadata_cache; - char *ot_name; -} dmu_object_type_info_t; - -typedef struct dmu_object_byteswap_info { - arc_byteswap_func_t *ob_func; - char *ob_name; -} dmu_object_byteswap_info_t; - -extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; -extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; - -/* - * Get information on a DMU object. - * - * Return 0 on success or ENOENT if object is not allocated. - * - * If doi is NULL, just indicates whether the object exists. - */ -int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); -void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); -/* Like dmu_object_info, but faster if you have a held dnode in hand. */ -void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi); -/* Like dmu_object_info, but faster if you have a held dbuf in hand. */ -void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); -/* - * Like dmu_object_info_from_db, but faster still when you only care about - * the size. This is specifically optimized for zfs_getattr(). - */ -void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, - u_longlong_t *nblk512); - -void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize); - -typedef struct dmu_objset_stats { - uint64_t dds_num_clones; /* number of clones of this */ - uint64_t dds_creation_txg; - uint64_t dds_guid; - dmu_objset_type_t dds_type; - uint8_t dds_is_snapshot; - uint8_t dds_inconsistent; - char dds_origin[ZFS_MAX_DATASET_NAME_LEN]; -} dmu_objset_stats_t; - -/* - * Get stats on a dataset. - */ -void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); - -/* - * Add entries to the nvlist for all the objset's properties. See - * zfs_prop_table[] and zfs(1m) for details on the properties. - */ -void dmu_objset_stats(objset_t *os, struct nvlist *nv); - -/* - * Get the space usage statistics for statvfs(). - * - * refdbytes is the amount of space "referenced" by this objset. - * availbytes is the amount of space available to this objset, taking - * into account quotas & reservations, assuming that no other objsets - * use the space first. These values correspond to the 'referenced' and - * 'available' properties, described in the zfs(1m) manpage. - * - * usedobjs and availobjs are the number of objects currently allocated, - * and available. - */ -void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, - uint64_t *usedobjsp, uint64_t *availobjsp); - -/* - * The fsid_guid is a 56-bit ID that can change to avoid collisions. - * (Contrast with the ds_guid which is a 64-bit ID that will never - * change, so there is a small probability that it will collide.) - */ -uint64_t dmu_objset_fsid_guid(objset_t *os); - -/* - * Get the [cm]time for an objset's snapshot dir - */ -timestruc_t dmu_objset_snap_cmtime(objset_t *os); - -int dmu_objset_is_snapshot(objset_t *os); - -extern struct spa *dmu_objset_spa(objset_t *os); -extern struct zilog *dmu_objset_zil(objset_t *os); -extern struct dsl_pool *dmu_objset_pool(objset_t *os); -extern struct dsl_dataset *dmu_objset_ds(objset_t *os); -extern void dmu_objset_name(objset_t *os, char *buf); -extern dmu_objset_type_t dmu_objset_type(objset_t *os); -extern uint64_t dmu_objset_id(objset_t *os); -extern uint64_t dmu_objset_dnodesize(objset_t *os); -extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); -extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); -extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, - uint64_t *id, uint64_t *offp, boolean_t *case_conflict); -extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, - int maxlen, boolean_t *conflict); -extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, - uint64_t *idp, uint64_t *offp); - -typedef int objset_used_cb_t(dmu_object_type_t bonustype, - void *bonus, uint64_t *userp, uint64_t *groupp); -extern void dmu_objset_register_type(dmu_objset_type_t ost, - objset_used_cb_t *cb); -extern void dmu_objset_set_user(objset_t *os, void *user_ptr); -extern void *dmu_objset_get_user(objset_t *os); - -/* - * Return the txg number for the given assigned transaction. - */ -uint64_t dmu_tx_get_txg(dmu_tx_t *tx); - -/* - * Synchronous write. - * If a parent zio is provided this function initiates a write on the - * provided buffer as a child of the parent zio. - * In the absence of a parent zio, the write is completed synchronously. - * At write completion, blk is filled with the bp of the written block. - * Note that while the data covered by this function will be on stable - * storage when the write completes this new data does not become a - * permanent part of the file until the associated transaction commits. - */ - -/* - * {zfs,zvol,ztest}_get_done() args - */ -typedef struct zgd { - struct lwb *zgd_lwb; - struct blkptr *zgd_bp; - dmu_buf_t *zgd_db; - struct locked_range *zgd_lr; - void *zgd_private; -} zgd_t; - -typedef void dmu_sync_cb_t(zgd_t *arg, int error); -int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); - -/* - * Find the next hole or data block in file starting at *off - * Return found offset in *off. Return ESRCH for end of file. - */ -int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, - uint64_t *off); - -/* - * Check if a DMU object has any dirty blocks. If so, sync out - * all pending transaction groups. Otherwise, this function - * does not alter DMU state. This could be improved to only sync - * out the necessary transaction groups for this particular - * object. - */ -int dmu_object_wait_synced(objset_t *os, uint64_t object); - -/* - * Initial setup and final teardown. - */ -extern void dmu_init(void); -extern void dmu_fini(void); - -typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, - uint64_t object, uint64_t offset, int len); -void dmu_traverse_objset(objset_t *os, uint64_t txg_start, - dmu_traverse_cb_t cb, void *arg); -int dmu_diff(const char *tosnap_name, const char *fromsnap_name, - struct file *fp, offset_t *offp); - -/* CRC64 table */ -#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ -extern uint64_t zfs_crc64_table[256]; - -extern int zfs_mdcomp_disable; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DMU_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h deleted file mode 100644 index 5cf7aea4711f..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h +++ /dev/null @@ -1,315 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 2012, Martin Matuska . All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. - */ - -#ifndef _SYS_DMU_IMPL_H -#define _SYS_DMU_IMPL_H - -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * This is the locking strategy for the DMU. Numbers in parenthesis are - * cases that use that lock order, referenced below: - * - * ARC is self-contained - * bplist is self-contained - * refcount is self-contained - * txg is self-contained (hopefully!) - * zst_lock - * zf_rwlock - * - * XXX try to improve evicting path? - * - * dp_config_rwlock > os_obj_lock > dn_struct_rwlock > - * dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs - * - * dp_config_rwlock - * must be held before: everything - * protects dd namespace changes - * protects property changes globally - * held from: - * dsl_dir_open/r: - * dsl_dir_create_sync/w: - * dsl_dir_sync_destroy/w: - * dsl_dir_rename_sync/w: - * dsl_prop_changed_notify/r: - * - * os_obj_lock - * must be held before: - * everything except dp_config_rwlock - * protects os_obj_next - * held from: - * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock - * - * dn_struct_rwlock - * must be held before: - * everything except dp_config_rwlock and os_obj_lock - * protects structure of dnode (eg. nlevels) - * db_blkptr can change when syncing out change to nlevels - * dn_maxblkid - * dn_nlevels - * dn_*blksz* - * phys nlevels, maxblkid, physical blkptr_t's (?) - * held from: - * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch - * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz) - * dbuf_read_impl: db_mtx, dmu_zfetch() - * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch() - * dbuf_new_size: db_mtx - * dbuf_dirty: db_mtx - * dbuf_findbp: (callers, phys? - the real need) - * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?) - * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx - * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp() - * dnode_sync/w (increase_indirection): db_mtx (phys) - * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*) - * dnode_new_blkid/w: (dn_maxblkid) - * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid) - * dnode_next_offset: (phys) - * - * dn_dbufs_mtx - * must be held before: - * db_mtx, hash_mutexes - * protects: - * dn_dbufs - * dn_evicted - * held from: - * dmu_evict_user: db_mtx (dn_dbufs) - * dbuf_free_range: db_mtx (dn_dbufs) - * dbuf_remove_ref: db_mtx, callees: - * dbuf_hash_remove: hash_mutexes, db_mtx - * dbuf_create: hash_mutexes, db_mtx (dn_dbufs) - * dnode_set_blksz: (dn_dbufs) - * - * hash_mutexes (global) - * must be held before: - * db_mtx - * protects dbuf_hash_table (global) and db_hash_next - * held from: - * dbuf_find: db_mtx - * dbuf_hash_insert: db_mtx - * dbuf_hash_remove: db_mtx - * - * db_mtx (meta-leaf) - * must be held before: - * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes) - * protects: - * db_state - * db_holds - * db_buf - * db_changed - * db_data_pending - * db_dirtied - * db_link - * db_dirty_node (??) - * db_dirtycnt - * db_d.* - * db.* - * held from: - * dbuf_dirty: dn_mtx, dn_dirty_mtx - * dbuf_dirty->dsl_dir_willuse_space: dd_lock - * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock - * dbuf_undirty: dn_dirty_mtx (db_d) - * dbuf_write_done: dn_dirty_mtx (db_state) - * dbuf_* - * dmu_buf_update_user: none (db_d) - * dmu_evict_user: none (db_d) (maybe can eliminate) - * dbuf_find: none (db_holds) - * dbuf_hash_insert: none (db_holds) - * dmu_buf_read_array_impl: none (db_state, db_changed) - * dmu_sync: none (db_dirty_node, db_d) - * dnode_reallocate: none (db) - * - * dn_mtx (leaf) - * protects: - * dn_dirty_dbufs - * dn_ranges - * phys accounting - * dn_allocated_txg - * dn_free_txg - * dn_assigned_txg - * dn_dirty_txg - * dn_notxholds - * dn_dirtyctx - * dn_dirtyctx_firstset - * (dn_phys copy fields?) - * (dn_phys contents?) - * held from: - * dnode_* - * dbuf_dirty: none - * dbuf_sync: none (phys accounting) - * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs) - * dbuf_write_done: none (phys accounting) - * dmu_object_info_from_dnode: none (accounting) - * dmu_tx_commit: none - * dmu_tx_hold_object_impl: none - * dmu_tx_try_assign: dn_notxholds(cv) - * dmu_tx_unassign: none - * - * dd_lock - * must be held before: - * ds_lock - * ancestors' dd_lock - * protects: - * dd_prop_cbs - * dd_sync_* - * dd_used_bytes - * dd_tempreserved - * dd_space_towrite - * dd_myname - * dd_phys accounting? - * held from: - * dsl_dir_* - * dsl_prop_changed_notify: none (dd_prop_cbs) - * dsl_prop_register: none (dd_prop_cbs) - * dsl_prop_unregister: none (dd_prop_cbs) - * - * os_lock (leaf) - * protects: - * os_dirty_dnodes - * os_free_dnodes - * os_dnodes - * os_downgraded_dbufs - * dn_dirtyblksz - * dn_dirty_link - * held from: - * dnode_create: none (os_dnodes) - * dnode_destroy: none (os_dnodes) - * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes) - * dnode_free: none (dn_dirtyblksz, os_*_dnodes) - * - * ds_lock - * protects: - * ds_objset - * ds_open_refcount - * ds_snapname - * ds_phys accounting - * ds_phys userrefs zapobj - * ds_reserved - * held from: - * dsl_dataset_* - * - * dr_mtx (leaf) - * protects: - * dr_children - * held from: - * dbuf_dirty - * dbuf_undirty - * dbuf_sync_indirect - * dnode_new_blkid - */ - -struct objset; -struct dmu_pool; - -typedef struct dmu_xuio { - int next; - int cnt; - struct arc_buf **bufs; - iovec_t *iovp; -} dmu_xuio_t; - -typedef struct xuio_stats { - /* loaned yet not returned arc_buf */ - kstat_named_t xuiostat_onloan_rbuf; - kstat_named_t xuiostat_onloan_wbuf; - /* whether a copy is made when loaning out a read buffer */ - kstat_named_t xuiostat_rbuf_copied; - kstat_named_t xuiostat_rbuf_nocopy; - /* whether a copy is made when assigning a write buffer */ - kstat_named_t xuiostat_wbuf_copied; - kstat_named_t xuiostat_wbuf_nocopy; -} xuio_stats_t; - -static xuio_stats_t xuio_stats = { - { "onloan_read_buf", KSTAT_DATA_UINT64 }, - { "onloan_write_buf", KSTAT_DATA_UINT64 }, - { "read_buf_copied", KSTAT_DATA_UINT64 }, - { "read_buf_nocopy", KSTAT_DATA_UINT64 }, - { "write_buf_copied", KSTAT_DATA_UINT64 }, - { "write_buf_nocopy", KSTAT_DATA_UINT64 } -}; - -#define XUIOSTAT_INCR(stat, val) \ - atomic_add_64(&xuio_stats.stat.value.ui64, (val)) -#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) - -/* - * The list of data whose inclusion in a send stream can be pending from - * one call to backup_cb to another. Multiple calls to dump_free() and - * dump_freeobjects() can be aggregated into a single DRR_FREE or - * DRR_FREEOBJECTS replay record. - */ -typedef enum { - PENDING_NONE, - PENDING_FREE, - PENDING_FREEOBJECTS -} dmu_pendop_t; - -typedef struct dmu_sendarg { - list_node_t dsa_link; - dmu_replay_record_t *dsa_drr; - kthread_t *dsa_td; - struct file *dsa_fp; - int dsa_outfd; - struct proc *dsa_proc; - offset_t *dsa_off; - objset_t *dsa_os; - zio_cksum_t dsa_zc; - uint64_t dsa_toguid; - int dsa_err; - dmu_pendop_t dsa_pending_op; - uint64_t dsa_featureflags; - uint64_t dsa_last_data_object; - uint64_t dsa_last_data_offset; - uint64_t dsa_resume_object; - uint64_t dsa_resume_offset; - boolean_t dsa_sent_begin; - boolean_t dsa_sent_end; -} dmu_sendarg_t; - -void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); -void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); -int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t, - void *, dmu_buf_t **); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DMU_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h deleted file mode 100644 index cae1c7719a83..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h +++ /dev/null @@ -1,221 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -/* Portions Copyright 2010 Robert Milkowski */ - -#ifndef _SYS_DMU_OBJSET_H -#define _SYS_DMU_OBJSET_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -extern krwlock_t os_lock; - -struct dsl_pool; -struct dsl_dataset; -struct dmu_tx; - -#define OBJSET_PHYS_SIZE 2048 -#define OBJSET_OLD_PHYS_SIZE 1024 - -#define OBJSET_BUF_HAS_USERUSED(buf) \ - (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE) - -#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0) - -typedef struct objset_phys { - dnode_phys_t os_meta_dnode; - zil_header_t os_zil_header; - uint64_t os_type; - uint64_t os_flags; - char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 - - sizeof (zil_header_t) - sizeof (uint64_t)*2]; - dnode_phys_t os_userused_dnode; - dnode_phys_t os_groupused_dnode; -} objset_phys_t; - -#define OBJSET_PROP_UNINITIALIZED ((uint64_t)-1) -struct objset { - /* Immutable: */ - struct dsl_dataset *os_dsl_dataset; - spa_t *os_spa; - arc_buf_t *os_phys_buf; - objset_phys_t *os_phys; - /* - * The following "special" dnodes have no parent, are exempt - * from dnode_move(), and are not recorded in os_dnodes, but they - * root their descendents in this objset using handles anyway, so - * that all access to dnodes from dbufs consistently uses handles. - */ - dnode_handle_t os_meta_dnode; - dnode_handle_t os_userused_dnode; - dnode_handle_t os_groupused_dnode; - zilog_t *os_zil; - - list_node_t os_evicting_node; - - /* can change, under dsl_dir's locks: */ - uint64_t os_dnodesize; /* default dnode size for new objects */ - enum zio_checksum os_checksum; - enum zio_compress os_compress; - uint8_t os_copies; - enum zio_checksum os_dedup_checksum; - boolean_t os_dedup_verify; - zfs_logbias_op_t os_logbias; - zfs_cache_type_t os_primary_cache; - zfs_cache_type_t os_secondary_cache; - zfs_sync_type_t os_sync; - zfs_redundant_metadata_type_t os_redundant_metadata; - int os_recordsize; - /* - * The next four values are used as a cache of whatever's on disk, and - * are initialized the first time these properties are queried. Before - * being initialized with their real values, their values are - * OBJSET_PROP_UNINITIALIZED. - */ - uint64_t os_version; - uint64_t os_normalization; - uint64_t os_utf8only; - uint64_t os_casesensitivity; - /* - * The largest zpl file block allowed in special class. - * cached here instead of zfsvfs for easier access. - */ - int os_zpl_special_smallblock; - - /* - * Pointer is constant; the blkptr it points to is protected by - * os_dsl_dataset->ds_bp_rwlock - */ - blkptr_t *os_rootbp; - - /* no lock needed: */ - struct dmu_tx *os_synctx; /* XXX sketchy */ - zil_header_t os_zil_header; - multilist_t *os_synced_dnodes; - uint64_t os_flags; - uint64_t os_freed_dnodes; - boolean_t os_rescan_dnodes; - - /* Protected by os_obj_lock */ - kmutex_t os_obj_lock; - uint64_t os_obj_next_chunk; - - /* Per-CPU next object to allocate, protected by atomic ops. */ - uint64_t *os_obj_next_percpu; - int os_obj_next_percpu_len; - - /* Protected by os_lock */ - kmutex_t os_lock; - multilist_t *os_dirty_dnodes[TXG_SIZE]; - list_t os_dnodes; - list_t os_downgraded_dbufs; - - /* Protects changes to DMU_{USER,GROUP}USED_OBJECT */ - kmutex_t os_userused_lock; - - /* stuff we store for the user */ - kmutex_t os_user_ptr_lock; - void *os_user_ptr; - sa_os_t *os_sa; -}; - -#define DMU_META_OBJSET 0 -#define DMU_META_DNODE_OBJECT 0 -#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0) -#define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode) -#define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode) -#define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode) - -#define DMU_OS_IS_L2CACHEABLE(os) \ - ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ - (os)->os_secondary_cache == ZFS_CACHE_METADATA) - -#define DMU_OS_IS_L2COMPRESSIBLE(os) (zfs_mdcomp_disable == B_FALSE) - -/* called from zpl */ -int dmu_objset_hold(const char *name, void *tag, objset_t **osp); -int dmu_objset_own(const char *name, dmu_objset_type_t type, - boolean_t readonly, void *tag, objset_t **osp); -int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj, - dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); -void dmu_objset_refresh_ownership(struct dsl_dataset *ds, - struct dsl_dataset **newds, void *tag); -void dmu_objset_rele(objset_t *os, void *tag); -void dmu_objset_disown(objset_t *os, void *tag); -int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp); - -void dmu_objset_stats(objset_t *os, nvlist_t *nv); -void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); -void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, - uint64_t *usedobjsp, uint64_t *availobjsp); -uint64_t dmu_objset_fsid_guid(objset_t *os); -int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj, - int func(struct dsl_pool *, struct dsl_dataset *, void *), - void *arg, int flags); -int dmu_objset_prefetch(const char *name, void *arg); -void dmu_objset_evict_dbufs(objset_t *os); -timestruc_t dmu_objset_snap_cmtime(objset_t *os); - -/* called from dsl */ -void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); -boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg); -objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, - blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); -int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, - objset_t **osp); -void dmu_objset_evict(objset_t *os); -void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx); -void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx); -boolean_t dmu_objset_userused_enabled(objset_t *os); -int dmu_objset_userspace_upgrade(objset_t *os); -boolean_t dmu_objset_userspace_present(objset_t *os); -int dmu_fsname(const char *snapname, char *buf); - -void dmu_objset_evict_done(objset_t *os); -void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx); - -void dmu_objset_init(void); -void dmu_objset_fini(void); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DMU_OBJSET_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h deleted file mode 100644 index 1f4b1f2cde9f..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h +++ /dev/null @@ -1,93 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -#ifndef _DMU_SEND_H -#define _DMU_SEND_H - -#include - -struct vnode; -struct dsl_dataset; -struct drr_begin; -struct avl_tree; -struct dmu_replay_record; - -extern const char *recv_clone_name; - -int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, - boolean_t large_block_ok, boolean_t compressok, int outfd, - uint64_t resumeobj, uint64_t resumeoff, -#ifdef illumos - struct vnode *vp, offset_t *off); -#else - struct file *fp, offset_t *off); -#endif -int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds, - boolean_t stream_compressed, uint64_t *sizep); -int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg, - boolean_t stream_compressed, uint64_t *sizep); -int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, - boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, -#ifdef illumos - int outfd, struct vnode *vp, offset_t *off); -#else - int outfd, struct file *fp, offset_t *off); -#endif - -typedef struct dmu_recv_cookie { - struct dsl_dataset *drc_ds; - struct dmu_replay_record *drc_drr_begin; - struct drr_begin *drc_drrb; - const char *drc_tofs; - const char *drc_tosnap; - boolean_t drc_newfs; - boolean_t drc_byteswap; - boolean_t drc_force; - boolean_t drc_resumable; - boolean_t drc_clone; - struct avl_tree *drc_guid_to_ds_map; - zio_cksum_t drc_cksum; - uint64_t drc_newsnapobj; - void *drc_owner; - cred_t *drc_cred; -} dmu_recv_cookie_t; - -int dmu_recv_begin(char *tofs, char *tosnap, - struct dmu_replay_record *drr_begin, - boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc); -#ifdef illumos -int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, -#else -int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, -#endif - int cleanup_fd, uint64_t *action_handlep); -int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner); -boolean_t dmu_objset_is_receiving(objset_t *os); - -#endif /* _DMU_SEND_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h deleted file mode 100644 index c010edd440d9..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. - */ - -#ifndef _SYS_DMU_TRAVERSE_H -#define _SYS_DMU_TRAVERSE_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct dnode_phys; -struct dsl_dataset; -struct zilog; -struct arc_buf; - -typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg); - -#define TRAVERSE_PRE (1<<0) -#define TRAVERSE_POST (1<<1) -#define TRAVERSE_PREFETCH_METADATA (1<<2) -#define TRAVERSE_PREFETCH_DATA (1<<3) -#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA) -#define TRAVERSE_HARD (1<<4) - -/* Special traverse error return value to indicate skipping of children */ -#define TRAVERSE_VISIT_NO_CHILDREN -1 - -int traverse_dataset(struct dsl_dataset *ds, - uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); -int traverse_dataset_resume(struct dsl_dataset *ds, uint64_t txg_start, - zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg); -int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, - uint64_t txg_start, zbookmark_phys_t *resume, int flags, - blkptr_cb_t func, void *arg); -int traverse_pool(spa_t *spa, - uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DMU_TRAVERSE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h deleted file mode 100644 index 82b8946e5f6d..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h +++ /dev/null @@ -1,152 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. - */ - -#ifndef _SYS_DMU_TX_H -#define _SYS_DMU_TX_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct dmu_buf_impl; -struct dmu_tx_hold; -struct dnode_link; -struct dsl_pool; -struct dnode; -struct dsl_dir; - -struct dmu_tx { - /* - * No synchronization is needed because a tx can only be handled - * by one thread. - */ - list_t tx_holds; /* list of dmu_tx_hold_t */ - objset_t *tx_objset; - struct dsl_dir *tx_dir; - struct dsl_pool *tx_pool; - uint64_t tx_txg; - uint64_t tx_lastsnap_txg; - uint64_t tx_lasttried_txg; - txg_handle_t tx_txgh; - void *tx_tempreserve_cookie; - struct dmu_tx_hold *tx_needassign_txh; - - /* list of dmu_tx_callback_t on this dmu_tx */ - list_t tx_callbacks; - - /* placeholder for syncing context, doesn't need specific holds */ - boolean_t tx_anyobj; - - /* transaction is marked as being a "net free" of space */ - boolean_t tx_netfree; - - /* time this transaction was created */ - hrtime_t tx_start; - - /* need to wait for sufficient dirty space */ - boolean_t tx_wait_dirty; - - /* has this transaction already been delayed? */ - boolean_t tx_dirty_delayed; - - int tx_err; -}; - -enum dmu_tx_hold_type { - THT_NEWOBJECT, - THT_WRITE, - THT_BONUS, - THT_FREE, - THT_ZAP, - THT_SPACE, - THT_SPILL, - THT_NUMTYPES -}; - -typedef struct dmu_tx_hold { - dmu_tx_t *txh_tx; - list_node_t txh_node; - struct dnode *txh_dnode; - zfs_refcount_t txh_space_towrite; - zfs_refcount_t txh_memory_tohold; - enum dmu_tx_hold_type txh_type; - uint64_t txh_arg1; - uint64_t txh_arg2; -} dmu_tx_hold_t; - -typedef struct dmu_tx_callback { - list_node_t dcb_node; /* linked to tx_callbacks list */ - dmu_tx_callback_func_t *dcb_func; /* caller function pointer */ - void *dcb_data; /* caller private data */ -} dmu_tx_callback_t; - -/* - * These routines are defined in dmu.h, and are called by the user. - */ -dmu_tx_t *dmu_tx_create(objset_t *dd); -int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); -void dmu_tx_commit(dmu_tx_t *tx); -void dmu_tx_abort(dmu_tx_t *tx); -uint64_t dmu_tx_get_txg(dmu_tx_t *tx); -struct dsl_pool *dmu_tx_pool(dmu_tx_t *tx); -void dmu_tx_wait(dmu_tx_t *tx); - -void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, - void *dcb_data); -void dmu_tx_do_callbacks(list_t *cb_list, int error); - -/* - * These routines are defined in dmu_spa.h, and are called by the SPA. - */ -extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg); - -/* - * These routines are only called by the DMU. - */ -dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd); -int dmu_tx_is_syncing(dmu_tx_t *tx); -int dmu_tx_private_ok(dmu_tx_t *tx); -void dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn); -void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); -void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); - -#ifdef ZFS_DEBUG -#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db) -#else -#define DMU_TX_DIRTY_BUF(tx, db) -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DMU_TX_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h deleted file mode 100644 index 21a3ff3a2032..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2014 by Delphix. All rights reserved. - */ - -#ifndef _DMU_ZFETCH_H -#define _DMU_ZFETCH_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -extern uint64_t zfetch_array_rd_sz; - -struct dnode; /* so we can reference dnode */ - -typedef struct zstream { - uint64_t zs_blkid; /* expect next access at this blkid */ - uint64_t zs_pf_blkid; /* next block to prefetch */ - - /* - * We will next prefetch the L1 indirect block of this level-0 - * block id. - */ - uint64_t zs_ipf_blkid; - - kmutex_t zs_lock; /* protects stream */ - hrtime_t zs_atime; /* time last prefetch issued */ - list_node_t zs_node; /* link for zf_stream */ -} zstream_t; - -typedef struct zfetch { - krwlock_t zf_rwlock; /* protects zfetch structure */ - list_t zf_stream; /* list of zstream_t's */ - struct dnode *zf_dnode; /* dnode that owns this zfetch */ -} zfetch_t; - -void zfetch_init(void); -void zfetch_fini(void); - -void dmu_zfetch_init(zfetch_t *, struct dnode *); -void dmu_zfetch_fini(zfetch_t *); -void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t); - - -#ifdef __cplusplus -} -#endif - -#endif /* _DMU_ZFETCH_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h deleted file mode 100644 index b1a8547013c5..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h +++ /dev/null @@ -1,599 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - */ - -#ifndef _SYS_DNODE_H -#define _SYS_DNODE_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * dnode_hold() flags. - */ -#define DNODE_MUST_BE_ALLOCATED 1 -#define DNODE_MUST_BE_FREE 2 - -/* - * dnode_next_offset() flags. - */ -#define DNODE_FIND_HOLE 1 -#define DNODE_FIND_BACKWARDS 2 -#define DNODE_FIND_HAVELOCK 4 - -/* - * Fixed constants. - */ -#define DNODE_SHIFT 9 /* 512 bytes */ -#define DN_MIN_INDBLKSHIFT 12 /* 4k */ -/* - * If we ever increase this value beyond 20, we need to revisit all logic that - * does x << level * ebps to handle overflow. With a 1M indirect block size, - * 4 levels of indirect blocks would not be able to guarantee addressing an - * entire object, so 5 levels will be used, but 5 * (20 - 7) = 65. - */ -#define DN_MAX_INDBLKSHIFT 17 /* 128k */ -#define DNODE_BLOCK_SHIFT 14 /* 16k */ -#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */ -#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */ -#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ - -/* - * dnode id flags - * - * Note: a file will never ever have its - * ids moved from bonus->spill - * and only in a crypto environment would it be on spill - */ -#define DN_ID_CHKED_BONUS 0x1 -#define DN_ID_CHKED_SPILL 0x2 -#define DN_ID_OLD_EXIST 0x4 -#define DN_ID_NEW_EXIST 0x8 - -/* - * Derived constants. - */ -#define DNODE_MIN_SIZE (1 << DNODE_SHIFT) -#define DNODE_MAX_SIZE (1 << DNODE_BLOCK_SHIFT) -#define DNODE_BLOCK_SIZE (1 << DNODE_BLOCK_SHIFT) -#define DNODE_MIN_SLOTS (DNODE_MIN_SIZE >> DNODE_SHIFT) -#define DNODE_MAX_SLOTS (DNODE_MAX_SIZE >> DNODE_SHIFT) -#define DN_BONUS_SIZE(dnsize) ((dnsize) - DNODE_CORE_SIZE - \ - (1 << SPA_BLKPTRSHIFT)) -#define DN_SLOTS_TO_BONUSLEN(slots) DN_BONUS_SIZE((slots) << DNODE_SHIFT) -#define DN_OLD_MAX_BONUSLEN (DN_BONUS_SIZE(DNODE_MIN_SIZE)) -#define DN_MAX_NBLKPTR ((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) -#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) -#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1) -#define DN_KILL_SPILLBLK (1) - -#define DN_SLOT_UNINIT ((void *)NULL) /* Uninitialized */ -#define DN_SLOT_FREE ((void *)1UL) /* Free slot */ -#define DN_SLOT_ALLOCATED ((void *)2UL) /* Allocated slot */ -#define DN_SLOT_INTERIOR ((void *)3UL) /* Interior allocated slot */ -#define DN_SLOT_IS_PTR(dn) ((void *)dn > DN_SLOT_INTERIOR) -#define DN_SLOT_IS_VALID(dn) ((void *)dn != NULL) - -#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) -#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) - -/* - * This is inaccurate if the indblkshift of the particular object is not the - * max. But it's only used by userland to calculate the zvol reservation. - */ -#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) -#define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT) - -/* The +2 here is a cheesy way to round up */ -#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ - (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT))) - -#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ - (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) -#define DN_MAX_BONUS_LEN(dnp) \ - ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? \ - (uint8_t *)DN_SPILL_BLKPTR(dnp) - (uint8_t *)DN_BONUS(dnp) : \ - (uint8_t *)(dnp + (dnp->dn_extra_slots + 1)) - (uint8_t *)DN_BONUS(dnp)) - -#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \ - (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT) - -#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) - -struct dmu_buf_impl; -struct objset; -struct zio; - -enum dnode_dirtycontext { - DN_UNDIRTIED, - DN_DIRTY_OPEN, - DN_DIRTY_SYNC -}; - -/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ -#define DNODE_FLAG_USED_BYTES (1<<0) -#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1) - -/* Does dnode have a SA spill blkptr in bonus? */ -#define DNODE_FLAG_SPILL_BLKPTR (1<<2) - -/* - * VARIABLE-LENGTH (LARGE) DNODES - * - * The motivation for variable-length dnodes is to eliminate the overhead - * associated with using spill blocks. Spill blocks are used to store - * system attribute data (i.e. file metadata) that does not fit in the - * dnode's bonus buffer. By allowing a larger bonus buffer area the use of - * a spill block can be avoided. Spill blocks potentially incur an - * additional read I/O for every dnode in a dnode block. As a worst case - * example, reading 32 dnodes from a 16k dnode block and all of the spill - * blocks could issue 33 separate reads. Now suppose those dnodes have size - * 1024 and therefore don't need spill blocks. Then the worst case number - * of blocks read is reduced to from 33 to two--one per dnode block. - * - * ZFS-on-Linux systems that make heavy use of extended attributes benefit - * from this feature. In particular, ZFS-on-Linux supports the xattr=sa - * dataset property which allows file extended attribute data to be stored - * in the dnode bonus buffer as an alternative to the traditional - * directory-based format. Workloads such as SELinux and the Lustre - * distributed filesystem often store enough xattr data to force spill - * blocks when xattr=sa is in effect. Large dnodes may therefore provide a - * performance benefit to such systems. Other use cases that benefit from - * this feature include files with large ACLs and symbolic links with long - * target names. - * - * The size of a dnode may be a multiple of 512 bytes up to the size of a - * dnode block (currently 16384 bytes). The dn_extra_slots field of the - * on-disk dnode_phys_t structure describes the size of the physical dnode - * on disk. The field represents how many "extra" dnode_phys_t slots a - * dnode consumes in its dnode block. This convention results in a value of - * 0 for 512 byte dnodes which preserves on-disk format compatibility with - * older software which doesn't support large dnodes. - * - * Similarly, the in-memory dnode_t structure has a dn_num_slots field - * to represent the total number of dnode_phys_t slots consumed on disk. - * Thus dn->dn_num_slots is 1 greater than the corresponding - * dnp->dn_extra_slots. This difference in convention was adopted - * because, unlike on-disk structures, backward compatibility is not a - * concern for in-memory objects, so we used a more natural way to - * represent size for a dnode_t. - * - * The default size for newly created dnodes is determined by the value of - * the "dnodesize" dataset property. By default the property is set to - * "legacy" which is compatible with older software. Setting the property - * to "auto" will allow the filesystem to choose the most suitable dnode - * size. Currently this just sets the default dnode size to 1k, but future - * code improvements could dynamically choose a size based on observed - * workload patterns. Dnodes of varying sizes can coexist within the same - * dataset and even within the same dnode block. - */ - -typedef struct dnode_phys { - uint8_t dn_type; /* dmu_object_type_t */ - uint8_t dn_indblkshift; /* ln2(indirect block size) */ - uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */ - uint8_t dn_nblkptr; /* length of dn_blkptr */ - uint8_t dn_bonustype; /* type of data in bonus buffer */ - uint8_t dn_checksum; /* ZIO_CHECKSUM type */ - uint8_t dn_compress; /* ZIO_COMPRESS type */ - uint8_t dn_flags; /* DNODE_FLAG_* */ - uint16_t dn_datablkszsec; /* data block size in 512b sectors */ - uint16_t dn_bonuslen; /* length of dn_bonus */ - uint8_t dn_extra_slots; /* # of subsequent slots consumed */ - uint8_t dn_pad2[3]; - - /* accounting is protected by dn_dirty_mtx */ - uint64_t dn_maxblkid; /* largest allocated block ID */ - uint64_t dn_used; /* bytes (or sectors) of disk space */ - - /* - * Both dn_pad2 and dn_pad3 are protected by the block's MAC. This - * allows us to protect any fields that might be added here in the - * future. In either case, developers will want to check - * zio_crypt_init_uios_dnode() to ensure the new field is being - * protected properly. - */ - uint64_t dn_pad3[4]; - union { - blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)]; - struct { - blkptr_t __dn_ignore1; - uint8_t dn_bonus[DN_OLD_MAX_BONUSLEN]; - }; - struct { - blkptr_t __dn_ignore2; - uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN - - sizeof (blkptr_t)]; - blkptr_t dn_spill; - }; - }; -} dnode_phys_t; - -#define DN_SPILL_BLKPTR(dnp) (blkptr_t *)((char *)(dnp) + \ - (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT)) - -struct dnode { - /* - * Protects the structure of the dnode, including the number of levels - * of indirection (dn_nlevels), dn_maxblkid, and dn_next_* - */ - krwlock_t dn_struct_rwlock; - - /* Our link on dn_objset->os_dnodes list; protected by os_lock. */ - list_node_t dn_link; - - /* immutable: */ - struct objset *dn_objset; - uint64_t dn_object; - struct dmu_buf_impl *dn_dbuf; - struct dnode_handle *dn_handle; - dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ - - /* - * Copies of stuff in dn_phys. They're valid in the open - * context (eg. even before the dnode is first synced). - * Where necessary, these are protected by dn_struct_rwlock. - */ - dmu_object_type_t dn_type; /* object type */ - uint16_t dn_bonuslen; /* bonus length */ - uint8_t dn_bonustype; /* bonus type */ - uint8_t dn_nblkptr; /* number of blkptrs (immutable) */ - uint8_t dn_checksum; /* ZIO_CHECKSUM type */ - uint8_t dn_compress; /* ZIO_COMPRESS type */ - uint8_t dn_nlevels; - uint8_t dn_indblkshift; - uint8_t dn_datablkshift; /* zero if blksz not power of 2! */ - uint8_t dn_moved; /* Has this dnode been moved? */ - uint16_t dn_datablkszsec; /* in 512b sectors */ - uint32_t dn_datablksz; /* in bytes */ - uint64_t dn_maxblkid; - uint8_t dn_next_type[TXG_SIZE]; - uint8_t dn_num_slots; /* metadnode slots consumed on disk */ - uint8_t dn_next_nblkptr[TXG_SIZE]; - uint8_t dn_next_nlevels[TXG_SIZE]; - uint8_t dn_next_indblkshift[TXG_SIZE]; - uint8_t dn_next_bonustype[TXG_SIZE]; - uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */ - uint16_t dn_next_bonuslen[TXG_SIZE]; - uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ - - /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */ - uint32_t dn_dbufs_count; /* count of dn_dbufs */ - - /* protected by os_lock: */ - multilist_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ - - /* protected by dn_mtx: */ - kmutex_t dn_mtx; - list_t dn_dirty_records[TXG_SIZE]; - struct range_tree *dn_free_ranges[TXG_SIZE]; - uint64_t dn_allocated_txg; - uint64_t dn_free_txg; - uint64_t dn_assigned_txg; - uint64_t dn_dirty_txg; /* txg dnode was last dirtied */ - kcondvar_t dn_notxholds; - enum dnode_dirtycontext dn_dirtyctx; - uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */ - - /* protected by own devices */ - zfs_refcount_t dn_tx_holds; - zfs_refcount_t dn_holds; - - kmutex_t dn_dbufs_mtx; - /* - * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs - * can contain multiple dbufs of the same (level, blkid) when a - * dbuf is marked DB_EVICTING without being removed from - * dn_dbufs. To maintain the avl invariant that there cannot be - * duplicate entries, we order the dbufs by an arbitrary value - - * their address in memory. This means that dn_dbufs cannot be used to - * directly look up a dbuf. Instead, callers must use avl_walk, have - * a reference to the dbuf, or look up a non-existant node with - * db_state = DB_SEARCH (see dbuf_free_range for an example). - */ - avl_tree_t dn_dbufs; - - /* protected by dn_struct_rwlock */ - struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ - - boolean_t dn_have_spill; /* have spill or are spilling */ - - /* parent IO for current sync write */ - zio_t *dn_zio; - - /* used in syncing context */ - uint64_t dn_oldused; /* old phys used bytes */ - uint64_t dn_oldflags; /* old phys dn_flags */ - uint64_t dn_olduid, dn_oldgid; - uint64_t dn_newuid, dn_newgid; - int dn_id_flags; - - /* holds prefetch structure */ - struct zfetch dn_zfetch; -}; - -/* - * Since AVL already has embedded element counter, use dn_dbufs_count - * only for dbufs not counted there (bonus buffers) and just add them. - */ -#define DN_DBUFS_COUNT(dn) ((dn)->dn_dbufs_count + \ - avl_numnodes(&(dn)->dn_dbufs)) - -/* - * Adds a level of indirection between the dbuf and the dnode to avoid - * iterating descendent dbufs in dnode_move(). Handles are not allocated - * individually, but as an array of child dnodes in dnode_hold_impl(). - */ -typedef struct dnode_handle { - /* Protects dnh_dnode from modification by dnode_move(). */ - zrlock_t dnh_zrlock; - dnode_t *dnh_dnode; -} dnode_handle_t; - -typedef struct dnode_children { - dmu_buf_user_t dnc_dbu; /* User evict data */ - size_t dnc_count; /* number of children */ - dnode_handle_t dnc_children[]; /* sized dynamically */ -} dnode_children_t; - -typedef struct free_range { - avl_node_t fr_node; - uint64_t fr_blkid; - uint64_t fr_nblks; -} free_range_t; - -void dnode_special_open(struct objset *dd, dnode_phys_t *dnp, - uint64_t object, dnode_handle_t *dnh); -void dnode_special_close(dnode_handle_t *dnh); - -void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); -void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx); -void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx); - -int dnode_hold(struct objset *dd, uint64_t object, - void *ref, dnode_t **dnp); -int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots, - void *ref, dnode_t **dnp); -boolean_t dnode_add_ref(dnode_t *dn, void *ref); -void dnode_rele(dnode_t *dn, void *ref); -void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting); -void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); -void dnode_sync(dnode_t *dn, dmu_tx_t *tx); -void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, - dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx); -void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, - dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx); -void dnode_free(dnode_t *dn, dmu_tx_t *tx); -void dnode_byteswap(dnode_phys_t *dnp); -void dnode_buf_byteswap(void *buf, size_t size); -void dnode_verify(dnode_t *dn); -int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); -void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); -void dnode_diduse_space(dnode_t *dn, int64_t space); -void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t); -uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); -void dnode_init(void); -void dnode_fini(void); -int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, - int minlvl, uint64_t blkfill, uint64_t txg); -void dnode_evict_dbufs(dnode_t *dn); -void dnode_evict_bonus(dnode_t *dn); -void dnode_free_interior_slots(dnode_t *dn); -boolean_t dnode_needs_remap(const dnode_t *dn); - -#define DNODE_IS_DIRTY(_dn) \ - ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa)) - -#define DNODE_IS_CACHEABLE(_dn) \ - ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ - (DMU_OT_IS_METADATA((_dn)->dn_type) && \ - (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)) - -#define DNODE_META_IS_CACHEABLE(_dn) \ - ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ - (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA) - -/* - * Used for dnodestats kstat. - */ -typedef struct dnode_stats { - /* - * Number of failed attempts to hold a meta dnode dbuf. - */ - kstat_named_t dnode_hold_dbuf_hold; - /* - * Number of failed attempts to read a meta dnode dbuf. - */ - kstat_named_t dnode_hold_dbuf_read; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was able - * to hold the requested object number which was allocated. This is - * the common case when looking up any allocated object number. - */ - kstat_named_t dnode_hold_alloc_hits; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not - * able to hold the request object number because it was not allocated. - */ - kstat_named_t dnode_hold_alloc_misses; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not - * able to hold the request object number because the object number - * refers to an interior large dnode slot. - */ - kstat_named_t dnode_hold_alloc_interior; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) needed - * to retry acquiring slot zrl locks due to contention. - */ - kstat_named_t dnode_hold_alloc_lock_retry; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) did not - * need to create the dnode because another thread did so after - * dropping the read lock but before acquiring the write lock. - */ - kstat_named_t dnode_hold_alloc_lock_misses; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) found - * a free dnode instantiated by dnode_create() but not yet allocated - * by dnode_allocate(). - */ - kstat_named_t dnode_hold_alloc_type_none; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was able - * to hold the requested range of free dnode slots. - */ - kstat_named_t dnode_hold_free_hits; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not - * able to hold the requested range of free dnode slots because - * at least one slot was allocated. - */ - kstat_named_t dnode_hold_free_misses; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not - * able to hold the requested range of free dnode slots because - * after acquiring the zrl lock at least one slot was allocated. - */ - kstat_named_t dnode_hold_free_lock_misses; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) needed - * to retry acquiring slot zrl locks due to contention. - */ - kstat_named_t dnode_hold_free_lock_retry; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested - * a range of dnode slots which were held by another thread. - */ - kstat_named_t dnode_hold_free_refcount; - /* - * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested - * a range of dnode slots which would overflow the dnode_phys_t. - */ - kstat_named_t dnode_hold_free_overflow; - /* - * Number of times a dnode_hold(...) was attempted on a dnode - * which had already been unlinked in an earlier txg. - */ - kstat_named_t dnode_hold_free_txg; - /* - * Number of times dnode_free_interior_slots() needed to retry - * acquiring a slot zrl lock due to contention. - */ - kstat_named_t dnode_free_interior_lock_retry; - /* - * Number of new dnodes allocated by dnode_allocate(). - */ - kstat_named_t dnode_allocate; - /* - * Number of dnodes re-allocated by dnode_reallocate(). - */ - kstat_named_t dnode_reallocate; - /* - * Number of meta dnode dbufs evicted. - */ - kstat_named_t dnode_buf_evict; - /* - * Number of times dmu_object_alloc*() reached the end of the existing - * object ID chunk and advanced to a new one. - */ - kstat_named_t dnode_alloc_next_chunk; - /* - * Number of times multiple threads attempted to allocate a dnode - * from the same block of free dnodes. - */ - kstat_named_t dnode_alloc_race; - /* - * Number of times dmu_object_alloc*() was forced to advance to the - * next meta dnode dbuf due to an error from dmu_object_next(). - */ - kstat_named_t dnode_alloc_next_block; - /* - * Statistics for tracking dnodes which have been moved. - */ - kstat_named_t dnode_move_invalid; - kstat_named_t dnode_move_recheck1; - kstat_named_t dnode_move_recheck2; - kstat_named_t dnode_move_special; - kstat_named_t dnode_move_handle; - kstat_named_t dnode_move_rwlock; - kstat_named_t dnode_move_active; -} dnode_stats_t; - -extern dnode_stats_t dnode_stats; - -#define DNODE_STAT_INCR(stat, val) \ - atomic_add_64(&dnode_stats.stat.value.ui64, (val)); -#define DNODE_STAT_BUMP(stat) \ - DNODE_STAT_INCR(stat, 1); - -#ifdef ZFS_DEBUG - -/* - * There should be a ## between the string literal and fmt, to make it - * clear that we're joining two strings together, but that piece of shit - * gcc doesn't support that preprocessor token. - */ -#define dprintf_dnode(dn, fmt, ...) do { \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ - char __db_buf[32]; \ - uint64_t __db_obj = (dn)->dn_object; \ - if (__db_obj == DMU_META_DNODE_OBJECT) \ - (void) strcpy(__db_buf, "mdn"); \ - else \ - (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ - (u_longlong_t)__db_obj);\ - dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \ - __db_buf, __VA_ARGS__); \ - } \ -_NOTE(CONSTCOND) } while (0) - -#define DNODE_VERIFY(dn) dnode_verify(dn) -#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx) - -#else - -#define dprintf_dnode(db, fmt, ...) -#define DNODE_VERIFY(dn) -#define FREE_VERIFY(db, start, end, tx) - -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DNODE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h deleted file mode 100644 index e4d9ec2be033..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2013 by Delphix. All rights reserved. - */ - -#ifndef _SYS_DSL_BOOKMARK_H -#define _SYS_DSL_BOOKMARK_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct dsl_pool; -struct dsl_dataset; - -/* - * On disk zap object. - */ -typedef struct zfs_bookmark_phys { - uint64_t zbm_guid; /* guid of bookmarked dataset */ - uint64_t zbm_creation_txg; /* birth transaction group */ - uint64_t zbm_creation_time; /* bookmark creation time */ -} zfs_bookmark_phys_t; - -int dsl_bookmark_create(nvlist_t *, nvlist_t *); -int dsl_get_bookmarks(const char *, nvlist_t *, nvlist_t *); -int dsl_get_bookmarks_impl(dsl_dataset_t *, nvlist_t *, nvlist_t *); -int dsl_bookmark_destroy(nvlist_t *, nvlist_t *); -int dsl_bookmark_rename(const char *fs, const char *from, const char *to); -int dsl_bookmark_lookup(struct dsl_pool *, const char *, - struct dsl_dataset *, zfs_bookmark_phys_t *); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_BOOKMARK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h deleted file mode 100644 index 064ff617fd2e..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h +++ /dev/null @@ -1,457 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. - * Copyright (c) 2013 Steven Hartland. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -#ifndef _SYS_DSL_DATASET_H -#define _SYS_DSL_DATASET_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct dsl_dataset; -struct dsl_dir; -struct dsl_pool; - -#define DS_FLAG_INCONSISTENT (1ULL<<0) -#define DS_IS_INCONSISTENT(ds) \ - (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) - -/* - * Do not allow this dataset to be promoted. - */ -#define DS_FLAG_NOPROMOTE (1ULL<<1) - -/* - * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly - * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE, - * refquota/refreservations). - */ -#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2) - -/* - * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called - * on a dataset. This allows the dataset to be destroyed using 'zfs release'. - */ -#define DS_FLAG_DEFER_DESTROY (1ULL<<3) -#define DS_IS_DEFER_DESTROY(ds) \ - (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY) - -/* - * DS_FIELD_* are strings that are used in the "extensified" dataset zap object. - * They should be of the format :. - */ - -/* - * This field's value is the object ID of a zap object which contains the - * bookmarks of this dataset. If it is present, then this dataset is counted - * in the refcount of the SPA_FEATURES_BOOKMARKS feature. - */ -#define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks" - -/* - * This field is present (with value=0) if this dataset may contain large - * dnodes (>512B). If it is present, then this dataset is counted in the - * refcount of the SPA_FEATURE_LARGE_DNODE feature. - */ -#define DS_FIELD_LARGE_DNODE "org.zfsonlinux:large_dnode" - -/* - * These fields are set on datasets that are in the middle of a resumable - * receive, and allow the sender to resume the send if it is interrupted. - */ -#define DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid" -#define DS_FIELD_RESUME_TONAME "com.delphix:resume_toname" -#define DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid" -#define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object" -#define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset" -#define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes" -#define DS_FIELD_RESUME_LARGEBLOCK "com.delphix:resume_largeblockok" -#define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok" -#define DS_FIELD_RESUME_COMPRESSOK "com.delphix:resume_compressok" - -/* - * This field is set to the object number of the remap deadlist if one exists. - */ -#define DS_FIELD_REMAP_DEADLIST "com.delphix:remap_deadlist" - -/* - * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose - * name lookups should be performed case-insensitively. - */ -#define DS_FLAG_CI_DATASET (1ULL<<16) - -#define DS_CREATE_FLAG_NODIRTY (1ULL<<24) - -typedef struct dsl_dataset_phys { - uint64_t ds_dir_obj; /* DMU_OT_DSL_DIR */ - uint64_t ds_prev_snap_obj; /* DMU_OT_DSL_DATASET */ - uint64_t ds_prev_snap_txg; - uint64_t ds_next_snap_obj; /* DMU_OT_DSL_DATASET */ - uint64_t ds_snapnames_zapobj; /* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */ - uint64_t ds_num_children; /* clone/snap children; ==0 for head */ - uint64_t ds_creation_time; /* seconds since 1970 */ - uint64_t ds_creation_txg; - uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */ - /* - * ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes - * include all blocks referenced by this dataset, including those - * shared with any other datasets. - */ - uint64_t ds_referenced_bytes; - uint64_t ds_compressed_bytes; - uint64_t ds_uncompressed_bytes; - uint64_t ds_unique_bytes; /* only relevant to snapshots */ - /* - * The ds_fsid_guid is a 56-bit ID that can change to avoid - * collisions. The ds_guid is a 64-bit ID that will never - * change, so there is a small probability that it will collide. - */ - uint64_t ds_fsid_guid; - uint64_t ds_guid; - uint64_t ds_flags; /* DS_FLAG_* */ - blkptr_t ds_bp; - uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */ - uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */ - uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */ - uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */ -} dsl_dataset_phys_t; - -typedef struct dsl_dataset { - dmu_buf_user_t ds_dbu; - rrwlock_t ds_bp_rwlock; /* Protects ds_phys->ds_bp */ - - /* Immutable: */ - struct dsl_dir *ds_dir; - dmu_buf_t *ds_dbuf; - uint64_t ds_object; - uint64_t ds_fsid_guid; - boolean_t ds_is_snapshot; - - /* only used in syncing context, only valid for non-snapshots: */ - struct dsl_dataset *ds_prev; - uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */ - - /* has internal locking: */ - dsl_deadlist_t ds_deadlist; - bplist_t ds_pending_deadlist; - - /* - * The remap deadlist contains blocks (DVA's, really) that are - * referenced by the previous snapshot and point to indirect vdevs, - * but in this dataset they have been remapped to point to concrete - * (or at least, less-indirect) vdevs. In other words, the - * physical DVA is referenced by the previous snapshot but not by - * this dataset. Logically, the DVA continues to be referenced, - * but we are using a different (less indirect) physical DVA. - * This deadlist is used to determine when physical DVAs that - * point to indirect vdevs are no longer referenced anywhere, - * and thus should be marked obsolete. - * - * This is only used if SPA_FEATURE_OBSOLETE_COUNTS is enabled. - */ - dsl_deadlist_t ds_remap_deadlist; - /* protects creation of the ds_remap_deadlist */ - kmutex_t ds_remap_deadlist_lock; - - /* protected by lock on pool's dp_dirty_datasets list */ - txg_node_t ds_dirty_link; - list_node_t ds_synced_link; - - /* - * ds_phys->ds_ is also protected by ds_lock. - * Protected by ds_lock: - */ - kmutex_t ds_lock; - objset_t *ds_objset; - uint64_t ds_userrefs; - void *ds_owner; - - /* - * Long holds prevent the ds from being destroyed; they allow the - * ds to remain held even after dropping the dp_config_rwlock. - * Owning counts as a long hold. See the comments above - * dsl_pool_hold() for details. - */ - zfs_refcount_t ds_longholds; - - /* no locking; only for making guesses */ - uint64_t ds_trysnap_txg; - - /* for objset_open() */ - kmutex_t ds_opening_lock; - - uint64_t ds_reserved; /* cached refreservation */ - uint64_t ds_quota; /* cached refquota */ - - kmutex_t ds_sendstream_lock; - list_t ds_sendstreams; - - /* - * When in the middle of a resumable receive, tracks how much - * progress we have made. - */ - uint64_t ds_resume_object[TXG_SIZE]; - uint64_t ds_resume_offset[TXG_SIZE]; - uint64_t ds_resume_bytes[TXG_SIZE]; - - /* Protected by our dsl_dir's dd_lock */ - list_t ds_prop_cbs; - - /* - * For ZFEATURE_FLAG_PER_DATASET features, set if this dataset - * uses this feature. - */ - uint8_t ds_feature_inuse[SPA_FEATURES]; - - /* - * Set if we need to activate the feature on this dataset this txg - * (used only in syncing context). - */ - uint8_t ds_feature_activation_needed[SPA_FEATURES]; - - /* Protected by ds_lock; keep at end of struct for better locality */ - char ds_snapname[ZFS_MAX_DATASET_NAME_LEN]; -} dsl_dataset_t; - -inline dsl_dataset_phys_t * -dsl_dataset_phys(dsl_dataset_t *ds) -{ - return (ds->ds_dbuf->db_data); -} - -typedef struct dsl_dataset_promote_arg { - const char *ddpa_clonename; - dsl_dataset_t *ddpa_clone; - list_t shared_snaps, origin_snaps, clone_snaps; - dsl_dataset_t *origin_origin; /* origin of the origin */ - uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; - nvlist_t *err_ds; - cred_t *cr; -} dsl_dataset_promote_arg_t; - -typedef struct dsl_dataset_rollback_arg { - const char *ddra_fsname; - const char *ddra_tosnap; - void *ddra_owner; - nvlist_t *ddra_result; -} dsl_dataset_rollback_arg_t; - -typedef struct dsl_dataset_snapshot_arg { - nvlist_t *ddsa_snaps; - nvlist_t *ddsa_props; - nvlist_t *ddsa_errors; - cred_t *ddsa_cr; -} dsl_dataset_snapshot_arg_t; - -/* - * The max length of a temporary tag prefix is the number of hex digits - * required to express UINT64_MAX plus one for the hyphen. - */ -#define MAX_TAG_PREFIX_LEN 17 - -#define dsl_dataset_is_snapshot(ds) \ - (dsl_dataset_phys(ds)->ds_num_children != 0) - -#define DS_UNIQUE_IS_ACCURATE(ds) \ - ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) - -int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag, - dsl_dataset_t **dsp); -boolean_t dsl_dataset_try_add_ref(struct dsl_pool *dp, dsl_dataset_t *ds, - void *tag); -int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag, - dsl_dataset_t **); -void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); -int dsl_dataset_own(struct dsl_pool *dp, const char *name, - void *tag, dsl_dataset_t **dsp); -int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, - void *tag, dsl_dataset_t **dsp); -void dsl_dataset_disown(dsl_dataset_t *ds, void *tag); -void dsl_dataset_name(dsl_dataset_t *ds, char *name); -boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag); -int dsl_dataset_namelen(dsl_dataset_t *ds); -boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds); -uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, - dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); -uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - uint64_t flags, dmu_tx_t *tx); -void dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx); -int dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx); -int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors); -void dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx); -int dsl_dataset_promote_check(void *arg, dmu_tx_t *tx); -int dsl_dataset_promote(const char *name, char *conflsnap); -int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, - boolean_t force); -int dsl_dataset_rename_snapshot(const char *fsname, - const char *oldsnapname, const char *newsnapname, boolean_t recursive); -int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, - minor_t cleanup_minor, const char *htag); - -blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); - -spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); - -boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds, - dsl_dataset_t *snap); - -void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); -void dsl_dataset_sync_done(dsl_dataset_t *os, dmu_tx_t *tx); - -void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, - dmu_tx_t *tx); -int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, - dmu_tx_t *tx, boolean_t async); -void dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, - uint64_t offset, uint64_t size, uint64_t birth, dmu_tx_t *tx); - -void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); - -int get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val); -char *get_receive_resume_stats_impl(dsl_dataset_t *ds); -char *get_child_receive_stats(dsl_dataset_t *ds); -uint64_t dsl_get_refratio(dsl_dataset_t *ds); -uint64_t dsl_get_logicalreferenced(dsl_dataset_t *ds); -uint64_t dsl_get_compressratio(dsl_dataset_t *ds); -uint64_t dsl_get_used(dsl_dataset_t *ds); -uint64_t dsl_get_creation(dsl_dataset_t *ds); -uint64_t dsl_get_creationtxg(dsl_dataset_t *ds); -uint64_t dsl_get_refquota(dsl_dataset_t *ds); -uint64_t dsl_get_refreservation(dsl_dataset_t *ds); -uint64_t dsl_get_guid(dsl_dataset_t *ds); -uint64_t dsl_get_unique(dsl_dataset_t *ds); -uint64_t dsl_get_objsetid(dsl_dataset_t *ds); -uint64_t dsl_get_userrefs(dsl_dataset_t *ds); -uint64_t dsl_get_defer_destroy(dsl_dataset_t *ds); -uint64_t dsl_get_referenced(dsl_dataset_t *ds); -uint64_t dsl_get_numclones(dsl_dataset_t *ds); -uint64_t dsl_get_inconsistent(dsl_dataset_t *ds); -uint64_t dsl_get_available(dsl_dataset_t *ds); -int dsl_get_written(dsl_dataset_t *ds, uint64_t *written); -int dsl_get_prev_snap(dsl_dataset_t *ds, char *snap); -int dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value, - char *source); - -void get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv); - -void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv); - -void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat); -void dsl_dataset_space(dsl_dataset_t *ds, - uint64_t *refdbytesp, uint64_t *availbytesp, - uint64_t *usedobjsp, uint64_t *availobjsp); -uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds); -int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); -int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); -boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds); - -int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); - -int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, - uint64_t asize, uint64_t inflight, uint64_t *used, - uint64_t *ref_rsrv); -int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, - uint64_t quota); -int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, - uint64_t reservation); - -boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, - uint64_t earlier_txg); -void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag); -void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag); -boolean_t dsl_dataset_long_held(dsl_dataset_t *ds); - -int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, - dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx); -void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, - dsl_dataset_t *origin_head, dmu_tx_t *tx); -int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, - dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr); -void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, - dmu_tx_t *tx); - -void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, - dmu_tx_t *tx); -void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds); -int dsl_dataset_get_snapname(dsl_dataset_t *ds); -int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, - uint64_t *value); -int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, - boolean_t adj_cnt); -void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, - zprop_source_t source, uint64_t value, dmu_tx_t *tx); -void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx); -boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds); -boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds); - -int dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx); -void dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx); -int dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner, - nvlist_t *result); - -uint64_t dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds); -void dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); -boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds); -void dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); - -void dsl_dataset_deactivate_feature(uint64_t dsobj, - spa_feature_t f, dmu_tx_t *tx); - -#ifdef ZFS_DEBUG -#define dprintf_ds(ds, fmt, ...) do { \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ - char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \ - dsl_dataset_name(ds, __ds_name); \ - dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \ - kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \ - } \ -_NOTE(CONSTCOND) } while (0) -#else -#define dprintf_ds(dd, fmt, ...) -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_DATASET_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h deleted file mode 100644 index 08f38233d7ab..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2015 by Delphix. All rights reserved. - */ - -#ifndef _SYS_DSL_DEADLIST_H -#define _SYS_DSL_DEADLIST_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct dmu_buf; -struct dsl_dataset; - -typedef struct dsl_deadlist_phys { - uint64_t dl_used; - uint64_t dl_comp; - uint64_t dl_uncomp; - uint64_t dl_pad[37]; /* pad out to 320b for future expansion */ -} dsl_deadlist_phys_t; - -typedef struct dsl_deadlist { - objset_t *dl_os; - uint64_t dl_object; - avl_tree_t dl_tree; - boolean_t dl_havetree; - struct dmu_buf *dl_dbuf; - dsl_deadlist_phys_t *dl_phys; - kmutex_t dl_lock; - - /* if it's the old on-disk format: */ - bpobj_t dl_bpobj; - boolean_t dl_oldfmt; -} dsl_deadlist_t; - -typedef struct dsl_deadlist_entry { - avl_node_t dle_node; - uint64_t dle_mintxg; - bpobj_t dle_bpobj; -} dsl_deadlist_entry_t; - -void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object); -void dsl_deadlist_close(dsl_deadlist_t *dl); -uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx); -void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx); -void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx); -void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); -void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); -uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, - uint64_t mrs_obj, dmu_tx_t *tx); -void dsl_deadlist_space(dsl_deadlist_t *dl, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); -void dsl_deadlist_space_range(dsl_deadlist_t *dl, - uint64_t mintxg, uint64_t maxtxg, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); -void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx); -void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, - dmu_tx_t *tx); -boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_DEADLIST_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h deleted file mode 100644 index 6fb6a121ade6..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. - */ - -#ifndef _SYS_DSL_DELEG_H -#define _SYS_DSL_DELEG_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define ZFS_DELEG_PERM_NONE "" -#define ZFS_DELEG_PERM_CREATE "create" -#define ZFS_DELEG_PERM_DESTROY "destroy" -#define ZFS_DELEG_PERM_SNAPSHOT "snapshot" -#define ZFS_DELEG_PERM_ROLLBACK "rollback" -#define ZFS_DELEG_PERM_CLONE "clone" -#define ZFS_DELEG_PERM_PROMOTE "promote" -#define ZFS_DELEG_PERM_RENAME "rename" -#define ZFS_DELEG_PERM_MOUNT "mount" -#define ZFS_DELEG_PERM_SHARE "share" -#define ZFS_DELEG_PERM_SEND "send" -#define ZFS_DELEG_PERM_RECEIVE "receive" -#define ZFS_DELEG_PERM_ALLOW "allow" -#define ZFS_DELEG_PERM_USERPROP "userprop" -#define ZFS_DELEG_PERM_VSCAN "vscan" -#define ZFS_DELEG_PERM_USERQUOTA "userquota" -#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" -#define ZFS_DELEG_PERM_USERUSED "userused" -#define ZFS_DELEG_PERM_GROUPUSED "groupused" -#define ZFS_DELEG_PERM_HOLD "hold" -#define ZFS_DELEG_PERM_RELEASE "release" -#define ZFS_DELEG_PERM_DIFF "diff" -#define ZFS_DELEG_PERM_BOOKMARK "bookmark" -#define ZFS_DELEG_PERM_REMAP "remap" - -/* - * Note: the names of properties that are marked delegatable are also - * valid delegated permissions - */ - -int dsl_deleg_get(const char *ddname, nvlist_t **nvp); -int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset); -int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr); -int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr); -void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr); -int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr); -int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr); -int dsl_deleg_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx); -boolean_t dsl_delegation_on(objset_t *os); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_DELEG_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h deleted file mode 100644 index ae3ca0cfbd5e..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. - */ - -#ifndef _SYS_DSL_DESTROY_H -#define _SYS_DSL_DESTROY_H - -#ifdef __cplusplus -extern "C" { -#endif - -struct nvlist; -struct dsl_dataset; -struct dmu_tx; - -int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t, - struct nvlist *); -int dsl_destroy_snapshot(const char *, boolean_t); -int dsl_destroy_head(const char *); -int dsl_destroy_head_check_impl(struct dsl_dataset *, int); -void dsl_destroy_head_sync_impl(struct dsl_dataset *, struct dmu_tx *); -int dsl_destroy_inconsistent(const char *, void *); -int dsl_destroy_snapshot_check_impl(struct dsl_dataset *, boolean_t); -void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *, - boolean_t, struct dmu_tx *); - -typedef struct dsl_destroy_snapshot_arg { - const char *ddsa_name; - boolean_t ddsa_defer; -} dsl_destroy_snapshot_arg_t; - -int dsl_destroy_snapshot_check(void *, dmu_tx_t *); -void dsl_destroy_snapshot_sync(void *, dmu_tx_t *); - -typedef struct dsl_destroy_head_arg { - const char *ddha_name; -} dsl_destroy_head_arg_t; - -int dsl_destroy_head_check(void *, dmu_tx_t *); -void dsl_destroy_head_sync(void *, dmu_tx_t *); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_DESTROY_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h deleted file mode 100644 index 21d953cb6013..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - */ - -#ifndef _SYS_DSL_DIR_H -#define _SYS_DSL_DIR_H - -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct dsl_dataset; - -/* - * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object. - * They should be of the format :. - */ - -#define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count" -#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count" -#define DD_FIELD_LAST_REMAP_TXG "com.delphix:last_remap_txg" - -typedef enum dd_used { - DD_USED_HEAD, - DD_USED_SNAP, - DD_USED_CHILD, - DD_USED_CHILD_RSRV, - DD_USED_REFRSRV, - DD_USED_NUM -} dd_used_t; - -#define DD_FLAG_USED_BREAKDOWN (1<<0) - -typedef struct dsl_dir_phys { - uint64_t dd_creation_time; /* not actually used */ - uint64_t dd_head_dataset_obj; - uint64_t dd_parent_obj; - uint64_t dd_origin_obj; - uint64_t dd_child_dir_zapobj; - /* - * how much space our children are accounting for; for leaf - * datasets, == physical space used by fs + snaps - */ - uint64_t dd_used_bytes; - uint64_t dd_compressed_bytes; - uint64_t dd_uncompressed_bytes; - /* Administrative quota setting */ - uint64_t dd_quota; - /* Administrative reservation setting */ - uint64_t dd_reserved; - uint64_t dd_props_zapobj; - uint64_t dd_deleg_zapobj; /* dataset delegation permissions */ - uint64_t dd_flags; - uint64_t dd_used_breakdown[DD_USED_NUM]; - uint64_t dd_clones; /* dsl_dir objects */ - uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */ -} dsl_dir_phys_t; - -struct dsl_dir { - dmu_buf_user_t dd_dbu; - - /* These are immutable; no lock needed: */ - uint64_t dd_object; - dsl_pool_t *dd_pool; - - /* Stable until user eviction; no lock needed: */ - dmu_buf_t *dd_dbuf; - - /* protected by lock on pool's dp_dirty_dirs list */ - txg_node_t dd_dirty_link; - - /* protected by dp_config_rwlock */ - dsl_dir_t *dd_parent; - - /* Protected by dd_lock */ - kmutex_t dd_lock; - list_t dd_props; /* list of dsl_prop_record_t's */ - timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */ - uint64_t dd_origin_txg; - - /* gross estimate of space used by in-flight tx's */ - uint64_t dd_tempreserved[TXG_SIZE]; - /* amount of space we expect to write; == amount of dirty data */ - int64_t dd_space_towrite[TXG_SIZE]; - - /* protected by dd_lock; keep at end of struct for better locality */ - char dd_myname[ZFS_MAX_DATASET_NAME_LEN]; -}; - -inline dsl_dir_phys_t * -dsl_dir_phys(dsl_dir_t *dd) -{ - return (dd->dd_dbuf->db_data); -} - -void dsl_dir_rele(dsl_dir_t *dd, void *tag); -void dsl_dir_async_rele(dsl_dir_t *dd, void *tag); -int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, - dsl_dir_t **, const char **tail); -int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, - const char *tail, void *tag, dsl_dir_t **); -void dsl_dir_name(dsl_dir_t *dd, char *buf); -int dsl_dir_namelen(dsl_dir_t *dd); -uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, - const char *name, dmu_tx_t *tx); - -uint64_t dsl_dir_get_used(dsl_dir_t *dd); -uint64_t dsl_dir_get_compressed(dsl_dir_t *dd); -uint64_t dsl_dir_get_quota(dsl_dir_t *dd); -uint64_t dsl_dir_get_reservation(dsl_dir_t *dd); -uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd); -uint64_t dsl_dir_get_logicalused(dsl_dir_t *dd); -uint64_t dsl_dir_get_usedsnap(dsl_dir_t *dd); -uint64_t dsl_dir_get_usedds(dsl_dir_t *dd); -uint64_t dsl_dir_get_usedrefreserv(dsl_dir_t *dd); -uint64_t dsl_dir_get_usedchild(dsl_dir_t *dd); -void dsl_dir_get_origin(dsl_dir_t *dd, char *buf); -int dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count); -int dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count); -int dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count); - -void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); -uint64_t dsl_dir_space_available(dsl_dir_t *dd, - dsl_dir_t *ancestor, int64_t delta, int ondiskonly); -void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); -void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); -int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, - uint64_t asize, boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx); -void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); -void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); -void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, - int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); -void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, - dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); -int dsl_dir_set_quota(const char *ddname, zprop_source_t source, - uint64_t quota); -int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, - uint64_t reservation); -int dsl_dir_activate_fs_ss_limit(const char *); -int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *, - cred_t *); -void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *); -int dsl_dir_update_last_remap_txg(dsl_dir_t *, uint64_t); -int dsl_dir_rename(const char *oldname, const char *newname); -int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, - uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *); -boolean_t dsl_dir_is_clone(dsl_dir_t *dd); -void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, - uint64_t reservation, cred_t *cr, dmu_tx_t *tx); -void dsl_dir_snap_cmtime_update(dsl_dir_t *dd); -timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd); -void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, - dmu_tx_t *tx); -void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx); -boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); - -/* internal reserved dir name */ -#define MOS_DIR_NAME "$MOS" -#define ORIGIN_DIR_NAME "$ORIGIN" -#define FREE_DIR_NAME "$FREE" -#define LEAK_DIR_NAME "$LEAK" - -#ifdef ZFS_DEBUG -#define dprintf_dd(dd, fmt, ...) do { \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ - char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \ - dsl_dir_name(dd, __ds_name); \ - dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \ - kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \ - } \ -_NOTE(CONSTCOND) } while (0) -#else -#define dprintf_dd(dd, fmt, ...) -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_DIR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h deleted file mode 100644 index 7dce64bfd40b..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h +++ /dev/null @@ -1,191 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. - * Copyright 2016 Nexenta Systems, Inc. All rights reserved. - */ - -#ifndef _SYS_DSL_POOL_H -#define _SYS_DSL_POOL_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct objset; -struct dsl_dir; -struct dsl_dataset; -struct dsl_pool; -struct dmu_tx; -struct dsl_scan; - -extern uint64_t zfs_dirty_data_max; -extern uint64_t zfs_dirty_data_max_max; -extern uint64_t zfs_dirty_data_sync_pct; -extern int zfs_dirty_data_max_percent; -extern int zfs_delay_min_dirty_percent; -extern uint64_t zfs_delay_scale; - -/* These macros are for indexing into the zfs_all_blkstats_t. */ -#define DMU_OT_DEFERRED DMU_OT_NONE -#define DMU_OT_OTHER DMU_OT_NUMTYPES /* place holder for DMU_OT() types */ -#define DMU_OT_TOTAL (DMU_OT_NUMTYPES + 1) - -typedef struct zfs_blkstat { - uint64_t zb_count; - uint64_t zb_asize; - uint64_t zb_lsize; - uint64_t zb_psize; - uint64_t zb_gangs; - uint64_t zb_ditto_2_of_2_samevdev; - uint64_t zb_ditto_2_of_3_samevdev; - uint64_t zb_ditto_3_of_3_samevdev; -} zfs_blkstat_t; - -typedef struct zfs_all_blkstats { - zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1]; - kmutex_t zab_lock; -} zfs_all_blkstats_t; - - -typedef struct dsl_pool { - /* Immutable */ - spa_t *dp_spa; - struct objset *dp_meta_objset; - struct dsl_dir *dp_root_dir; - struct dsl_dir *dp_mos_dir; - struct dsl_dir *dp_free_dir; - struct dsl_dir *dp_leak_dir; - struct dsl_dataset *dp_origin_snap; - uint64_t dp_root_dir_obj; - struct taskq *dp_vnrele_taskq; - - /* No lock needed - sync context only */ - blkptr_t dp_meta_rootbp; - uint64_t dp_tmp_userrefs_obj; - bpobj_t dp_free_bpobj; - uint64_t dp_bptree_obj; - uint64_t dp_empty_bpobj; - bpobj_t dp_obsolete_bpobj; - - struct dsl_scan *dp_scan; - - /* Uses dp_lock */ - kmutex_t dp_lock; - kcondvar_t dp_spaceavail_cv; - uint64_t dp_dirty_pertxg[TXG_SIZE]; - uint64_t dp_dirty_total; - uint64_t dp_long_free_dirty_pertxg[TXG_SIZE]; - uint64_t dp_mos_used_delta; - uint64_t dp_mos_compressed_delta; - uint64_t dp_mos_uncompressed_delta; - - /* - * Time of most recently scheduled (furthest in the future) - * wakeup for delayed transactions. - */ - hrtime_t dp_last_wakeup; - - /* Has its own locking */ - tx_state_t dp_tx; - txg_list_t dp_dirty_datasets; - txg_list_t dp_dirty_zilogs; - txg_list_t dp_dirty_dirs; - txg_list_t dp_sync_tasks; - txg_list_t dp_early_sync_tasks; - taskq_t *dp_sync_taskq; - taskq_t *dp_zil_clean_taskq; - - /* - * Protects administrative changes (properties, namespace) - * - * It is only held for write in syncing context. Therefore - * syncing context does not need to ever have it for read, since - * nobody else could possibly have it for write. - */ - rrwlock_t dp_config_rwlock; - - zfs_all_blkstats_t *dp_blkstats; -} dsl_pool_t; - -int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); -int dsl_pool_open(dsl_pool_t *dp); -void dsl_pool_close(dsl_pool_t *dp); -dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg); -void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); -void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg); -int dsl_pool_sync_context(dsl_pool_t *dp); -uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy); -uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, - zfs_space_check_t slop_policy); -void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); -void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); -void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); -void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, - const blkptr_t *bpp); -void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); -void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); -void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx); -void dsl_pool_mos_diduse_space(dsl_pool_t *dp, - int64_t used, int64_t comp, int64_t uncomp); -void dsl_pool_ckpoint_diduse_space(dsl_pool_t *dp, - int64_t used, int64_t comp, int64_t uncomp); -void dsl_pool_config_enter(dsl_pool_t *dp, void *tag); -void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag); -void dsl_pool_config_exit(dsl_pool_t *dp, void *tag); -boolean_t dsl_pool_config_held(dsl_pool_t *dp); -boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp); -boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp); - -taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp); - -int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, - const char *tag, uint64_t now, dmu_tx_t *tx); -int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, - const char *tag, dmu_tx_t *tx); -void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); -int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); -int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp); -void dsl_pool_rele(dsl_pool_t *dp, void *tag); - -void dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx); -void dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_POOL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h deleted file mode 100644 index 21e6f4674be9..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h +++ /dev/null @@ -1,115 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. - */ - -#ifndef _SYS_DSL_PROP_H -#define _SYS_DSL_PROP_H - -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct dsl_dataset; -struct dsl_dir; - -/* The callback func may not call into the DMU or DSL! */ -typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval); - -typedef struct dsl_prop_record { - list_node_t pr_node; /* link on dd_props */ - const char *pr_propname; - list_t pr_cbs; -} dsl_prop_record_t; - -typedef struct dsl_prop_cb_record { - list_node_t cbr_pr_node; /* link on pr_cbs */ - list_node_t cbr_ds_node; /* link on ds_prop_cbs */ - dsl_prop_record_t *cbr_pr; - struct dsl_dataset *cbr_ds; - dsl_prop_changed_cb_t *cbr_func; - void *cbr_arg; -} dsl_prop_cb_record_t; - -typedef struct dsl_props_arg { - nvlist_t *pa_props; - zprop_source_t pa_source; -} dsl_props_arg_t; - -void dsl_prop_init(dsl_dir_t *dd); -void dsl_prop_fini(dsl_dir_t *dd); -int dsl_prop_register(struct dsl_dataset *ds, const char *propname, - dsl_prop_changed_cb_t *callback, void *cbarg); -void dsl_prop_unregister_all(struct dsl_dataset *ds, void *cbarg); -void dsl_prop_notify_all(struct dsl_dir *dd); -boolean_t dsl_prop_hascb(struct dsl_dataset *ds); - -int dsl_prop_get(const char *ddname, const char *propname, - int intsz, int numints, void *buf, char *setpoint); -int dsl_prop_get_integer(const char *ddname, const char *propname, - uint64_t *valuep, char *setpoint); -int dsl_prop_get_all(objset_t *os, nvlist_t **nvp); -int dsl_prop_get_received(const char *dsname, nvlist_t **nvp); -int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname, - int intsz, int numints, void *buf, char *setpoint); -int dsl_prop_get_int_ds(struct dsl_dataset *ds, const char *propname, - uint64_t *valuep); -int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, - int intsz, int numints, void *buf, char *setpoint, - boolean_t snapshot); - -void dsl_props_set_sync_impl(struct dsl_dataset *ds, zprop_source_t source, - nvlist_t *props, dmu_tx_t *tx); -void dsl_prop_set_sync_impl(struct dsl_dataset *ds, const char *propname, - zprop_source_t source, int intsz, int numints, const void *value, - dmu_tx_t *tx); -int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl); -int dsl_prop_set_int(const char *dsname, const char *propname, - zprop_source_t source, uint64_t value); -int dsl_prop_set_string(const char *dsname, const char *propname, - zprop_source_t source, const char *value); -int dsl_prop_inherit(const char *dsname, const char *propname, - zprop_source_t source); - -int dsl_prop_predict(dsl_dir_t *dd, const char *propname, - zprop_source_t source, uint64_t value, uint64_t *newvalp); - -/* flag first receive on or after SPA_VERSION_RECVD_PROPS */ -boolean_t dsl_prop_get_hasrecvd(const char *dsname); -int dsl_prop_set_hasrecvd(const char *dsname); -void dsl_prop_unset_hasrecvd(const char *dsname); - -void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); -void dsl_prop_nvlist_add_string(nvlist_t *nv, - zfs_prop_t prop, const char *value); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_PROP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h deleted file mode 100644 index 5ddffe57bf97..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h +++ /dev/null @@ -1,188 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2017 Datto Inc. - */ - -#ifndef _SYS_DSL_SCAN_H -#define _SYS_DSL_SCAN_H - -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct objset; -struct dsl_dir; -struct dsl_dataset; -struct dsl_pool; -struct dmu_tx; - -/* - * All members of this structure must be uint64_t, for byteswap - * purposes. - */ -typedef struct dsl_scan_phys { - uint64_t scn_func; /* pool_scan_func_t */ - uint64_t scn_state; /* dsl_scan_state_t */ - uint64_t scn_queue_obj; - uint64_t scn_min_txg; - uint64_t scn_max_txg; - uint64_t scn_cur_min_txg; - uint64_t scn_cur_max_txg; - uint64_t scn_start_time; - uint64_t scn_end_time; - uint64_t scn_to_examine; /* total bytes to be scanned */ - uint64_t scn_examined; /* bytes scanned so far */ - uint64_t scn_to_process; - uint64_t scn_processed; - uint64_t scn_errors; /* scan I/O error count */ - uint64_t scn_ddt_class_max; - ddt_bookmark_t scn_ddt_bookmark; - zbookmark_phys_t scn_bookmark; - uint64_t scn_flags; /* dsl_scan_flags_t */ -} dsl_scan_phys_t; - -#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t)) - -typedef enum dsl_scan_flags { - DSF_VISIT_DS_AGAIN = 1<<0, - DSF_SCRUB_PAUSED = 1<<1, -} dsl_scan_flags_t; - -/* - * Every pool will have one dsl_scan_t and this structure will contain - * in-memory information about the scan and a pointer to the on-disk - * representation (i.e. dsl_scan_phys_t). Most of the state of the scan - * is contained on-disk to allow the scan to resume in the event of a reboot - * or panic. This structure maintains information about the behavior of a - * running scan, some caching information, and how it should traverse the pool. - * - * The following members of this structure direct the behavior of the scan: - * - * scn_suspending - a scan that cannot be completed in a single txg or - * has exceeded its allotted time will need to suspend. - * When this flag is set the scanner will stop traversing - * the pool and write out the current state to disk. - * - * scn_restart_txg - directs the scanner to either restart or start a - * a scan at the specified txg value. - * - * scn_done_txg - when a scan completes its traversal it will set - * the completion txg to the next txg. This is necessary - * to ensure that any blocks that were freed during - * the scan but have not yet been processed (i.e deferred - * frees) are accounted for. - * - * This structure also maintains information about deferred frees which are - * a special kind of traversal. Deferred free can exist in either a bptree or - * a bpobj structure. The scn_is_bptree flag will indicate the type of - * deferred free that is in progress. If the deferred free is part of an - * asynchronous destroy then the scn_async_destroying flag will be set. - */ -typedef struct dsl_scan { - struct dsl_pool *scn_dp; - - uint64_t scn_restart_txg; - uint64_t scn_done_txg; - uint64_t scn_sync_start_time; - uint64_t scn_issued_before_pass; - - /* for freeing blocks */ - boolean_t scn_is_bptree; - boolean_t scn_async_destroying; - boolean_t scn_async_stalled; - uint64_t scn_async_block_min_time_ms; - /* flags and stats for controlling scan state */ - boolean_t scn_is_sorted; /* doing sequential scan */ - boolean_t scn_clearing; /* scan is issuing sequential extents */ - boolean_t scn_checkpointing; /* scan is issuing all queued extents */ - boolean_t scn_suspending; /* scan is suspending until next txg */ - uint64_t scn_last_checkpoint; /* time of last checkpoint */ - - /* members for thread synchronization */ - zio_t *scn_zio_root; /* root zio for waiting on IO */ - taskq_t *scn_taskq; /* task queue for issuing extents */ - - /* for controlling scan prefetch, protected by spa_scrub_lock */ - boolean_t scn_prefetch_stop; /* prefetch should stop */ - zbookmark_phys_t scn_prefetch_bookmark; /* prefetch start bookmark */ - avl_tree_t scn_prefetch_queue; /* priority queue of prefetch IOs */ - uint64_t scn_maxinflight_bytes; /* max bytes in flight for poool */ - - /* per txg statistics */ - uint64_t scn_visited_this_txg; /* total bps visited this txg */ - uint64_t scn_holes_this_txg; - uint64_t scn_lt_min_this_txg; - uint64_t scn_gt_max_this_txg; - uint64_t scn_ddt_contained_this_txg; - uint64_t scn_objsets_visited_this_txg; - uint64_t scn_avg_seg_size_this_txg; - uint64_t scn_segs_this_txg; - uint64_t scn_avg_zio_size_this_txg; - uint64_t scn_zios_this_txg; - - /* members needed for syncing scan status to disk */ - dsl_scan_phys_t scn_phys; /* on disk representation of scan */ - dsl_scan_phys_t scn_phys_cached; - avl_tree_t scn_queue; /* queue of datasets to scan */ - uint64_t scn_bytes_pending; /* outstanding data to issue */ -} dsl_scan_t; - -typedef struct dsl_scan_io_queue dsl_scan_io_queue_t; - -void dsl_scan_global_init(void); - -void scan_init(void); -void scan_fini(void); -int dsl_scan_init(struct dsl_pool *dp, uint64_t txg); -void dsl_scan_fini(struct dsl_pool *dp); -void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); -int dsl_scan_cancel(struct dsl_pool *); -int dsl_scan(struct dsl_pool *, pool_scan_func_t); -boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp); -int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd); -void dsl_resilver_restart(struct dsl_pool *, uint64_t txg); -boolean_t dsl_scan_resilvering(struct dsl_pool *dp); -boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); -void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, - ddt_entry_t *dde, dmu_tx_t *tx); -void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); -void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); -void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, - struct dmu_tx *tx); -boolean_t dsl_scan_active(dsl_scan_t *scn); -boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn); -void dsl_scan_freed(spa_t *spa, const blkptr_t *bp); -void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue); -void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_SCAN_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h deleted file mode 100644 index 957963ffe553..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h +++ /dev/null @@ -1,127 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - */ - -#ifndef _SYS_DSL_SYNCTASK_H -#define _SYS_DSL_SYNCTASK_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct dsl_pool; - -typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *); -typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *); -typedef void (dsl_sigfunc_t)(void *, dmu_tx_t *); - -typedef enum zfs_space_check { - /* - * Normal space check: if there is less than 3.2% free space, - * the operation will fail. Operations which are logically - * creating things should use this (e.g. "zfs create", "zfs snapshot"). - * User writes (via the ZPL / ZVOL) also fail at this point. - */ - ZFS_SPACE_CHECK_NORMAL, - - /* - * Space check allows use of half the slop space. If there - * is less than 1.6% free space, the operation will fail. Most - * operations should use this (e.g. "zfs set", "zfs rename"), - * because we want them to succeed even after user writes are failing, - * so that they can be used as part of the space recovery process. - */ - ZFS_SPACE_CHECK_RESERVED, - - /* - * Space check allows use of three quarters of the slop space. - * If there is less than 0.8% free space, the operation will - * fail. - */ - ZFS_SPACE_CHECK_EXTRA_RESERVED, - - /* - * In all cases "zfs destroy" is expected to result in an net - * reduction of space, except one. When the pool has a - * checkpoint, space freed by "zfs destroy" will not actually - * free anything internally. Thus, it starts failing after - * three quarters of the slop space is exceeded. - */ - ZFS_SPACE_CHECK_DESTROY = ZFS_SPACE_CHECK_EXTRA_RESERVED, - - /* - * A channel program can run a "zfs destroy" as part of its - * script and therefore has the same space_check policy when - * being evaluated. - */ - ZFS_SPACE_CHECK_ZCP_EVAL = ZFS_SPACE_CHECK_DESTROY, - - /* - * No space check is performed. This level of space check should - * be used cautiously as operations that use it can even run when - * 0.8% capacity is left for use. In this scenario, if there is a - * checkpoint, async destroys are suspended and any kind of freeing - * can potentially add space instead of freeing it. - * - * See also the comments above spa_slop_shift. - */ - ZFS_SPACE_CHECK_NONE, - - ZFS_SPACE_CHECK_DISCARD_CHECKPOINT = ZFS_SPACE_CHECK_NONE, - -} zfs_space_check_t; - -typedef struct dsl_sync_task { - txg_node_t dst_node; - struct dsl_pool *dst_pool; - uint64_t dst_txg; - int dst_space; - zfs_space_check_t dst_space_check; - dsl_checkfunc_t *dst_checkfunc; - dsl_syncfunc_t *dst_syncfunc; - void *dst_arg; - int dst_error; - boolean_t dst_nowaiter; -} dsl_sync_task_t; - -void dsl_sync_task_sync(dsl_sync_task_t *, dmu_tx_t *); -int dsl_sync_task(const char *, dsl_checkfunc_t *, - dsl_syncfunc_t *, void *, int, zfs_space_check_t); -void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *, - void *, int, zfs_space_check_t, dmu_tx_t *); -int dsl_early_sync_task(const char *, dsl_checkfunc_t *, - dsl_syncfunc_t *, void *, int, zfs_space_check_t); -void dsl_early_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *, - void *, int, zfs_space_check_t, dmu_tx_t *); -int dsl_sync_task_sig(const char *, dsl_checkfunc_t *, dsl_syncfunc_t *, - dsl_sigfunc_t *, void *, int, zfs_space_check_t); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_SYNCTASK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h deleted file mode 100644 index 071aeb86d1f1..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h +++ /dev/null @@ -1,57 +0,0 @@ - -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ - -#ifndef _SYS_DSL_USERHOLD_H -#define _SYS_DSL_USERHOLD_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct dsl_pool; -struct dsl_dataset; -struct dmu_tx; - -int dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, - nvlist_t *errlist); -int dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist); -int dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl); -void dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds); -int dsl_dataset_user_hold_check_one(struct dsl_dataset *ds, const char *htag, - boolean_t temphold, struct dmu_tx *tx); -void dsl_dataset_user_hold_sync_one(struct dsl_dataset *ds, const char *htag, - minor_t minor, uint64_t now, struct dmu_tx *tx); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DSL_USERHOLD_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h deleted file mode 100644 index 7219dc967427..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h +++ /dev/null @@ -1,127 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2017, Intel Corporation. - */ - -#ifndef _SYS_METASLAB_H -#define _SYS_METASLAB_H - -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - - -typedef struct metaslab_ops { - uint64_t (*msop_alloc)(metaslab_t *, uint64_t); -} metaslab_ops_t; - - -extern metaslab_ops_t *zfs_metaslab_ops; - -int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, - metaslab_t **); -void metaslab_fini(metaslab_t *); - -int metaslab_load(metaslab_t *); -void metaslab_unload(metaslab_t *); - -uint64_t metaslab_allocated_space(metaslab_t *); - -void metaslab_sync(metaslab_t *, uint64_t); -void metaslab_sync_done(metaslab_t *, uint64_t); -void metaslab_sync_reassess(metaslab_group_t *); -uint64_t metaslab_block_maxsize(metaslab_t *); - -/* - * metaslab alloc flags - */ -#define METASLAB_HINTBP_FAVOR 0x0 -#define METASLAB_HINTBP_AVOID 0x1 -#define METASLAB_GANG_HEADER 0x2 -#define METASLAB_GANG_CHILD 0x4 -#define METASLAB_ASYNC_ALLOC 0x8 -#define METASLAB_DONT_THROTTLE 0x10 -#define METASLAB_MUST_RESERVE 0x20 -#define METASLAB_FASTWRITE 0x40 - -int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, - blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, - int); -int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, - dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int); -void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); -void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t); -void metaslab_free_dva(spa_t *, const dva_t *, boolean_t); -void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *); -void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t); -int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); -int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); -void metaslab_check_free(spa_t *, const blkptr_t *); - -void metaslab_alloc_trace_init(void); -void metaslab_alloc_trace_fini(void); -void metaslab_trace_init(zio_alloc_list_t *); -void metaslab_trace_fini(zio_alloc_list_t *); - -metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *); -void metaslab_class_destroy(metaslab_class_t *); -int metaslab_class_validate(metaslab_class_t *); -void metaslab_class_histogram_verify(metaslab_class_t *); -uint64_t metaslab_class_fragmentation(metaslab_class_t *); -uint64_t metaslab_class_expandable_space(metaslab_class_t *); -boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, - zio_t *, int); -void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *); - -uint64_t metaslab_class_get_alloc(metaslab_class_t *); -uint64_t metaslab_class_get_space(metaslab_class_t *); -uint64_t metaslab_class_get_dspace(metaslab_class_t *); -uint64_t metaslab_class_get_deferred(metaslab_class_t *); -uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc); - -metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int); -void metaslab_group_destroy(metaslab_group_t *); -void metaslab_group_activate(metaslab_group_t *); -void metaslab_group_passivate(metaslab_group_t *); -boolean_t metaslab_group_initialized(metaslab_group_t *); -uint64_t metaslab_group_get_space(metaslab_group_t *); -void metaslab_group_histogram_verify(metaslab_group_t *); -uint64_t metaslab_group_fragmentation(metaslab_group_t *); -void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); -void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int, - boolean_t); -void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int); -void metaslab_recalculate_weight_and_sort(metaslab_t *); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_METASLAB_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h deleted file mode 100644 index ae49795fec1a..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h +++ /dev/null @@ -1,501 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - */ - -#ifndef _SYS_METASLAB_IMPL_H -#define _SYS_METASLAB_IMPL_H - -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Metaslab allocation tracing record. - */ -typedef struct metaslab_alloc_trace { - list_node_t mat_list_node; - metaslab_group_t *mat_mg; - metaslab_t *mat_msp; - uint64_t mat_size; - uint64_t mat_weight; - uint32_t mat_dva_id; - uint64_t mat_offset; - int mat_allocator; -} metaslab_alloc_trace_t; - -/* - * Used by the metaslab allocation tracing facility to indicate - * error conditions. These errors are stored to the offset member - * of the metaslab_alloc_trace_t record and displayed by mdb. - */ -typedef enum trace_alloc_type { - TRACE_ALLOC_FAILURE = -1ULL, - TRACE_TOO_SMALL = -2ULL, - TRACE_FORCE_GANG = -3ULL, - TRACE_NOT_ALLOCATABLE = -4ULL, - TRACE_GROUP_FAILURE = -5ULL, - TRACE_ENOSPC = -6ULL, - TRACE_CONDENSING = -7ULL, - TRACE_VDEV_ERROR = -8ULL, - TRACE_INITIALIZING = -9ULL -} trace_alloc_type_t; - -#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) -#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) -#define METASLAB_WEIGHT_CLAIM (1ULL << 61) -#define METASLAB_WEIGHT_TYPE (1ULL << 60) -#define METASLAB_ACTIVE_MASK \ - (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ - METASLAB_WEIGHT_CLAIM) - -/* - * The metaslab weight is used to encode the amount of free space in a - * metaslab, such that the "best" metaslab appears first when sorting the - * metaslabs by weight. The weight (and therefore the "best" metaslab) can - * be determined in two different ways: by computing a weighted sum of all - * the free space in the metaslab (a space based weight) or by counting only - * the free segments of the largest size (a segment based weight). We prefer - * the segment based weight because it reflects how the free space is - * comprised, but we cannot always use it -- legacy pools do not have the - * space map histogram information necessary to determine the largest - * contiguous regions. Pools that have the space map histogram determine - * the segment weight by looking at each bucket in the histogram and - * determining the free space whose size in bytes is in the range: - * [2^i, 2^(i+1)) - * We then encode the largest index, i, that contains regions into the - * segment-weighted value. - * - * Space-based weight: - * - * 64 56 48 40 32 24 16 8 0 - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * |PSC1| weighted-free space | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * - * PS - indicates primary and secondary activation - * C - indicates activation for claimed block zio - * space - the fragmentation-weighted space - * - * Segment-based weight: - * - * 64 56 48 40 32 24 16 8 0 - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * |PSC0| idx| count of segments in region | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * - * PS - indicates primary and secondary activation - * C - indicates activation for claimed block zio - * idx - index for the highest bucket in the histogram - * count - number of segments in the specified bucket - */ -#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) -#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) - -#define WEIGHT_IS_SPACEBASED(weight) \ - ((weight) == 0 || BF64_GET((weight), 60, 1)) -#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) - -/* - * These macros are only applicable to segment-based weighting. - */ -#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) -#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) -#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) -#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) - -/* - * A metaslab class encompasses a category of allocatable top-level vdevs. - * Each top-level vdev is associated with a metaslab group which defines - * the allocatable region for that vdev. Examples of these categories include - * "normal" for data block allocations (i.e. main pool allocations) or "log" - * for allocations designated for intent log devices (i.e. slog devices). - * When a block allocation is requested from the SPA it is associated with a - * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging - * to the class can be used to satisfy that request. Allocations are done - * by traversing the metaslab groups that are linked off of the mc_rotor field. - * This rotor points to the next metaslab group where allocations will be - * attempted. Allocating a block is a 3 step process -- select the metaslab - * group, select the metaslab, and then allocate the block. The metaslab - * class defines the low-level block allocator that will be used as the - * final step in allocation. These allocators are pluggable allowing each class - * to use a block allocator that best suits that class. - */ -struct metaslab_class { - kmutex_t mc_lock; - spa_t *mc_spa; - metaslab_group_t *mc_rotor; - metaslab_ops_t *mc_ops; - uint64_t mc_aliquot; - - /* - * Track the number of metaslab groups that have been initialized - * and can accept allocations. An initialized metaslab group is - * one has been completely added to the config (i.e. we have - * updated the MOS config and the space has been added to the pool). - */ - uint64_t mc_groups; - - /* - * Toggle to enable/disable the allocation throttle. - */ - boolean_t mc_alloc_throttle_enabled; - - /* - * The allocation throttle works on a reservation system. Whenever - * an asynchronous zio wants to perform an allocation it must - * first reserve the number of blocks that it wants to allocate. - * If there aren't sufficient slots available for the pending zio - * then that I/O is throttled until more slots free up. The current - * number of reserved allocations is maintained by the mc_alloc_slots - * refcount. The mc_alloc_max_slots value determines the maximum - * number of allocations that the system allows. Gang blocks are - * allowed to reserve slots even if we've reached the maximum - * number of allocations allowed. - */ - uint64_t *mc_alloc_max_slots; - zfs_refcount_t *mc_alloc_slots; - - uint64_t mc_alloc_groups; /* # of allocatable groups */ - - uint64_t mc_alloc; /* total allocated space */ - uint64_t mc_deferred; /* total deferred frees */ - uint64_t mc_space; /* total space (alloc + free) */ - uint64_t mc_dspace; /* total deflated space */ - uint64_t mc_minblocksize; - uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; -}; - -/* - * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) - * of a top-level vdev. They are linked togther to form a circular linked - * list and can belong to only one metaslab class. Metaslab groups may become - * ineligible for allocations for a number of reasons such as limited free - * space, fragmentation, or going offline. When this happens the allocator will - * simply find the next metaslab group in the linked list and attempt - * to allocate from that group instead. - */ -struct metaslab_group { - kmutex_t mg_lock; - metaslab_t **mg_primaries; - metaslab_t **mg_secondaries; - avl_tree_t mg_metaslab_tree; - uint64_t mg_aliquot; - boolean_t mg_allocatable; /* can we allocate? */ - uint64_t mg_ms_ready; - - /* - * A metaslab group is considered to be initialized only after - * we have updated the MOS config and added the space to the pool. - * We only allow allocation attempts to a metaslab group if it - * has been initialized. - */ - boolean_t mg_initialized; - - uint64_t mg_free_capacity; /* percentage free */ - int64_t mg_bias; - int64_t mg_activation_count; - metaslab_class_t *mg_class; - vdev_t *mg_vd; - taskq_t *mg_taskq; - metaslab_group_t *mg_prev; - metaslab_group_t *mg_next; - - /* - * In order for the allocation throttle to function properly, we cannot - * have too many IOs going to each disk by default; the throttle - * operates by allocating more work to disks that finish quickly, so - * allocating larger chunks to each disk reduces its effectiveness. - * However, if the number of IOs going to each allocator is too small, - * we will not perform proper aggregation at the vdev_queue layer, - * also resulting in decreased performance. Therefore, we will use a - * ramp-up strategy. - * - * Each allocator in each metaslab group has a current queue depth - * (mg_alloc_queue_depth[allocator]) and a current max queue depth - * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group - * has an absolute max queue depth (mg_max_alloc_queue_depth). We - * add IOs to an allocator until the mg_alloc_queue_depth for that - * allocator hits the cur_max. Every time an IO completes for a given - * allocator on a given metaslab group, we increment its cur_max until - * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to - * help protect against disks that decrease in performance over time. - * - * It's possible for an allocator to handle more allocations than - * its max. This can occur when gang blocks are required or when other - * groups are unable to handle their share of allocations. - */ - uint64_t mg_max_alloc_queue_depth; - uint64_t *mg_cur_max_alloc_queue_depth; - zfs_refcount_t *mg_alloc_queue_depth; - int mg_allocators; - /* - * A metalab group that can no longer allocate the minimum block - * size will set mg_no_free_space. Once a metaslab group is out - * of space then its share of work must be distributed to other - * groups. - */ - boolean_t mg_no_free_space; - - uint64_t mg_allocations; - uint64_t mg_failed_allocations; - uint64_t mg_fragmentation; - uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; - - int mg_ms_initializing; - boolean_t mg_initialize_updating; - kmutex_t mg_ms_initialize_lock; - kcondvar_t mg_ms_initialize_cv; -}; - -/* - * This value defines the number of elements in the ms_lbas array. The value - * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. - * This is the equivalent of highbit(UINT64_MAX). - */ -#define MAX_LBAS 64 - -/* - * Each metaslab maintains a set of in-core trees to track metaslab - * operations. The in-core free tree (ms_allocatable) contains the list of - * free segments which are eligible for allocation. As blocks are - * allocated, the allocated segment are removed from the ms_allocatable and - * added to a per txg allocation tree (ms_allocating). As blocks are - * freed, they are added to the free tree (ms_freeing). These trees - * allow us to process all allocations and frees in syncing context - * where it is safe to update the on-disk space maps. An additional set - * of in-core trees is maintained to track deferred frees - * (ms_defer). Once a block is freed it will move from the - * ms_freed to the ms_defer tree. A deferred free means that a block - * has been freed but cannot be used by the pool until TXG_DEFER_SIZE - * transactions groups later. For example, a block that is freed in txg - * 50 will not be available for reallocation until txg 52 (50 + - * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback. - * A pool could be safely rolled back TXG_DEFERS_SIZE transactions - * groups and ensure that no block has been reallocated. - * - * The simplified transition diagram looks like this: - * - * - * ALLOCATE - * | - * V - * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map) - * ^ - * | ms_freeing <--- FREE - * | | - * | v - * | ms_freed - * | | - * +-------- ms_defer[2] <-------+-------> (write to space map) - * - * - * Each metaslab's space is tracked in a single space map in the MOS, - * which is only updated in syncing context. Each time we sync a txg, - * we append the allocs and frees from that txg to the space map. The - * pool space is only updated once all metaslabs have finished syncing. - * - * To load the in-core free tree we read the space map from disk. This - * object contains a series of alloc and free records that are combined - * to make up the list of all free segments in this metaslab. These - * segments are represented in-core by the ms_allocatable and are stored - * in an AVL tree. - * - * As the space map grows (as a result of the appends) it will - * eventually become space-inefficient. When the metaslab's in-core - * free tree is zfs_condense_pct/100 times the size of the minimal - * on-disk representation, we rewrite it in its minimized form. If a - * metaslab needs to condense then we must set the ms_condensing flag to - * ensure that allocations are not performed on the metaslab that is - * being written. - */ -struct metaslab { - /* - * This is the main lock of the metaslab and its purpose is to - * coordinate our allocations and frees [e.g metaslab_block_alloc(), - * metaslab_free_concrete(), ..etc] with our various syncing - * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc]. - * - * The lock is also used during some miscellaneous operations like - * using the metaslab's histogram for the metaslab group's histogram - * aggregation, or marking the metaslab for initialization. - */ - kmutex_t ms_lock; - - /* - * Acquired together with the ms_lock whenever we expect to - * write to metaslab data on-disk (i.e flushing entries to - * the metaslab's space map). It helps coordinate readers of - * the metaslab's space map [see spa_vdev_remove_thread()] - * with writers [see metaslab_sync()]. - * - * Note that metaslab_load(), even though a reader, uses - * a completely different mechanism to deal with the reading - * of the metaslab's space map based on ms_synced_length. That - * said, the function still uses the ms_sync_lock after it - * has read the ms_sm [see relevant comment in metaslab_load() - * as to why]. - */ - kmutex_t ms_sync_lock; - - kcondvar_t ms_load_cv; - space_map_t *ms_sm; - uint64_t ms_id; - uint64_t ms_start; - uint64_t ms_size; - uint64_t ms_fragmentation; - - range_tree_t *ms_allocating[TXG_SIZE]; - range_tree_t *ms_allocatable; - uint64_t ms_allocated_this_txg; - - /* - * The following range trees are accessed only from syncing context. - * ms_free*tree only have entries while syncing, and are empty - * between syncs. - */ - range_tree_t *ms_freeing; /* to free this syncing txg */ - range_tree_t *ms_freed; /* already freed this syncing txg */ - range_tree_t *ms_defer[TXG_DEFER_SIZE]; - range_tree_t *ms_checkpointing; /* to add to the checkpoint */ - - boolean_t ms_condensing; /* condensing? */ - boolean_t ms_condense_wanted; - uint64_t ms_condense_checked_txg; - - uint64_t ms_initializing; /* leaves initializing this ms */ - - /* - * We must always hold the ms_lock when modifying ms_loaded - * and ms_loading. - */ - boolean_t ms_loaded; - boolean_t ms_loading; - - /* - * The following histograms count entries that are in the - * metaslab's space map (and its histogram) but are not in - * ms_allocatable yet, because they are in ms_freed, ms_freeing, - * or ms_defer[]. - * - * When the metaslab is not loaded, its ms_weight needs to - * reflect what is allocatable (i.e. what will be part of - * ms_allocatable if it is loaded). The weight is computed from - * the spacemap histogram, but that includes ranges that are - * not yet allocatable (because they are in ms_freed, - * ms_freeing, or ms_defer[]). Therefore, when calculating the - * weight, we need to remove those ranges. - * - * The ranges in the ms_freed and ms_defer[] range trees are all - * present in the spacemap. However, the spacemap may have - * multiple entries to represent a contiguous range, because it - * is written across multiple sync passes, but the changes of - * all sync passes are consolidated into the range trees. - * Adjacent ranges that are freed in different sync passes of - * one txg will be represented separately (as 2 or more entries) - * in the space map (and its histogram), but these adjacent - * ranges will be consolidated (represented as one entry) in the - * ms_freed/ms_defer[] range trees (and their histograms). - * - * When calculating the weight, we can not simply subtract the - * range trees' histograms from the spacemap's histogram, - * because the range trees' histograms may have entries in - * higher buckets than the spacemap, due to consolidation. - * Instead we must subtract the exact entries that were added to - * the spacemap's histogram. ms_synchist and ms_deferhist[] - * represent these exact entries, so we can subtract them from - * the spacemap's histogram when calculating ms_weight. - * - * ms_synchist represents the same ranges as ms_freeing + - * ms_freed, but without consolidation across sync passes. - * - * ms_deferhist[i] represents the same ranges as ms_defer[i], - * but without consolidation across sync passes. - */ - uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE]; - uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE]; - - /* - * Tracks the exact amount of allocated space of this metaslab - * (and specifically the metaslab's space map) up to the most - * recently completed sync pass [see usage in metaslab_sync()]. - */ - uint64_t ms_allocated_space; - int64_t ms_deferspace; /* sum of ms_defermap[] space */ - uint64_t ms_weight; /* weight vs. others in group */ - uint64_t ms_activation_weight; /* activation weight */ - - /* - * Track of whenever a metaslab is selected for loading or allocation. - * We use this value to determine how long the metaslab should - * stay cached. - */ - uint64_t ms_selected_txg; - - uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ - uint64_t ms_max_size; /* maximum allocatable size */ - - /* - * -1 if it's not active in an allocator, otherwise set to the allocator - * this metaslab is active for. - */ - int ms_allocator; - boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */ - - /* - * The metaslab block allocators can optionally use a size-ordered - * range tree and/or an array of LBAs. Not all allocators use - * this functionality. The ms_allocatable_by_size should always - * contain the same number of segments as the ms_allocatable. The - * only difference is that the ms_allocatable_by_size is ordered by - * segment sizes. - */ - avl_tree_t ms_allocatable_by_size; - uint64_t ms_lbas[MAX_LBAS]; - - metaslab_group_t *ms_group; /* metaslab group */ - avl_node_t ms_group_node; /* node in metaslab group tree */ - txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ - - /* updated every time we are done syncing the metaslab's space map */ - uint64_t ms_synced_length; - - boolean_t ms_new; -}; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_METASLAB_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/mmp.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/mmp.h deleted file mode 100644 index 527e3323b4b9..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/mmp.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ -/* - * Copyright (C) 2017 by Lawrence Livermore National Security, LLC. - */ - -#ifndef _SYS_MMP_H -#define _SYS_MMP_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define MMP_MIN_INTERVAL 100 /* ms */ -#define MMP_DEFAULT_INTERVAL 1000 /* ms */ -#define MMP_DEFAULT_IMPORT_INTERVALS 20 -#define MMP_DEFAULT_FAIL_INTERVALS 10 -#define MMP_MIN_FAIL_INTERVALS 2 /* min if != 0 */ -#define MMP_IMPORT_SAFETY_FACTOR 200 /* pct */ -#define MMP_INTERVAL_OK(interval) MAX(interval, MMP_MIN_INTERVAL) -#define MMP_FAIL_INTVS_OK(fails) (fails == 0 ? 0 : MAX(fails, \ - MMP_MIN_FAIL_INTERVALS)) - -typedef struct mmp_thread { - kmutex_t mmp_thread_lock; /* protect thread mgmt fields */ - kcondvar_t mmp_thread_cv; - kthread_t *mmp_thread; - uint8_t mmp_thread_exiting; - kmutex_t mmp_io_lock; /* protect below */ - hrtime_t mmp_last_write; /* last successful MMP write */ - uint64_t mmp_delay; /* decaying avg ns between MMP writes */ - uberblock_t mmp_ub; /* last ub written by sync */ - zio_t *mmp_zio_root; /* root of mmp write zios */ - uint64_t mmp_kstat_id; /* unique id for next MMP write kstat */ - int mmp_skip_error; /* reason for last skipped write */ - vdev_t *mmp_last_leaf; /* last mmp write sent here */ - uint64_t mmp_leaf_last_gen; /* last mmp write sent here */ - uint32_t mmp_seq; /* intra-second update counter */ -} mmp_thread_t; - - -extern void mmp_init(struct spa *spa); -extern void mmp_fini(struct spa *spa); -extern void mmp_thread_start(struct spa *spa); -extern void mmp_thread_stop(struct spa *spa); -extern void mmp_update_uberblock(struct spa *spa, struct uberblock *ub); -extern void mmp_signal_all_threads(void); - -/* Global tuning */ -extern ulong_t zfs_multihost_interval; -extern uint_t zfs_multihost_fail_intervals; -extern uint_t zfs_multihost_import_intervals; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_MMP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h deleted file mode 100644 index a3b44e60eb97..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. - */ - -#ifndef _SYS_MULTILIST_H -#define _SYS_MULTILIST_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef list_node_t multilist_node_t; -typedef struct multilist multilist_t; -typedef struct multilist_sublist multilist_sublist_t; -typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *); - -struct multilist_sublist { - /* - * The mutex used internally to implement thread safe insertions - * and removals to this individual sublist. It can also be locked - * by a consumer using multilist_sublist_{lock,unlock}, which is - * useful if a consumer needs to traverse the list in a thread - * safe manner. - */ - kmutex_t mls_lock; - /* - * The actual list object containing all objects in this sublist. - */ - list_t mls_list; - /* - * Pad to cache line, in an effort to try and prevent cache line - * contention. - */ -} __aligned(CACHE_LINE_SIZE); - -struct multilist { - /* - * This is used to get to the multilist_node_t structure given - * the void *object contained on the list. - */ - size_t ml_offset; - /* - * The number of sublists used internally by this multilist. - */ - uint64_t ml_num_sublists; - /* - * The array of pointers to the actual sublists. - */ - multilist_sublist_t *ml_sublists; - /* - * Pointer to function which determines the sublist to use - * when inserting and removing objects from this multilist. - * Please see the comment above multilist_create for details. - */ - multilist_sublist_index_func_t *ml_index_func; -}; - -void multilist_destroy(multilist_t *); -multilist_t *multilist_create(size_t, size_t, multilist_sublist_index_func_t *); - -void multilist_insert(multilist_t *, void *); -void multilist_remove(multilist_t *, void *); -int multilist_is_empty(multilist_t *); - -unsigned int multilist_get_num_sublists(multilist_t *); -unsigned int multilist_get_random_index(multilist_t *); - -multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int); -multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *); -void multilist_sublist_unlock(multilist_sublist_t *); - -void multilist_sublist_insert_head(multilist_sublist_t *, void *); -void multilist_sublist_insert_tail(multilist_sublist_t *, void *); -void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj); -void multilist_sublist_remove(multilist_sublist_t *, void *); -int multilist_sublist_is_empty(multilist_sublist_t *); -int multilist_sublist_is_empty_idx(multilist_t *, unsigned int); - -void *multilist_sublist_head(multilist_sublist_t *); -void *multilist_sublist_tail(multilist_sublist_t *); -void *multilist_sublist_next(multilist_sublist_t *, void *); -void *multilist_sublist_prev(multilist_sublist_t *, void *); - -void multilist_link_init(multilist_node_t *); -int multilist_link_active(multilist_node_t *); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_MULTILIST_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h deleted file mode 100644 index bbdf66cade63..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. - */ - -#ifndef _SYS_RANGE_TREE_H -#define _SYS_RANGE_TREE_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define RANGE_TREE_HISTOGRAM_SIZE 64 - -typedef struct range_tree_ops range_tree_ops_t; - -/* - * Note: the range_tree may not be accessed concurrently; consumers - * must provide external locking if required. - */ -typedef struct range_tree { - avl_tree_t rt_root; /* offset-ordered segment AVL tree */ - uint64_t rt_space; /* sum of all segments in the map */ - range_tree_ops_t *rt_ops; - void *rt_arg; - - /* rt_avl_compare should only be set it rt_arg is an AVL tree */ - uint64_t rt_gap; /* allowable inter-segment gap */ - int (*rt_avl_compare)(const void *, const void *); - /* - * The rt_histogram maintains a histogram of ranges. Each bucket, - * rt_histogram[i], contains the number of ranges whose size is: - * 2^i <= size of range in bytes < 2^(i+1) - */ - uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE]; -} range_tree_t; - -typedef struct range_seg { - avl_node_t rs_node; /* AVL node */ - avl_node_t rs_pp_node; /* AVL picker-private node */ - uint64_t rs_start; /* starting offset of this segment */ - uint64_t rs_end; /* ending offset (non-inclusive) */ - uint64_t rs_fill; /* actual fill if gap mode is on */ -} range_seg_t; - -struct range_tree_ops { - void (*rtop_create)(range_tree_t *rt, void *arg); - void (*rtop_destroy)(range_tree_t *rt, void *arg); - void (*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg); - void (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg); - void (*rtop_vacate)(range_tree_t *rt, void *arg); -}; - -typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size); - -void range_tree_init(void); -void range_tree_fini(void); -range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg, - int (*avl_compare)(const void*, const void*), uint64_t gap); -range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); -void range_tree_destroy(range_tree_t *rt); -boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); -void range_tree_verify_not_present(range_tree_t *rt, - uint64_t start, uint64_t size); -range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); -void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, - uint64_t newstart, uint64_t newsize); -uint64_t range_tree_space(range_tree_t *rt); -boolean_t range_tree_is_empty(range_tree_t *rt); -void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst); -void range_tree_stat_verify(range_tree_t *rt); -uint64_t range_tree_min(range_tree_t *rt); -uint64_t range_tree_max(range_tree_t *rt); -uint64_t range_tree_span(range_tree_t *rt); - -void range_tree_add(void *arg, uint64_t start, uint64_t size); -void range_tree_remove(void *arg, uint64_t start, uint64_t size); -void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size); -void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta); -void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size); - -void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg); -void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg); -range_seg_t *range_tree_first(range_tree_t *rt); - -void rt_avl_create(range_tree_t *rt, void *arg); -void rt_avl_destroy(range_tree_t *rt, void *arg); -void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg); -void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg); -void rt_avl_vacate(range_tree_t *rt, void *arg); -extern struct range_tree_ops rt_avl_ops; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_RANGE_TREE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h deleted file mode 100644 index f1fd04792fef..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. - */ - -#ifndef _SYS_REFCOUNT_H -#define _SYS_REFCOUNT_H - -#include -#include -/* For FreeBSD refcount(9). */ -#include_next -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * If the reference is held only by the calling function and not any - * particular object, use FTAG (which is a string) for the holder_tag. - * Otherwise, use the object that holds the reference. - */ -#define FTAG ((char *)(uintptr_t)__func__) - -#ifdef ZFS_DEBUG -typedef struct reference { - list_node_t ref_link; - void *ref_holder; - uint64_t ref_number; - uint8_t *ref_removed; -} reference_t; - -typedef struct refcount { - kmutex_t rc_mtx; - boolean_t rc_tracked; - list_t rc_list; - list_t rc_removed; - uint64_t rc_count; - uint64_t rc_removed_count; -} zfs_refcount_t; - -/* - * Note: zfs_refcount_t must be initialized with - * refcount_create[_untracked]() - */ - -void zfs_refcount_create(zfs_refcount_t *); -void zfs_refcount_create_untracked(zfs_refcount_t *); -void zfs_refcount_create_tracked(zfs_refcount_t *); -void zfs_refcount_destroy(zfs_refcount_t *); -void zfs_refcount_destroy_many(zfs_refcount_t *, uint64_t); -int zfs_refcount_is_zero(zfs_refcount_t *); -int64_t zfs_refcount_count(zfs_refcount_t *); -int64_t zfs_refcount_add(zfs_refcount_t *, void *); -int64_t zfs_refcount_remove(zfs_refcount_t *, void *); -int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, void *); -int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, void *); -void zfs_refcount_transfer(zfs_refcount_t *, zfs_refcount_t *); -void zfs_refcount_transfer_ownership(zfs_refcount_t *, void *, void *); -boolean_t zfs_refcount_held(zfs_refcount_t *, void *); -boolean_t zfs_refcount_not_held(zfs_refcount_t *, void *); - -void zfs_refcount_init(void); -void zfs_refcount_fini(void); - -#else /* ZFS_DEBUG */ - -typedef struct refcount { - uint64_t rc_count; -} zfs_refcount_t; - -#define zfs_refcount_create(rc) ((rc)->rc_count = 0) -#define zfs_refcount_create_untracked(rc) ((rc)->rc_count = 0) -#define zfs_refcount_create_tracked(rc) ((rc)->rc_count = 0) -#define zfs_refcount_destroy(rc) ((rc)->rc_count = 0) -#define zfs_refcount_destroy_many(rc, number) ((rc)->rc_count = 0) -#define zfs_refcount_is_zero(rc) ((rc)->rc_count == 0) -#define zfs_refcount_count(rc) ((rc)->rc_count) -#define zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count) -#define zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count) -#define zfs_refcount_add_many(rc, number, holder) \ - atomic_add_64_nv(&(rc)->rc_count, number) -#define zfs_refcount_remove_many(rc, number, holder) \ - atomic_add_64_nv(&(rc)->rc_count, -number) -#define zfs_refcount_transfer(dst, src) { \ - uint64_t __tmp = (src)->rc_count; \ - atomic_add_64(&(src)->rc_count, -__tmp); \ - atomic_add_64(&(dst)->rc_count, __tmp); \ -} -#define zfs_refcount_transfer_ownership(rc, current_holder, new_holder) (void)0 -#define zfs_refcount_held(rc, holder) ((rc)->rc_count > 0) -#define zfs_refcount_not_held(rc, holder) (B_TRUE) - -#define zfs_refcount_init() -#define zfs_refcount_fini() - -#endif /* ZFS_DEBUG */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_REFCOUNT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h deleted file mode 100644 index e0898dfe0ae8..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h +++ /dev/null @@ -1,112 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2012 by Delphix. All rights reserved. - */ - -#ifndef _SYS_RR_RW_LOCK_H -#define _SYS_RR_RW_LOCK_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include - -/* - * A reader-writer lock implementation that allows re-entrant reads, but - * still gives writers priority on "new" reads. - * - * See rrwlock.c for more details about the implementation. - * - * Fields of the rrwlock_t structure: - * - rr_lock: protects modification and reading of rrwlock_t fields - * - rr_cv: cv for waking up readers or waiting writers - * - rr_writer: thread id of the current writer - * - rr_anon_rount: number of active anonymous readers - * - rr_linked_rcount: total number of non-anonymous active readers - * - rr_writer_wanted: a writer wants the lock - */ -typedef struct rrwlock { - kmutex_t rr_lock; - kcondvar_t rr_cv; - kthread_t *rr_writer; - zfs_refcount_t rr_anon_rcount; - zfs_refcount_t rr_linked_rcount; - boolean_t rr_writer_wanted; - boolean_t rr_track_all; -} rrwlock_t; - -/* - * 'tag' is used in reference counting tracking. The - * 'tag' must be the same in a rrw_enter() as in its - * corresponding rrw_exit(). - */ -void rrw_init(rrwlock_t *rrl, boolean_t track_all); -void rrw_destroy(rrwlock_t *rrl); -void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag); -void rrw_enter_read(rrwlock_t *rrl, void *tag); -void rrw_enter_read_prio(rrwlock_t *rrl, void *tag); -void rrw_enter_write(rrwlock_t *rrl); -void rrw_exit(rrwlock_t *rrl, void *tag); -boolean_t rrw_held(rrwlock_t *rrl, krw_t rw); -void rrw_tsd_destroy(void *arg); - -#define RRW_READ_HELD(x) rrw_held(x, RW_READER) -#define RRW_WRITE_HELD(x) rrw_held(x, RW_WRITER) -#define RRW_LOCK_HELD(x) \ - (rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER)) - -/* - * A reader-mostly lock implementation, tuning above reader-writer locks - * for hightly parallel read acquisitions, pessimizing write acquisitions. - * - * This should be a prime number. See comment in rrwlock.c near - * RRM_TD_LOCK() for details. - */ -#define RRM_NUM_LOCKS 17 -typedef struct rrmlock { - rrwlock_t locks[RRM_NUM_LOCKS]; -} rrmlock_t; - -void rrm_init(rrmlock_t *rrl, boolean_t track_all); -void rrm_destroy(rrmlock_t *rrl); -void rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag); -void rrm_enter_read(rrmlock_t *rrl, void *tag); -void rrm_enter_write(rrmlock_t *rrl); -void rrm_exit(rrmlock_t *rrl, void *tag); -boolean_t rrm_held(rrmlock_t *rrl, krw_t rw); - -#define RRM_READ_HELD(x) rrm_held(x, RW_READER) -#define RRM_WRITE_HELD(x) rrm_held(x, RW_WRITER) -#define RRM_LOCK_HELD(x) \ - (rrm_held(x, RW_WRITER) || rrm_held(x, RW_READER)) - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_RR_RW_LOCK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h deleted file mode 100644 index 62332ea126a0..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h +++ /dev/null @@ -1,170 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#ifndef _SYS_SA_H -#define _SYS_SA_H - -#include -#include - -/* - * Currently available byteswap functions. - * If it all possible new attributes should used - * one of the already defined byteswap functions. - * If a new byteswap function is added then the - * ZPL/Pool version will need to be bumped. - */ - -typedef enum sa_bswap_type { - SA_UINT64_ARRAY, - SA_UINT32_ARRAY, - SA_UINT16_ARRAY, - SA_UINT8_ARRAY, - SA_ACL, -} sa_bswap_type_t; - -typedef uint16_t sa_attr_type_t; - -/* - * Attribute to register support for. - */ -typedef struct sa_attr_reg { - char *sa_name; /* attribute name */ - uint16_t sa_length; - sa_bswap_type_t sa_byteswap; /* bswap functon enum */ - sa_attr_type_t sa_attr; /* filled in during registration */ -} sa_attr_reg_t; - - -typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t, - boolean_t, void *userptr); - -/* - * array of attributes to store. - * - * This array should be treated as opaque/private data. - * The SA_BULK_ADD_ATTR() macro should be used for manipulating - * the array. - * - * When sa_replace_all_by_template() is used the attributes - * will be stored in the order defined in the array, except that - * the attributes may be split between the bonus and the spill buffer - * - */ -typedef struct sa_bulk_attr { - void *sa_data; - sa_data_locator_t *sa_data_func; - uint16_t sa_length; - sa_attr_type_t sa_attr; - /* the following are private to the sa framework */ - void *sa_addr; - uint16_t sa_buftype; - uint16_t sa_size; -} sa_bulk_attr_t; - - -/* - * special macro for adding entries for bulk attr support - * bulk - sa_bulk_attr_t - * count - integer that will be incremented during each add - * attr - attribute to manipulate - * func - function for accessing data. - * data - pointer to data. - * len - length of data - */ - -#define SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \ -{ \ - b[idx].sa_attr = attr;\ - b[idx].sa_data_func = func; \ - b[idx].sa_data = data; \ - b[idx++].sa_length = len; \ -} - -typedef struct sa_os sa_os_t; - -typedef enum sa_handle_type { - SA_HDL_SHARED, - SA_HDL_PRIVATE -} sa_handle_type_t; - -struct sa_handle; -typedef void *sa_lookup_tab_t; -typedef struct sa_handle sa_handle_t; - -typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx); - -int sa_handle_get(objset_t *, uint64_t, void *userp, - sa_handle_type_t, sa_handle_t **); -int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp, - sa_handle_type_t, sa_handle_t **); -void sa_handle_destroy(sa_handle_t *); -int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **); -void sa_buf_rele(dmu_buf_t *, void *); -int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen); -int sa_update(sa_handle_t *, sa_attr_type_t, void *buf, - uint32_t buflen, dmu_tx_t *); -int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *); -int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count); -int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count); -int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *); -int sa_size(sa_handle_t *, sa_attr_type_t, int *); -int sa_update_from_cb(sa_handle_t *, sa_attr_type_t, - uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *); -void sa_object_info(sa_handle_t *, dmu_object_info_t *); -void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *); -void *sa_get_userdata(sa_handle_t *); -void sa_set_userp(sa_handle_t *, void *); -dmu_buf_t *sa_get_db(sa_handle_t *); -uint64_t sa_handle_object(sa_handle_t *); -boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size); -void sa_register_update_callback(objset_t *, sa_update_cb_t *); -int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **); -void sa_tear_down(objset_t *); -int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *, - int, dmu_tx_t *); -int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *, - int, dmu_tx_t *); -boolean_t sa_enabled(objset_t *); -void sa_cache_init(void); -void sa_cache_fini(void); -int sa_set_sa_object(objset_t *, uint64_t); -int sa_hdrsize(void *); -void sa_handle_lock(sa_handle_t *); -void sa_handle_unlock(sa_handle_t *); - -#ifdef _KERNEL -int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *); -#endif - -#ifdef __cplusplus -extern "C" { -#endif - - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SA_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h deleted file mode 100644 index 50430125b253..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h +++ /dev/null @@ -1,291 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - */ - -#ifndef _SYS_SA_IMPL_H -#define _SYS_SA_IMPL_H - -#include -#include -#include - -/* - * Array of known attributes and their - * various characteristics. - */ -typedef struct sa_attr_table { - sa_attr_type_t sa_attr; - uint8_t sa_registered; - uint16_t sa_length; - sa_bswap_type_t sa_byteswap; - char *sa_name; -} sa_attr_table_t; - -/* - * Zap attribute format for attribute registration - * - * 64 56 48 40 32 24 16 8 0 - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * | unused | len | bswap | attr num | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * - * Zap attribute format for layout information. - * - * layout information is stored as an array of attribute numbers - * The name of the attribute is the layout number (0, 1, 2, ...) - * - * 16 0 - * +---- ---+ - * | attr # | - * +--------+ - * | attr # | - * +--- ----+ - * ...... - * - */ - -#define ATTR_BSWAP(x) BF32_GET(x, 16, 8) -#define ATTR_LENGTH(x) BF32_GET(x, 24, 16) -#define ATTR_NUM(x) BF32_GET(x, 0, 16) -#define ATTR_ENCODE(x, attr, length, bswap) \ -{ \ - BF64_SET(x, 24, 16, length); \ - BF64_SET(x, 16, 8, bswap); \ - BF64_SET(x, 0, 16, attr); \ -} - -#define TOC_OFF(x) BF32_GET(x, 0, 23) -#define TOC_ATTR_PRESENT(x) BF32_GET(x, 31, 1) -#define TOC_LEN_IDX(x) BF32_GET(x, 24, 4) -#define TOC_ATTR_ENCODE(x, len_idx, offset) \ -{ \ - BF32_SET(x, 31, 1, 1); \ - BF32_SET(x, 24, 7, len_idx); \ - BF32_SET(x, 0, 24, offset); \ -} - -#define SA_LAYOUTS "LAYOUTS" -#define SA_REGISTRY "REGISTRY" - -/* - * Each unique layout will have their own table - * sa_lot (layout_table) - */ -typedef struct sa_lot { - avl_node_t lot_num_node; - avl_node_t lot_hash_node; - uint64_t lot_num; - uint64_t lot_hash; - sa_attr_type_t *lot_attrs; /* array of attr #'s */ - uint32_t lot_var_sizes; /* how many aren't fixed size */ - uint32_t lot_attr_count; /* total attr count */ - list_t lot_idx_tab; /* should be only a couple of entries */ - int lot_instance; /* used with lot_hash to identify entry */ -} sa_lot_t; - -/* index table of offsets */ -typedef struct sa_idx_tab { - list_node_t sa_next; - sa_lot_t *sa_layout; - uint16_t *sa_variable_lengths; - zfs_refcount_t sa_refcount; - uint32_t *sa_idx_tab; /* array of offsets */ -} sa_idx_tab_t; - -/* - * Since the offset/index information into the actual data - * will usually be identical we can share that information with - * all handles that have the exact same offsets. - * - * You would typically only have a large number of different table of - * contents if you had a several variable sized attributes. - * - * Two AVL trees are used to track the attribute layout numbers. - * one is keyed by number and will be consulted when a DMU_OT_SA - * object is first read. The second tree is keyed by the hash signature - * of the attributes and will be consulted when an attribute is added - * to determine if we already have an instance of that layout. Both - * of these tree's are interconnected. The only difference is that - * when an entry is found in the "hash" tree the list of attributes will - * need to be compared against the list of attributes you have in hand. - * The assumption is that typically attributes will just be updated and - * adding a completely new attribute is a very rare operation. - */ -struct sa_os { - kmutex_t sa_lock; - boolean_t sa_need_attr_registration; - boolean_t sa_force_spill; - uint64_t sa_master_obj; - uint64_t sa_reg_attr_obj; - uint64_t sa_layout_attr_obj; - int sa_num_attrs; - sa_attr_table_t *sa_attr_table; /* private attr table */ - sa_update_cb_t *sa_update_cb; - avl_tree_t sa_layout_num_tree; /* keyed by layout number */ - avl_tree_t sa_layout_hash_tree; /* keyed by layout hash value */ - int sa_user_table_sz; - sa_attr_type_t *sa_user_table; /* user name->attr mapping table */ -}; - -/* - * header for all bonus and spill buffers. - * - * The header has a fixed portion with a variable number - * of "lengths" depending on the number of variable sized - * attributes which are determined by the "layout number" - */ - -#define SA_MAGIC 0x2F505A /* ZFS SA */ -typedef struct sa_hdr_phys { - uint32_t sa_magic; - /* BEGIN CSTYLED */ - /* - * Encoded with hdrsize and layout number as follows: - * 16 10 0 - * +--------+-------+ - * | hdrsz |layout | - * +--------+-------+ - * - * Bits 0-10 are the layout number - * Bits 11-16 are the size of the header. - * The hdrsize is the number * 8 - * - * For example. - * hdrsz of 1 ==> 8 byte header - * 2 ==> 16 byte header - * - */ - /* END CSTYLED */ - uint16_t sa_layout_info; - uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */ - /* ... Data follows the lengths. */ -} sa_hdr_phys_t; - -#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) -#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 6, 3, 0) -#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \ -{ \ - BF32_SET_SB(x, 10, 6, 3, 0, size); \ - BF32_SET(x, 0, 10, num); \ -} - -typedef enum sa_buf_type { - SA_BONUS = 1, - SA_SPILL = 2 -} sa_buf_type_t; - -typedef enum sa_data_op { - SA_LOOKUP, - SA_UPDATE, - SA_ADD, - SA_REPLACE, - SA_REMOVE -} sa_data_op_t; - -/* - * Opaque handle used for most sa functions - * - * This needs to be kept as small as possible. - */ - -struct sa_handle { - dmu_buf_user_t sa_dbu; - kmutex_t sa_lock; - dmu_buf_t *sa_bonus; - dmu_buf_t *sa_spill; - objset_t *sa_os; - void *sa_userp; - sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */ - sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */ -}; - -#define SA_GET_DB(hdl, type) \ - (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill) - -#define SA_GET_HDR(hdl, type) \ - ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \ - type))->db.db_data)) - -#define SA_IDX_TAB_GET(hdl, type) \ - (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab) - -#define IS_SA_BONUSTYPE(a) \ - ((a == DMU_OT_SA) ? B_TRUE : B_FALSE) - -#define SA_BONUSTYPE_FROM_DB(db) \ - (dmu_get_bonustype((dmu_buf_t *)db)) - -#define SA_BLKPTR_SPACE (DN_OLD_MAX_BONUSLEN - sizeof (blkptr_t)) - -#define SA_LAYOUT_NUM(x, type) \ - ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \ - ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x)))) - - -#define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length - -#define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\ - hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \ - SA_REGISTERED_LEN(sa, attr)) - -#define SA_SET_HDR(hdr, num, size) \ - { \ - hdr->sa_magic = SA_MAGIC; \ - SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \ - } - -#define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \ - { \ - bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \ - bulk.sa_buftype = type; \ - bulk.sa_addr = \ - (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \ - (uintptr_t)hdr); \ -} - -#define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \ - (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \ - (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \ - sizeof (uint16_t), 8) : 0))) - -int sa_add_impl(sa_handle_t *, sa_attr_type_t, - uint32_t, sa_data_locator_t, void *, dmu_tx_t *); - -void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *); -int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *); - -void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *); -int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t, - uint16_t *, sa_hdr_phys_t *); - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SA_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h deleted file mode 100644 index 5bdc4feb3d5e..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ /dev/null @@ -1,969 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright 2013 Saso Kiselkov. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2017 Joyent, Inc. - * Copyright (c) 2017 Datto Inc. - * Copyright (c) 2017, Intel Corporation. - */ - -#ifndef _SYS_SPA_H -#define _SYS_SPA_H - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Forward references that lots of things need. - */ -typedef struct spa spa_t; -typedef struct vdev vdev_t; -typedef struct metaslab metaslab_t; -typedef struct metaslab_group metaslab_group_t; -typedef struct metaslab_class metaslab_class_t; -typedef struct zio zio_t; -typedef struct zilog zilog_t; -typedef struct spa_aux_vdev spa_aux_vdev_t; -typedef struct ddt ddt_t; -typedef struct ddt_entry ddt_entry_t; -struct dsl_pool; -struct dsl_dataset; - -/* - * General-purpose 32-bit and 64-bit bitfield encodings. - */ -#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) -#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) -#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) -#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) - -#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) -#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) - -#define BF32_SET(x, low, len, val) do { \ - ASSERT3U(val, <, 1U << (len)); \ - ASSERT3U(low + len, <=, 32); \ - (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ -_NOTE(CONSTCOND) } while (0) - -#define BF64_SET(x, low, len, val) do { \ - ASSERT3U(val, <, 1ULL << (len)); \ - ASSERT3U(low + len, <=, 64); \ - ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ -_NOTE(CONSTCOND) } while (0) - -#define BF32_GET_SB(x, low, len, shift, bias) \ - ((BF32_GET(x, low, len) + (bias)) << (shift)) -#define BF64_GET_SB(x, low, len, shift, bias) \ - ((BF64_GET(x, low, len) + (bias)) << (shift)) - -#define BF32_SET_SB(x, low, len, shift, bias, val) do { \ - ASSERT(IS_P2ALIGNED(val, 1U << shift)); \ - ASSERT3S((val) >> (shift), >=, bias); \ - BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ -_NOTE(CONSTCOND) } while (0) -#define BF64_SET_SB(x, low, len, shift, bias, val) do { \ - ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \ - ASSERT3S((val) >> (shift), >=, bias); \ - BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ -_NOTE(CONSTCOND) } while (0) - -/* - * We currently support block sizes from 512 bytes to 16MB. - * The benefits of larger blocks, and thus larger IO, need to be weighed - * against the cost of COWing a giant block to modify one byte, and the - * large latency of reading or writing a large block. - * - * Note that although blocks up to 16MB are supported, the recordsize - * property can not be set larger than zfs_max_recordsize (default 1MB). - * See the comment near zfs_max_recordsize in dsl_dataset.c for details. - * - * Note that although the LSIZE field of the blkptr_t can store sizes up - * to 32MB, the dnode's dn_datablkszsec can only store sizes up to - * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. - */ -#define SPA_MINBLOCKSHIFT 9 -#define SPA_OLD_MAXBLOCKSHIFT 17 -#define SPA_MAXBLOCKSHIFT 24 -#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) -#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) -#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) - -/* - * Default maximum supported logical ashift. - * - * The current 8k allocation block size limit is due to the 8k - * aligned/sized operations performed by vdev_probe() on - * vdev_label->vl_pad2. Using another "safe region" for these tests - * would allow the limit to be raised to 16k, at the expense of - * only having 8 available uberblocks in the label area. - */ -#define SPA_MAXASHIFT 13 - -/* - * Default minimum supported logical ashift. - */ -#define SPA_MINASHIFT SPA_MINBLOCKSHIFT - -/* - * Size of block to hold the configuration data (a packed nvlist) - */ -#define SPA_CONFIG_BLOCKSIZE (1ULL << 14) - -/* - * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. - * The ASIZE encoding should be at least 64 times larger (6 more bits) - * to support up to 4-way RAID-Z mirror mode with worst-case gang block - * overhead, three DVAs per bp, plus one more bit in case we do anything - * else that expands the ASIZE. - */ -#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ -#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ -#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ - -#define SPA_COMPRESSBITS 7 -#define SPA_VDEVBITS 24 - -/* - * All SPA data is represented by 128-bit data virtual addresses (DVAs). - * The members of the dva_t should be considered opaque outside the SPA. - */ -typedef struct dva { - uint64_t dva_word[2]; -} dva_t; - -/* - * Each block has a 256-bit checksum -- strong enough for cryptographic hashes. - */ -typedef struct zio_cksum { - uint64_t zc_word[4]; -} zio_cksum_t; - -/* - * Some checksums/hashes need a 256-bit initialization salt. This salt is kept - * secret and is suitable for use in MAC algorithms as the key. - */ -typedef struct zio_cksum_salt { - uint8_t zcs_bytes[32]; -} zio_cksum_salt_t; - -/* - * Each block is described by its DVAs, time of birth, checksum, etc. - * The word-by-word, bit-by-bit layout of the blkptr is as follows: - * - * 64 56 48 40 32 24 16 8 0 - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | pad | vdev1 | GRID | ASIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 1 |G| offset1 | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 2 | pad | vdev2 | GRID | ASIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 3 |G| offset2 | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 4 | pad | vdev3 | GRID | ASIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 5 |G| offset3 | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 7 | padding | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 8 | padding | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 9 | physical birth txg | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * a | logical birth txg | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * b | fill count | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * c | checksum[0] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * d | checksum[1] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * e | checksum[2] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * f | checksum[3] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * - * Legend: - * - * vdev virtual device ID - * offset offset into virtual device - * LSIZE logical size - * PSIZE physical size (after compression) - * ASIZE allocated size (including RAID-Z parity and gang block headers) - * GRID RAID-Z layout information (reserved for future use) - * cksum checksum function - * comp compression function - * G gang block indicator - * B byteorder (endianness) - * D dedup - * X encryption (on version 30, which is not supported) - * E blkptr_t contains embedded data (see below) - * lvl level of indirection - * type DMU object type - * phys birth txg when dva[0] was written; zero if same as logical birth txg - * note that typically all the dva's would be written in this - * txg, but they could be different if they were moved by - * device removal. - * log. birth transaction group in which the block was logically born - * fill count number of non-zero blocks under this bp - * checksum[4] 256-bit checksum of the data this bp describes - */ - -/* - * "Embedded" blkptr_t's don't actually point to a block, instead they - * have a data payload embedded in the blkptr_t itself. See the comment - * in blkptr.c for more details. - * - * The blkptr_t is laid out as follows: - * - * 64 56 48 40 32 24 16 8 0 - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | payload | - * 1 | payload | - * 2 | payload | - * 3 | payload | - * 4 | payload | - * 5 | payload | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 7 | payload | - * 8 | payload | - * 9 | payload | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * a | logical birth txg | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * b | payload | - * c | payload | - * d | payload | - * e | payload | - * f | payload | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * - * Legend: - * - * payload contains the embedded data - * B (byteorder) byteorder (endianness) - * D (dedup) padding (set to zero) - * X encryption (set to zero; see above) - * E (embedded) set to one - * lvl indirection level - * type DMU object type - * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) - * comp compression function of payload - * PSIZE size of payload after compression, in bytes - * LSIZE logical size of payload, in bytes - * note that 25 bits is enough to store the largest - * "normal" BP's LSIZE (2^16 * 2^9) in bytes - * log. birth transaction group in which the block was logically born - * - * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded - * bp's they are stored in units of SPA_MINBLOCKSHIFT. - * Generally, the generic BP_GET_*() macros can be used on embedded BP's. - * The B, D, X, lvl, type, and comp fields are stored the same as with normal - * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must - * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before - * other macros, as they assert that they are only used on BP's of the correct - * "embedded-ness". - */ - -#define BPE_GET_ETYPE(bp) \ - (ASSERT(BP_IS_EMBEDDED(bp)), \ - BF64_GET((bp)->blk_prop, 40, 8)) -#define BPE_SET_ETYPE(bp, t) do { \ - ASSERT(BP_IS_EMBEDDED(bp)); \ - BF64_SET((bp)->blk_prop, 40, 8, t); \ -_NOTE(CONSTCOND) } while (0) - -#define BPE_GET_LSIZE(bp) \ - (ASSERT(BP_IS_EMBEDDED(bp)), \ - BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) -#define BPE_SET_LSIZE(bp, x) do { \ - ASSERT(BP_IS_EMBEDDED(bp)); \ - BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ -_NOTE(CONSTCOND) } while (0) - -#define BPE_GET_PSIZE(bp) \ - (ASSERT(BP_IS_EMBEDDED(bp)), \ - BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) -#define BPE_SET_PSIZE(bp, x) do { \ - ASSERT(BP_IS_EMBEDDED(bp)); \ - BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ -_NOTE(CONSTCOND) } while (0) - -typedef enum bp_embedded_type { - BP_EMBEDDED_TYPE_DATA, - BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */ - NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED -} bp_embedded_type_t; - -#define BPE_NUM_WORDS 14 -#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) -#define BPE_IS_PAYLOADWORD(bp, wp) \ - ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) - -#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ -#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ -#define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */ - -/* - * A block is a hole when it has either 1) never been written to, or - * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads - * without physically allocating disk space. Holes are represented in the - * blkptr_t structure by zeroed blk_dva. Correct checking for holes is - * done through the BP_IS_HOLE macro. For holes, the logical size, level, - * DMU object type, and birth times are all also stored for holes that - * were written to at some point (i.e. were punched after having been filled). - */ -typedef struct blkptr { - dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ - uint64_t blk_prop; /* size, compression, type, etc */ - uint64_t blk_pad[2]; /* Extra space for the future */ - uint64_t blk_phys_birth; /* txg when block was allocated */ - uint64_t blk_birth; /* transaction group at birth */ - uint64_t blk_fill; /* fill count */ - zio_cksum_t blk_cksum; /* 256-bit checksum */ -} blkptr_t; - -/* - * Macros to get and set fields in a bp or DVA. - */ -#define DVA_GET_ASIZE(dva) \ - BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) -#define DVA_SET_ASIZE(dva, x) \ - BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ - SPA_MINBLOCKSHIFT, 0, x) - -#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) -#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) - -#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) -#define DVA_SET_VDEV(dva, x) \ - BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) - -#define DVA_GET_OFFSET(dva) \ - BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) -#define DVA_SET_OFFSET(dva, x) \ - BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) - -#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) -#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) - -#define BP_GET_LSIZE(bp) \ - (BP_IS_EMBEDDED(bp) ? \ - (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ - BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) -#define BP_SET_LSIZE(bp, x) do { \ - ASSERT(!BP_IS_EMBEDDED(bp)); \ - BF64_SET_SB((bp)->blk_prop, \ - 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ -_NOTE(CONSTCOND) } while (0) - -#define BP_GET_PSIZE(bp) \ - (BP_IS_EMBEDDED(bp) ? 0 : \ - BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)) -#define BP_SET_PSIZE(bp, x) do { \ - ASSERT(!BP_IS_EMBEDDED(bp)); \ - BF64_SET_SB((bp)->blk_prop, \ - 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ -_NOTE(CONSTCOND) } while (0) - -#define BP_GET_COMPRESS(bp) \ - BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) -#define BP_SET_COMPRESS(bp, x) \ - BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) - -#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) -#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) - -#define BP_GET_CHECKSUM(bp) \ - (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \ - BF64_GET((bp)->blk_prop, 40, 8)) -#define BP_SET_CHECKSUM(bp, x) do { \ - ASSERT(!BP_IS_EMBEDDED(bp)); \ - BF64_SET((bp)->blk_prop, 40, 8, x); \ -_NOTE(CONSTCOND) } while (0) - -#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) -#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) - -#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) -#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) - -#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) -#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) - -#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) -#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) - -#define BP_PHYSICAL_BIRTH(bp) \ - (BP_IS_EMBEDDED(bp) ? 0 : \ - (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) - -#define BP_SET_BIRTH(bp, logical, physical) \ -{ \ - ASSERT(!BP_IS_EMBEDDED(bp)); \ - (bp)->blk_birth = (logical); \ - (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ -} - -#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill) - -#define BP_IS_METADATA(bp) \ - (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) - -#define BP_GET_ASIZE(bp) \ - (BP_IS_EMBEDDED(bp) ? 0 : \ - DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ - DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ - DVA_GET_ASIZE(&(bp)->blk_dva[2])) - -#define BP_GET_UCSIZE(bp) \ - (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) - -#define BP_GET_NDVAS(bp) \ - (BP_IS_EMBEDDED(bp) ? 0 : \ - !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ - !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ - !!DVA_GET_ASIZE(&(bp)->blk_dva[2])) - -#define BP_COUNT_GANG(bp) \ - (BP_IS_EMBEDDED(bp) ? 0 : \ - (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ - DVA_GET_GANG(&(bp)->blk_dva[1]) + \ - DVA_GET_GANG(&(bp)->blk_dva[2]))) - -#define DVA_EQUAL(dva1, dva2) \ - ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ - (dva1)->dva_word[0] == (dva2)->dva_word[0]) - -#define BP_EQUAL(bp1, bp2) \ - (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ - (bp1)->blk_birth == (bp2)->blk_birth && \ - DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ - DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ - DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) - -#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ - (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ - ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ - ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ - ((zc1).zc_word[3] - (zc2).zc_word[3]))) - -#define ZIO_CHECKSUM_IS_ZERO(zc) \ - (0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \ - (zc)->zc_word[2] | (zc)->zc_word[3])) - -#define ZIO_CHECKSUM_BSWAP(zcp) \ -{ \ - (zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]); \ - (zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]); \ - (zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]); \ - (zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]); \ -} - - -#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) - -#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ -{ \ - (zcp)->zc_word[0] = w0; \ - (zcp)->zc_word[1] = w1; \ - (zcp)->zc_word[2] = w2; \ - (zcp)->zc_word[3] = w3; \ -} - -#define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0]) -#define BP_IS_GANG(bp) \ - (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp))) -#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ - (dva)->dva_word[1] == 0ULL) -#define BP_IS_HOLE(bp) \ - (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) - -/* BP_IS_RAIDZ(bp) assumes no block compression */ -#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ - BP_GET_PSIZE(bp)) - -#define BP_ZERO(bp) \ -{ \ - (bp)->blk_dva[0].dva_word[0] = 0; \ - (bp)->blk_dva[0].dva_word[1] = 0; \ - (bp)->blk_dva[1].dva_word[0] = 0; \ - (bp)->blk_dva[1].dva_word[1] = 0; \ - (bp)->blk_dva[2].dva_word[0] = 0; \ - (bp)->blk_dva[2].dva_word[1] = 0; \ - (bp)->blk_prop = 0; \ - (bp)->blk_pad[0] = 0; \ - (bp)->blk_pad[1] = 0; \ - (bp)->blk_phys_birth = 0; \ - (bp)->blk_birth = 0; \ - (bp)->blk_fill = 0; \ - ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ -} - -#if BYTE_ORDER == _BIG_ENDIAN -#define ZFS_HOST_BYTEORDER (0ULL) -#else -#define ZFS_HOST_BYTEORDER (1ULL) -#endif - -#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) - -#define BP_SPRINTF_LEN 320 - -/* - * This macro allows code sharing between zfs, libzpool, and mdb. - * 'func' is either snprintf() or mdb_snprintf(). - * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. - */ -#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ -{ \ - static const char *copyname[] = \ - { "zero", "single", "double", "triple" }; \ - int len = 0; \ - int copies = 0; \ - \ - if (bp == NULL) { \ - len += func(buf + len, size - len, ""); \ - } else if (BP_IS_HOLE(bp)) { \ - len += func(buf + len, size - len, \ - "HOLE [L%llu %s] " \ - "size=%llxL birth=%lluL", \ - (u_longlong_t)BP_GET_LEVEL(bp), \ - type, \ - (u_longlong_t)BP_GET_LSIZE(bp), \ - (u_longlong_t)bp->blk_birth); \ - } else if (BP_IS_EMBEDDED(bp)) { \ - len = func(buf + len, size - len, \ - "EMBEDDED [L%llu %s] et=%u %s " \ - "size=%llxL/%llxP birth=%lluL", \ - (u_longlong_t)BP_GET_LEVEL(bp), \ - type, \ - (int)BPE_GET_ETYPE(bp), \ - compress, \ - (u_longlong_t)BPE_GET_LSIZE(bp), \ - (u_longlong_t)BPE_GET_PSIZE(bp), \ - (u_longlong_t)bp->blk_birth); \ - } else { \ - for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ - const dva_t *dva = &bp->blk_dva[d]; \ - if (DVA_IS_VALID(dva)) \ - copies++; \ - len += func(buf + len, size - len, \ - "DVA[%d]=<%llu:%llx:%llx>%c", d, \ - (u_longlong_t)DVA_GET_VDEV(dva), \ - (u_longlong_t)DVA_GET_OFFSET(dva), \ - (u_longlong_t)DVA_GET_ASIZE(dva), \ - ws); \ - } \ - if (BP_IS_GANG(bp) && \ - DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ - DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ - copies--; \ - len += func(buf + len, size - len, \ - "[L%llu %s] %s %s %s %s %s %s%c" \ - "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ - "cksum=%llx:%llx:%llx:%llx", \ - (u_longlong_t)BP_GET_LEVEL(bp), \ - type, \ - checksum, \ - compress, \ - BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ - BP_IS_GANG(bp) ? "gang" : "contiguous", \ - BP_GET_DEDUP(bp) ? "dedup" : "unique", \ - copyname[copies], \ - ws, \ - (u_longlong_t)BP_GET_LSIZE(bp), \ - (u_longlong_t)BP_GET_PSIZE(bp), \ - (u_longlong_t)bp->blk_birth, \ - (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ - (u_longlong_t)BP_GET_FILL(bp), \ - ws, \ - (u_longlong_t)bp->blk_cksum.zc_word[0], \ - (u_longlong_t)bp->blk_cksum.zc_word[1], \ - (u_longlong_t)bp->blk_cksum.zc_word[2], \ - (u_longlong_t)bp->blk_cksum.zc_word[3]); \ - } \ - ASSERT(len < size); \ -} - -#define BP_GET_BUFC_TYPE(bp) \ - (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) - -typedef enum spa_import_type { - SPA_IMPORT_EXISTING, - SPA_IMPORT_ASSEMBLE -} spa_import_type_t; - -/* state manipulation functions */ -extern int spa_open(const char *pool, spa_t **, void *tag); -extern int spa_open_rewind(const char *pool, spa_t **, void *tag, - nvlist_t *policy, nvlist_t **config); -extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, - size_t buflen); -extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, - nvlist_t *zplprops); -#ifdef illumos -extern int spa_import_rootpool(char *devpath, char *devid); -#else -extern int spa_import_rootpool(const char *name, bool checkpointrewind); -#endif -extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, - uint64_t flags); -extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); -extern int spa_destroy(char *pool); -extern int spa_checkpoint(const char *pool); -extern int spa_checkpoint_discard(const char *pool); -extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, - boolean_t hardforce); -extern int spa_reset(char *pool); -extern void spa_async_request(spa_t *spa, int flag); -extern void spa_async_unrequest(spa_t *spa, int flag); -extern void spa_async_suspend(spa_t *spa); -extern void spa_async_resume(spa_t *spa); -extern spa_t *spa_inject_addref(char *pool); -extern void spa_inject_delref(spa_t *spa); -extern void spa_scan_stat_init(spa_t *spa); -extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); - -#define SPA_ASYNC_CONFIG_UPDATE 0x01 -#define SPA_ASYNC_REMOVE 0x02 -#define SPA_ASYNC_PROBE 0x04 -#define SPA_ASYNC_RESILVER_DONE 0x08 -#define SPA_ASYNC_RESILVER 0x10 -#define SPA_ASYNC_AUTOEXPAND 0x20 -#define SPA_ASYNC_REMOVE_DONE 0x40 -#define SPA_ASYNC_REMOVE_STOP 0x80 -#define SPA_ASYNC_INITIALIZE_RESTART 0x100 - -/* - * Controls the behavior of spa_vdev_remove(). - */ -#define SPA_REMOVE_UNSPARE 0x01 -#define SPA_REMOVE_DONE 0x02 - -/* device manipulation */ -extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); -extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, - int replacing); -extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, - int replace_done); -extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); -extern boolean_t spa_vdev_remove_active(spa_t *spa); -extern int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type); -extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); -extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); -extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - nvlist_t *props, boolean_t exp); - -/* spare state (which is global across all pools) */ -extern void spa_spare_add(vdev_t *vd); -extern void spa_spare_remove(vdev_t *vd); -extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); -extern void spa_spare_activate(vdev_t *vd); - -/* L2ARC state (which is global across all pools) */ -extern void spa_l2cache_add(vdev_t *vd); -extern void spa_l2cache_remove(vdev_t *vd); -extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); -extern void spa_l2cache_activate(vdev_t *vd); -extern void spa_l2cache_drop(spa_t *spa); - -/* scanning */ -extern int spa_scan(spa_t *spa, pool_scan_func_t func); -extern int spa_scan_stop(spa_t *spa); -extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); - -/* spa syncing */ -extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ -extern void spa_sync_allpools(void); - -/* spa namespace global mutex */ -extern kmutex_t spa_namespace_lock; - -/* - * SPA configuration functions in spa_config.c - */ - -#define SPA_CONFIG_UPDATE_POOL 0 -#define SPA_CONFIG_UPDATE_VDEVS 1 - -extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t); -extern void spa_config_load(void); -extern nvlist_t *spa_all_configs(uint64_t *); -extern void spa_config_set(spa_t *spa, nvlist_t *config); -extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, - int getstats); -extern void spa_config_update(spa_t *spa, int what); - -/* - * Miscellaneous SPA routines in spa_misc.c - */ - -/* Namespace manipulation */ -extern spa_t *spa_lookup(const char *name); -extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); -extern void spa_remove(spa_t *spa); -extern spa_t *spa_next(spa_t *prev); - -/* Refcount functions */ -extern void spa_open_ref(spa_t *spa, void *tag); -extern void spa_close(spa_t *spa, void *tag); -extern void spa_async_close(spa_t *spa, void *tag); -extern boolean_t spa_refcount_zero(spa_t *spa); - -#define SCL_NONE 0x00 -#define SCL_CONFIG 0x01 -#define SCL_STATE 0x02 -#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ -#define SCL_ALLOC 0x08 -#define SCL_ZIO 0x10 -#define SCL_FREE 0x20 -#define SCL_VDEV 0x40 -#define SCL_LOCKS 7 -#define SCL_ALL ((1 << SCL_LOCKS) - 1) -#define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) - -/* Pool configuration locks */ -extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); -extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); -extern void spa_config_exit(spa_t *spa, int locks, void *tag); -extern int spa_config_held(spa_t *spa, int locks, krw_t rw); - -/* Pool vdev add/remove lock */ -extern uint64_t spa_vdev_enter(spa_t *spa); -extern uint64_t spa_vdev_config_enter(spa_t *spa); -extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, - int error, char *tag); -extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); - -/* Pool vdev state change lock */ -extern void spa_vdev_state_enter(spa_t *spa, int oplock); -extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); - -/* Log state */ -typedef enum spa_log_state { - SPA_LOG_UNKNOWN = 0, /* unknown log state */ - SPA_LOG_MISSING, /* missing log(s) */ - SPA_LOG_CLEAR, /* clear the log(s) */ - SPA_LOG_GOOD, /* log(s) are good */ -} spa_log_state_t; - -extern spa_log_state_t spa_get_log_state(spa_t *spa); -extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); -extern int spa_reset_logs(spa_t *spa); - -/* Log claim callback */ -extern void spa_claim_notify(zio_t *zio); - -/* Accessor functions */ -extern boolean_t spa_shutting_down(spa_t *spa); -extern struct dsl_pool *spa_get_dsl(spa_t *spa); -extern boolean_t spa_is_initializing(spa_t *spa); -extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa); -extern blkptr_t *spa_get_rootblkptr(spa_t *spa); -extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); -extern void spa_altroot(spa_t *, char *, size_t); -extern int spa_sync_pass(spa_t *spa); -extern char *spa_name(spa_t *spa); -extern uint64_t spa_guid(spa_t *spa); -extern uint64_t spa_load_guid(spa_t *spa); -extern uint64_t spa_last_synced_txg(spa_t *spa); -extern uint64_t spa_first_txg(spa_t *spa); -extern uint64_t spa_syncing_txg(spa_t *spa); -extern uint64_t spa_final_dirty_txg(spa_t *spa); -extern uint64_t spa_version(spa_t *spa); -extern pool_state_t spa_state(spa_t *spa); -extern spa_load_state_t spa_load_state(spa_t *spa); -extern uint64_t spa_freeze_txg(spa_t *spa); -extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); -extern uint64_t spa_get_dspace(spa_t *spa); -extern uint64_t spa_get_checkpoint_space(spa_t *spa); -extern uint64_t spa_get_slop_space(spa_t *spa); -extern void spa_update_dspace(spa_t *spa); -extern uint64_t spa_version(spa_t *spa); -extern boolean_t spa_deflate(spa_t *spa); -extern metaslab_class_t *spa_normal_class(spa_t *spa); -extern metaslab_class_t *spa_log_class(spa_t *spa); -extern metaslab_class_t *spa_special_class(spa_t *spa); -extern metaslab_class_t *spa_dedup_class(spa_t *spa); -extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, - dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); - -extern void spa_evicting_os_register(spa_t *, objset_t *os); -extern void spa_evicting_os_deregister(spa_t *, objset_t *os); -extern void spa_evicting_os_wait(spa_t *spa); -extern int spa_max_replication(spa_t *spa); -extern int spa_prev_software_version(spa_t *spa); -extern int spa_busy(void); -extern uint8_t spa_get_failmode(spa_t *spa); -extern boolean_t spa_suspended(spa_t *spa); -extern uint64_t spa_bootfs(spa_t *spa); -extern uint64_t spa_delegation(spa_t *spa); -extern objset_t *spa_meta_objset(spa_t *spa); -extern uint64_t spa_deadman_synctime(spa_t *spa); -extern struct proc *spa_proc(spa_t *spa); -extern uint64_t spa_dirty_data(spa_t *spa); - -/* Miscellaneous support routines */ -extern void spa_load_failed(spa_t *spa, const char *fmt, ...); -extern void spa_load_note(spa_t *spa, const char *fmt, ...); -extern void spa_activate_mos_feature(spa_t *spa, const char *feature, - dmu_tx_t *tx); -extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); -extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); -extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); -extern char *spa_strdup(const char *); -extern void spa_strfree(char *); -extern uint64_t spa_get_random(uint64_t range); -extern uint64_t spa_generate_guid(spa_t *spa); -extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); -extern void spa_freeze(spa_t *spa); -extern int spa_change_guid(spa_t *spa); -extern void spa_upgrade(spa_t *spa, uint64_t version); -extern void spa_evict_all(void); -extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, - boolean_t l2cache); -extern boolean_t spa_has_spare(spa_t *, uint64_t guid); -extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); -extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); -extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); -extern boolean_t spa_has_slogs(spa_t *spa); -extern boolean_t spa_is_root(spa_t *spa); -extern boolean_t spa_writeable(spa_t *spa); -extern boolean_t spa_has_pending_synctask(spa_t *spa); -extern int spa_maxblocksize(spa_t *spa); -extern int spa_maxdnodesize(spa_t *spa); -extern boolean_t spa_multihost(spa_t *spa); -extern unsigned long spa_get_hostid(void); -extern boolean_t spa_has_checkpoint(spa_t *spa); -extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); -extern boolean_t spa_suspend_async_destroy(spa_t *spa); -extern uint64_t spa_min_claim_txg(spa_t *spa); -extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp); -extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, - const blkptr_t *bp); -typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size, - void *arg); -extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, - spa_remap_cb_t callback, void *arg); -extern uint64_t spa_get_last_removal_txg(spa_t *spa); -extern boolean_t spa_trust_config(spa_t *spa); -extern uint64_t spa_missing_tvds_allowed(spa_t *spa); -extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); -extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); -extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); - -extern int spa_mode(spa_t *spa); -extern uint64_t zfs_strtonum(const char *str, char **nptr); - -extern char *spa_his_ievent_table[]; - -extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); -extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, - char *his_buf); -extern int spa_history_log(spa_t *spa, const char *his_buf); -extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); -extern void spa_history_log_version(spa_t *spa, const char *operation); -extern void spa_history_log_internal(spa_t *spa, const char *operation, - dmu_tx_t *tx, const char *fmt, ...); -extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, - dmu_tx_t *tx, const char *fmt, ...); -extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, - dmu_tx_t *tx, const char *fmt, ...); - -/* error handling */ -struct zbookmark_phys; -extern void spa_log_error(spa_t *spa, zio_t *zio); -extern void zfs_ereport_post(const char *cls, spa_t *spa, vdev_t *vd, - zio_t *zio, uint64_t stateoroffset, uint64_t length); -extern void zfs_post_remove(spa_t *spa, vdev_t *vd); -extern void zfs_post_state_change(spa_t *spa, vdev_t *vd); -extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); -extern uint64_t spa_get_errlog_size(spa_t *spa); -extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); -extern void spa_errlog_rotate(spa_t *spa); -extern void spa_errlog_drain(spa_t *spa); -extern void spa_errlog_sync(spa_t *spa, uint64_t txg); -extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); - -/* vdev cache */ -extern void vdev_cache_stat_init(void); -extern void vdev_cache_stat_fini(void); - -/* Initialization and termination */ -extern void spa_init(int flags); -extern void spa_fini(void); -extern void spa_boot_init(void); - -/* properties */ -extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); -extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); -extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); -extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); - -/* asynchronous event notification */ -extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, - const char *name); -extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, - const char *name); -extern void spa_event_post(sysevent_t *ev); -extern void spa_event_discard(sysevent_t *ev); - -#ifdef ZFS_DEBUG -#define dprintf_bp(bp, fmt, ...) do { \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ - char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ - snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ - dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ - kmem_free(__blkbuf, BP_SPRINTF_LEN); \ - } \ -_NOTE(CONSTCOND) } while (0) -#else -#define dprintf_bp(bp, fmt, ...) -#endif - -extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SPA_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h deleted file mode 100644 index 8df5072a55ef..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2012 by Delphix. All rights reserved. - */ - -#ifndef _SYS_SPA_BOOT_H -#define _SYS_SPA_BOOT_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -extern char *spa_get_bootprop(char *prop); -extern void spa_free_bootprop(char *prop); - -extern void spa_arch_init(void); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SPA_BOOT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h deleted file mode 100644 index 9be2b6eeab3c..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2017 by Delphix. All rights reserved. - */ - -#ifndef _SYS_SPA_CHECKPOINT_H -#define _SYS_SPA_CHECKPOINT_H - -#include - -typedef struct spa_checkpoint_info { - uint64_t sci_timestamp; /* when checkpointed uberblock was synced */ - uint64_t sci_dspace; /* disk space used by checkpoint in bytes */ -} spa_checkpoint_info_t; - -int spa_checkpoint(const char *); -int spa_checkpoint_discard(const char *); - -boolean_t spa_checkpoint_discard_thread_check(void *, zthr_t *); -void spa_checkpoint_discard_thread(void *, zthr_t *); - -int spa_checkpoint_get_stats(spa_t *, pool_checkpoint_stat_t *); - -#endif /* _SYS_SPA_CHECKPOINT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h deleted file mode 100644 index 11b6982798e8..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ /dev/null @@ -1,435 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright 2013 Martin Matuska . All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright 2013 Saso Kiselkov. All rights reserved. - * Copyright (c) 2017 Datto Inc. - * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2016 Actifio, Inc. All rights reserved. - */ - -#ifndef _SYS_SPA_IMPL_H -#define _SYS_SPA_IMPL_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct spa_error_entry { - zbookmark_phys_t se_bookmark; - char *se_name; - avl_node_t se_avl; -} spa_error_entry_t; - -typedef struct spa_history_phys { - uint64_t sh_pool_create_len; /* ending offset of zpool create */ - uint64_t sh_phys_max_off; /* physical EOF */ - uint64_t sh_bof; /* logical BOF */ - uint64_t sh_eof; /* logical EOF */ - uint64_t sh_records_lost; /* num of records overwritten */ -} spa_history_phys_t; - -/* - * All members must be uint64_t, for byteswap purposes. - */ -typedef struct spa_removing_phys { - uint64_t sr_state; /* dsl_scan_state_t */ - - /* - * The vdev ID that we most recently attempted to remove, - * or -1 if no removal has been attempted. - */ - uint64_t sr_removing_vdev; - - /* - * The vdev ID that we most recently successfully removed, - * or -1 if no devices have been removed. - */ - uint64_t sr_prev_indirect_vdev; - - uint64_t sr_start_time; - uint64_t sr_end_time; - - /* - * Note that we can not use the space map's or indirect mapping's - * accounting as a substitute for these values, because we need to - * count frees of not-yet-copied data as though it did the copy. - * Otherwise, we could get into a situation where copied > to_copy, - * or we complete before copied == to_copy. - */ - uint64_t sr_to_copy; /* bytes that need to be copied */ - uint64_t sr_copied; /* bytes that have been copied or freed */ -} spa_removing_phys_t; - -/* - * This struct is stored as an entry in the DMU_POOL_DIRECTORY_OBJECT - * (with key DMU_POOL_CONDENSING_INDIRECT). It is present if a condense - * of an indirect vdev's mapping object is in progress. - */ -typedef struct spa_condensing_indirect_phys { - /* - * The vdev ID of the indirect vdev whose indirect mapping is - * being condensed. - */ - uint64_t scip_vdev; - - /* - * The vdev's old obsolete spacemap. This spacemap's contents are - * being integrated into the new mapping. - */ - uint64_t scip_prev_obsolete_sm_object; - - /* - * The new mapping object that is being created. - */ - uint64_t scip_next_mapping_object; -} spa_condensing_indirect_phys_t; - -struct spa_aux_vdev { - uint64_t sav_object; /* MOS object for device list */ - nvlist_t *sav_config; /* cached device config */ - vdev_t **sav_vdevs; /* devices */ - int sav_count; /* number devices */ - boolean_t sav_sync; /* sync the device list */ - nvlist_t **sav_pending; /* pending device additions */ - uint_t sav_npending; /* # pending devices */ -}; - -typedef struct spa_config_lock { - kmutex_t scl_lock; - kthread_t *scl_writer; - int scl_write_wanted; - kcondvar_t scl_cv; - zfs_refcount_t scl_count; -} spa_config_lock_t; - -typedef struct spa_config_dirent { - list_node_t scd_link; - char *scd_path; -} spa_config_dirent_t; - -typedef enum zio_taskq_type { - ZIO_TASKQ_ISSUE = 0, - ZIO_TASKQ_ISSUE_HIGH, - ZIO_TASKQ_INTERRUPT, - ZIO_TASKQ_INTERRUPT_HIGH, - ZIO_TASKQ_TYPES -} zio_taskq_type_t; - -/* - * State machine for the zpool-poolname process. The states transitions - * are done as follows: - * - * From To Routine - * PROC_NONE -> PROC_CREATED spa_activate() - * PROC_CREATED -> PROC_ACTIVE spa_thread() - * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate() - * PROC_DEACTIVATE -> PROC_GONE spa_thread() - * PROC_GONE -> PROC_NONE spa_deactivate() - */ -typedef enum spa_proc_state { - SPA_PROC_NONE, /* spa_proc = &p0, no process created */ - SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */ - SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */ - SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */ - SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */ -} spa_proc_state_t; - -typedef struct spa_taskqs { - uint_t stqs_count; - taskq_t **stqs_taskq; -} spa_taskqs_t; - -typedef enum spa_all_vdev_zap_action { - AVZ_ACTION_NONE = 0, - AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */ - AVZ_ACTION_REBUILD, /* Populate the new AVZ, see spa_avz_rebuild */ - AVZ_ACTION_INITIALIZE -} spa_avz_action_t; - -typedef enum spa_config_source { - SPA_CONFIG_SRC_NONE = 0, - SPA_CONFIG_SRC_SCAN, /* scan of path (default: /dev/dsk) */ - SPA_CONFIG_SRC_CACHEFILE, /* any cachefile */ - SPA_CONFIG_SRC_TRYIMPORT, /* returned from call to tryimport */ - SPA_CONFIG_SRC_SPLIT, /* new pool in a pool split */ - SPA_CONFIG_SRC_MOS /* MOS, but not always from right txg */ -} spa_config_source_t; - -struct spa { - /* - * Fields protected by spa_namespace_lock. - */ - char spa_name[ZFS_MAX_DATASET_NAME_LEN]; /* pool name */ - char *spa_comment; /* comment */ - avl_node_t spa_avl; /* node in spa_namespace_avl */ - nvlist_t *spa_config; /* last synced config */ - nvlist_t *spa_config_syncing; /* currently syncing config */ - nvlist_t *spa_config_splitting; /* config for splitting */ - nvlist_t *spa_load_info; /* info and errors from load */ - uint64_t spa_config_txg; /* txg of last config change */ - int spa_sync_pass; /* iterate-to-convergence */ - pool_state_t spa_state; /* pool state */ - int spa_inject_ref; /* injection references */ - uint8_t spa_sync_on; /* sync threads are running */ - spa_load_state_t spa_load_state; /* current load operation */ - boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */ - boolean_t spa_trust_config; /* do we trust vdev tree? */ - spa_config_source_t spa_config_source; /* where config comes from? */ - uint64_t spa_import_flags; /* import specific flags */ - spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; - dsl_pool_t *spa_dsl_pool; - boolean_t spa_is_initializing; /* true while opening pool */ - metaslab_class_t *spa_normal_class; /* normal data class */ - metaslab_class_t *spa_log_class; /* intent log data class */ - metaslab_class_t *spa_special_class; /* special allocation class */ - metaslab_class_t *spa_dedup_class; /* dedup allocation class */ - uint64_t spa_first_txg; /* first txg after spa_open() */ - uint64_t spa_final_txg; /* txg of export/destroy */ - uint64_t spa_freeze_txg; /* freeze pool at this txg */ - uint64_t spa_load_max_txg; /* best initial ub_txg */ - uint64_t spa_claim_max_txg; /* highest claimed birth txg */ - timespec_t spa_loaded_ts; /* 1st successful open time */ - objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ - kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */ - list_t spa_evicting_os_list; /* Objsets being evicted. */ - kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ - txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ - vdev_t *spa_root_vdev; /* top-level vdev container */ - int spa_min_ashift; /* of vdevs in normal class */ - int spa_max_ashift; /* of vdevs in normal class */ - uint64_t spa_config_guid; /* config pool guid */ - uint64_t spa_load_guid; /* spa_load initialized guid */ - uint64_t spa_last_synced_guid; /* last synced guid */ - list_t spa_config_dirty_list; /* vdevs with dirty config */ - list_t spa_state_dirty_list; /* vdevs with dirty state */ - /* - * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are - * stored in spa_alloc_count. There is one tree and one lock for each - * allocator, to help improve allocation performance in write-heavy - * workloads. - */ - kmutex_t *spa_alloc_locks; - avl_tree_t *spa_alloc_trees; - int spa_alloc_count; - - spa_aux_vdev_t spa_spares; /* hot spares */ - spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ - nvlist_t *spa_label_features; /* Features for reading MOS */ - uint64_t spa_config_object; /* MOS object for pool config */ - uint64_t spa_config_generation; /* config generation number */ - uint64_t spa_syncing_txg; /* txg currently syncing */ - bpobj_t spa_deferred_bpobj; /* deferred-free bplist */ - bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */ - zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */ - /* checksum context templates */ - kmutex_t spa_cksum_tmpls_lock; - void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS]; - uberblock_t spa_ubsync; /* last synced uberblock */ - uberblock_t spa_uberblock; /* current uberblock */ - boolean_t spa_extreme_rewind; /* rewind past deferred frees */ - uint64_t spa_last_io; /* lbolt of last non-scan I/O */ - kmutex_t spa_scrub_lock; /* resilver/scrub lock */ - uint64_t spa_scrub_inflight; /* in-flight scrub bytes */ - uint64_t spa_load_verify_ios; /* in-flight verifications IOs */ - kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ - uint8_t spa_scrub_active; /* active or suspended? */ - uint8_t spa_scrub_type; /* type of scrub we're doing */ - uint8_t spa_scrub_finished; /* indicator to rotate logs */ - uint8_t spa_scrub_started; /* started since last boot */ - uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ - uint64_t spa_scan_pass_start; /* start time per pass/reboot */ - uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */ - uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */ - uint64_t spa_scan_pass_exam; /* examined bytes per pass */ - uint64_t spa_scan_pass_issued; /* issued bytes per pass */ - kmutex_t spa_async_lock; /* protect async state */ - kthread_t *spa_async_thread; /* thread doing async task */ - kthread_t *spa_async_thread_vd; /* thread doing vd async task */ - int spa_async_suspended; /* async tasks suspended */ - kcondvar_t spa_async_cv; /* wait for thread_exit() */ - uint16_t spa_async_tasks; /* async task mask */ - uint64_t spa_missing_tvds; /* unopenable tvds on load */ - uint64_t spa_missing_tvds_allowed; /* allow loading spa? */ - - spa_removing_phys_t spa_removing_phys; - spa_vdev_removal_t *spa_vdev_removal; - - spa_condensing_indirect_phys_t spa_condensing_indirect_phys; - spa_condensing_indirect_t *spa_condensing_indirect; - zthr_t *spa_condense_zthr; /* zthr doing condense. */ - - uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */ - spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ - zthr_t *spa_checkpoint_discard_zthr; - - char *spa_root; /* alternate root directory */ - uint64_t spa_ena; /* spa-wide ereport ENA */ - int spa_last_open_failed; /* error if last open failed */ - uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */ - uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */ - uint64_t spa_load_txg; /* ub txg that loaded */ - uint64_t spa_load_txg_ts; /* timestamp from that ub */ - uint64_t spa_load_meta_errors; /* verify metadata err count */ - uint64_t spa_load_data_errors; /* verify data err count */ - uint64_t spa_verify_min_txg; /* start txg of verify scrub */ - kmutex_t spa_errlog_lock; /* error log lock */ - uint64_t spa_errlog_last; /* last error log object */ - uint64_t spa_errlog_scrub; /* scrub error log object */ - kmutex_t spa_errlist_lock; /* error list/ereport lock */ - avl_tree_t spa_errlist_last; /* last error list */ - avl_tree_t spa_errlist_scrub; /* scrub error list */ - uint64_t spa_deflate; /* should we deflate? */ - uint64_t spa_history; /* history object */ - kmutex_t spa_history_lock; /* history lock */ - vdev_t *spa_pending_vdev; /* pending vdev additions */ - kmutex_t spa_props_lock; /* property lock */ - uint64_t spa_pool_props_object; /* object for properties */ - uint64_t spa_bootfs; /* default boot filesystem */ - uint64_t spa_failmode; /* failure mode for the pool */ - uint64_t spa_delegation; /* delegation on/off */ - list_t spa_config_list; /* previous cache file(s) */ - /* per-CPU array of root of async I/O: */ - zio_t **spa_async_zio_root; - zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ - zio_t *spa_txg_zio[TXG_SIZE]; /* spa_sync() waits for this */ - kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ - kcondvar_t spa_suspend_cv; /* notification of resume */ - zio_suspend_reason_t spa_suspended; /* pool is suspended */ - uint8_t spa_claiming; /* pool is doing zil_claim() */ - boolean_t spa_is_root; /* pool is root */ - int spa_minref; /* num refs when first opened */ - int spa_mode; /* FREAD | FWRITE */ - spa_log_state_t spa_log_state; /* log state */ - uint64_t spa_autoexpand; /* lun expansion on/off */ - uint64_t spa_bootsize; /* efi system partition size */ - ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */ - uint64_t spa_ddt_stat_object; /* DDT statistics */ - uint64_t spa_dedup_ditto; /* dedup ditto threshold */ - uint64_t spa_dedup_checksum; /* default dedup checksum */ - uint64_t spa_dspace; /* dspace in normal class */ - kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ - kmutex_t spa_proc_lock; /* protects spa_proc* */ - kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ - spa_proc_state_t spa_proc_state; /* see definition */ - struct proc *spa_proc; /* "zpool-poolname" process */ - uint64_t spa_did; /* if procp != p0, did of t1 */ - kthread_t *spa_trim_thread; /* thread sending TRIM I/Os */ - kmutex_t spa_trim_lock; /* protects spa_trim_cv */ - kcondvar_t spa_trim_cv; /* used to notify TRIM thread */ - boolean_t spa_autoreplace; /* autoreplace set in open */ - int spa_vdev_locks; /* locks grabbed */ - uint64_t spa_creation_version; /* version at pool creation */ - uint64_t spa_prev_software_version; /* See ub_software_version */ - uint64_t spa_feat_for_write_obj; /* required to write to pool */ - uint64_t spa_feat_for_read_obj; /* required to read from pool */ - uint64_t spa_feat_desc_obj; /* Feature descriptions */ - uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */ - kmutex_t spa_feat_stats_lock; /* protects spa_feat_stats */ - nvlist_t *spa_feat_stats; /* Cache of enabled features */ - /* cache feature refcounts */ - uint64_t spa_feat_refcount_cache[SPA_FEATURES]; -#ifdef illumos - cyclic_id_t spa_deadman_cycid; /* cyclic id */ -#else /* !illumos */ -#ifdef _KERNEL - struct callout spa_deadman_cycid; /* callout id */ - struct task spa_deadman_task; -#endif -#endif /* illumos */ - uint64_t spa_deadman_calls; /* number of deadman calls */ - hrtime_t spa_sync_starttime; /* starting time fo spa_sync */ - uint64_t spa_deadman_synctime; /* deadman expiration timer */ - uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */ - spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */ - -#ifdef illumos - /* - * spa_iokstat_lock protects spa_iokstat and - * spa_queue_stats[]. - */ - kmutex_t spa_iokstat_lock; - struct kstat *spa_iokstat; /* kstat of io to this pool */ - struct { - int spa_active; - int spa_queued; - } spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE]; -#endif - /* arc_memory_throttle() parameters during low memory condition */ - uint64_t spa_lowmem_page_load; /* memory load during txg */ - uint64_t spa_lowmem_last_txg; /* txg window start */ - - hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ - - taskq_t *spa_zvol_taskq; /* Taskq for minor management */ - - uint64_t spa_multihost; /* multihost aware (mmp) */ - mmp_thread_t spa_mmp; /* multihost mmp thread */ - list_t spa_leaf_list; /* list of leaf vdevs */ - uint64_t spa_leaf_list_gen; /* track leaf_list changes */ - - /* - * spa_refcount & spa_config_lock must be the last elements - * because refcount_t changes size based on compilation options. - * because zfs_refcount_t changes size based on compilation options. - * In order for the MDB module to function correctly, the other - * fields must remain in the same location. - */ - spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ - zfs_refcount_t spa_refcount; /* number of opens */ -#ifndef illumos - boolean_t spa_splitting_newspa; /* creating new spa in split */ -#endif -}; - -extern const char *spa_config_path; - -extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent); -extern void spa_load_spares(spa_t *spa); -extern void spa_load_l2cache(spa_t *spa); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SPA_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h deleted file mode 100644 index 2bce20b48ba5..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h +++ /dev/null @@ -1,230 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - */ - -#ifndef _SYS_SPACE_MAP_H -#define _SYS_SPACE_MAP_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * The size of the space map object has increased to include a histogram. - * The SPACE_MAP_SIZE_V0 designates the original size and is used to - * maintain backward compatibility. - */ -#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t)) -#define SPACE_MAP_HISTOGRAM_SIZE 32 - -/* - * The space_map_phys is the on-disk representation of the space map. - * Consumers of space maps should never reference any of the members of this - * structure directly. These members may only be updated in syncing context. - * - * Note the smp_object is no longer used but remains in the structure - * for backward compatibility. - */ -typedef struct space_map_phys { - /* object number: not needed but kept for backwards compatibility */ - uint64_t smp_object; - - /* length of the object in bytes */ - uint64_t smp_length; - - /* space allocated from the map */ - int64_t smp_alloc; - - /* reserved */ - uint64_t smp_pad[5]; - - /* - * The smp_histogram maintains a histogram of free regions. Each - * bucket, smp_histogram[i], contains the number of free regions - * whose size is: - * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1) - */ - uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE]; -} space_map_phys_t; - -/* - * The space map object defines a region of space, its size, how much is - * allocated, and the on-disk object that stores this information. - * Consumers of space maps may only access the members of this structure. - * - * Note: the space_map may not be accessed concurrently; consumers - * must provide external locking if required. - */ -typedef struct space_map { - uint64_t sm_start; /* start of map */ - uint64_t sm_size; /* size of map */ - uint8_t sm_shift; /* unit shift */ - objset_t *sm_os; /* objset for this map */ - uint64_t sm_object; /* object id for this map */ - uint32_t sm_blksz; /* block size for space map */ - dmu_buf_t *sm_dbuf; /* space_map_phys_t dbuf */ - space_map_phys_t *sm_phys; /* on-disk space map */ -} space_map_t; - -/* - * debug entry - * - * 2 2 10 50 - * +-----+-----+------------+----------------------------------+ - * | 1 0 | act | syncpass | txg (lower bits) | - * +-----+-----+------------+----------------------------------+ - * 63 62 61 60 59 50 49 0 - * - * - * one-word entry - * - * 1 47 1 15 - * +-----------------------------------------------------------+ - * | 0 | offset (sm_shift units) | type | run | - * +-----------------------------------------------------------+ - * 63 62 16 15 14 0 - * - * - * two-word entry - * - * 2 2 36 24 - * +-----+-----+---------------------------+-------------------+ - * | 1 1 | pad | run | vdev | - * +-----+-----+---------------------------+-------------------+ - * 63 62 61 60 59 24 23 0 - * - * 1 63 - * +------+----------------------------------------------------+ - * | type | offset | - * +------+----------------------------------------------------+ - * 63 62 0 - * - * Note that a two-word entry will not strandle a block boundary. - * If necessary, the last word of a block will be padded with a - * debug entry (with act = syncpass = txg = 0). - */ - -typedef enum { - SM_ALLOC, - SM_FREE -} maptype_t; - -typedef struct space_map_entry { - maptype_t sme_type; - uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */ - uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */ - uint64_t sme_run; /* max is 2^36; units of sm_shift */ -} space_map_entry_t; - -#define SM_NO_VDEVID (1 << SPA_VDEVBITS) - -/* one-word entry constants */ -#define SM_DEBUG_PREFIX 2 -#define SM_OFFSET_BITS 47 -#define SM_RUN_BITS 15 - -/* two-word entry constants */ -#define SM2_PREFIX 3 -#define SM2_OFFSET_BITS 63 -#define SM2_RUN_BITS 36 - -#define SM_PREFIX_DECODE(x) BF64_DECODE(x, 62, 2) -#define SM_PREFIX_ENCODE(x) BF64_ENCODE(x, 62, 2) - -#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 2) -#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 2) -#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10) -#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10) -#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50) -#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50) - -#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, SM_OFFSET_BITS) -#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, SM_OFFSET_BITS) -#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) -#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) -#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, SM_RUN_BITS) + 1) -#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, SM_RUN_BITS) -#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) -#define SM_OFFSET_MAX SM_OFFSET_DECODE(~0ULL) - -#define SM2_RUN_DECODE(x) (BF64_DECODE(x, SPA_VDEVBITS, SM2_RUN_BITS) + 1) -#define SM2_RUN_ENCODE(x) BF64_ENCODE((x) - 1, SPA_VDEVBITS, SM2_RUN_BITS) -#define SM2_VDEV_DECODE(x) BF64_DECODE(x, 0, SPA_VDEVBITS) -#define SM2_VDEV_ENCODE(x) BF64_ENCODE(x, 0, SPA_VDEVBITS) -#define SM2_TYPE_DECODE(x) BF64_DECODE(x, SM2_OFFSET_BITS, 1) -#define SM2_TYPE_ENCODE(x) BF64_ENCODE(x, SM2_OFFSET_BITS, 1) -#define SM2_OFFSET_DECODE(x) BF64_DECODE(x, 0, SM2_OFFSET_BITS) -#define SM2_OFFSET_ENCODE(x) BF64_ENCODE(x, 0, SM2_OFFSET_BITS) -#define SM2_RUN_MAX SM2_RUN_DECODE(~0ULL) -#define SM2_OFFSET_MAX SM2_OFFSET_DECODE(~0ULL) - -boolean_t sm_entry_is_debug(uint64_t e); -boolean_t sm_entry_is_single_word(uint64_t e); -boolean_t sm_entry_is_double_word(uint64_t e); - -typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg); - -int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype); -int space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype, - uint64_t length); -int space_map_iterate(space_map_t *sm, uint64_t length, - sm_cb_t callback, void *arg); -int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, - dmu_tx_t *tx); - -boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt); -void space_map_histogram_clear(space_map_t *sm); -void space_map_histogram_add(space_map_t *sm, range_tree_t *rt, - dmu_tx_t *tx); - -uint64_t space_map_object(space_map_t *sm); -int64_t space_map_allocated(space_map_t *sm); -uint64_t space_map_length(space_map_t *sm); - -void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, - uint64_t vdev_id, dmu_tx_t *tx); -uint64_t space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, - uint64_t vdev_id); -void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx); -uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx); -void space_map_free(space_map_t *sm, dmu_tx_t *tx); -void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx); - -int space_map_open(space_map_t **smp, objset_t *os, uint64_t object, - uint64_t start, uint64_t size, uint8_t shift); -void space_map_close(space_map_t *sm); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SPACE_MAP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h deleted file mode 100644 index 249b15be6729..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2013 by Delphix. All rights reserved. - */ - -#ifndef _SYS_SPACE_REFTREE_H -#define _SYS_SPACE_REFTREE_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct space_ref { - avl_node_t sr_node; /* AVL node */ - uint64_t sr_offset; /* range offset (start or end) */ - int64_t sr_refcnt; /* associated reference count */ -} space_ref_t; - -void space_reftree_create(avl_tree_t *t); -void space_reftree_destroy(avl_tree_t *t); -void space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, - int64_t refcnt); -void space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt); -void space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, - int64_t minref); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SPACE_REFTREE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h deleted file mode 100644 index f228d0766631..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2012 Pawel Jakub Dawidek . - * All rights reserved. - */ - -#ifndef _SYS_TRIM_MAP_H -#define _SYS_TRIM_MAP_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -extern void trim_map_create(vdev_t *vd); -extern void trim_map_destroy(vdev_t *vd); -extern void trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg); -extern boolean_t trim_map_write_start(zio_t *zio); -extern void trim_map_write_done(zio_t *zio); - -extern void trim_thread_create(spa_t *spa); -extern void trim_thread_destroy(spa_t *spa); -extern void trim_thread_wakeup(spa_t *spa); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_TRIM_MAP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h deleted file mode 100644 index d5c22e41478e..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - */ - -#ifndef _SYS_TXG_H -#define _SYS_TXG_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */ -#define TXG_SIZE 4 /* next power of 2 */ -#define TXG_MASK (TXG_SIZE - 1) /* mask for size */ -#define TXG_INITIAL TXG_SIZE /* initial txg */ -#define TXG_IDX (txg & TXG_MASK) - -/* Number of txgs worth of frees we defer adding to in-core spacemaps */ -#define TXG_DEFER_SIZE 2 - -typedef struct tx_cpu tx_cpu_t; - -typedef struct txg_handle { - tx_cpu_t *th_cpu; - uint64_t th_txg; -} txg_handle_t; - -typedef struct txg_node { - struct txg_node *tn_next[TXG_SIZE]; - uint8_t tn_member[TXG_SIZE]; -} txg_node_t; - -typedef struct txg_list { - kmutex_t tl_lock; - size_t tl_offset; - spa_t *tl_spa; - txg_node_t *tl_head[TXG_SIZE]; -} txg_list_t; - -struct dsl_pool; - -extern void txg_init(struct dsl_pool *dp, uint64_t txg); -extern void txg_fini(struct dsl_pool *dp); -extern void txg_sync_start(struct dsl_pool *dp); -extern void txg_sync_stop(struct dsl_pool *dp); -extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp); -extern void txg_rele_to_quiesce(txg_handle_t *txghp); -extern void txg_rele_to_sync(txg_handle_t *txghp); -extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); - -extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta, - hrtime_t resolution); -extern void txg_kick(struct dsl_pool *dp); - -/* - * Wait until the given transaction group has finished syncing. - * Try to make this happen as soon as possible (eg. kick off any - * necessary syncs immediately). If txg==0, wait for the currently open - * txg to finish syncing. - */ -extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg); - -/* - * Wait as above. Returns true if the thread was signaled while waiting. - */ -extern boolean_t txg_wait_synced_sig(struct dsl_pool *dp, uint64_t txg); - -/* - * Wait until the given transaction group, or one after it, is - * the open transaction group. Try to make this happen as soon - * as possible (eg. kick off any necessary syncs immediately). - * If txg == 0, wait for the next open txg. - */ -extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg); - -/* - * Returns TRUE if we are "backed up" waiting for the syncing - * transaction to complete; otherwise returns FALSE. - */ -extern boolean_t txg_stalled(struct dsl_pool *dp); - -/* returns TRUE if someone is waiting for the next txg to sync */ -extern boolean_t txg_sync_waiting(struct dsl_pool *dp); - -extern void txg_verify(spa_t *spa, uint64_t txg); - -/* - * Per-txg object lists. - */ - -#define TXG_CLEAN(txg) ((txg) - 1) - -extern void txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset); -extern void txg_list_destroy(txg_list_t *tl); -extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg); -extern boolean_t txg_all_lists_empty(txg_list_t *tl); -extern boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg); -extern boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg); -extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); -extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg); -extern boolean_t txg_list_member(txg_list_t *tl, void *p, uint64_t txg); -extern void *txg_list_head(txg_list_t *tl, uint64_t txg); -extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_TXG_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h deleted file mode 100644 index bf3b269d707d..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. - */ - -#ifndef _SYS_TXG_IMPL_H -#define _SYS_TXG_IMPL_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * The tx_cpu structure is a per-cpu structure that is used to track - * the number of active transaction holds (tc_count). As transactions - * are assigned into a transaction group the appropriate tc_count is - * incremented to indicate that there are pending changes that have yet - * to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement - * the tc_count. A transaction group is not considered quiesced until all - * tx_cpu structures have reached a tc_count of zero. - * - * This structure is a per-cpu structure by design. Updates to this structure - * are frequent and concurrent. Having a single structure would result in - * heavy lock contention so a per-cpu design was implemented. With the fanned - * out mutex design, consumers only need to lock the mutex associated with - * thread's cpu. - * - * The tx_cpu contains two locks, the tc_lock and tc_open_lock. - * The tc_lock is used to protect all members of the tx_cpu structure with - * the exception of the tc_open_lock. This lock should only be held for a - * short period of time, typically when updating the value of tc_count. - * - * The tc_open_lock protects the tx_open_txg member of the tx_state structure. - * This lock is used to ensure that transactions are only assigned into - * the current open transaction group. In order to move the current open - * transaction group to the quiesce phase, the txg_quiesce thread must - * grab all tc_open_locks, increment the tx_open_txg, and drop the locks. - * The tc_open_lock is held until the transaction is assigned into the - * transaction group. Typically, this is a short operation but if throttling - * is occuring it may be held for longer periods of time. - */ -struct tx_cpu { - kmutex_t tc_open_lock; /* protects tx_open_txg */ - kmutex_t tc_lock; /* protects the rest of this struct */ - kcondvar_t tc_cv[TXG_SIZE]; - uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */ - list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ - char tc_pad[8]; /* pad to fill 3 cache lines */ -}; - -/* - * The tx_state structure maintains the state information about the different - * stages of the pool's transcation groups. A per pool tx_state structure - * is used to track this information. The tx_state structure also points to - * an array of tx_cpu structures (described above). Although the tx_sync_lock - * is used to protect the members of this structure, it is not used to - * protect the tx_open_txg. Instead a special lock in the tx_cpu structure - * is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock. - * Any thread wishing to update tx_open_txg must grab the tc_open_lock on - * every cpu (see txg_quiesce()). - */ -typedef struct tx_state { - tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */ - kmutex_t tx_sync_lock; /* protects the rest of this struct */ - - uint64_t tx_open_txg; /* currently open txg id */ - uint64_t tx_quiescing_txg; /* currently quiescing txg id */ - uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ - uint64_t tx_syncing_txg; /* currently syncing txg id */ - uint64_t tx_synced_txg; /* last synced txg id */ - - hrtime_t tx_open_time; /* start time of tx_open_txg */ - - uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */ - uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */ - - kcondvar_t tx_sync_more_cv; - kcondvar_t tx_sync_done_cv; - kcondvar_t tx_quiesce_more_cv; - kcondvar_t tx_quiesce_done_cv; - kcondvar_t tx_timeout_cv; - kcondvar_t tx_exit_cv; /* wait for all threads to exit */ - - uint8_t tx_threads; /* number of threads */ - uint8_t tx_exiting; /* set when we're exiting */ - - kthread_t *tx_sync_thread; - kthread_t *tx_quiesce_thread; - - taskq_t *tx_commit_cb_taskq; /* commit callback taskq */ -} tx_state_t; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_TXG_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h deleted file mode 100644 index 044e438387c0..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2014 by Delphix. All rights reserved. - */ - -#ifndef _SYS_UBERBLOCK_H -#define _SYS_UBERBLOCK_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct uberblock uberblock_t; - -extern int uberblock_verify(uberblock_t *); -extern boolean_t uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, - uint64_t mmp_delay); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_UBERBLOCK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h deleted file mode 100644 index caf43957dfe4..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2016, 2017 by Delphix. All rights reserved. - */ - -#ifndef _SYS_UBERBLOCK_IMPL_H -#define _SYS_UBERBLOCK_IMPL_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * The uberblock version is incremented whenever an incompatible on-disk - * format change is made to the SPA, DMU, or ZAP. - * - * Note: the first two fields should never be moved. When a storage pool - * is opened, the uberblock must be read off the disk before the version - * can be checked. If the ub_version field is moved, we may not detect - * version mismatch. If the ub_magic field is moved, applications that - * expect the magic number in the first word won't work. - */ -#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */ -#define UBERBLOCK_SHIFT 10 /* up to 1K */ -#define MMP_MAGIC 0xa11cea11 /* all-see-all */ - -#define MMP_INTERVAL_VALID_BIT 0x01 -#define MMP_SEQ_VALID_BIT 0x02 -#define MMP_FAIL_INT_VALID_BIT 0x04 - -#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \ - ubp->ub_mmp_magic == MMP_MAGIC) -#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ - MMP_INTERVAL_VALID_BIT)) -#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ - MMP_SEQ_VALID_BIT)) -#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ - MMP_FAIL_INT_VALID_BIT)) - -#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \ - >> 8) -#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \ - >> 32) -#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \ - >> 48) - -#define MMP_INTERVAL_SET(write) \ - (((uint64_t)(write & 0xFFFFFF) << 8) | MMP_INTERVAL_VALID_BIT) - -#define MMP_SEQ_SET(seq) \ - (((uint64_t)(seq & 0xFFFF) << 32) | MMP_SEQ_VALID_BIT) - -#define MMP_FAIL_INT_SET(fail) \ - (((uint64_t)(fail & 0xFFFF) << 48) | MMP_FAIL_INT_VALID_BIT) - -struct uberblock { - uint64_t ub_magic; /* UBERBLOCK_MAGIC */ - uint64_t ub_version; /* SPA_VERSION */ - uint64_t ub_txg; /* txg of last sync */ - uint64_t ub_guid_sum; /* sum of all vdev guids */ - uint64_t ub_timestamp; /* UTC time of last sync */ - blkptr_t ub_rootbp; /* MOS objset_phys_t */ - - /* highest SPA_VERSION supported by software that wrote this txg */ - uint64_t ub_software_version; - - /* Maybe missing in uberblocks we read, but always written */ - uint64_t ub_mmp_magic; - /* - * If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off. - * Otherwise, nanosec since last MMP write. - */ - uint64_t ub_mmp_delay; - - /* - * The ub_mmp_config contains the multihost write interval, multihost - * fail intervals, sequence number for sub-second granularity, and - * valid bit mask. This layout is as follows: - * - * 64 56 48 40 32 24 16 8 0 - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | Fail Intervals| Seq | Write Interval (ms) | VALID | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * - * This allows a write_interval of (2^24/1000)s, over 4.5 hours - * - * VALID Bits: - * - 0x01 - Write Interval (ms) - * - 0x02 - Sequence number exists - * - 0x04 - Fail Intervals - * - 0xf8 - Reserved - */ - uint64_t ub_mmp_config; - - /* - * ub_checkpoint_txg indicates two things about the current uberblock: - * - * 1] If it is not zero then this uberblock is a checkpoint. If it is - * zero, then this uberblock is not a checkpoint. - * - * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is - * the ub_txg that the uberblock had at the time we moved it to - * the MOS config. - * - * The field is set when we checkpoint the uberblock and continues to - * hold that value even after we've rewound (unlike the ub_txg that - * is reset to a higher value). - * - * Besides checks used to determine whether we are reopening the - * pool from a checkpointed uberblock [see spa_ld_select_uberblock()], - * the value of the field is used to determine which ZIL blocks have - * been allocated according to the ms_sm when we are rewinding to a - * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then - * the ZIL block is not allocated [see uses of spa_min_claim_txg()]. - */ - uint64_t ub_checkpoint_txg; -}; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_UBERBLOCK_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h deleted file mode 100644 index d4ba32e5c642..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_UNIQUE_H -#define _SYS_UNIQUE_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* The number of significant bits in each unique value. */ -#define UNIQUE_BITS 56 - -void unique_init(void); -void unique_fini(void); - -/* - * Return a new unique value (which will not be uniquified against until - * it is unique_insert()-ed). - */ -uint64_t unique_create(void); - -/* Return a unique value, which equals the one passed in if possible. */ -uint64_t unique_insert(uint64_t value); - -/* Indicate that this value no longer needs to be uniquified against. */ -void unique_remove(uint64_t value); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_UNIQUE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h deleted file mode 100644 index 0bb266873c6c..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h +++ /dev/null @@ -1,196 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. - * Copyright (c) 2017, Intel Corporation. - */ - -#ifndef _SYS_VDEV_H -#define _SYS_VDEV_H - -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef enum vdev_dtl_type { - DTL_MISSING, /* 0% replication: no copies of the data */ - DTL_PARTIAL, /* less than 100% replication: some copies missing */ - DTL_SCRUB, /* unable to fully repair during scrub/resilver */ - DTL_OUTAGE, /* temporarily missing (used to attempt detach) */ - DTL_TYPES -} vdev_dtl_type_t; - -extern boolean_t zfs_nocacheflush; -extern boolean_t zfs_trim_enabled; - -extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...); -extern void vdev_dbgmsg_print_tree(vdev_t *, int); -extern int vdev_open(vdev_t *); -extern void vdev_open_children(vdev_t *); -extern boolean_t vdev_uses_zvols(vdev_t *); -extern int vdev_validate(vdev_t *); -extern int vdev_copy_path_strict(vdev_t *, vdev_t *); -extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *); -extern void vdev_close(vdev_t *); -extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); -extern void vdev_reopen(vdev_t *); -extern int vdev_validate_aux(vdev_t *vd); -extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); -extern boolean_t vdev_is_concrete(vdev_t *vd); -extern boolean_t vdev_is_bootable(vdev_t *vd); -extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); -extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); -extern int vdev_count_leaves(spa_t *spa); -extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, - uint64_t txg, uint64_t size); -extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, - uint64_t txg, uint64_t size); -extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); -extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size); -extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, - int scrub_done); -extern boolean_t vdev_dtl_required(vdev_t *vd); -extern boolean_t vdev_resilver_needed(vdev_t *vd, - uint64_t *minp, uint64_t *maxp); -extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, - dmu_tx_t *tx); -extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx); -extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx); -extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx); -extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, - uint64_t size); -extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev, - uint64_t offset, uint64_t size, dmu_tx_t *tx); - -extern void vdev_hold(vdev_t *); -extern void vdev_rele(vdev_t *); - -extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); -extern void vdev_metaslab_fini(vdev_t *vd); -extern void vdev_metaslab_set_size(vdev_t *); -extern void vdev_ashift_optimize(vdev_t *); -extern void vdev_expand(vdev_t *vd, uint64_t txg); -extern void vdev_split(vdev_t *vd); -extern void vdev_deadman(vdev_t *vd); - -extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); -extern void vdev_clear_stats(vdev_t *vd); -extern void vdev_stat_update(zio_t *zio, uint64_t psize); -extern void vdev_scan_stat_init(vdev_t *vd); -extern void vdev_propagate_state(vdev_t *vd); -extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, - vdev_aux_t aux); -extern boolean_t vdev_children_are_offline(vdev_t *vd); - -extern void vdev_space_update(vdev_t *vd, - int64_t alloc_delta, int64_t defer_delta, int64_t space_delta); - -extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space); - -extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); - -extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); -extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux); -extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, - vdev_state_t *); -extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); -extern void vdev_clear(spa_t *spa, vdev_t *vd); - -extern boolean_t vdev_is_dead(vdev_t *vd); -extern boolean_t vdev_readable(vdev_t *vd); -extern boolean_t vdev_writeable(vdev_t *vd); -extern boolean_t vdev_allocatable(vdev_t *vd); -extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); -extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd); - -extern void vdev_cache_init(vdev_t *vd); -extern void vdev_cache_fini(vdev_t *vd); -extern boolean_t vdev_cache_read(zio_t *zio); -extern void vdev_cache_write(zio_t *zio); -extern void vdev_cache_purge(vdev_t *vd); - -extern void vdev_queue_init(vdev_t *vd); -extern void vdev_queue_fini(vdev_t *vd); -extern zio_t *vdev_queue_io(zio_t *zio); -extern void vdev_queue_io_done(zio_t *zio); -extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); -extern int vdev_queue_length(vdev_t *vd); -extern uint64_t vdev_queue_lastoffset(vdev_t *vd); -extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio); - -extern void vdev_config_dirty(vdev_t *vd); -extern void vdev_config_clean(vdev_t *vd); -extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); - -extern void vdev_state_dirty(vdev_t *vd); -extern void vdev_state_clean(vdev_t *vd); - -typedef enum vdev_config_flag { - VDEV_CONFIG_SPARE = 1 << 0, - VDEV_CONFIG_L2CACHE = 1 << 1, - VDEV_CONFIG_REMOVING = 1 << 2, - VDEV_CONFIG_MOS = 1 << 3, - VDEV_CONFIG_MISSING = 1 << 4 -} vdev_config_flag_t; - -extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config); -extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, - boolean_t getstats, vdev_config_flag_t flags); - -/* - * Label routines - */ -struct uberblock; -extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); -extern int vdev_label_number(uint64_t psise, uint64_t offset); -extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg); -extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); -extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t - offset, uint64_t size, zio_done_func_t *done, void *priv, int flags); -extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *); -extern int vdev_label_write_bootenv(vdev_t *, char *); - -typedef enum { - VDEV_LABEL_CREATE, /* create/add a new device */ - VDEV_LABEL_REPLACE, /* replace an existing device */ - VDEV_LABEL_SPARE, /* add a new hot spare */ - VDEV_LABEL_REMOVE, /* remove an existing device */ - VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */ - VDEV_LABEL_SPLIT /* generating new label for split-off dev */ -} vdev_labeltype_t; - -extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); - -extern int vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_VDEV_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h deleted file mode 100644 index 61e2f273f0a0..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - * Copyright (c) 2013 Joyent, Inc. All rights reserved. - * Copyright 2012 Nexenta Systems, Inc. All rights reserved. - */ - -#ifndef _SYS_VDEV_DISK_H -#define _SYS_VDEV_DISK_H - -#include -#ifdef _KERNEL -#include -#include -#include -#include -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef _KERNEL -typedef struct vdev_disk { - ddi_devid_t vd_devid; - char *vd_minor; - ldi_handle_t vd_lh; - list_t vd_ldi_cbs; - boolean_t vd_ldi_offline; -} vdev_disk_t; -#endif - -extern int vdev_disk_physio(vdev_t *, - caddr_t, size_t, uint64_t, int, boolean_t); - -/* - * Since vdev_disk.c is not compiled into libzpool, this function should only be - * defined in the zfs kernel module. - */ -#ifdef _KERNEL -extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int); -#endif -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_VDEV_DISK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h deleted file mode 100644 index 0260b4ab4f79..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_VDEV_FILE_H -#define _SYS_VDEV_FILE_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct vdev_file { - vnode_t *vf_vnode; -} vdev_file_t; - -extern void vdev_file_init(void); -extern void vdev_file_fini(void); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_VDEV_FILE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h deleted file mode 100644 index e40335fc73ae..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ /dev/null @@ -1,571 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. - * Copyright (c) 2017, Intel Corporation. - */ - -#ifndef _SYS_VDEV_IMPL_H -#define _SYS_VDEV_IMPL_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Virtual device descriptors. - * - * All storage pool operations go through the virtual device framework, - * which provides data replication and I/O scheduling. - */ - -/* - * Forward declarations that lots of things need. - */ -typedef struct vdev_queue vdev_queue_t; -typedef struct vdev_cache vdev_cache_t; -typedef struct vdev_cache_entry vdev_cache_entry_t; -struct abd; - -extern int zfs_vdev_queue_depth_pct; -extern int zfs_vdev_def_queue_depth; -extern uint32_t zfs_vdev_async_write_max_active; - -/* - * Virtual device operations - */ -typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, - uint64_t *logical_ashift, uint64_t *physical_ashift); -typedef void vdev_close_func_t(vdev_t *vd); -typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); -typedef void vdev_io_start_func_t(zio_t *zio); -typedef void vdev_io_done_func_t(zio_t *zio); -typedef void vdev_state_change_func_t(vdev_t *vd, int, int); -typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t); -typedef void vdev_hold_func_t(vdev_t *vd); -typedef void vdev_rele_func_t(vdev_t *vd); - -typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd, - uint64_t offset, uint64_t size, void *arg); -typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, - vdev_remap_cb_t callback, void *arg); -/* - * Given a target vdev, translates the logical range "in" to the physical - * range "res" - */ -typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in, - range_seg_t *res); - -typedef struct vdev_ops { - vdev_open_func_t *vdev_op_open; - vdev_close_func_t *vdev_op_close; - vdev_asize_func_t *vdev_op_asize; - vdev_io_start_func_t *vdev_op_io_start; - vdev_io_done_func_t *vdev_op_io_done; - vdev_state_change_func_t *vdev_op_state_change; - vdev_need_resilver_func_t *vdev_op_need_resilver; - vdev_hold_func_t *vdev_op_hold; - vdev_rele_func_t *vdev_op_rele; - vdev_remap_func_t *vdev_op_remap; - /* - * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves. - * Used when initializing vdevs. Isn't used by leaf ops. - */ - vdev_xlation_func_t *vdev_op_xlate; - char vdev_op_type[16]; - boolean_t vdev_op_leaf; -} vdev_ops_t; - -/* - * Virtual device properties - */ -struct vdev_cache_entry { - struct abd *ve_abd; - uint64_t ve_offset; - uint64_t ve_lastused; - avl_node_t ve_offset_node; - avl_node_t ve_lastused_node; - uint32_t ve_hits; - uint16_t ve_missed_update; - zio_t *ve_fill_io; -}; - -struct vdev_cache { - avl_tree_t vc_offset_tree; - avl_tree_t vc_lastused_tree; - kmutex_t vc_lock; -}; - -typedef struct vdev_queue_class { - uint32_t vqc_active; - - /* - * Sorted by offset or timestamp, depending on if the queue is - * LBA-ordered vs FIFO. - */ - avl_tree_t vqc_queued_tree; -} vdev_queue_class_t; - -struct vdev_queue { - vdev_t *vq_vdev; - vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; - avl_tree_t vq_active_tree; - avl_tree_t vq_read_offset_tree; - avl_tree_t vq_write_offset_tree; - uint64_t vq_last_offset; - hrtime_t vq_io_complete_ts; /* time last i/o completed */ - kmutex_t vq_lock; - uint64_t vq_lastoffset; -}; - -typedef enum vdev_alloc_bias { - VDEV_BIAS_NONE, - VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */ - VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */ - VDEV_BIAS_DEDUP /* dedicated to dedup metadata */ -} vdev_alloc_bias_t; - - -/* - * On-disk indirect vdev state. - * - * An indirect vdev is described exclusively in the MOS config of a pool. - * The config for an indirect vdev includes several fields, which are - * accessed in memory by a vdev_indirect_config_t. - */ -typedef struct vdev_indirect_config { - /* - * Object (in MOS) which contains the indirect mapping. This object - * contains an array of vdev_indirect_mapping_entry_phys_t ordered by - * vimep_src. The bonus buffer for this object is a - * vdev_indirect_mapping_phys_t. This object is allocated when a vdev - * removal is initiated. - * - * Note that this object can be empty if none of the data on the vdev - * has been copied yet. - */ - uint64_t vic_mapping_object; - - /* - * Object (in MOS) which contains the birth times for the mapping - * entries. This object contains an array of - * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus - * buffer for this object is a vdev_indirect_birth_phys_t. This object - * is allocated when a vdev removal is initiated. - * - * Note that this object can be empty if none of the vdev has yet been - * copied. - */ - uint64_t vic_births_object; - - /* - * This is the vdev ID which was removed previous to this vdev, or - * UINT64_MAX if there are no previously removed vdevs. - */ - uint64_t vic_prev_indirect_vdev; -} vdev_indirect_config_t; - -/* - * Virtual device descriptor - */ -struct vdev { - /* - * Common to all vdev types. - */ - uint64_t vdev_id; /* child number in vdev parent */ - uint64_t vdev_guid; /* unique ID for this vdev */ - uint64_t vdev_guid_sum; /* self guid + all child guids */ - uint64_t vdev_orig_guid; /* orig. guid prior to remove */ - uint64_t vdev_asize; /* allocatable device capacity */ - uint64_t vdev_min_asize; /* min acceptable asize */ - uint64_t vdev_max_asize; /* max acceptable asize */ - uint64_t vdev_ashift; /* block alignment shift */ - /* - * Logical block alignment shift - * - * The smallest sized/aligned I/O supported by the device. - */ - uint64_t vdev_logical_ashift; - /* - * Physical block alignment shift - * - * The device supports logical I/Os with vdev_logical_ashift - * size/alignment, but optimum performance will be achieved by - * aligning/sizing requests to vdev_physical_ashift. Smaller - * requests may be inflated or incur device level read-modify-write - * operations. - * - * May be 0 to indicate no preference (i.e. use vdev_logical_ashift). - */ - uint64_t vdev_physical_ashift; - uint64_t vdev_state; /* see VDEV_STATE_* #defines */ - uint64_t vdev_prevstate; /* used when reopening a vdev */ - vdev_ops_t *vdev_ops; /* vdev operations */ - spa_t *vdev_spa; /* spa for this vdev */ - void *vdev_tsd; /* type-specific data */ - vnode_t *vdev_name_vp; /* vnode for pathname */ - vnode_t *vdev_devid_vp; /* vnode for devid */ - vdev_t *vdev_top; /* top-level vdev */ - vdev_t *vdev_parent; /* parent vdev */ - vdev_t **vdev_child; /* array of children */ - uint64_t vdev_children; /* number of children */ - vdev_stat_t vdev_stat; /* virtual device statistics */ - boolean_t vdev_expanding; /* expand the vdev? */ - boolean_t vdev_reopening; /* reopen in progress? */ - boolean_t vdev_nonrot; /* true if solid state */ - int vdev_open_error; /* error on last open */ - kthread_t *vdev_open_thread; /* thread opening children */ - uint64_t vdev_crtxg; /* txg when top-level was added */ - - /* - * Top-level vdev state. - */ - uint64_t vdev_ms_array; /* metaslab array object */ - uint64_t vdev_ms_shift; /* metaslab size shift */ - uint64_t vdev_ms_count; /* number of metaslabs */ - metaslab_group_t *vdev_mg; /* metaslab group */ - metaslab_t **vdev_ms; /* metaslab array */ - txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ - txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ - txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ - boolean_t vdev_remove_wanted; /* async remove wanted? */ - boolean_t vdev_probe_wanted; /* async probe wanted? */ - list_node_t vdev_config_dirty_node; /* config dirty list */ - list_node_t vdev_state_dirty_node; /* state dirty list */ - uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ - uint64_t vdev_islog; /* is an intent log device */ - uint64_t vdev_removing; /* device is being removed? */ - boolean_t vdev_ishole; /* is a hole in the namespace */ - uint64_t vdev_top_zap; - vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ - - /* pool checkpoint related */ - space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ - - boolean_t vdev_initialize_exit_wanted; - vdev_initializing_state_t vdev_initialize_state; - kthread_t *vdev_initialize_thread; - /* Protects vdev_initialize_thread and vdev_initialize_state. */ - kmutex_t vdev_initialize_lock; - kcondvar_t vdev_initialize_cv; - uint64_t vdev_initialize_offset[TXG_SIZE]; - uint64_t vdev_initialize_last_offset; - range_tree_t *vdev_initialize_tree; /* valid while initializing */ - uint64_t vdev_initialize_bytes_est; - uint64_t vdev_initialize_bytes_done; - time_t vdev_initialize_action_time; /* start and end time */ - - /* for limiting outstanding I/Os */ - kmutex_t vdev_initialize_io_lock; - kcondvar_t vdev_initialize_io_cv; - uint64_t vdev_initialize_inflight; - - /* - * Values stored in the config for an indirect or removing vdev. - */ - vdev_indirect_config_t vdev_indirect_config; - - /* - * The vdev_indirect_rwlock protects the vdev_indirect_mapping - * pointer from changing on indirect vdevs (when it is condensed). - * Note that removing (not yet indirect) vdevs have different - * access patterns (the mapping is not accessed from open context, - * e.g. from zio_read) and locking strategy (e.g. svr_lock). - */ - krwlock_t vdev_indirect_rwlock; - vdev_indirect_mapping_t *vdev_indirect_mapping; - vdev_indirect_births_t *vdev_indirect_births; - - /* - * In memory data structures used to manage the obsolete sm, for - * indirect or removing vdevs. - * - * The vdev_obsolete_segments is the in-core record of the segments - * that are no longer referenced anywhere in the pool (due to - * being freed or remapped and not referenced by any snapshots). - * During a sync, segments are added to vdev_obsolete_segments - * via vdev_indirect_mark_obsolete(); at the end of each sync - * pass, this is appended to vdev_obsolete_sm via - * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock - * protects against concurrent modifications of vdev_obsolete_segments - * from multiple zio threads. - */ - kmutex_t vdev_obsolete_lock; - range_tree_t *vdev_obsolete_segments; - space_map_t *vdev_obsolete_sm; - - /* - * Protects the vdev_scan_io_queue field itself as well as the - * structure's contents (when present). - */ - kmutex_t vdev_scan_io_queue_lock; - struct dsl_scan_io_queue *vdev_scan_io_queue; - - /* - * Leaf vdev state. - */ - range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */ - space_map_t *vdev_dtl_sm; /* dirty time log space map */ - txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ - uint64_t vdev_dtl_object; /* DTL object */ - uint64_t vdev_psize; /* physical device capacity */ - uint64_t vdev_wholedisk; /* true if this is a whole disk */ - uint64_t vdev_offline; /* persistent offline state */ - uint64_t vdev_faulted; /* persistent faulted state */ - uint64_t vdev_degraded; /* persistent degraded state */ - uint64_t vdev_removed; /* persistent removed state */ - uint64_t vdev_resilver_txg; /* persistent resilvering state */ - uint64_t vdev_nparity; /* number of parity devices for raidz */ - char *vdev_path; /* vdev path (if any) */ - char *vdev_devid; /* vdev devid (if any) */ - char *vdev_physpath; /* vdev device path (if any) */ - char *vdev_fru; /* physical FRU location */ - uint64_t vdev_not_present; /* not present during import */ - uint64_t vdev_unspare; /* unspare when resilvering done */ - boolean_t vdev_nowritecache; /* true if flushwritecache failed */ - boolean_t vdev_notrim; /* true if trim failed */ - boolean_t vdev_checkremove; /* temporary online test */ - boolean_t vdev_forcefault; /* force online fault */ - boolean_t vdev_splitting; /* split or repair in progress */ - boolean_t vdev_delayed_close; /* delayed device close? */ - boolean_t vdev_tmpoffline; /* device taken offline temporarily? */ - boolean_t vdev_detached; /* device detached? */ - boolean_t vdev_cant_read; /* vdev is failing all reads */ - boolean_t vdev_cant_write; /* vdev is failing all writes */ - boolean_t vdev_isspare; /* was a hot spare */ - boolean_t vdev_isl2cache; /* was a l2cache device */ - vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ - vdev_cache_t vdev_cache; /* physical block cache */ - spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ - zio_t *vdev_probe_zio; /* root of current probe */ - vdev_aux_t vdev_label_aux; /* on-disk aux state */ - struct trim_map *vdev_trimmap; /* map on outstanding trims */ - uint64_t vdev_leaf_zap; - hrtime_t vdev_mmp_pending; /* 0 if write finished */ - uint64_t vdev_mmp_kstat_id; /* to find kstat entry */ - list_node_t vdev_leaf_node; /* leaf vdev list */ - - /* - * For DTrace to work in userland (libzpool) context, these fields must - * remain at the end of the structure. DTrace will use the kernel's - * CTF definition for 'struct vdev', and since the size of a kmutex_t is - * larger in userland, the offsets for the rest of the fields would be - * incorrect. - */ - kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ - kmutex_t vdev_stat_lock; /* vdev_stat */ - kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ -}; - -#define VDEV_RAIDZ_MAXPARITY 3 - -#define VDEV_PAD_SIZE (8 << 10) -/* 2 padding areas (vl_pad1 and vl_be) to skip */ -#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 -#define VDEV_PHYS_SIZE (112 << 10) -#define VDEV_UBERBLOCK_RING (128 << 10) - -/* - * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock - * ring when MMP is enabled. - */ -#define MMP_BLOCKS_PER_LABEL 1 - -/* The largest uberblock we support is 8k. */ -#define MAX_UBERBLOCK_SHIFT (13) -#define VDEV_UBERBLOCK_SHIFT(vd) \ - MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \ - MAX_UBERBLOCK_SHIFT) -#define VDEV_UBERBLOCK_COUNT(vd) \ - (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) -#define VDEV_UBERBLOCK_OFFSET(vd, n) \ - offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) -#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) - -typedef struct vdev_phys { - char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)]; - zio_eck_t vp_zbt; -} vdev_phys_t; - -typedef enum vbe_vers { - /* The bootenv file is stored as ascii text in the envblock */ - VB_RAW = 0, - - /* - * The bootenv file is converted to an nvlist and then packed into the - * envblock. - */ - VB_NVLIST = 1 -} vbe_vers_t; - -typedef struct vdev_boot_envblock { - uint64_t vbe_version; - char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) - - sizeof (zio_eck_t)]; - zio_eck_t vbe_zbt; -} vdev_boot_envblock_t; - -CTASSERT(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE); - -typedef struct vdev_label { - char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ - vdev_boot_envblock_t vl_be; /* 8K */ - vdev_phys_t vl_vdev_phys; /* 112K */ - char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ -} vdev_label_t; /* 256K total */ - -/* - * vdev_dirty() flags - */ -#define VDD_METASLAB 0x01 -#define VDD_DTL 0x02 - -/* Offset of embedded boot loader region on each label */ -#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) -/* - * Size of embedded boot loader region on each label. - * The total size of the first two labels plus the boot area is 4MB. - */ -#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ - -/* - * Size of label regions at the start and end of each leaf device. - */ -#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) -#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) -#define VDEV_LABELS 4 -#define VDEV_BEST_LABEL VDEV_LABELS - -#define VDEV_ALLOC_LOAD 0 -#define VDEV_ALLOC_ADD 1 -#define VDEV_ALLOC_SPARE 2 -#define VDEV_ALLOC_L2CACHE 3 -#define VDEV_ALLOC_ROOTPOOL 4 -#define VDEV_ALLOC_SPLIT 5 -#define VDEV_ALLOC_ATTACH 6 - -/* - * Allocate or free a vdev - */ -extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, - vdev_ops_t *ops); -extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, - vdev_t *parent, uint_t id, int alloctype); -extern void vdev_free(vdev_t *vd); - -/* - * Add or remove children and parents - */ -extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd); -extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd); -extern void vdev_compact_children(vdev_t *pvd); -extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops); -extern void vdev_remove_parent(vdev_t *cvd); - -/* - * vdev sync load and sync - */ -extern boolean_t vdev_log_state_valid(vdev_t *vd); -extern int vdev_load(vdev_t *vd); -extern int vdev_dtl_load(vdev_t *vd); -extern void vdev_sync(vdev_t *vd, uint64_t txg); -extern void vdev_sync_done(vdev_t *vd, uint64_t txg); -extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); -extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg); - -/* - * Available vdev types. - */ -extern vdev_ops_t vdev_root_ops; -extern vdev_ops_t vdev_mirror_ops; -extern vdev_ops_t vdev_replacing_ops; -extern vdev_ops_t vdev_raidz_ops; -#ifdef _KERNEL -extern vdev_ops_t vdev_geom_ops; -#else -extern vdev_ops_t vdev_disk_ops; -#endif -extern vdev_ops_t vdev_file_ops; -extern vdev_ops_t vdev_missing_ops; -extern vdev_ops_t vdev_hole_ops; -extern vdev_ops_t vdev_spare_ops; -extern vdev_ops_t vdev_indirect_ops; - -/* - * Common size functions - */ -extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in, - range_seg_t *out); -extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); -extern uint64_t vdev_get_min_asize(vdev_t *vd); -extern void vdev_set_min_asize(vdev_t *vd); - -/* - * Global variables - */ -extern int vdev_standard_sm_blksz; -/* zdb uses this tunable, so it must be declared here to make lint happy. */ -extern int zfs_vdev_cache_size; -extern uint_t zfs_geom_probe_vdev_key; - -/* - * Functions from vdev_indirect.c - */ -extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx); -extern boolean_t vdev_indirect_should_condense(vdev_t *vd); -extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx); -extern int vdev_obsolete_sm_object(vdev_t *vd); -extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd); - -#ifdef illumos -/* - * Other miscellaneous functions - */ -int vdev_checkpoint_sm_object(vdev_t *vd); - -/* - * The vdev_buf_t is used to translate between zio_t and buf_t, and back again. - */ -typedef struct vdev_buf { - buf_t vb_buf; /* buffer that describes the io */ - zio_t *vb_io; /* pointer back to the original zio_t */ -} vdev_buf_t; -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_VDEV_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h deleted file mode 100644 index 987b14485d2b..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2015 by Delphix. All rights reserved. - */ - -#ifndef _SYS_VDEV_INDIRECT_BIRTHS_H -#define _SYS_VDEV_INDIRECT_BIRTHS_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct vdev_indirect_birth_entry_phys { - uint64_t vibe_offset; - uint64_t vibe_phys_birth_txg; -} vdev_indirect_birth_entry_phys_t; - -typedef struct vdev_indirect_birth_phys { - uint64_t vib_count; /* count of v_i_b_entry_phys_t's */ -} vdev_indirect_birth_phys_t; - -typedef struct vdev_indirect_births { - uint64_t vib_object; - - /* - * Each entry indicates that everything up to but not including - * vibe_offset was copied in vibe_phys_birth_txg. Entries are sorted - * by increasing phys_birth, and also by increasing offset. See - * vdev_indirect_births_physbirth for usage. - */ - vdev_indirect_birth_entry_phys_t *vib_entries; - - objset_t *vib_objset; - - dmu_buf_t *vib_dbuf; - vdev_indirect_birth_phys_t *vib_phys; -} vdev_indirect_births_t; - -extern vdev_indirect_births_t *vdev_indirect_births_open(objset_t *os, - uint64_t object); -extern void vdev_indirect_births_close(vdev_indirect_births_t *vib); -extern boolean_t vdev_indirect_births_is_open(vdev_indirect_births_t *vib); -extern uint64_t vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx); -extern void vdev_indirect_births_free(objset_t *os, uint64_t object, - dmu_tx_t *tx); - -extern uint64_t vdev_indirect_births_count(vdev_indirect_births_t *vib); -extern uint64_t vdev_indirect_births_object(vdev_indirect_births_t *vib); - -extern void vdev_indirect_births_add_entry(vdev_indirect_births_t *vib, - uint64_t offset, uint64_t txg, dmu_tx_t *tx); - -extern uint64_t vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, - uint64_t offset, uint64_t asize); - -extern uint64_t vdev_indirect_births_last_entry_txg( - vdev_indirect_births_t *vib); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_VDEV_INDIRECT_BIRTHS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h deleted file mode 100644 index 7e42c1019504..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h +++ /dev/null @@ -1,141 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2015 by Delphix. All rights reserved. - */ - -#ifndef _SYS_VDEV_INDIRECT_MAPPING_H -#define _SYS_VDEV_INDIRECT_MAPPING_H - -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct vdev_indirect_mapping_entry_phys { - /* - * Decode with DVA_MAPPING_* macros. - * Contains: - * the source offset (low 63 bits) - * the one-bit "mark", used for garbage collection (by zdb) - */ - uint64_t vimep_src; - - /* - * Note: the DVA's asize is 24 bits, and can thus store ranges - * up to 8GB. - */ - dva_t vimep_dst; -} vdev_indirect_mapping_entry_phys_t; - -#define DVA_MAPPING_GET_SRC_OFFSET(vimep) \ - BF64_GET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0) -#define DVA_MAPPING_SET_SRC_OFFSET(vimep, x) \ - BF64_SET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0, x) - -typedef struct vdev_indirect_mapping_entry { - vdev_indirect_mapping_entry_phys_t vime_mapping; - uint32_t vime_obsolete_count; - list_node_t vime_node; -} vdev_indirect_mapping_entry_t; - -/* - * This is stored in the bonus buffer of the mapping object, see comment of - * vdev_indirect_config for more details. - */ -typedef struct vdev_indirect_mapping_phys { - uint64_t vimp_max_offset; - uint64_t vimp_bytes_mapped; - uint64_t vimp_num_entries; /* number of v_i_m_entry_phys_t's */ - - /* - * For each entry in the mapping object, this object contains an - * entry representing the number of bytes of that mapping entry - * that were no longer in use by the pool at the time this indirect - * vdev was last condensed. - */ - uint64_t vimp_counts_object; -} vdev_indirect_mapping_phys_t; - -#define VDEV_INDIRECT_MAPPING_SIZE_V0 (3 * sizeof (uint64_t)) - -typedef struct vdev_indirect_mapping { - uint64_t vim_object; - boolean_t vim_havecounts; - - /* - * An ordered array of all mapping entries, sorted by source offset. - * Note that vim_entries is needed during a removal (and contains - * mappings that have been synced to disk so far) to handle frees - * from the removing device. - */ - vdev_indirect_mapping_entry_phys_t *vim_entries; - - objset_t *vim_objset; - - dmu_buf_t *vim_dbuf; - vdev_indirect_mapping_phys_t *vim_phys; -} vdev_indirect_mapping_t; - -extern vdev_indirect_mapping_t *vdev_indirect_mapping_open(objset_t *os, - uint64_t object); -extern void vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim); -extern uint64_t vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx); -extern void vdev_indirect_mapping_free(objset_t *os, uint64_t obj, - dmu_tx_t *tx); - -extern uint64_t vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim); -extern uint64_t vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim); -extern uint64_t vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim); -extern uint64_t vdev_indirect_mapping_bytes_mapped( - vdev_indirect_mapping_t *vim); -extern uint64_t vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim); - -/* - * Writes the given list of vdev_indirect_mapping_entry_t to the mapping - * then updates internal state. - */ -extern void vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, - list_t *vime_list, dmu_tx_t *tx); - -extern vdev_indirect_mapping_entry_phys_t * - vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, - uint64_t offset); - -extern vdev_indirect_mapping_entry_phys_t * - vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim, - uint64_t offset); - -extern uint32_t *vdev_indirect_mapping_load_obsolete_counts( - vdev_indirect_mapping_t *vim); -extern void vdev_indirect_mapping_load_obsolete_spacemap( - vdev_indirect_mapping_t *vim, - uint32_t *counts, space_map_t *obsolete_space_sm); -extern void vdev_indirect_mapping_increment_obsolete_count( - vdev_indirect_mapping_t *vim, - uint64_t offset, uint64_t asize, uint32_t *counts); -extern void vdev_indirect_mapping_free_obsolete_counts( - vdev_indirect_mapping_t *vim, uint32_t *counts); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_VDEV_INDIRECT_MAPPING_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h deleted file mode 100644 index db4b0572cd60..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2016 by Delphix. All rights reserved. - */ - -#ifndef _SYS_VDEV_INITIALIZE_H -#define _SYS_VDEV_INITIALIZE_H - -#ifdef __cplusplus -extern "C" { -#endif - -extern void vdev_initialize(vdev_t *vd); -extern void vdev_initialize_stop(vdev_t *vd, - vdev_initializing_state_t tgt_state); -extern void vdev_initialize_stop_all(vdev_t *vd, - vdev_initializing_state_t tgt_state); -extern void vdev_initialize_restart(vdev_t *vd); -extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, - range_seg_t *physical_rs); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_VDEV_INITIALIZE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h deleted file mode 100644 index e771e668fda6..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. - */ - -#ifndef _SYS_VDEV_RAIDZ_H -#define _SYS_VDEV_RAIDZ_H - -#include -#ifdef illumos -#include -#ifdef _KERNEL -#include -#include -#include -#endif -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef _KERNEL -extern int vdev_raidz_physio(vdev_t *, - caddr_t, size_t, uint64_t, uint64_t, boolean_t, boolean_t); -#endif -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_VDEV_RAIDZ_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h deleted file mode 100644 index 3962237afdab..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2014, 2017 by Delphix. All rights reserved. - */ - -#ifndef _SYS_VDEV_REMOVAL_H -#define _SYS_VDEV_REMOVAL_H - -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct spa_vdev_removal { - uint64_t svr_vdev_id; - uint64_t svr_max_offset_to_sync[TXG_SIZE]; - /* Thread performing a vdev removal. */ - kthread_t *svr_thread; - /* Segments left to copy from the current metaslab. */ - range_tree_t *svr_allocd_segs; - kmutex_t svr_lock; - kcondvar_t svr_cv; - boolean_t svr_thread_exit; - - /* - * New mappings to write out each txg. - */ - list_t svr_new_segments[TXG_SIZE]; - - /* - * Ranges that were freed while a mapping was in flight. This is - * a subset of the ranges covered by vdev_im_new_segments. - */ - range_tree_t *svr_frees[TXG_SIZE]; - - /* - * Number of bytes which we have finished our work for - * in each txg. This could be data copied (which will be part of - * the mappings in vdev_im_new_segments), or data freed before - * we got around to copying it. - */ - uint64_t svr_bytes_done[TXG_SIZE]; - - /* List of leaf zap objects to be unlinked */ - nvlist_t *svr_zaplist; -} spa_vdev_removal_t; - -typedef struct spa_condensing_indirect { - /* - * New mappings to write out each txg. - */ - list_t sci_new_mapping_entries[TXG_SIZE]; - - vdev_indirect_mapping_t *sci_new_mapping; -} spa_condensing_indirect_t; - -extern int spa_remove_init(spa_t *); -extern void spa_restart_removal(spa_t *); -extern int spa_condense_init(spa_t *); -extern void spa_condense_fini(spa_t *); -extern void spa_start_indirect_condensing_thread(spa_t *); -extern void spa_vdev_condense_suspend(spa_t *); -extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t); -extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t); -extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *); -extern void svr_sync(spa_t *spa, dmu_tx_t *tx); -extern void spa_vdev_remove_suspend(spa_t *); -extern int spa_vdev_remove_cancel(spa_t *); -extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr); - -extern int vdev_removal_max_span; -extern int zfs_remove_max_segment; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_VDEV_REMOVAL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h deleted file mode 100644 index e60233b4b103..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h +++ /dev/null @@ -1,514 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. - */ - -#ifndef _SYS_ZAP_H -#define _SYS_ZAP_H - -/* - * ZAP - ZFS Attribute Processor - * - * The ZAP is a module which sits on top of the DMU (Data Management - * Unit) and implements a higher-level storage primitive using DMU - * objects. Its primary consumer is the ZPL (ZFS Posix Layer). - * - * A "zapobj" is a DMU object which the ZAP uses to stores attributes. - * Users should use only zap routines to access a zapobj - they should - * not access the DMU object directly using DMU routines. - * - * The attributes stored in a zapobj are name-value pairs. The name is - * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including - * terminating NULL). The value is an array of integers, which may be - * 1, 2, 4, or 8 bytes long. The total space used by the array (number - * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes. - * Note that an 8-byte integer value can be used to store the location - * (object number) of another dmu object (which may be itself a zapobj). - * Note that you can use a zero-length attribute to store a single bit - * of information - the attribute is present or not. - * - * The ZAP routines are thread-safe. However, you must observe the - * DMU's restriction that a transaction may not be operated on - * concurrently. - * - * Any of the routines that return an int may return an I/O error (EIO - * or ECHECKSUM). - * - * - * Implementation / Performance Notes: - * - * The ZAP is intended to operate most efficiently on attributes with - * short (49 bytes or less) names and single 8-byte values, for which - * the microzap will be used. The ZAP should be efficient enough so - * that the user does not need to cache these attributes. - * - * The ZAP's locking scheme makes its routines thread-safe. Operations - * on different zapobjs will be processed concurrently. Operations on - * the same zapobj which only read data will be processed concurrently. - * Operations on the same zapobj which modify data will be processed - * concurrently when there are many attributes in the zapobj (because - * the ZAP uses per-block locking - more than 128 * (number of cpus) - * small attributes will suffice). - */ - -/* - * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C - * strings) for the names of attributes, rather than a byte string - * bounded by an explicit length. If some day we want to support names - * in character sets which have embedded zeros (eg. UTF-16, UTF-32), - * we'll have to add routines for using length-bounded strings. - */ - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Specifies matching criteria for ZAP lookups. - * MT_NORMALIZE Use ZAP normalization flags, which can include both - * unicode normalization and case-insensitivity. - * MT_MATCH_CASE Do case-sensitive lookups even if MT_NORMALIZE is - * specified and ZAP normalization flags include - * U8_TEXTPREP_TOUPPER. - */ -typedef enum matchtype { - MT_NORMALIZE = 1 << 0, - MT_MATCH_CASE = 1 << 1, -} matchtype_t; - -typedef enum zap_flags { - /* Use 64-bit hash value (serialized cursors will always use 64-bits) */ - ZAP_FLAG_HASH64 = 1 << 0, - /* Key is binary, not string (zap_add_uint64() can be used) */ - ZAP_FLAG_UINT64_KEY = 1 << 1, - /* - * First word of key (which must be an array of uint64) is - * already randomly distributed. - */ - ZAP_FLAG_PRE_HASHED_KEY = 1 << 2, -} zap_flags_t; - -/* - * Create a new zapobj with no attributes and return its object number. - * - * dnodesize specifies the on-disk size of the dnode for the new zapobj. - * Valid values are multiples of 512 up to DNODE_MAX_SIZE. - */ -uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); -uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); -uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); -uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags, - dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, - int dnodesize, dmu_tx_t *tx); -uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, - dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); -uint64_t zap_create_flags_dnsize(objset_t *os, int normflags, - zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, - int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, - int dnodesize, dmu_tx_t *tx); -uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, - uint64_t parent_obj, const char *name, dmu_tx_t *tx); -uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, - uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx); -uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, - uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx); - -/* - * Initialize an already-allocated object. - */ -void mzap_create_impl(objset_t *os, uint64_t obj, int normflags, - zap_flags_t flags, dmu_tx_t *tx); - -/* - * Create a new zapobj with no attributes from the given (unallocated) - * object number. - */ -int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); -int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); -int zap_create_claim_norm(objset_t *ds, uint64_t obj, - int normflags, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); -int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj, - int normflags, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); - -/* - * The zapobj passed in must be a valid ZAP object for all of the - * following routines. - */ - -/* - * Destroy this zapobj and all its attributes. - * - * Frees the object number using dmu_object_free. - */ -int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); - -/* - * Manipulate attributes. - * - * 'integer_size' is in bytes, and must be 1, 2, 4, or 8. - */ - -/* - * Retrieve the contents of the attribute with the given name. - * - * If the requested attribute does not exist, the call will fail and - * return ENOENT. - * - * If 'integer_size' is smaller than the attribute's integer size, the - * call will fail and return EINVAL. - * - * If 'integer_size' is equal to or larger than the attribute's integer - * size, the call will succeed and return 0. - * - * When converting to a larger integer size, the integers will be treated as - * unsigned (ie. no sign-extension will be performed). - * - * 'num_integers' is the length (in integers) of 'buf'. - * - * If the attribute is longer than the buffer, as many integers as will - * fit will be transferred to 'buf'. If the entire attribute was not - * transferred, the call will return EOVERFLOW. - */ -int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf); - -/* - * If rn_len is nonzero, realname will be set to the name of the found - * entry (which may be different from the requested name if matchtype is - * not MT_EXACT). - * - * If normalization_conflictp is not NULL, it will be set if there is - * another name with the same case/unicode normalized form. - */ -int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf, - matchtype_t mt, char *realname, int rn_len, - boolean_t *normalization_conflictp); -int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); -int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); -int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints); -int zap_lookup_by_dnode(dnode_t *dn, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf); -int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf, - matchtype_t mt, char *realname, int rn_len, - boolean_t *ncp); - -int zap_count_write_by_dnode(dnode_t *dn, const char *name, - int add, zfs_refcount_t *towrite, zfs_refcount_t *tooverwrite); - -/* - * Create an attribute with the given name and value. - * - * If an attribute with the given name already exists, the call will - * fail and return EEXIST. - */ -int zap_add(objset_t *ds, uint64_t zapobj, const char *key, - int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx); -int zap_add_by_dnode(dnode_t *dn, const char *key, - int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx); -int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, - int key_numints, int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx); - -/* - * Set the attribute with the given name to the given value. If an - * attribute with the given name does not exist, it will be created. If - * an attribute with the given name already exists, the previous value - * will be overwritten. The integer_size may be different from the - * existing attribute's integer size, in which case the attribute's - * integer size will be updated to the new value. - */ -int zap_update(objset_t *ds, uint64_t zapobj, const char *name, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); -int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); - -/* - * Get the length (in integers) and the integer size of the specified - * attribute. - * - * If the requested attribute does not exist, the call will fail and - * return ENOENT. - */ -int zap_length(objset_t *ds, uint64_t zapobj, const char *name, - uint64_t *integer_size, uint64_t *num_integers); -int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, uint64_t *integer_size, uint64_t *num_integers); - -/* - * Remove the specified attribute. - * - * If the specified attribute does not exist, the call will fail and - * return ENOENT. - */ -int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); -int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, - matchtype_t mt, dmu_tx_t *tx); -int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx); -int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, dmu_tx_t *tx); - -/* - * Returns (in *count) the number of attributes in the specified zap - * object. - */ -int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); - -/* - * Returns (in name) the name of the entry whose (value & mask) - * (za_first_integer) is value, or ENOENT if not found. The string - * pointed to by name must be at least 256 bytes long. If mask==0, the - * match must be exact (ie, same as mask=-1ULL). - */ -int zap_value_search(objset_t *os, uint64_t zapobj, - uint64_t value, uint64_t mask, char *name); - -/* - * Transfer all the entries from fromobj into intoobj. Only works on - * int_size=8 num_integers=1 values. Fails if there are any duplicated - * entries. - */ -int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); - -/* Same as zap_join, but set the values to 'value'. */ -int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, - uint64_t value, dmu_tx_t *tx); - -/* Same as zap_join, but add together any duplicated entries. */ -int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, - dmu_tx_t *tx); - -/* - * Manipulate entries where the name + value are the "same" (the name is - * a stringified version of the value). - */ -int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); -int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); -int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); -int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, - dmu_tx_t *tx); - -/* Here the key is an int and the value is a different int. */ -int zap_add_int_key(objset_t *os, uint64_t obj, - uint64_t key, uint64_t value, dmu_tx_t *tx); -int zap_update_int_key(objset_t *os, uint64_t obj, - uint64_t key, uint64_t value, dmu_tx_t *tx); -int zap_lookup_int_key(objset_t *os, uint64_t obj, - uint64_t key, uint64_t *valuep); - -int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, - dmu_tx_t *tx); - -struct zap; -struct zap_leaf; -typedef struct zap_cursor { - /* This structure is opaque! */ - objset_t *zc_objset; - struct zap *zc_zap; - struct zap_leaf *zc_leaf; - uint64_t zc_zapobj; - uint64_t zc_serialized; - uint64_t zc_hash; - uint32_t zc_cd; - boolean_t zc_prefetch; -} zap_cursor_t; - -typedef struct { - int za_integer_length; - /* - * za_normalization_conflict will be set if there are additional - * entries with this normalized form (eg, "foo" and "Foo"). - */ - boolean_t za_normalization_conflict; - uint64_t za_num_integers; - uint64_t za_first_integer; /* no sign extension for <8byte ints */ - char za_name[ZAP_MAXNAMELEN]; -} zap_attribute_t; - -/* - * The interface for listing all the attributes of a zapobj can be - * thought of as cursor moving down a list of the attributes one by - * one. The cookie returned by the zap_cursor_serialize routine is - * persistent across system calls (and across reboot, even). - */ - -/* - * Initialize a zap cursor, pointing to the "first" attribute of the - * zapobj. You must _fini the cursor when you are done with it. - */ -void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj); -void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, - uint64_t zapobj); -void zap_cursor_fini(zap_cursor_t *zc); - -/* - * Get the attribute currently pointed to by the cursor. Returns - * ENOENT if at the end of the attributes. - */ -int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za); - -/* - * Advance the cursor to the next attribute. - */ -void zap_cursor_advance(zap_cursor_t *zc); - -/* - * Get a persistent cookie pointing to the current position of the zap - * cursor. The low 4 bits in the cookie are always zero, and thus can - * be used as to differentiate a serialized cookie from a different type - * of value. The cookie will be less than 2^32 as long as there are - * fewer than 2^22 (4.2 million) entries in the zap object. - */ -uint64_t zap_cursor_serialize(zap_cursor_t *zc); - -/* - * Advance the cursor to the attribute having the given key. - */ -int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt); - -/* - * Initialize a zap cursor pointing to the position recorded by - * zap_cursor_serialize (in the "serialized" argument). You can also - * use a "serialized" argument of 0 to start at the beginning of the - * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to - * zap_cursor_init(...).) - */ -void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds, - uint64_t zapobj, uint64_t serialized); - - -#define ZAP_HISTOGRAM_SIZE 10 - -typedef struct zap_stats { - /* - * Size of the pointer table (in number of entries). - * This is always a power of 2, or zero if it's a microzap. - * In general, it should be considerably greater than zs_num_leafs. - */ - uint64_t zs_ptrtbl_len; - - uint64_t zs_blocksize; /* size of zap blocks */ - - /* - * The number of blocks used. Note that some blocks may be - * wasted because old ptrtbl's and large name/value blocks are - * not reused. (Although their space is reclaimed, we don't - * reuse those offsets in the object.) - */ - uint64_t zs_num_blocks; - - /* - * Pointer table values from zap_ptrtbl in the zap_phys_t - */ - uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */ - uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */ - uint64_t zs_ptrtbl_zt_blk; /* starting block number */ - uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */ - uint64_t zs_ptrtbl_zt_shift; /* bits to index it */ - - /* - * Values of the other members of the zap_phys_t - */ - uint64_t zs_block_type; /* ZBT_HEADER */ - uint64_t zs_magic; /* ZAP_MAGIC */ - uint64_t zs_num_leafs; /* The number of leaf blocks */ - uint64_t zs_num_entries; /* The number of zap entries */ - uint64_t zs_salt; /* salt to stir into hash function */ - - /* - * Histograms. For all histograms, the last index - * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater - * than what can be represented. For example - * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number - * of leafs with more than 45 entries. - */ - - /* - * zs_leafs_with_n_pointers[n] is the number of leafs with - * 2^n pointers to it. - */ - uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE]; - - /* - * zs_leafs_with_n_entries[n] is the number of leafs with - * [n*5, (n+1)*5) entries. In the current implementation, there - * can be at most 55 entries in any block, but there may be - * fewer if the name or value is large, or the block is not - * completely full. - */ - uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE]; - - /* - * zs_leafs_n_tenths_full[n] is the number of leafs whose - * fullness is in the range [n/10, (n+1)/10). - */ - uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE]; - - /* - * zs_entries_using_n_chunks[n] is the number of entries which - * consume n 24-byte chunks. (Note, large names/values only use - * one chunk, but contribute to zs_num_blocks_large.) - */ - uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE]; - - /* - * zs_buckets_with_n_entries[n] is the number of buckets (each - * leaf has 64 buckets) with n entries. - * zs_buckets_with_n_entries[1] should be very close to - * zs_num_entries. - */ - uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE]; -} zap_stats_t; - -/* - * Get statistics about a ZAP object. Note: you need to be aware of the - * internal implementation of the ZAP to correctly interpret some of the - * statistics. This interface shouldn't be relied on unless you really - * know what you're doing. - */ -int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZAP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h deleted file mode 100644 index 912b8b219c4c..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h +++ /dev/null @@ -1,242 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2017 Nexenta Systems, Inc. - */ - -#ifndef _SYS_ZAP_IMPL_H -#define _SYS_ZAP_IMPL_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -extern int fzap_default_block_shift; - -#define ZAP_MAGIC 0x2F52AB2ABULL - -#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift) - -#define MZAP_ENT_LEN 64 -#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) -#define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE - -#define ZAP_NEED_CD (-1U) - -typedef struct mzap_ent_phys { - uint64_t mze_value; - uint32_t mze_cd; - uint16_t mze_pad; /* in case we want to chain them someday */ - char mze_name[MZAP_NAME_LEN]; -} mzap_ent_phys_t; - -typedef struct mzap_phys { - uint64_t mz_block_type; /* ZBT_MICRO */ - uint64_t mz_salt; - uint64_t mz_normflags; - uint64_t mz_pad[5]; - mzap_ent_phys_t mz_chunk[1]; - /* actually variable size depending on block size */ -} mzap_phys_t; - -typedef struct mzap_ent { - avl_node_t mze_node; - int mze_chunkid; - uint64_t mze_hash; - uint32_t mze_cd; /* copy from mze_phys->mze_cd */ -} mzap_ent_t; - -#define MZE_PHYS(zap, mze) \ - (&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid]) - -/* - * The (fat) zap is stored in one object. It is an array of - * 1<= 6] [zap_leaf_t] [ptrtbl] ... - * - */ - -struct dmu_buf; -struct zap_leaf; - -#define ZBT_LEAF ((1ULL << 63) + 0) -#define ZBT_HEADER ((1ULL << 63) + 1) -#define ZBT_MICRO ((1ULL << 63) + 3) -/* any other values are ptrtbl blocks */ - -/* - * the embedded pointer table takes up half a block: - * block size / entry size (2^3) / 2 - */ -#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1) - -/* - * The embedded pointer table starts half-way through the block. Since - * the pointer table itself is half the block, it starts at (64-bit) - * word number (1<zap_dbuf->db_data); -} - -inline mzap_phys_t * -zap_m_phys(zap_t *zap) -{ - return (zap->zap_dbuf->db_data); -} - -typedef struct zap_name { - zap_t *zn_zap; - int zn_key_intlen; - const void *zn_key_orig; - int zn_key_orig_numints; - const void *zn_key_norm; - int zn_key_norm_numints; - uint64_t zn_hash; - matchtype_t zn_matchtype; - int zn_normflags; - char zn_normbuf[ZAP_MAXNAMELEN]; -} zap_name_t; - -#define zap_f zap_u.zap_fat -#define zap_m zap_u.zap_micro - -boolean_t zap_match(zap_name_t *zn, const char *matchname); -int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, - krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp); -void zap_unlockdir(zap_t *zap, void *tag); -void zap_evict_sync(void *dbu); -zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); -void zap_name_free(zap_name_t *zn); -int zap_hashbits(zap_t *zap); -uint32_t zap_maxcd(zap_t *zap); -uint64_t zap_getflags(zap_t *zap); - -#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) - -void fzap_byteswap(void *buf, size_t size); -int fzap_count(zap_t *zap, uint64_t *count); -int fzap_lookup(zap_name_t *zn, - uint64_t integer_size, uint64_t num_integers, void *buf, - char *realname, int rn_len, boolean_t *normalization_conflictp); -void fzap_prefetch(zap_name_t *zn); -int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, - const void *val, void *tag, dmu_tx_t *tx); -int fzap_update(zap_name_t *zn, - int integer_size, uint64_t num_integers, const void *val, - void *tag, dmu_tx_t *tx); -int fzap_length(zap_name_t *zn, - uint64_t *integer_size, uint64_t *num_integers); -int fzap_remove(zap_name_t *zn, dmu_tx_t *tx); -int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za); -void fzap_get_stats(zap_t *zap, zap_stats_t *zs); -void zap_put_leaf(struct zap_leaf *l); - -int fzap_add_cd(zap_name_t *zn, - uint64_t integer_size, uint64_t num_integers, - const void *val, uint32_t cd, void *tag, dmu_tx_t *tx); -void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags); -int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZAP_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h deleted file mode 100644 index 76b3ecc72557..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h +++ /dev/null @@ -1,248 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - */ - -#ifndef _SYS_ZAP_LEAF_H -#define _SYS_ZAP_LEAF_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct zap; -struct zap_name; -struct zap_stats; - -#define ZAP_LEAF_MAGIC 0x2AB1EAF - -/* chunk size = 24 bytes */ -#define ZAP_LEAF_CHUNKSIZE 24 - -/* - * The amount of space available for chunks is: - * block size (1<l_bs) - hash entry size (2) * number of hash - * entries - header space (2*chunksize) - */ -#define ZAP_LEAF_NUMCHUNKS(l) \ - (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \ - ZAP_LEAF_CHUNKSIZE - 2) - -/* - * The amount of space within the chunk available for the array is: - * chunk size - space for type (1) - space for next pointer (2) - */ -#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3) - -#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \ - (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES) - -/* - * Low water mark: when there are only this many chunks free, start - * growing the ptrtbl. Ideally, this should be larger than a - * "reasonably-sized" entry. 20 chunks is more than enough for the - * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value), - * while still being only around 3% for 16k blocks. - */ -#define ZAP_LEAF_LOW_WATER (20) - -/* - * The leaf hash table has block size / 2^5 (32) number of entries, - * which should be more than enough for the maximum number of entries, - * which is less than block size / CHUNKSIZE (24) / minimum number of - * chunks per entry (3). - */ -#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5) -#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l)) - -/* - * The chunks start immediately after the hash table. The end of the - * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a - * chunk_t. - */ -#define ZAP_LEAF_CHUNK(l, idx) \ - ((zap_leaf_chunk_t *) \ - (zap_leaf_phys(l)->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx] -#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry) - -typedef enum zap_chunk_type { - ZAP_CHUNK_FREE = 253, - ZAP_CHUNK_ENTRY = 252, - ZAP_CHUNK_ARRAY = 251, - ZAP_CHUNK_TYPE_MAX = 250 -} zap_chunk_type_t; - -#define ZLF_ENTRIES_CDSORTED (1<<0) - -/* - * TAKE NOTE: - * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified. - */ -typedef struct zap_leaf_phys { - struct zap_leaf_header { - /* Public to ZAP */ - uint64_t lh_block_type; /* ZBT_LEAF */ - uint64_t lh_pad1; - uint64_t lh_prefix; /* hash prefix of this leaf */ - uint32_t lh_magic; /* ZAP_LEAF_MAGIC */ - uint16_t lh_nfree; /* number free chunks */ - uint16_t lh_nentries; /* number of entries */ - uint16_t lh_prefix_len; /* num bits used to id this */ - - /* Private to zap_leaf */ - uint16_t lh_freelist; /* chunk head of free list */ - uint8_t lh_flags; /* ZLF_* flags */ - uint8_t lh_pad2[11]; - } l_hdr; /* 2 24-byte chunks */ - - /* - * The header is followed by a hash table with - * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is - * followed by an array of ZAP_LEAF_NUMCHUNKS(zap) - * zap_leaf_chunk structures. These structures are accessed - * with the ZAP_LEAF_CHUNK() macro. - */ - - uint16_t l_hash[1]; -} zap_leaf_phys_t; - -typedef union zap_leaf_chunk { - struct zap_leaf_entry { - uint8_t le_type; /* always ZAP_CHUNK_ENTRY */ - uint8_t le_value_intlen; /* size of value's ints */ - uint16_t le_next; /* next entry in hash chain */ - uint16_t le_name_chunk; /* first chunk of the name */ - uint16_t le_name_numints; /* ints in name (incl null) */ - uint16_t le_value_chunk; /* first chunk of the value */ - uint16_t le_value_numints; /* value length in ints */ - uint32_t le_cd; /* collision differentiator */ - uint64_t le_hash; /* hash value of the name */ - } l_entry; - struct zap_leaf_array { - uint8_t la_type; /* always ZAP_CHUNK_ARRAY */ - uint8_t la_array[ZAP_LEAF_ARRAY_BYTES]; - uint16_t la_next; /* next blk or CHAIN_END */ - } l_array; - struct zap_leaf_free { - uint8_t lf_type; /* always ZAP_CHUNK_FREE */ - uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES]; - uint16_t lf_next; /* next in free list, or CHAIN_END */ - } l_free; -} zap_leaf_chunk_t; - -typedef struct zap_leaf { - dmu_buf_user_t l_dbu; - krwlock_t l_rwlock; - uint64_t l_blkid; /* 1<l_dbuf->db_data); -} - -typedef struct zap_entry_handle { - /* Set by zap_leaf and public to ZAP */ - uint64_t zeh_num_integers; - uint64_t zeh_hash; - uint32_t zeh_cd; - uint8_t zeh_integer_size; - - /* Private to zap_leaf */ - uint16_t zeh_fakechunk; - uint16_t *zeh_chunkp; - zap_leaf_t *zeh_leaf; -} zap_entry_handle_t; - -/* - * Return a handle to the named entry, or ENOENT if not found. The hash - * value must equal zap_hash(name). - */ -extern int zap_leaf_lookup(zap_leaf_t *l, - struct zap_name *zn, zap_entry_handle_t *zeh); - -/* - * Return a handle to the entry with this hash+cd, or the entry with the - * next closest hash+cd. - */ -extern int zap_leaf_lookup_closest(zap_leaf_t *l, - uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh); - -/* - * Read the first num_integers in the attribute. Integer size - * conversion will be done without sign extension. Return EINVAL if - * integer_size is too small. Return EOVERFLOW if there are more than - * num_integers in the attribute. - */ -extern int zap_entry_read(const zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, void *buf); - -extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh, - uint16_t buflen, char *buf); - -/* - * Replace the value of an existing entry. - * - * May fail if it runs out of space (ENOSPC). - */ -extern int zap_entry_update(zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, const void *buf); - -/* - * Remove an entry. - */ -extern void zap_entry_remove(zap_entry_handle_t *zeh); - -/* - * Create an entry. An equal entry must not exist, and this entry must - * belong in this leaf (according to its hash value). Fills in the - * entry handle on success. Returns 0 on success or ENOSPC on failure. - */ -extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd, - uint8_t integer_size, uint64_t num_integers, const void *buf, - zap_entry_handle_t *zeh); - -/* Determine whether there is another entry with the same normalized form. */ -extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, - struct zap_name *zn, const char *name, struct zap *zap); - -/* - * Other stuff. - */ - -extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort); -extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len); -extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort); -extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l, - struct zap_stats *zs); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZAP_LEAF_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h deleted file mode 100644 index a6cb575b5b62..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h +++ /dev/null @@ -1,185 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2016, 2017 by Delphix. All rights reserved. - */ - -#ifndef _SYS_ZCP_H -#define _SYS_ZCP_H - -#include -#include - -#include "lua.h" -#include "lualib.h" -#include "lauxlib.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#define ZCP_RUN_INFO_KEY "runinfo" - -extern uint64_t zfs_lua_max_instrlimit; -extern uint64_t zfs_lua_max_memlimit; - -int zcp_argerror(lua_State *, int, const char *, ...); - -int zcp_eval(const char *, const char *, boolean_t, uint64_t, uint64_t, - nvpair_t *, nvlist_t *); - -int zcp_load_list_lib(lua_State *); - -int zcp_load_synctask_lib(lua_State *, boolean_t); - -typedef void (zcp_cleanup_t)(void *); -typedef struct zcp_cleanup_handler { - zcp_cleanup_t *zch_cleanup_func; - void *zch_cleanup_arg; - list_node_t zch_node; -} zcp_cleanup_handler_t; - -typedef struct zcp_alloc_arg { - boolean_t aa_must_succeed; - int64_t aa_alloc_remaining; - int64_t aa_alloc_limit; -} zcp_alloc_arg_t; - -typedef struct zcp_run_info { - dsl_pool_t *zri_pool; - - /* - * An estimate of the total amount of space consumed by all - * synctasks we have successfully performed so far in this - * channel program. Used to generate ENOSPC errors for syncfuncs. - */ - int zri_space_used; - - /* - * The credentials of the thread which originally invoked the channel - * program. Since channel programs are always invoked from the synctask - * thread they should always do permissions checks against this cred - * rather than the 'current' thread's. - */ - cred_t *zri_cred; - - /* - * The tx in which this channel program is running. - */ - dmu_tx_t *zri_tx; - - /* - * The maximum number of Lua instructions the channel program is allowed - * to execute. If it takes longer than this it will time out. A value - * of 0 indicates no instruction limit. - */ - uint64_t zri_maxinstrs; - - /* - * The number of Lua instructions the channel program has executed. - */ - uint64_t zri_curinstrs; - - /* - * Boolean indicating whether or not the channel program exited - * because it timed out. - */ - boolean_t zri_timed_out; - - /* - * Channel program was canceled by user - */ - boolean_t zri_canceled; - - /* - * Boolean indicating whether or not we are running in syncing - * context. - */ - boolean_t zri_sync; - - /* - * List of currently registered cleanup handlers, which will be - * triggered in the event of a fatal error. - */ - list_t zri_cleanup_handlers; - - /* - * The Lua state context of our channel program. - */ - lua_State *zri_state; - - /* - * Lua memory allocator arguments. - */ - zcp_alloc_arg_t *zri_allocargs; - - /* - * Contains output values from zcp script or error string. - */ - nvlist_t *zri_outnvl; - - /* - * The errno number returned to caller of zcp_eval(). - */ - int zri_result; -} zcp_run_info_t; - -zcp_run_info_t *zcp_run_info(lua_State *); -zcp_cleanup_handler_t *zcp_register_cleanup(lua_State *, zcp_cleanup_t, void *); -void zcp_deregister_cleanup(lua_State *, zcp_cleanup_handler_t *); -void zcp_cleanup(lua_State *); - -/* - * Argument parsing routines for channel program callback functions. - */ -typedef struct zcp_arg { - /* - * The name of this argument. For keyword arguments this is the name - * functions will use to set the argument. For positional arguments - * the name has no programatic meaning, but will appear in error - * messages and help output. - */ - const char *za_name; - - /* - * The Lua type this argument should have (e.g. LUA_TSTRING, - * LUA_TBOOLEAN) see the lua_type() function documentation for a - * complete list. Calling a function with an argument that does - * not match the expected type will result in the program terminating. - */ - const int za_lua_type; -} zcp_arg_t; - -void zcp_parse_args(lua_State *, const char *, const zcp_arg_t *, - const zcp_arg_t *); -int zcp_nvlist_to_lua(lua_State *, nvlist_t *, char *, int); -int zcp_dataset_hold_error(lua_State *, dsl_pool_t *, const char *, int); -struct dsl_dataset *zcp_dataset_hold(lua_State *, dsl_pool_t *, - const char *, void *); - -typedef int (zcp_lib_func_t)(lua_State *); -typedef struct zcp_lib_info { - const char *name; - zcp_lib_func_t *func; - const zcp_arg_t pargs[4]; - const zcp_arg_t kwargs[2]; -} zcp_lib_info_t; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZCP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h deleted file mode 100644 index e227f2f4b7f5..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2016 by Delphix. All rights reserved. - */ - -#ifndef _SYS_ZCP_GLOBALS_H -#define _SYS_ZCP_GLOBALS_H - -#include "lua.h" - -#ifdef __cplusplus -extern "C" { -#endif - -void zcp_load_globals(lua_State *); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZCP_GLOBALS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h deleted file mode 100644 index a021e1ce8917..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2016 by Delphix. All rights reserved. - */ - -#ifndef _SYS_ZCP_LIST_H -#define _SYS_ZCP_LIST_H - -#include "lua.h" - -#ifdef __cplusplus -extern "C" { -#endif - -void zcp_load_list_funcs(lua_State *); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZCP_LIST_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h deleted file mode 100644 index 97b17619565c..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2016 by Delphix. All rights reserved. - */ - -#ifndef _SYS_ZCP_PROP_H -#define _SYS_ZCP_PROP_H - -#ifdef __cplusplus -extern "C" { -#endif - -int zcp_load_get_lib(lua_State *state); -boolean_t prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZCP_PROP_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h deleted file mode 100644 index 5abde149a615..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2013 by Delphix. All rights reserved. - */ - -#ifndef _SYS_ZFEATURE_H -#define _SYS_ZFEATURE_H - -#include -#include -#include "zfeature_common.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#define VALID_FEATURE_FID(fid) ((fid) >= 0 && (fid) < SPA_FEATURES) -#define VALID_FEATURE_OR_NONE(fid) ((fid) == SPA_FEATURE_NONE || \ - VALID_FEATURE_FID(fid)) - -struct spa; -struct dmu_tx; -struct objset; - -extern void spa_feature_create_zap_objects(struct spa *, struct dmu_tx *); -extern void spa_feature_enable(struct spa *, spa_feature_t, - struct dmu_tx *); -extern void spa_feature_incr(struct spa *, spa_feature_t, struct dmu_tx *); -extern void spa_feature_decr(struct spa *, spa_feature_t, struct dmu_tx *); -extern boolean_t spa_feature_is_enabled(struct spa *, spa_feature_t); -extern boolean_t spa_feature_is_active(struct spa *, spa_feature_t); -extern boolean_t spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, - uint64_t *txg); -extern uint64_t spa_feature_refcount(spa_t *, spa_feature_t, uint64_t); -extern boolean_t spa_features_check(spa_t *, boolean_t, nvlist_t *, nvlist_t *); - -/* - * These functions are only exported for zhack and zdb; normal callers should - * use the above interfaces. - */ -extern int feature_get_refcount(struct spa *, zfeature_info_t *, uint64_t *); -extern int feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature, - uint64_t *res); -extern void feature_enable_sync(struct spa *, zfeature_info_t *, - struct dmu_tx *); -extern void feature_sync(struct spa *, zfeature_info_t *, uint64_t, - struct dmu_tx *); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZFEATURE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h deleted file mode 100644 index b34360a3c821..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h +++ /dev/null @@ -1,248 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#ifndef _SYS_FS_ZFS_ACL_H -#define _SYS_FS_ZFS_ACL_H - -#ifdef _KERNEL -#include -#endif -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct znode_phys; - -#define ACE_SLOT_CNT 6 -#define ZFS_ACL_VERSION_INITIAL 0ULL -#define ZFS_ACL_VERSION_FUID 1ULL -#define ZFS_ACL_VERSION ZFS_ACL_VERSION_FUID - -/* - * ZFS ACLs (Access Control Lists) are stored in various forms. - * - * Files created with ACL version ZFS_ACL_VERSION_INITIAL - * will all be created with fixed length ACEs of type - * zfs_oldace_t. - * - * Files with ACL version ZFS_ACL_VERSION_FUID will be created - * with various sized ACEs. The abstraction entries will utilize - * zfs_ace_hdr_t, normal user/group entries will use zfs_ace_t - * and some specialized CIFS ACEs will use zfs_object_ace_t. - */ - -/* - * All ACEs have a common hdr. For - * owner@, group@, and everyone@ this is all - * thats needed. - */ -typedef struct zfs_ace_hdr { - uint16_t z_type; - uint16_t z_flags; - uint32_t z_access_mask; -} zfs_ace_hdr_t; - -typedef zfs_ace_hdr_t zfs_ace_abstract_t; - -/* - * Standard ACE - */ -typedef struct zfs_ace { - zfs_ace_hdr_t z_hdr; - uint64_t z_fuid; -} zfs_ace_t; - -/* - * The following type only applies to ACE_ACCESS_ALLOWED|DENIED_OBJECT_ACE_TYPE - * and will only be set/retrieved in a CIFS context. - */ - -typedef struct zfs_object_ace { - zfs_ace_t z_ace; - uint8_t z_object_type[16]; /* object type */ - uint8_t z_inherit_type[16]; /* inherited object type */ -} zfs_object_ace_t; - -typedef struct zfs_oldace { - uint32_t z_fuid; /* "who" */ - uint32_t z_access_mask; /* access mask */ - uint16_t z_flags; /* flags, i.e inheritance */ - uint16_t z_type; /* type of entry allow/deny */ -} zfs_oldace_t; - -typedef struct zfs_acl_phys_v0 { - uint64_t z_acl_extern_obj; /* ext acl pieces */ - uint32_t z_acl_count; /* Number of ACEs */ - uint16_t z_acl_version; /* acl version */ - uint16_t z_acl_pad; /* pad */ - zfs_oldace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */ -} zfs_acl_phys_v0_t; - -#define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT) - -/* - * Size of ACL count is always 2 bytes. - * Necessary to for dealing with both V0 ACL and V1 ACL layout - */ -#define ZFS_ACL_COUNT_SIZE (sizeof (uint16_t)) - -typedef struct zfs_acl_phys { - uint64_t z_acl_extern_obj; /* ext acl pieces */ - uint32_t z_acl_size; /* Number of bytes in ACL */ - uint16_t z_acl_version; /* acl version */ - uint16_t z_acl_count; /* ace count */ - uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ -} zfs_acl_phys_t; - -typedef struct acl_ops { - uint32_t (*ace_mask_get) (void *acep); /* get access mask */ - void (*ace_mask_set) (void *acep, - uint32_t mask); /* set access mask */ - uint16_t (*ace_flags_get) (void *acep); /* get flags */ - void (*ace_flags_set) (void *acep, - uint16_t flags); /* set flags */ - uint16_t (*ace_type_get)(void *acep); /* get type */ - void (*ace_type_set)(void *acep, - uint16_t type); /* set type */ - uint64_t (*ace_who_get)(void *acep); /* get who/fuid */ - void (*ace_who_set)(void *acep, - uint64_t who); /* set who/fuid */ - size_t (*ace_size)(void *acep); /* how big is this ace */ - size_t (*ace_abstract_size)(void); /* sizeof abstract entry */ - int (*ace_mask_off)(void); /* off of access mask in ace */ - /* ptr to data if any */ - int (*ace_data)(void *acep, void **datap); -} acl_ops_t; - -/* - * A zfs_acl_t structure is composed of a list of zfs_acl_node_t's. - * Each node will have one or more ACEs associated with it. You will - * only have multiple nodes during a chmod operation. Normally only - * one node is required. - */ -typedef struct zfs_acl_node { - list_node_t z_next; /* Next chunk of ACEs */ - void *z_acldata; /* pointer into actual ACE(s) */ - void *z_allocdata; /* pointer to kmem allocated memory */ - size_t z_allocsize; /* Size of blob in bytes */ - size_t z_size; /* length of ACL data */ - uint64_t z_ace_count; /* number of ACEs in this acl node */ - int z_ace_idx; /* ace iterator positioned on */ -} zfs_acl_node_t; - -typedef struct zfs_acl { - uint64_t z_acl_count; /* Number of ACEs */ - size_t z_acl_bytes; /* Number of bytes in ACL */ - uint_t z_version; /* version of ACL */ - void *z_next_ace; /* pointer to next ACE */ - uint64_t z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */ - zfs_acl_node_t *z_curr_node; /* current node iterator is handling */ - list_t z_acl; /* chunks of ACE data */ - acl_ops_t z_ops; /* ACL operations */ -} zfs_acl_t; - -typedef struct acl_locator_cb { - zfs_acl_t *cb_aclp; - zfs_acl_node_t *cb_acl_node; -} zfs_acl_locator_cb_t; - -#define ACL_DATA_ALLOCED 0x1 -#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) - -struct zfs_fuid_info; - -typedef struct zfs_acl_ids { - uint64_t z_fuid; /* file owner fuid */ - uint64_t z_fgid; /* file group owner fuid */ - uint64_t z_mode; /* mode to set on create */ - zfs_acl_t *z_aclp; /* ACL to create with file */ - struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */ -} zfs_acl_ids_t; - -/* - * Property values for acl_mode and acl_inherit. - * - * acl_mode can take discard, noallow, groupmask and passthrough. - * whereas acl_inherit has secure instead of groupmask. - */ - -#define ZFS_ACL_DISCARD 0 -#define ZFS_ACL_NOALLOW 1 -#define ZFS_ACL_GROUPMASK 2 -#define ZFS_ACL_PASSTHROUGH 3 -#define ZFS_ACL_RESTRICTED 4 -#define ZFS_ACL_PASSTHROUGH_X 5 - -struct znode; -struct zfsvfs; - -#ifdef _KERNEL -int zfs_acl_ids_create(struct znode *, int, vattr_t *, - cred_t *, vsecattr_t *, zfs_acl_ids_t *); -void zfs_acl_ids_free(zfs_acl_ids_t *); -boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *); -int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); -int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); -void zfs_acl_rele(void *); -void zfs_oldace_byteswap(ace_t *, int); -void zfs_ace_byteswap(void *, size_t, boolean_t); -extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr); -extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *); -#ifdef illumos -int zfs_fastaccesschk_execute(struct znode *, cred_t *); -#endif -int zfs_freebsd_fastaccesschk_execute(struct vnode *, cred_t *); -extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); -extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); -extern int zfs_acl_access(struct znode *, int, cred_t *); -int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); -int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); -int zfs_zaccess_rename(struct znode *, struct znode *, - struct znode *, struct znode *, cred_t *cr); -void zfs_acl_free(zfs_acl_t *); -int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *, - struct zfs_fuid_info **, zfs_acl_t **); -int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *); -uint64_t zfs_external_acl(struct znode *); -int zfs_znode_acl_version(struct znode *); -int zfs_acl_size(struct znode *, int *); -zfs_acl_t *zfs_acl_alloc(int); -zfs_acl_node_t *zfs_acl_node_alloc(size_t); -void zfs_acl_xform(struct znode *, zfs_acl_t *, cred_t *); -void zfs_acl_data_locator(void **, uint32_t *, uint32_t, boolean_t, void *); -uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *, - uint64_t *, uint64_t, uint64_t); -int zfs_acl_chown_setattr(struct znode *); - -#endif - -#ifdef __cplusplus -} -#endif -#endif /* _SYS_FS_ZFS_ACL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h deleted file mode 100644 index 38fda1d40585..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h +++ /dev/null @@ -1,146 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. - */ - -#ifndef _SYS_ZFS_CONTEXT_H -#define _SYS_ZFS_CONTEXT_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef illumos -#include -#endif -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#define boot_ncpus (mp_ncpus) - -#define CPU_SEQID (curcpu) - -#define tsd_create(keyp, destructor) do { \ - *(keyp) = osd_thread_register((destructor)); \ - KASSERT(*(keyp) > 0, ("cannot register OSD")); \ -} while (0) -#define tsd_destroy(keyp) osd_thread_deregister(*(keyp)) -#define tsd_get(key) osd_thread_get(curthread, (key)) -#define tsd_set(key, value) osd_thread_set(curthread, (key), (value)) - -#ifdef __cplusplus -} -#endif - -extern int zfs_debug_level; -extern struct mtx zfs_debug_mtx; -#define ZFS_LOG(lvl, ...) do { \ - if (((lvl) & 0xff) <= zfs_debug_level) { \ - mtx_lock(&zfs_debug_mtx); \ - printf("%s:%u[%d]: ", __func__, __LINE__, (lvl)); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - if ((lvl) & 0x100) \ - kdb_backtrace(); \ - mtx_unlock(&zfs_debug_mtx); \ - } \ -} while (0) - -#define sys_shutdown rebooting - -#define noinline __attribute__((noinline)) -#define likely(x) __builtin_expect((x), 1) - -#endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h deleted file mode 100644 index de770c52add0..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#ifndef _ZFS_CTLDIR_H -#define _ZFS_CTLDIR_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define ZFS_CTLDIR_NAME ".zfs" - -#define zfs_has_ctldir(zdp) \ - ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \ - ((zdp)->z_zfsvfs->z_ctldir != NULL)) -#define zfs_show_ctldir(zdp) \ - (zfs_has_ctldir(zdp) && \ - ((zdp)->z_zfsvfs->z_show_ctldir)) - -void zfsctl_create(zfsvfs_t *); -void zfsctl_destroy(zfsvfs_t *); -int zfsctl_root(zfsvfs_t *, int, vnode_t **); -void zfsctl_init(void); -void zfsctl_fini(void); -boolean_t zfsctl_is_node(vnode_t *); - -int zfsctl_rename_snapshot(const char *from, const char *to); -int zfsctl_destroy_snapshot(const char *snapname, int force); -int zfsctl_umount_snapshots(vfs_t *, int, cred_t *); - -int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp); - -#define ZFSCTL_INO_ROOT 0x1 -#define ZFSCTL_INO_SNAPDIR 0x2 - -#ifdef __cplusplus -} -#endif - -#endif /* _ZFS_CTLDIR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h deleted file mode 100644 index 9cbfc26b64e2..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - */ - -#ifndef _SYS_ZFS_DEBUG_H -#define _SYS_ZFS_DEBUG_H - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef TRUE -#define TRUE 1 -#endif - -#ifndef FALSE -#define FALSE 0 -#endif - -/* - * ZFS debugging - */ - -#if defined(DEBUG) || !defined(_KERNEL) -#if !defined(ZFS_DEBUG) -#define ZFS_DEBUG -#endif -#endif - -extern int zfs_flags; -extern boolean_t zfs_recover; -extern boolean_t zfs_free_leak_on_eio; - -#define ZFS_DEBUG_DPRINTF (1 << 0) -#define ZFS_DEBUG_DBUF_VERIFY (1 << 1) -#define ZFS_DEBUG_DNODE_VERIFY (1 << 2) -#define ZFS_DEBUG_SNAPNAMES (1 << 3) -#define ZFS_DEBUG_MODIFY (1 << 4) -/* 1<<5 was previously used, try not to reuse */ -#define ZFS_DEBUG_ZIO_FREE (1 << 6) -#define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7) -#define ZFS_DEBUG_METASLAB_VERIFY (1 << 8) -#define ZFS_DEBUG_INDIRECT_REMAP (1 << 9) - -#ifdef ZFS_DEBUG -extern void __dprintf(const char *file, const char *func, - int line, const char *fmt, ...); -#define dprintf(...) \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) \ - __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__) -#else -#define dprintf(...) ((void)0) -#endif /* ZFS_DEBUG */ - -extern void zfs_panic_recover(const char *fmt, ...); - -typedef struct zfs_dbgmsg { - list_node_t zdm_node; - time_t zdm_timestamp; - char zdm_msg[1]; /* variable length allocation */ -} zfs_dbgmsg_t; - -extern void zfs_dbgmsg_init(void); -extern void zfs_dbgmsg_fini(void); -extern void zfs_dbgmsg(const char *fmt, ...); -extern void zfs_dbgmsg_print(const char *tag); - -#ifdef illumos -#ifndef _KERNEL -extern int dprintf_find_string(const char *string); -#endif -#endif /* illumos */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZFS_DEBUG_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h deleted file mode 100644 index 22d8e603c433..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_FS_ZFS_DIR_H -#define _SYS_FS_ZFS_DIR_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* zfs_dirent_lock() flags */ -#define ZNEW 0x0001 /* entry should not exist */ -#define ZEXISTS 0x0002 /* entry should exist */ -#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */ -#define ZXATTR 0x0008 /* we want the xattr dir */ -#define ZRENAMING 0x0010 /* znode is being renamed */ -#define ZCILOOK 0x0020 /* case-insensitive lookup requested */ -#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */ -#define ZHAVELOCK 0x0080 /* z_name_lock is already held */ - -/* mknode flags */ -#define IS_ROOT_NODE 0x01 /* create a root node */ -#define IS_XATTR 0x02 /* create an extended attribute node */ - -extern int zfs_dirent_lookup(znode_t *, const char *, znode_t **, int); -extern int zfs_link_create(znode_t *, const char *, znode_t *, dmu_tx_t *, int); -extern int zfs_link_destroy(znode_t *, const char *, znode_t *, dmu_tx_t *, int, - boolean_t *); -#if 0 -extern int zfs_dirlook(vnode_t *, const char *, vnode_t **, int); -#else -extern int zfs_dirlook(znode_t *, const char *name, znode_t **); -#endif -extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, - uint_t, znode_t **, zfs_acl_ids_t *); -extern void zfs_rmnode(znode_t *); -extern boolean_t zfs_dirempty(znode_t *); -extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); -extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); -extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr); -extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *, int); -extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FS_ZFS_DIR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h deleted file mode 100644 index b381bb98e734..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_FS_ZFS_FUID_H -#define _SYS_FS_ZFS_FUID_H - -#include -#ifdef _KERNEL -#include -#include -#include -#endif -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef enum { - ZFS_OWNER, - ZFS_GROUP, - ZFS_ACE_USER, - ZFS_ACE_GROUP -} zfs_fuid_type_t; - -/* - * Estimate space needed for one more fuid table entry. - * for now assume its current size + 1K - */ -#define FUID_SIZE_ESTIMATE(z) ((z)->z_fuid_size + (SPA_MINBLOCKSIZE << 1)) - -#define FUID_INDEX(x) ((x) >> 32) -#define FUID_RID(x) ((x) & 0xffffffff) -#define FUID_ENCODE(idx, rid) (((uint64_t)(idx) << 32) | (rid)) -/* - * FUIDs cause problems for the intent log - * we need to replay the creation of the FUID, - * but we can't count on the idmapper to be around - * and during replay the FUID index may be different than - * before. Also, if an ACL has 100 ACEs and 12 different - * domains we don't want to log 100 domain strings, but rather - * just the unique 12. - */ - -/* - * The FUIDs in the log will index into - * domain string table and the bottom half will be the rid. - * Used for mapping ephemeral uid/gid during ACL setting to FUIDs - */ -typedef struct zfs_fuid { - list_node_t z_next; - uint64_t z_id; /* uid/gid being converted to fuid */ - uint64_t z_domidx; /* index in AVL domain table */ - uint64_t z_logfuid; /* index for domain in log */ -} zfs_fuid_t; - -/* list of unique domains */ -typedef struct zfs_fuid_domain { - list_node_t z_next; - uint64_t z_domidx; /* AVL tree idx */ - const char *z_domain; /* domain string */ -} zfs_fuid_domain_t; - -/* - * FUID information necessary for logging create, setattr, and setacl. - */ -typedef struct zfs_fuid_info { - list_t z_fuids; - list_t z_domains; - uint64_t z_fuid_owner; - uint64_t z_fuid_group; - char **z_domain_table; /* Used during replay */ - uint32_t z_fuid_cnt; /* How many fuids in z_fuids */ - uint32_t z_domain_cnt; /* How many domains */ - size_t z_domain_str_sz; /* len of domain strings z_domain list */ -} zfs_fuid_info_t; - -#ifdef _KERNEL -struct znode; -extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t); -extern void zfs_fuid_node_add(zfs_fuid_info_t **, const char *, uint32_t, - uint64_t, uint64_t, zfs_fuid_type_t); -extern void zfs_fuid_destroy(zfsvfs_t *); -extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t, - cred_t *, zfs_fuid_info_t **); -extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t, - zfs_fuid_info_t **); -extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, - uid_t *uid, uid_t *gid); -extern zfs_fuid_info_t *zfs_fuid_info_alloc(void); -extern void zfs_fuid_info_free(zfs_fuid_info_t *); -extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *); -void zfs_fuid_sync(zfsvfs_t *, dmu_tx_t *); -extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain, - char **retdomain, boolean_t addok); -extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx); -extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx); -#endif - -char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t); -void zfs_fuid_avl_tree_create(avl_tree_t *, avl_tree_t *); -uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *); -void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FS_ZFS_FUID_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h deleted file mode 100644 index 756800f8afde..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h +++ /dev/null @@ -1,466 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright 2016 RackTop Systems. - * Copyright (c) 2014 Integros [integros.com] - */ - -#ifndef _SYS_ZFS_IOCTL_H -#define _SYS_ZFS_IOCTL_H - -#include -#include -#include -#include -#include -#include - -#ifdef _KERNEL -#include -#endif /* _KERNEL */ - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * The structures in this file are passed between userland and the - * kernel. Userland may be running a 32-bit process, while the kernel - * is 64-bit. Therefore, these structures need to compile the same in - * 32-bit and 64-bit. This means not using type "long", and adding - * explicit padding so that the 32-bit structure will not be packed more - * tightly than the 64-bit structure (which requires 64-bit alignment). - */ - -/* - * Property values for snapdir - */ -#define ZFS_SNAPDIR_HIDDEN 0 -#define ZFS_SNAPDIR_VISIBLE 1 - -/* - * Field manipulation macros for the drr_versioninfo field of the - * send stream header. - */ - -/* - * Header types for zfs send streams. - */ -typedef enum drr_headertype { - DMU_SUBSTREAM = 0x1, - DMU_COMPOUNDSTREAM = 0x2 -} drr_headertype_t; - -#define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2) -#define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x) - -#define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30) -#define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x) - -/* - * Feature flags for zfs send streams (flags in drr_versioninfo) - */ - -#define DMU_BACKUP_FEATURE_DEDUP (1 << 0) -#define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1) -#define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2) -/* flags #3 - #15 are reserved for incompatible closed-source implementations */ -#define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16) -#define DMU_BACKUP_FEATURE_LZ4 (1 << 17) -/* flag #18 is reserved for a Delphix feature */ -#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) -#define DMU_BACKUP_FEATURE_RESUMING (1 << 20) -/* flag #21 is reserved for a Delphix feature */ -#define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22) -#define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23) -/* flag #24 is reserved for the raw send feature */ -/* flag #25 is reserved for the ZSTD compression feature */ - -/* - * Mask of all supported backup features - */ -#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ - DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \ - DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \ - DMU_BACKUP_FEATURE_RESUMING | \ - DMU_BACKUP_FEATURE_LARGE_BLOCKS | DMU_BACKUP_FEATURE_LARGE_DNODE | \ - DMU_BACKUP_FEATURE_COMPRESSED) - -/* Are all features in the given flag word currently supported? */ -#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) - -typedef enum dmu_send_resume_token_version { - ZFS_SEND_RESUME_TOKEN_VERSION = 1 -} dmu_send_resume_token_version_t; - -/* - * The drr_versioninfo field of the dmu_replay_record has the - * following layout: - * - * 64 56 48 40 32 24 16 8 0 - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * | reserved | feature-flags |C|S| - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * - * The low order two bits indicate the header type: SUBSTREAM (0x1) - * or COMPOUNDSTREAM (0x2). Using two bits for this is historical: - * this field used to be a version number, where the two version types - * were 1 and 2. Using two bits for this allows earlier versions of - * the code to be able to recognize send streams that don't use any - * of the features indicated by feature flags. - */ - -#define DMU_BACKUP_MAGIC 0x2F5bacbacULL - -/* - * Send stream flags. Bits 24-31 are reserved for vendor-specific - * implementations and should not be used. - */ -#define DRR_FLAG_CLONE (1<<0) -#define DRR_FLAG_CI_DATA (1<<1) -/* - * This send stream, if it is a full send, includes the FREE and FREEOBJECT - * records that are created by the sending process. This means that the send - * stream can be received as a clone, even though it is not an incremental. - * This is not implemented as a feature flag, because the receiving side does - * not need to have implemented it to receive this stream; it is fully backwards - * compatible. We need a flag, though, because full send streams without it - * cannot necessarily be received as a clone correctly. - */ -#define DRR_FLAG_FREERECORDS (1<<2) - -/* - * flags in the drr_checksumflags field in the DRR_WRITE and - * DRR_WRITE_BYREF blocks - */ -#define DRR_CHECKSUM_DEDUP (1<<0) - -#define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) - -/* deal with compressed drr_write replay records */ -#define DRR_WRITE_COMPRESSED(drrw) ((drrw)->drr_compressiontype != 0) -#define DRR_WRITE_PAYLOAD_SIZE(drrw) \ - (DRR_WRITE_COMPRESSED(drrw) ? (drrw)->drr_compressed_size : \ - (drrw)->drr_logical_size) - -typedef struct dmu_replay_record { - enum { - DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, - DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, - DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES - } drr_type; - uint32_t drr_payloadlen; - union { - struct drr_begin { - uint64_t drr_magic; - uint64_t drr_versioninfo; /* was drr_version */ - uint64_t drr_creation_time; - dmu_objset_type_t drr_type; - uint32_t drr_flags; - uint64_t drr_toguid; - uint64_t drr_fromguid; - char drr_toname[MAXNAMELEN]; - } drr_begin; - struct drr_end { - zio_cksum_t drr_checksum; - uint64_t drr_toguid; - } drr_end; - struct drr_object { - uint64_t drr_object; - dmu_object_type_t drr_type; - dmu_object_type_t drr_bonustype; - uint32_t drr_blksz; - uint32_t drr_bonuslen; - uint8_t drr_checksumtype; - uint8_t drr_compress; - uint8_t drr_dn_slots; - uint8_t drr_pad[5]; - uint64_t drr_toguid; - /* bonus content follows */ - } drr_object; - struct drr_freeobjects { - uint64_t drr_firstobj; - uint64_t drr_numobjs; - uint64_t drr_toguid; - } drr_freeobjects; - struct drr_write { - uint64_t drr_object; - dmu_object_type_t drr_type; - uint32_t drr_pad; - uint64_t drr_offset; - uint64_t drr_logical_size; - uint64_t drr_toguid; - uint8_t drr_checksumtype; - uint8_t drr_checksumflags; - uint8_t drr_compressiontype; - uint8_t drr_pad2[5]; - /* deduplication key */ - ddt_key_t drr_key; - /* only nonzero if drr_compressiontype is not 0 */ - uint64_t drr_compressed_size; - /* content follows */ - } drr_write; - struct drr_free { - uint64_t drr_object; - uint64_t drr_offset; - uint64_t drr_length; - uint64_t drr_toguid; - } drr_free; - struct drr_write_byref { - /* where to put the data */ - uint64_t drr_object; - uint64_t drr_offset; - uint64_t drr_length; - uint64_t drr_toguid; - /* where to find the prior copy of the data */ - uint64_t drr_refguid; - uint64_t drr_refobject; - uint64_t drr_refoffset; - /* properties of the data */ - uint8_t drr_checksumtype; - uint8_t drr_checksumflags; - uint8_t drr_pad2[6]; - ddt_key_t drr_key; /* deduplication key */ - } drr_write_byref; - struct drr_spill { - uint64_t drr_object; - uint64_t drr_length; - uint64_t drr_toguid; - uint64_t drr_pad[4]; /* needed for crypto */ - /* spill data follows */ - } drr_spill; - struct drr_write_embedded { - uint64_t drr_object; - uint64_t drr_offset; - /* logical length, should equal blocksize */ - uint64_t drr_length; - uint64_t drr_toguid; - uint8_t drr_compression; - uint8_t drr_etype; - uint8_t drr_pad[6]; - uint32_t drr_lsize; /* uncompressed size of payload */ - uint32_t drr_psize; /* compr. (real) size of payload */ - /* (possibly compressed) content follows */ - } drr_write_embedded; - - /* - * Nore: drr_checksum is overlaid with all record types - * except DRR_BEGIN. Therefore its (non-pad) members - * must not overlap with members from the other structs. - * We accomplish this by putting its members at the very - * end of the struct. - */ - struct drr_checksum { - uint64_t drr_pad[34]; - /* - * fletcher-4 checksum of everything preceding the - * checksum. - */ - zio_cksum_t drr_checksum; - } drr_checksum; - } drr_u; -} dmu_replay_record_t; - -/* diff record range types */ -typedef enum diff_type { - DDR_NONE = 0x1, - DDR_INUSE = 0x2, - DDR_FREE = 0x4 -} diff_type_t; - -/* - * The diff reports back ranges of free or in-use objects. - */ -typedef struct dmu_diff_record { - uint64_t ddr_type; - uint64_t ddr_first; - uint64_t ddr_last; -} dmu_diff_record_t; - -typedef struct zinject_record { - uint64_t zi_objset; - uint64_t zi_object; - uint64_t zi_start; - uint64_t zi_end; - uint64_t zi_guid; - uint32_t zi_level; - uint32_t zi_error; - uint64_t zi_type; - uint32_t zi_freq; - uint32_t zi_failfast; - char zi_func[MAXNAMELEN]; - uint32_t zi_iotype; - int32_t zi_duration; - uint64_t zi_timer; - uint64_t zi_nlanes; - uint32_t zi_cmd; - uint32_t zi_pad; -} zinject_record_t; - -#define ZINJECT_NULL 0x1 -#define ZINJECT_FLUSH_ARC 0x2 -#define ZINJECT_UNLOAD_SPA 0x4 - -typedef enum zinject_type { - ZINJECT_UNINITIALIZED, - ZINJECT_DATA_FAULT, - ZINJECT_DEVICE_FAULT, - ZINJECT_LABEL_FAULT, - ZINJECT_IGNORED_WRITES, - ZINJECT_PANIC, - ZINJECT_DELAY_IO, -} zinject_type_t; - -typedef struct zfs_share { - uint64_t z_exportdata; - uint64_t z_sharedata; - uint64_t z_sharetype; /* 0 = share, 1 = unshare */ - uint64_t z_sharemax; /* max length of share string */ -} zfs_share_t; - -/* - * ZFS file systems may behave the usual, POSIX-compliant way, where - * name lookups are case-sensitive. They may also be set up so that - * all the name lookups are case-insensitive, or so that only some - * lookups, the ones that set an FIGNORECASE flag, are case-insensitive. - */ -typedef enum zfs_case { - ZFS_CASE_SENSITIVE, - ZFS_CASE_INSENSITIVE, - ZFS_CASE_MIXED -} zfs_case_t; - -/* - * Note: this struct must have the same layout in 32-bit and 64-bit, so - * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit - * kernel. Therefore, we add padding to it so that no "hidden" padding - * is automatically added on 64-bit (but not on 32-bit). - */ -typedef struct zfs_cmd { - char zc_name[MAXPATHLEN]; /* name of pool or dataset */ - uint64_t zc_nvlist_src; /* really (char *) */ - uint64_t zc_nvlist_src_size; - uint64_t zc_nvlist_dst; /* really (char *) */ - uint64_t zc_nvlist_dst_size; - boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ - int zc_pad2; - - /* - * The following members are for legacy ioctls which haven't been - * converted to the new method. - */ - uint64_t zc_history; /* really (char *) */ - char zc_value[MAXPATHLEN * 2]; - char zc_string[MAXNAMELEN]; - uint64_t zc_guid; - uint64_t zc_nvlist_conf; /* really (char *) */ - uint64_t zc_nvlist_conf_size; - uint64_t zc_cookie; - uint64_t zc_objset_type; - uint64_t zc_perm_action; - uint64_t zc_history_len; - uint64_t zc_history_offset; - uint64_t zc_obj; - uint64_t zc_iflags; /* internal to zfs(7fs) */ - zfs_share_t zc_share; - uint64_t zc_jailid; - dmu_objset_stats_t zc_objset_stats; - dmu_replay_record_t zc_begin_record; - zinject_record_t zc_inject_record; - uint32_t zc_defer_destroy; - uint32_t zc_flags; - uint64_t zc_action_handle; - int zc_cleanup_fd; - uint8_t zc_simple; - uint8_t zc_pad3[3]; - boolean_t zc_resumable; - uint32_t zc_pad4; - uint64_t zc_sendobj; - uint64_t zc_fromobj; - uint64_t zc_createtxg; - zfs_stat_t zc_stat; -} zfs_cmd_t; - -typedef struct zfs_useracct { - char zu_domain[256]; - uid_t zu_rid; - uint32_t zu_pad; - uint64_t zu_space; -} zfs_useracct_t; - -#define ZFSDEV_MAX_MINOR (1 << 16) -#define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1) - -#define ZPOOL_EXPORT_AFTER_SPLIT 0x1 - -#ifdef _KERNEL -struct objset; -struct zfsvfs; - -typedef struct zfs_creat { - nvlist_t *zct_zplprops; - nvlist_t *zct_props; -} zfs_creat_t; - -extern int zfs_secpolicy_snapshot_perms(const char *, cred_t *); -extern int zfs_secpolicy_rename_perms(const char *, const char *, cred_t *); -extern int zfs_secpolicy_destroy_perms(const char *, cred_t *); -extern int zfs_busy(void); -extern void zfs_unmount_snap(const char *); -extern void zfs_destroy_unmount_origin(const char *); -#ifdef illumos -extern int getzfsvfs_impl(struct objset *, struct zfsvfs **); -#else -extern int getzfsvfs_impl(struct objset *, vfs_t **); -#endif -extern int getzfsvfs(const char *, struct zfsvfs **); - -/* - * ZFS minor numbers can refer to either a control device instance or - * a zvol. Depending on the value of zss_type, zss_data points to either - * a zvol_state_t or a zfs_onexit_t. - */ -enum zfs_soft_state_type { - ZSST_ZVOL, - ZSST_CTLDEV -}; - -typedef struct zfs_soft_state { - enum zfs_soft_state_type zss_type; - void *zss_data; -} zfs_soft_state_t; - -extern void *zfsdev_get_soft_state(minor_t minor, - enum zfs_soft_state_type which); -extern minor_t zfsdev_minor_alloc(void); - -extern void *zfsdev_state; - -#endif /* _KERNEL */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZFS_IOCTL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h deleted file mode 100644 index 4982bd4d0afc..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#ifndef _SYS_ZFS_ONEXIT_H -#define _SYS_ZFS_ONEXIT_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef _KERNEL - -typedef struct zfs_onexit { - kmutex_t zo_lock; - list_t zo_actions; -} zfs_onexit_t; - -typedef struct zfs_onexit_action_node { - list_node_t za_link; - void (*za_func)(void *); - void *za_data; -} zfs_onexit_action_node_t; - -extern void zfs_onexit_init(zfs_onexit_t **zo); -extern void zfs_onexit_destroy(zfs_onexit_t *zo); - -#endif - -extern int zfs_onexit_fd_hold(int fd, minor_t *minorp); -extern void zfs_onexit_fd_rele(int fd); -extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, - uint64_t *action_handle); -extern int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, - boolean_t fire); -extern int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, - void **data); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZFS_ONEXIT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h deleted file mode 100644 index ffae1130fd88..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2018 by Delphix. All rights reserved. - */ - -#ifndef _SYS_FS_ZFS_RLOCK_H -#define _SYS_FS_ZFS_RLOCK_H - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef __FreeBSD__ -#define rangelock_init zfs_rangelock_init -#define rangelock_fini zfs_rangelock_fini -#endif - -typedef enum { - RL_READER, - RL_WRITER, - RL_APPEND -} rangelock_type_t; - -struct locked_range; - -typedef void (rangelock_cb_t)(struct locked_range *, void *); - -#ifdef __FreeBSD__ -typedef struct zfs_rangelock { -#else -typedef struct rangelock { -#endif - avl_tree_t rl_tree; /* contains locked_range_t */ - kmutex_t rl_lock; - rangelock_cb_t *rl_cb; - void *rl_arg; -} rangelock_t; - -typedef struct locked_range { - rangelock_t *lr_rangelock; /* rangelock that this lock applies to */ - avl_node_t lr_node; /* avl node link */ - uint64_t lr_offset; /* file range offset */ - uint64_t lr_length; /* file range length */ - uint_t lr_count; /* range reference count in tree */ - rangelock_type_t lr_type; /* range type */ - kcondvar_t lr_write_cv; /* cv for waiting writers */ - kcondvar_t lr_read_cv; /* cv for waiting readers */ - uint8_t lr_proxy; /* acting for original range */ - uint8_t lr_write_wanted; /* writer wants to lock this range */ - uint8_t lr_read_wanted; /* reader wants to lock this range */ -} locked_range_t; - -void rangelock_init(rangelock_t *, rangelock_cb_t *, void *); -void rangelock_fini(rangelock_t *); - -locked_range_t *rangelock_enter(rangelock_t *, - uint64_t, uint64_t, rangelock_type_t); -locked_range_t *rangelock_tryenter(rangelock_t *, - uint64_t, uint64_t, rangelock_type_t); -void rangelock_exit(locked_range_t *); -void rangelock_reduce(locked_range_t *, uint64_t, uint64_t); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FS_ZFS_RLOCK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h deleted file mode 100644 index fc40b0e9517c..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ZFS_SA_H -#define _SYS_ZFS_SA_H - -#ifdef _KERNEL -#include -#include -#include -#include -#include -#include - - -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * This is the list of known attributes - * to the ZPL. The values of the actual - * attributes are not defined by the order - * the enums. It is controlled by the attribute - * registration mechanism. Two different file system - * could have different numeric values for the same - * attributes. this list is only used for dereferencing - * into the table that will hold the actual numeric value. - */ -typedef enum zpl_attr { - ZPL_ATIME, - ZPL_MTIME, - ZPL_CTIME, - ZPL_CRTIME, - ZPL_GEN, - ZPL_MODE, - ZPL_SIZE, - ZPL_PARENT, - ZPL_LINKS, - ZPL_XATTR, - ZPL_RDEV, - ZPL_FLAGS, - ZPL_UID, - ZPL_GID, - ZPL_PAD, - ZPL_ZNODE_ACL, - ZPL_DACL_COUNT, - ZPL_SYMLINK, - ZPL_SCANSTAMP, - ZPL_DACL_ACES, - ZPL_END -} zpl_attr_t; - -#define ZFS_OLD_ZNODE_PHYS_SIZE 0x108 -#define ZFS_SA_BASE_ATTR_SIZE (ZFS_OLD_ZNODE_PHYS_SIZE - \ - sizeof (zfs_acl_phys_t)) - -#define SA_MODE_OFFSET 0 -#define SA_SIZE_OFFSET 8 -#define SA_GEN_OFFSET 16 -#define SA_UID_OFFSET 24 -#define SA_GID_OFFSET 32 -#define SA_PARENT_OFFSET 40 - -extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1]; -extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1]; - -/* - * This is a deprecated data structure that only exists for - * dealing with file systems create prior to ZPL version 5. - */ -typedef struct znode_phys { - uint64_t zp_atime[2]; /* 0 - last file access time */ - uint64_t zp_mtime[2]; /* 16 - last file modification time */ - uint64_t zp_ctime[2]; /* 32 - last file change time */ - uint64_t zp_crtime[2]; /* 48 - creation time */ - uint64_t zp_gen; /* 64 - generation (txg of creation) */ - uint64_t zp_mode; /* 72 - file mode bits */ - uint64_t zp_size; /* 80 - size of file */ - uint64_t zp_parent; /* 88 - directory parent (`..') */ - uint64_t zp_links; /* 96 - number of links to file */ - uint64_t zp_xattr; /* 104 - DMU object for xattrs */ - uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ - uint64_t zp_flags; /* 120 - persistent flags */ - uint64_t zp_uid; /* 128 - file owner */ - uint64_t zp_gid; /* 136 - owning group */ - uint64_t zp_zap; /* 144 - extra attributes */ - uint64_t zp_pad[3]; /* 152 - future */ - zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */ - /* - * Data may pad out any remaining bytes in the znode buffer, eg: - * - * |<---------------------- dnode_phys (512) ------------------------>| - * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| - * |<---- znode (264) ---->|<---- data (56) ---->| - * - * At present, we use this space for the following: - * - symbolic links - * - 32-byte anti-virus scanstamp (regular files only) - */ -} znode_phys_t; - -#ifdef _KERNEL -int zfs_sa_readlink(struct znode *, uio_t *); -void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *); -void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *); -void zfs_sa_get_scanstamp(struct znode *, xvattr_t *); -void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *); -void zfs_sa_uprade_pre(struct sa_handle *, void *, dmu_tx_t *); -void zfs_sa_upgrade_post(struct sa_handle *, void *, dmu_tx_t *); -void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *); -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZFS_SA_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h deleted file mode 100644 index a8af7ec61ba9..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#ifndef _SYS_FS_ZFS_STAT_H -#define _SYS_FS_ZFS_STAT_H - -#ifdef _KERNEL -#include -#include -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * A limited number of zpl level stats are retrievable - * with an ioctl. zfs diff is the current consumer. - */ -typedef struct zfs_stat { - uint64_t zs_gen; - uint64_t zs_mode; - uint64_t zs_links; - uint64_t zs_ctime[2]; -} zfs_stat_t; - -extern int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, - char *buf, int len); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FS_ZFS_STAT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h deleted file mode 100644 index 8fba5e735da6..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h +++ /dev/null @@ -1,192 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 Pawel Jakub Dawidek . - * All rights reserved. - */ - -#ifndef _SYS_FS_ZFS_VFSOPS_H -#define _SYS_FS_ZFS_VFSOPS_H - -#include -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct zfsvfs zfsvfs_t; -struct znode; - -struct zfsvfs { - vfs_t *z_vfs; /* generic fs struct */ - zfsvfs_t *z_parent; /* parent fs */ - objset_t *z_os; /* objset reference */ - uint64_t z_root; /* id of root znode */ - uint64_t z_unlinkedobj; /* id of unlinked zapobj */ - uint64_t z_max_blksz; /* maximum block size for files */ - uint64_t z_fuid_obj; /* fuid table object number */ - uint64_t z_fuid_size; /* fuid table size */ - avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ - avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ - krwlock_t z_fuid_lock; /* fuid lock */ - boolean_t z_fuid_loaded; /* fuid tables are loaded */ - boolean_t z_fuid_dirty; /* need to sync fuid table ? */ - struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ - zilog_t *z_log; /* intent log pointer */ - uint_t z_acl_mode; /* acl chmod/mode behavior */ - uint_t z_acl_inherit; /* acl inheritance behavior */ - zfs_case_t z_case; /* case-sense */ - boolean_t z_utf8; /* utf8-only */ - int z_norm; /* normalization flags */ - boolean_t z_atime; /* enable atimes mount option */ - boolean_t z_unmounted; /* unmounted */ - rrmlock_t z_teardown_lock; - struct rmslock z_teardown_inactive_lock; - list_t z_all_znodes; /* all vnodes in the fs */ - kmutex_t z_znodes_lock; /* lock for z_all_znodes */ - struct zfsctl_root *z_ctldir; /* .zfs directory pointer */ - boolean_t z_show_ctldir; /* expose .zfs in the root dir */ - boolean_t z_issnap; /* true if this is a snapshot */ - boolean_t z_vscan; /* virus scan on/off */ - boolean_t z_use_fuids; /* version allows fuids */ - boolean_t z_replay; /* set during ZIL replay */ - boolean_t z_use_sa; /* version allow system attributes */ - boolean_t z_use_namecache;/* make use of FreeBSD name cache */ - uint64_t z_version; /* ZPL version */ - uint64_t z_shares_dir; /* hidden shares dir */ - kmutex_t z_lock; - uint64_t z_userquota_obj; - uint64_t z_groupquota_obj; - uint64_t z_replay_eof; /* New end of file - replay only */ - sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ -#define ZFS_OBJ_MTX_SZ 64 - kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ -#if defined(__FreeBSD__) - struct task z_unlinked_drain_task; -#endif -}; - -#define ZFS_TRYRLOCK_TEARDOWN_INACTIVE(zfsvfs) \ - rms_try_rlock(&(zfsvfs)->z_teardown_inactive_lock) - -#define ZFS_RLOCK_TEARDOWN_INACTIVE(zfsvfs) \ - rms_rlock(&(zfsvfs)->z_teardown_inactive_lock) - -#define ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs) \ - rms_runlock(&(zfsvfs)->z_teardown_inactive_lock) - -#define ZFS_WLOCK_TEARDOWN_INACTIVE(zfsvfs) \ - rms_wlock(&(zfsvfs)->z_teardown_inactive_lock) - -#define ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs) \ - rms_wunlock(&(zfsvfs)->z_teardown_inactive_lock) - -#define ZFS_TEARDOWN_INACTIVE_WLOCKED(zfsvfs) \ - rms_wowned(&(zfsvfs)->z_teardown_inactive_lock) - -/* - * Normal filesystems (those not under .zfs/snapshot) have a total - * file ID size limited to 12 bytes (including the length field) due to - * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical - * reasons, this same limit is being imposed by the Solaris NFSv3 implementation - * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It - * is not possible to expand beyond 12 bytes without abandoning support - * of NFSv2. - * - * For normal filesystems, we partition up the available space as follows: - * 2 bytes fid length (required) - * 6 bytes object number (48 bits) - * 4 bytes generation number (32 bits) - * - * We reserve only 48 bits for the object number, as this is the limit - * currently defined and imposed by the DMU. - */ -typedef struct zfid_short { - uint16_t zf_len; - uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ - uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ -} zfid_short_t; - -/* - * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes - * (including the length field). This makes files under .zfs/snapshot - * accessible by NFSv3 and NFSv4, but not NFSv2. - * - * For files under .zfs/snapshot, we partition up the available space - * as follows: - * 2 bytes fid length (required) - * 6 bytes object number (48 bits) - * 4 bytes generation number (32 bits) - * 6 bytes objset id (48 bits) - * 4 bytes[**] currently just zero (32 bits) - * - * We reserve only 48 bits for the object number and objset id, as these are - * the limits currently defined and imposed by the DMU. - * - * [*] 20 bytes on FreeBSD to fit into the size of struct fid. - * [**] 2 bytes on FreeBSD for the above reason. - */ -typedef struct zfid_long { - zfid_short_t z_fid; - uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ - uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */ -} zfid_long_t; - -#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) -#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) - -extern uint_t zfs_fsyncer_key; -extern int zfs_super_owner; - -extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); -extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); -extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - const char *domain, uint64_t rid, uint64_t *valuep); -extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); -extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - const char *domain, uint64_t rid, uint64_t quota); -extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *, - boolean_t isgroup); -extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, - uint64_t fuid); -extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); -extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp); -extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); -extern void zfsvfs_free(zfsvfs_t *zfsvfs); -extern int zfs_check_global_label(const char *dsname, const char *hexsl); - -#ifdef _KERNEL -extern void zfsvfs_update_fromname(const char *oldname, const char *newname); -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h deleted file mode 100644 index a95545bda4e1..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h +++ /dev/null @@ -1,374 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2016 Nexenta Systems, Inc. All rights reserved. - */ - -#ifndef _SYS_FS_ZFS_ZNODE_H -#define _SYS_FS_ZFS_ZNODE_H - -#ifdef _KERNEL -#include -#include -#include -#include -#include -#include -#include -#include -#endif -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Additional file level attributes, that are stored - * in the upper half of zp_flags - */ -#define ZFS_READONLY 0x0000000100000000 -#define ZFS_HIDDEN 0x0000000200000000 -#define ZFS_SYSTEM 0x0000000400000000 -#define ZFS_ARCHIVE 0x0000000800000000 -#define ZFS_IMMUTABLE 0x0000001000000000 -#define ZFS_NOUNLINK 0x0000002000000000 -#define ZFS_APPENDONLY 0x0000004000000000 -#define ZFS_NODUMP 0x0000008000000000 -#define ZFS_OPAQUE 0x0000010000000000 -#define ZFS_AV_QUARANTINED 0x0000020000000000 -#define ZFS_AV_MODIFIED 0x0000040000000000 -#define ZFS_REPARSE 0x0000080000000000 -#define ZFS_OFFLINE 0x0000100000000000 -#define ZFS_SPARSE 0x0000200000000000 - -#define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \ -{ \ - if (value) \ - pflags |= attr; \ - else \ - pflags &= ~attr; \ - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \ - &pflags, sizeof (pflags), tx)); \ -} - -/* - * Define special zfs pflags - */ -#define ZFS_XATTR 0x1 /* is an extended attribute */ -#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ -#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ -#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */ -#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */ -#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */ -#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */ -#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ -#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */ - -#define SA_ZPL_ATIME(z) z->z_attr_table[ZPL_ATIME] -#define SA_ZPL_MTIME(z) z->z_attr_table[ZPL_MTIME] -#define SA_ZPL_CTIME(z) z->z_attr_table[ZPL_CTIME] -#define SA_ZPL_CRTIME(z) z->z_attr_table[ZPL_CRTIME] -#define SA_ZPL_GEN(z) z->z_attr_table[ZPL_GEN] -#define SA_ZPL_DACL_ACES(z) z->z_attr_table[ZPL_DACL_ACES] -#define SA_ZPL_XATTR(z) z->z_attr_table[ZPL_XATTR] -#define SA_ZPL_SYMLINK(z) z->z_attr_table[ZPL_SYMLINK] -#define SA_ZPL_RDEV(z) z->z_attr_table[ZPL_RDEV] -#define SA_ZPL_SCANSTAMP(z) z->z_attr_table[ZPL_SCANSTAMP] -#define SA_ZPL_UID(z) z->z_attr_table[ZPL_UID] -#define SA_ZPL_GID(z) z->z_attr_table[ZPL_GID] -#define SA_ZPL_PARENT(z) z->z_attr_table[ZPL_PARENT] -#define SA_ZPL_LINKS(z) z->z_attr_table[ZPL_LINKS] -#define SA_ZPL_MODE(z) z->z_attr_table[ZPL_MODE] -#define SA_ZPL_DACL_COUNT(z) z->z_attr_table[ZPL_DACL_COUNT] -#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS] -#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE] -#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL] -#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD] - -/* - * Is ID ephemeral? - */ -#define IS_EPHEMERAL(x) (x > MAXUID) - -/* - * Should we use FUIDs? - */ -#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID && \ - spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) -#define USE_SA(version, os) (version >= ZPL_VERSION_SA && \ - spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA) - -#define MASTER_NODE_OBJ 1 - -/* - * Special attributes for master node. - * "userquota@" and "groupquota@" are also valid (from - * zfs_userquota_prop_prefixes[]). - */ -#define ZFS_FSID "FSID" -#define ZFS_UNLINKED_SET "DELETE_QUEUE" -#define ZFS_ROOT_OBJ "ROOT" -#define ZPL_VERSION_STR "VERSION" -#define ZFS_FUID_TABLES "FUID" -#define ZFS_SHARES_DIR "SHARES" -#define ZFS_SA_ATTRS "SA_ATTRS" - -/* - * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in - * the directory entries. - */ -#ifndef IFTODT -#define IFTODT(mode) (((mode) & S_IFMT) >> 12) -#endif - -/* - * The directory entry has the type (currently unused on Solaris) in the - * top 4 bits, and the object number in the low 48 bits. The "middle" - * 12 bits are unused. - */ -#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4) -#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) - -/* - * Directory entry locks control access to directory entries. - * They are used to protect creates, deletes, and renames. - * Each directory znode has a mutex and a list of locked names. - */ -#ifdef _KERNEL -typedef struct zfs_dirlock { - char *dl_name; /* directory entry being locked */ - uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */ - uint8_t dl_namelock; /* 1 if z_name_lock is NOT held */ - uint16_t dl_namesize; /* set if dl_name was allocated */ - kcondvar_t dl_cv; /* wait for entry to be unlocked */ - struct znode *dl_dzp; /* directory znode */ - struct zfs_dirlock *dl_next; /* next in z_dirlocks list */ -} zfs_dirlock_t; - -typedef struct znode { - struct zfsvfs *z_zfsvfs; - vnode_t *z_vnode; - uint64_t z_id; /* object ID for this znode */ -#ifdef illumos - kmutex_t z_lock; /* znode modification lock */ - krwlock_t z_parent_lock; /* parent lock for directories */ - krwlock_t z_name_lock; /* "master" lock for dirent locks */ - zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ -#endif - rangelock_t z_rangelock; /* file range locks */ - uint8_t z_unlinked; /* file has been unlinked */ - uint8_t z_atime_dirty; /* atime needs to be synced */ - uint8_t z_zn_prefetch; /* Prefetch znodes? */ - uint8_t z_moved; /* Has this znode been moved? */ - uint_t z_blksz; /* block size in bytes */ - uint_t z_seq; /* modification sequence number */ - uint64_t z_mapcnt; /* number of pages mapped to file */ - uint64_t z_dnodesize; /* dnode size */ - uint64_t z_gen; /* generation (cached) */ - uint64_t z_size; /* file size (cached) */ - uint64_t z_atime[2]; /* atime (cached) */ - uint64_t z_links; /* file links (cached) */ - uint64_t z_pflags; /* pflags (cached) */ - uint64_t z_uid; /* uid fuid (cached) */ - uint64_t z_gid; /* gid fuid (cached) */ - mode_t z_mode; /* mode (cached) */ - uint32_t z_sync_cnt; /* synchronous open count */ - kmutex_t z_acl_lock; /* acl data lock */ - zfs_acl_t *z_acl_cached; /* cached acl */ - list_node_t z_link_node; /* all znodes in fs link */ - sa_handle_t *z_sa_hdl; /* handle to sa data */ - boolean_t z_is_sa; /* are we native sa? */ -} znode_t; - -#define ZFS_LINK_MAX UINT64_MAX - -/* - * Range locking rules - * -------------------- - * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole - * file range needs to be locked as RL_WRITER. Only then can the pages be - * freed etc and zp_size reset. zp_size must be set within range lock. - * 2. For writes and punching holes (zfs_write & zfs_space) just the range - * being written or freed needs to be locked as RL_WRITER. - * Multiple writes at the end of the file must coordinate zp_size updates - * to ensure data isn't lost. A compare and swap loop is currently used - * to ensure the file size is at least the offset last written. - * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being - * read needs to be locked as RL_READER. A check against zp_size can then - * be made for reading beyond end of file. - */ - -/* - * Convert between znode pointers and vnode pointers - */ -#ifdef DEBUG -static __inline vnode_t * -ZTOV(znode_t *zp) -{ - vnode_t *vp = zp->z_vnode; - - ASSERT(vp != NULL && vp->v_data == zp); - return (vp); -} -static __inline znode_t * -VTOZ(vnode_t *vp) -{ - znode_t *zp = (znode_t *)vp->v_data; - - ASSERT(zp != NULL && zp->z_vnode == vp); - return (zp); -} -#else -#define ZTOV(ZP) ((ZP)->z_vnode) -#define VTOZ(VP) ((znode_t *)(VP)->v_data) -#endif - -#define VTOZ_SMR(VP) ((znode_t *)vn_load_v_data_smr(VP)) - -/* Called on entry to each ZFS vnode and vfs operation */ -#define ZFS_ENTER(zfsvfs) \ - { \ - rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ - if ((zfsvfs)->z_unmounted) { \ - ZFS_EXIT(zfsvfs); \ - return (EIO); \ - } \ - } - -/* Must be called before exiting the vop */ -#define ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG) - -/* Verifies the znode is valid */ -#define ZFS_VERIFY_ZP(zp) \ - if ((zp)->z_sa_hdl == NULL) { \ - ZFS_EXIT((zp)->z_zfsvfs); \ - return (EIO); \ - } \ - -/* - * Macros for dealing with dmu_buf_hold - */ -#define ZFS_OBJ_HASH(obj_num) ((obj_num) & (ZFS_OBJ_MTX_SZ - 1)) -#define ZFS_OBJ_MUTEX(zfsvfs, obj_num) \ - (&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) -#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \ - mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) -#define ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \ - mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) -#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \ - mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) - -/* Encode ZFS stored time values from a struct timespec */ -#define ZFS_TIME_ENCODE(tp, stmp) \ -{ \ - (stmp)[0] = (uint64_t)(tp)->tv_sec; \ - (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ -} - -/* Decode ZFS stored time values to a struct timespec */ -#define ZFS_TIME_DECODE(tp, stmp) \ -{ \ - (tp)->tv_sec = (time_t)(stmp)[0]; \ - (tp)->tv_nsec = (long)(stmp)[1]; \ -} - -/* - * Timestamp defines - */ -#define ACCESSED (AT_ATIME) -#define STATE_CHANGED (AT_CTIME) -#define CONTENT_MODIFIED (AT_MTIME | AT_CTIME) - -#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ - if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ - zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE); - -extern int zfs_init_fs(zfsvfs_t *, znode_t **); -extern void zfs_set_dataprop(objset_t *); -extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *, - dmu_tx_t *tx); -extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2], - uint64_t [2], boolean_t); -extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *); -extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); -extern void zfs_znode_init(void); -extern void zfs_znode_fini(void); -extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **); -extern int zfs_rezget(znode_t *); -extern void zfs_zinactive(znode_t *); -extern void zfs_znode_delete(znode_t *, dmu_tx_t *); -extern void zfs_znode_free(znode_t *); -extern void zfs_remove_op_tables(); -extern int zfs_create_op_tables(); -extern dev_t zfs_cmpldev(uint64_t); -extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); -extern int zfs_get_stats(objset_t *os, nvlist_t *nv); -extern boolean_t zfs_get_vfs_flag_unmounted(objset_t *os); -extern void zfs_znode_dmu_fini(znode_t *); - -extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name, vsecattr_t *, zfs_fuid_info_t *, - vattr_t *vap); -extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp, - vattr_t *vap); -extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, char *name, uint64_t foid); -#define ZFS_NO_OBJECT 0 /* no object id */ -extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name); -extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name, char *link); -extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); -extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t len, int ioflag); -extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, uint64_t off, uint64_t len); -extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp); -#ifndef ZFS_NO_ACL -extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, - vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); -#endif -extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx); -extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); -extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx); - -extern zil_get_data_t zfs_get_data; -extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; -extern int zfsfstype; - -extern int zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf); - -#endif /* _KERNEL */ - -extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FS_ZFS_ZNODE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h deleted file mode 100644 index a27a9547ac43..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h +++ /dev/null @@ -1,464 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -/* Portions Copyright 2010 Robert Milkowski */ - -#ifndef _SYS_ZIL_H -#define _SYS_ZIL_H - -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct dsl_pool; -struct dsl_dataset; -struct lwb; - -/* - * Intent log format: - * - * Each objset has its own intent log. The log header (zil_header_t) - * for objset N's intent log is kept in the Nth object of the SPA's - * intent_log objset. The log header points to a chain of log blocks, - * each of which contains log records (i.e., transactions) followed by - * a log block trailer (zil_trailer_t). The format of a log record - * depends on the record (or transaction) type, but all records begin - * with a common structure that defines the type, length, and txg. - */ - -/* - * Intent log header - this on disk structure holds fields to manage - * the log. All fields are 64 bit to easily handle cross architectures. - */ -typedef struct zil_header { - uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ - uint64_t zh_replay_seq; /* highest replayed sequence number */ - blkptr_t zh_log; /* log chain */ - uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */ - uint64_t zh_flags; /* header flags */ - uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */ - uint64_t zh_pad[3]; -} zil_header_t; - -/* - * zh_flags bit settings - */ -#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */ -#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */ - -/* - * Log block chaining. - * - * Log blocks are chained together. Originally they were chained at the - * end of the block. For performance reasons the chain was moved to the - * beginning of the block which allows writes for only the data being used. - * The older position is supported for backwards compatability. - * - * The zio_eck_t contains a zec_cksum which for the intent log is - * the sequence number of this log block. A seq of 0 is invalid. - * The zec_cksum is checked by the SPA against the sequence - * number passed in the blk_cksum field of the blkptr_t - */ -typedef struct zil_chain { - uint64_t zc_pad; - blkptr_t zc_next_blk; /* next block in chain */ - uint64_t zc_nused; /* bytes in log block used */ - zio_eck_t zc_eck; /* block trailer */ -} zil_chain_t; - -#define ZIL_MIN_BLKSZ 4096ULL - -/* - * ziltest is by and large an ugly hack, but very useful in - * checking replay without tedious work. - * When running ziltest we want to keep all itx's and so maintain - * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG - * We subtract TXG_CONCURRENT_STATES to allow for common code. - */ -#define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES) - -/* - * The words of a log block checksum. - */ -#define ZIL_ZC_GUID_0 0 -#define ZIL_ZC_GUID_1 1 -#define ZIL_ZC_OBJSET 2 -#define ZIL_ZC_SEQ 3 - -typedef enum zil_create { - Z_FILE, - Z_DIR, - Z_XATTRDIR, -} zil_create_t; - -/* - * size of xvattr log section. - * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps - * for create time and a single 64 bit integer for all of the attributes, - * and 4 64 bit integers (32 bytes) for the scanstamp. - * - */ - -#define ZIL_XVAT_SIZE(mapsize) \ - sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \ - (sizeof (uint64_t) * 7) - -/* - * Size of ACL in log. The ACE data is padded out to properly align - * on 8 byte boundary. - */ - -#define ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t))) - -/* - * Intent log transaction types and record structures - */ -#define TX_COMMIT 0 /* Commit marker (no on-disk state) */ -#define TX_CREATE 1 /* Create file */ -#define TX_MKDIR 2 /* Make directory */ -#define TX_MKXATTR 3 /* Make XATTR directory */ -#define TX_SYMLINK 4 /* Create symbolic link to a file */ -#define TX_REMOVE 5 /* Remove file */ -#define TX_RMDIR 6 /* Remove directory */ -#define TX_LINK 7 /* Create hard link to a file */ -#define TX_RENAME 8 /* Rename a file */ -#define TX_WRITE 9 /* File write */ -#define TX_TRUNCATE 10 /* Truncate a file */ -#define TX_SETATTR 11 /* Set file attributes */ -#define TX_ACL_V0 12 /* Set old formatted ACL */ -#define TX_ACL 13 /* Set ACL */ -#define TX_CREATE_ACL 14 /* create with ACL */ -#define TX_CREATE_ATTR 15 /* create + attrs */ -#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */ -#define TX_MKDIR_ACL 17 /* mkdir with ACL */ -#define TX_MKDIR_ATTR 18 /* mkdir with attr */ -#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ -#define TX_WRITE2 20 /* dmu_sync EALREADY write */ -#define TX_MAX_TYPE 21 /* Max transaction type */ - -/* - * The transactions for mkdir, symlink, remove, rmdir, link, and rename - * may have the following bit set, indicating the original request - * specified case-insensitive handling of names. - */ -#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */ - -/* - * Transactions for write, truncate, setattr, acl_v0, and acl can be logged - * out of order. For convenience in the code, all such records must have - * lr_foid at the same offset. - */ -#define TX_OOO(txtype) \ - ((txtype) == TX_WRITE || \ - (txtype) == TX_TRUNCATE || \ - (txtype) == TX_SETATTR || \ - (txtype) == TX_ACL_V0 || \ - (txtype) == TX_ACL || \ - (txtype) == TX_WRITE2) - -/* - * The number of dnode slots consumed by the object is stored in the 8 - * unused upper bits of the object ID. We subtract 1 from the value - * stored on disk for compatibility with implementations that don't - * support large dnodes. The slot count for a single-slot dnode will - * contain 0 for those bits to preserve the log record format for - * "small" dnodes. - */ -#define LR_FOID_GET_SLOTS(oid) (BF64_GET((oid), 56, 8) + 1) -#define LR_FOID_SET_SLOTS(oid, x) BF64_SET((oid), 56, 8, (x) - 1) -#define LR_FOID_GET_OBJ(oid) BF64_GET((oid), 0, DN_MAX_OBJECT_SHIFT) -#define LR_FOID_SET_OBJ(oid, x) BF64_SET((oid), 0, DN_MAX_OBJECT_SHIFT, (x)) - -/* - * Format of log records. - * The fields are carefully defined to allow them to be aligned - * and sized the same on sparc & intel architectures. - * Each log record has a common structure at the beginning. - * - * The log record on disk (lrc_seq) holds the sequence number of all log - * records which is used to ensure we don't replay the same record. - */ -typedef struct { /* common log record header */ - uint64_t lrc_txtype; /* intent log transaction type */ - uint64_t lrc_reclen; /* transaction record length */ - uint64_t lrc_txg; /* dmu transaction group number */ - uint64_t lrc_seq; /* see comment above */ -} lr_t; - -/* - * Common start of all out-of-order record types (TX_OOO() above). - */ -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_foid; /* object id */ -} lr_ooo_t; - -/* - * Handle option extended vattr attributes. - * - * Whenever new attributes are added the version number - * will need to be updated as will code in - * zfs_log.c and zfs_replay.c - */ -typedef struct { - uint32_t lr_attr_masksize; /* number of elements in array */ - uint32_t lr_attr_bitmap; /* First entry of array */ - /* remainder of array and any additional fields */ -} lr_attr_t; - -/* - * log record for creates without optional ACL. - * This log record does support optional xvattr_t attributes. - */ -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_doid; /* object id of directory */ - uint64_t lr_foid; /* object id of created file object */ - uint64_t lr_mode; /* mode of object */ - uint64_t lr_uid; /* uid of object */ - uint64_t lr_gid; /* gid of object */ - uint64_t lr_gen; /* generation (txg of creation) */ - uint64_t lr_crtime[2]; /* creation time */ - uint64_t lr_rdev; /* rdev of object to create */ - /* name of object to create follows this */ - /* for symlinks, link content follows name */ - /* for creates with xvattr data, the name follows the xvattr info */ -} lr_create_t; - -/* - * FUID ACL record will be an array of ACEs from the original ACL. - * If this array includes ephemeral IDs, the record will also include - * an array of log-specific FUIDs to replace the ephemeral IDs. - * Only one copy of each unique domain will be present, so the log-specific - * FUIDs will use an index into a compressed domain table. On replay this - * information will be used to construct real FUIDs (and bypass idmap, - * since it may not be available). - */ - -/* - * Log record for creates with optional ACL - * This log record is also used for recording any FUID - * information needed for replaying the create. If the - * file doesn't have any actual ACEs then the lr_aclcnt - * would be zero. - * - * After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's. - * If create is also setting xvattr's, then acl data follows xvattr. - * If ACE FUIDs are needed then they will follow the xvattr_t. Following - * the FUIDs will be the domain table information. The FUIDs for the owner - * and group will be in lr_create. Name follows ACL data. - */ -typedef struct { - lr_create_t lr_create; /* common create portion */ - uint64_t lr_aclcnt; /* number of ACEs in ACL */ - uint64_t lr_domcnt; /* number of unique domains */ - uint64_t lr_fuidcnt; /* number of real fuids */ - uint64_t lr_acl_bytes; /* number of bytes in ACL */ - uint64_t lr_acl_flags; /* ACL flags */ -} lr_acl_create_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_doid; /* obj id of directory */ - /* name of object to remove follows this */ -} lr_remove_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_doid; /* obj id of directory */ - uint64_t lr_link_obj; /* obj id of link */ - /* name of object to link follows this */ -} lr_link_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_sdoid; /* obj id of source directory */ - uint64_t lr_tdoid; /* obj id of target directory */ - /* 2 strings: names of source and destination follow this */ -} lr_rename_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_foid; /* file object to write */ - uint64_t lr_offset; /* offset to write to */ - uint64_t lr_length; /* user data length to write */ - uint64_t lr_blkoff; /* no longer used */ - blkptr_t lr_blkptr; /* spa block pointer for replay */ - /* write data will follow for small writes */ -} lr_write_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_foid; /* object id of file to truncate */ - uint64_t lr_offset; /* offset to truncate from */ - uint64_t lr_length; /* length to truncate */ -} lr_truncate_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_foid; /* file object to change attributes */ - uint64_t lr_mask; /* mask of attributes to set */ - uint64_t lr_mode; /* mode to set */ - uint64_t lr_uid; /* uid to set */ - uint64_t lr_gid; /* gid to set */ - uint64_t lr_size; /* size to set */ - uint64_t lr_atime[2]; /* access time */ - uint64_t lr_mtime[2]; /* modification time */ - /* optional attribute lr_attr_t may be here */ -} lr_setattr_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_foid; /* obj id of file */ - uint64_t lr_aclcnt; /* number of acl entries */ - /* lr_aclcnt number of ace_t entries follow this */ -} lr_acl_v0_t; - -typedef struct { - lr_t lr_common; /* common portion of log record */ - uint64_t lr_foid; /* obj id of file */ - uint64_t lr_aclcnt; /* number of ACEs in ACL */ - uint64_t lr_domcnt; /* number of unique domains */ - uint64_t lr_fuidcnt; /* number of real fuids */ - uint64_t lr_acl_bytes; /* number of bytes in ACL */ - uint64_t lr_acl_flags; /* ACL flags */ - /* lr_acl_bytes number of variable sized ace's follows */ -} lr_acl_t; - -/* - * ZIL structure definitions, interface function prototype and globals. - */ - -/* - * Writes are handled in three different ways: - * - * WR_INDIRECT: - * In this mode, if we need to commit the write later, then the block - * is immediately written into the file system (using dmu_sync), - * and a pointer to the block is put into the log record. - * When the txg commits the block is linked in. - * This saves additionally writing the data into the log record. - * There are a few requirements for this to occur: - * - write is greater than zfs/zvol_immediate_write_sz - * - not using slogs (as slogs are assumed to always be faster - * than writing into the main pool) - * - the write occupies only one block - * WR_COPIED: - * If we know we'll immediately be committing the - * transaction (FSYNC or FDSYNC), the we allocate a larger - * log record here for the data and copy the data in. - * WR_NEED_COPY: - * Otherwise we don't allocate a buffer, and *if* we need to - * flush the write later then a buffer is allocated and - * we retrieve the data using the dmu. - */ -typedef enum { - WR_INDIRECT, /* indirect - a large write (dmu_sync() data */ - /* and put blkptr in log, rather than actual data) */ - WR_COPIED, /* immediate - data is copied into lr_write_t */ - WR_NEED_COPY, /* immediate - data needs to be copied if pushed */ - WR_NUM_STATES /* number of states */ -} itx_wr_state_t; - -typedef struct itx { - list_node_t itx_node; /* linkage on zl_itx_list */ - void *itx_private; /* type-specific opaque data */ - itx_wr_state_t itx_wr_state; /* write state */ - uint8_t itx_sync; /* synchronous transaction */ - uint64_t itx_oid; /* object id */ - lr_t itx_lr; /* common part of log record */ - /* followed by type-specific part of lr_xx_t and its immediate data */ -} itx_t; - -typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, - uint64_t txg); -typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, - uint64_t txg); -typedef int zil_replay_func_t(void *arg1, void *arg2, boolean_t byteswap); -typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, - struct lwb *lwb, zio_t *zio); - -extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, - zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); - -extern void zil_init(void); -extern void zil_fini(void); - -extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys); -extern void zil_free(zilog_t *zilog); - -extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); -extern void zil_close(zilog_t *zilog); - -extern void zil_replay(objset_t *os, void *arg, - zil_replay_func_t *replay_func[TX_MAX_TYPE]); -extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx); -extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); -extern void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx); -extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); - -extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); -extern void zil_itx_destroy(itx_t *itx); -extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); - -extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid); -extern void zil_commit(zilog_t *zilog, uint64_t oid); -extern void zil_commit_impl(zilog_t *zilog, uint64_t oid); - -extern int zil_reset(const char *osname, void *txarg); -extern int zil_claim(struct dsl_pool *dp, - struct dsl_dataset *ds, void *txarg); -extern int zil_check_log_chain(struct dsl_pool *dp, - struct dsl_dataset *ds, void *tx); -extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); -extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); - -extern int zil_suspend(const char *osname, void **cookiep); -extern void zil_resume(void *cookie); - -extern void zil_lwb_add_block(struct lwb *lwb, const blkptr_t *bp); -extern void zil_lwb_add_txg(struct lwb *lwb, uint64_t txg); -extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp); - -extern void zil_set_sync(zilog_t *zilog, uint64_t syncval); - -extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); - -extern uint64_t zil_max_copied_data(zilog_t *zilog); -extern uint64_t zil_max_log_data(zilog_t *zilog); - -extern int zil_replay_disable; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZIL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h deleted file mode 100644 index a19ba970574f..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h +++ /dev/null @@ -1,229 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -/* Portions Copyright 2010 Robert Milkowski */ - -#ifndef _SYS_ZIL_IMPL_H -#define _SYS_ZIL_IMPL_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Possbile states for a given lwb structure. - * - * An lwb will start out in the "closed" state, and then transition to - * the "opened" state via a call to zil_lwb_write_open(). When - * transitioning from "closed" to "opened" the zilog's "zl_issuer_lock" - * must be held. - * - * After the lwb is "opened", it can transition into the "issued" state - * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must - * be held when making this transition. - * - * After the lwb's write zio completes, it transitions into the "write - * done" state via zil_lwb_write_done(); and then into the "flush done" - * state via zil_lwb_flush_vdevs_done(). When transitioning from - * "issued" to "write done", and then from "write done" to "flush done", - * the zilog's "zl_lock" must be held, *not* the "zl_issuer_lock". - * - * The zilog's "zl_issuer_lock" can become heavily contended in certain - * workloads, so we specifically avoid acquiring that lock when - * transitioning an lwb from "issued" to "done". This allows us to avoid - * having to acquire the "zl_issuer_lock" for each lwb ZIO completion, - * which would have added more lock contention on an already heavily - * contended lock. - * - * Additionally, correctness when reading an lwb's state is often - * acheived by exploiting the fact that these state transitions occur in - * this specific order; i.e. "closed" to "opened" to "issued" to "done". - * - * Thus, if an lwb is in the "closed" or "opened" state, holding the - * "zl_issuer_lock" will prevent a concurrent thread from transitioning - * that lwb to the "issued" state. Likewise, if an lwb is already in the - * "issued" state, holding the "zl_lock" will prevent a concurrent - * thread from transitioning that lwb to the "write done" state. - */ -typedef enum { - LWB_STATE_CLOSED, - LWB_STATE_OPENED, - LWB_STATE_ISSUED, - LWB_STATE_WRITE_DONE, - LWB_STATE_FLUSH_DONE, - LWB_NUM_STATES -} lwb_state_t; - -/* - * Log write block (lwb) - * - * Prior to an lwb being issued to disk via zil_lwb_write_issue(), it - * will be protected by the zilog's "zl_issuer_lock". Basically, prior - * to it being issued, it will only be accessed by the thread that's - * holding the "zl_issuer_lock". After the lwb is issued, the zilog's - * "zl_lock" is used to protect the lwb against concurrent access. - */ -typedef struct lwb { - zilog_t *lwb_zilog; /* back pointer to log struct */ - blkptr_t lwb_blk; /* on disk address of this log blk */ - boolean_t lwb_slog; /* lwb_blk is on SLOG device */ - int lwb_nused; /* # used bytes in buffer */ - int lwb_sz; /* size of block and buffer */ - lwb_state_t lwb_state; /* the state of this lwb */ - char *lwb_buf; /* log write buffer */ - zio_t *lwb_write_zio; /* zio for the lwb buffer */ - zio_t *lwb_root_zio; /* root zio for lwb write and flushes */ - dmu_tx_t *lwb_tx; /* tx for log block allocation */ - uint64_t lwb_max_txg; /* highest txg in this lwb */ - list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ - list_t lwb_waiters; /* list of zil_commit_waiter's */ - avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */ - kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */ - hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ -} lwb_t; - -/* - * ZIL commit waiter. - * - * This structure is allocated each time zil_commit() is called, and is - * used by zil_commit() to communicate with other parts of the ZIL, such - * that zil_commit() can know when it safe for it return. For more - * details, see the comment above zil_commit(). - * - * The "zcw_lock" field is used to protect the commit waiter against - * concurrent access. This lock is often acquired while already holding - * the zilog's "zl_issuer_lock" or "zl_lock"; see the functions - * zil_process_commit_list() and zil_lwb_flush_vdevs_done() as examples - * of this. Thus, one must be careful not to acquire the - * "zl_issuer_lock" or "zl_lock" when already holding the "zcw_lock"; - * e.g. see the zil_commit_waiter_timeout() function. - */ -typedef struct zil_commit_waiter { - kcondvar_t zcw_cv; /* signalled when "done" */ - kmutex_t zcw_lock; /* protects fields of this struct */ - list_node_t zcw_node; /* linkage in lwb_t:lwb_waiter list */ - lwb_t *zcw_lwb; /* back pointer to lwb when linked */ - boolean_t zcw_done; /* B_TRUE when "done", else B_FALSE */ - int zcw_zio_error; /* contains the zio io_error value */ -} zil_commit_waiter_t; - -/* - * Intent log transaction lists - */ -typedef struct itxs { - list_t i_sync_list; /* list of synchronous itxs */ - avl_tree_t i_async_tree; /* tree of foids for async itxs */ -} itxs_t; - -typedef struct itxg { - kmutex_t itxg_lock; /* lock for this structure */ - uint64_t itxg_txg; /* txg for this chain */ - itxs_t *itxg_itxs; /* sync and async itxs */ -} itxg_t; - -/* for async nodes we build up an AVL tree of lists of async itxs per file */ -typedef struct itx_async_node { - uint64_t ia_foid; /* file object id */ - list_t ia_list; /* list of async itxs for this foid */ - avl_node_t ia_node; /* AVL tree linkage */ -} itx_async_node_t; - -/* - * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs - * we've touched so we know which ones need a write cache flush at the end. - */ -typedef struct zil_vdev_node { - uint64_t zv_vdev; /* vdev to be flushed */ - avl_node_t zv_node; /* AVL tree linkage */ -} zil_vdev_node_t; - -#define ZIL_PREV_BLKS 16 - -/* - * Stable storage intent log management structure. One per dataset. - */ -struct zilog { - kmutex_t zl_lock; /* protects most zilog_t fields */ - struct dsl_pool *zl_dmu_pool; /* DSL pool */ - spa_t *zl_spa; /* handle for read/write log */ - const zil_header_t *zl_header; /* log header buffer */ - objset_t *zl_os; /* object set we're logging */ - zil_get_data_t *zl_get_data; /* callback to get object content */ - lwb_t *zl_last_lwb_opened; /* most recent lwb opened */ - hrtime_t zl_last_lwb_latency; /* zio latency of last lwb done */ - uint64_t zl_lr_seq; /* on-disk log record sequence number */ - uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */ - uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ - uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */ - uint64_t zl_replaying_seq; /* current replay seq number */ - uint32_t zl_suspend; /* log suspend count */ - kcondvar_t zl_cv_suspend; /* log suspend completion */ - uint8_t zl_suspending; /* log is currently suspending */ - uint8_t zl_keep_first; /* keep first log block in destroy */ - uint8_t zl_replay; /* replaying records while set */ - uint8_t zl_stop_sync; /* for debugging */ - kmutex_t zl_issuer_lock; /* single writer, per ZIL, at a time */ - uint8_t zl_logbias; /* latency or throughput */ - uint8_t zl_sync; /* synchronous or asynchronous */ - int zl_parse_error; /* last zil_parse() error */ - uint64_t zl_parse_blk_seq; /* highest blk seq on last parse */ - uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */ - uint64_t zl_parse_blk_count; /* number of blocks parsed */ - uint64_t zl_parse_lr_count; /* number of log records parsed */ - itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */ - list_t zl_itx_commit_list; /* itx list to be committed */ - uint64_t zl_cur_used; /* current commit log size used */ - list_t zl_lwb_list; /* in-flight log write list */ - avl_tree_t zl_bp_tree; /* track bps during log parse */ - clock_t zl_replay_time; /* lbolt of when replay started */ - uint64_t zl_replay_blks; /* number of log blocks replayed */ - zil_header_t zl_old_header; /* debugging aid */ - uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */ - uint_t zl_prev_rotor; /* rotor for zl_prev[] */ - txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */ - uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */ - /* - * Max block size for this ZIL. Note that this can not be changed - * while the ZIL is in use because consumers (ZPL/zvol) need to take - * this into account when deciding between WR_COPIED and WR_NEED_COPY - * (see zil_max_copied_data()). - */ - uint64_t zl_max_block_size; -}; - -typedef struct zil_bp_node { - dva_t zn_dva; - avl_node_t zn_node; -} zil_bp_node_t; - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZIL_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h deleted file mode 100644 index 99aecb67069b..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ /dev/null @@ -1,675 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright 2016 Toomas Soome - */ - -#ifndef _ZIO_H -#define _ZIO_H - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Embedded checksum - */ -#define ZEC_MAGIC 0x210da7ab10c7a11ULL - -typedef struct zio_eck { - uint64_t zec_magic; /* for validation, endianness */ - zio_cksum_t zec_cksum; /* 256-bit checksum */ -} zio_eck_t; - -/* - * Gang block headers are self-checksumming and contain an array - * of block pointers. - */ -#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE -#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_eck_t)) / sizeof (blkptr_t)) -#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_eck_t) - \ - (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ - sizeof (uint64_t)) - -typedef struct zio_gbh { - blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; - uint64_t zg_filler[SPA_GBH_FILLER]; - zio_eck_t zg_tail; -} zio_gbh_phys_t; - -enum zio_checksum { - ZIO_CHECKSUM_INHERIT = 0, - ZIO_CHECKSUM_ON, - ZIO_CHECKSUM_OFF, - ZIO_CHECKSUM_LABEL, - ZIO_CHECKSUM_GANG_HEADER, - ZIO_CHECKSUM_ZILOG, - ZIO_CHECKSUM_FLETCHER_2, - ZIO_CHECKSUM_FLETCHER_4, - ZIO_CHECKSUM_SHA256, - ZIO_CHECKSUM_ZILOG2, - ZIO_CHECKSUM_NOPARITY, - ZIO_CHECKSUM_SHA512, - ZIO_CHECKSUM_SKEIN, -#ifdef illumos - ZIO_CHECKSUM_EDONR, -#endif - ZIO_CHECKSUM_FUNCTIONS -}; - -/* - * The number of "legacy" compression functions which can be set on individual - * objects. - */ -#define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2 - -#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 -#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON - -#define ZIO_CHECKSUM_MASK 0xffULL -#define ZIO_CHECKSUM_VERIFY (1 << 8) - -#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 -#define ZIO_DEDUPDITTO_MIN 100 - -/* - * The number of "legacy" compression functions which can be set on individual - * objects. - */ -#define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4 - -/* - * The meaning of "compress = on" selected by the compression features enabled - * on a given pool. - */ -#define ZIO_COMPRESS_LEGACY_ON_VALUE ZIO_COMPRESS_LZJB -#define ZIO_COMPRESS_LZ4_ON_VALUE ZIO_COMPRESS_LZ4 - -#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF - -#define BOOTFS_COMPRESS_VALID(compress) \ - ((compress) == ZIO_COMPRESS_LZJB || \ - (compress) == ZIO_COMPRESS_LZ4 || \ - (compress) == ZIO_COMPRESS_ON || \ - (compress) == ZIO_COMPRESS_OFF) - -#define ZIO_FAILURE_MODE_WAIT 0 -#define ZIO_FAILURE_MODE_CONTINUE 1 -#define ZIO_FAILURE_MODE_PANIC 2 - -typedef enum zio_suspend_reason { - ZIO_SUSPEND_NONE = 0, - ZIO_SUSPEND_IOERR, - ZIO_SUSPEND_MMP, -} zio_suspend_reason_t; - -enum zio_flag { - /* - * Flags inherited by gang, ddt, and vdev children, - * and that must be equal for two zios to aggregate - */ - ZIO_FLAG_DONT_AGGREGATE = 1 << 0, - ZIO_FLAG_IO_REPAIR = 1 << 1, - ZIO_FLAG_SELF_HEAL = 1 << 2, - ZIO_FLAG_RESILVER = 1 << 3, - ZIO_FLAG_SCRUB = 1 << 4, - ZIO_FLAG_SCAN_THREAD = 1 << 5, - ZIO_FLAG_PHYSICAL = 1 << 6, - -#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) - - /* - * Flags inherited by ddt, gang, and vdev children. - */ - ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */ - ZIO_FLAG_SPECULATIVE = 1 << 8, - ZIO_FLAG_CONFIG_WRITER = 1 << 9, - ZIO_FLAG_DONT_RETRY = 1 << 10, - ZIO_FLAG_DONT_CACHE = 1 << 11, - ZIO_FLAG_NODATA = 1 << 12, - ZIO_FLAG_INDUCE_DAMAGE = 1 << 13, - ZIO_FLAG_IO_ALLOCATING = 1 << 14, - -#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) -#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) - - /* - * Flags inherited by vdev children. - */ - ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */ - ZIO_FLAG_PROBE = 1 << 16, - ZIO_FLAG_TRYHARD = 1 << 17, - ZIO_FLAG_OPTIONAL = 1 << 18, - -#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) - - /* - * Flags not inherited by any children. - */ - ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */ - ZIO_FLAG_DONT_PROPAGATE = 1 << 20, - ZIO_FLAG_IO_BYPASS = 1 << 21, - ZIO_FLAG_IO_REWRITE = 1 << 22, - ZIO_FLAG_RAW = 1 << 23, - ZIO_FLAG_GANG_CHILD = 1 << 24, - ZIO_FLAG_DDT_CHILD = 1 << 25, - ZIO_FLAG_GODFATHER = 1 << 26, - ZIO_FLAG_NOPWRITE = 1 << 27, - ZIO_FLAG_REEXECUTED = 1 << 28, - ZIO_FLAG_DELEGATED = 1 << 29, -}; - -#define ZIO_FLAG_MUSTSUCCEED 0 - -#define ZIO_DDT_CHILD_FLAGS(zio) \ - (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \ - ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL) - -#define ZIO_GANG_CHILD_FLAGS(zio) \ - (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ - ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) - -#define ZIO_VDEV_CHILD_FLAGS(zio) \ - (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ - ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL) - -#define ZIO_CHILD_BIT(x) (1 << (x)) -#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x))) - -enum zio_child { - ZIO_CHILD_VDEV = 0, - ZIO_CHILD_GANG, - ZIO_CHILD_DDT, - ZIO_CHILD_LOGICAL, - ZIO_CHILD_TYPES -}; - -#define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV) -#define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG) -#define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT) -#define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL) -#define ZIO_CHILD_ALL_BITS \ - (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \ - ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT) - -enum zio_wait_type { - ZIO_WAIT_READY = 0, - ZIO_WAIT_DONE, - ZIO_WAIT_TYPES -}; - -/* - * These are bespoke errnos used in ZFS. We map them to their closest FreeBSD - * equivalents. This gives us more useful error messages from strerror(3). - */ -#define ECKSUM EINTEGRITY -#define EFRAGS ENOSPC - -typedef void zio_done_func_t(zio_t *zio); - -extern boolean_t zio_dva_throttle_enabled; -extern const char *zio_type_name[ZIO_TYPES]; - -/* - * A bookmark is a four-tuple that uniquely - * identifies any block in the pool. By convention, the meta-objset (MOS) - * is objset 0, and the meta-dnode is object 0. This covers all blocks - * except root blocks and ZIL blocks, which are defined as follows: - * - * Root blocks (objset_phys_t) are object 0, level -1: . - * ZIL blocks are bookmarked . - * dmu_sync()ed ZIL data blocks are bookmarked . - * dnode visit bookmarks are . - * - * Note: this structure is called a bookmark because its original purpose - * was to remember where to resume a pool-wide traverse. - * - * Note: this structure is passed between userland and the kernel, and is - * stored on disk (by virtue of being incorporated into other on-disk - * structures, e.g. dsl_scan_phys_t). - */ -typedef struct zbookmark_phys { - uint64_t zb_objset; - uint64_t zb_object; - int64_t zb_level; - uint64_t zb_blkid; -} zbookmark_phys_t; - -#define SET_BOOKMARK(zb, objset, object, level, blkid) \ -{ \ - (zb)->zb_objset = objset; \ - (zb)->zb_object = object; \ - (zb)->zb_level = level; \ - (zb)->zb_blkid = blkid; \ -} - -#define ZB_DESTROYED_OBJSET (-1ULL) - -#define ZB_ROOT_OBJECT (0ULL) -#define ZB_ROOT_LEVEL (-1LL) -#define ZB_ROOT_BLKID (0ULL) - -#define ZB_ZIL_OBJECT (0ULL) -#define ZB_ZIL_LEVEL (-2LL) - -#define ZB_DNODE_LEVEL (-3LL) -#define ZB_DNODE_BLKID (0ULL) - -#define ZB_IS_ZERO(zb) \ - ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ - (zb)->zb_level == 0 && (zb)->zb_blkid == 0) -#define ZB_IS_ROOT(zb) \ - ((zb)->zb_object == ZB_ROOT_OBJECT && \ - (zb)->zb_level == ZB_ROOT_LEVEL && \ - (zb)->zb_blkid == ZB_ROOT_BLKID) - -typedef struct zio_prop { - enum zio_checksum zp_checksum; - enum zio_compress zp_compress; - dmu_object_type_t zp_type; - uint8_t zp_level; - uint8_t zp_copies; - boolean_t zp_dedup; - boolean_t zp_dedup_verify; - boolean_t zp_nopwrite; - uint32_t zp_zpl_smallblk; -} zio_prop_t; - -typedef struct zio_cksum_report zio_cksum_report_t; - -typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, - const void *good_data); -typedef void zio_cksum_free_f(void *cbdata, size_t size); - -struct zio_bad_cksum; /* defined in zio_checksum.h */ -struct dnode_phys; -struct abd; - -struct zio_cksum_report { - struct zio_cksum_report *zcr_next; - nvlist_t *zcr_ereport; - nvlist_t *zcr_detector; - void *zcr_cbdata; - size_t zcr_cbinfo; /* passed to zcr_free() */ - uint64_t zcr_align; - uint64_t zcr_length; - zio_cksum_finish_f *zcr_finish; - zio_cksum_free_f *zcr_free; - - /* internal use only */ - struct zio_bad_cksum *zcr_ckinfo; /* information from failure */ -}; - -typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr, - void *arg); - -zio_vsd_cksum_report_f zio_vsd_default_cksum_report; - -typedef struct zio_vsd_ops { - zio_done_func_t *vsd_free; - zio_vsd_cksum_report_f *vsd_cksum_report; -} zio_vsd_ops_t; - -typedef struct zio_gang_node { - zio_gbh_phys_t *gn_gbh; - struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; -} zio_gang_node_t; - -typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, - zio_gang_node_t *gn, struct abd *data, uint64_t offset); - -typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size); - -typedef struct zio_transform { - struct abd *zt_orig_abd; - uint64_t zt_orig_size; - uint64_t zt_bufsize; - zio_transform_func_t *zt_transform; - struct zio_transform *zt_next; -} zio_transform_t; - -typedef zio_t *zio_pipe_stage_t(zio_t *zio); - -/* - * The io_reexecute flags are distinct from io_flags because the child must - * be able to propagate them to the parent. The normal io_flags are local - * to the zio, not protected by any lock, and not modifiable by children; - * the reexecute flags are protected by io_lock, modifiable by children, - * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. - */ -#define ZIO_REEXECUTE_NOW 0x01 -#define ZIO_REEXECUTE_SUSPEND 0x02 - -typedef struct zio_alloc_list { - list_t zal_list; - uint64_t zal_size; -} zio_alloc_list_t; - -typedef struct zio_link { - zio_t *zl_parent; - zio_t *zl_child; - list_node_t zl_parent_node; - list_node_t zl_child_node; -} zio_link_t; - -/* - * Used for TRIM kstat. - */ -typedef struct zio_trim_stats { - /* - * Number of bytes successfully TRIMmed. - */ - kstat_named_t bytes; - - /* - * Number of successful TRIM requests. - */ - kstat_named_t success; - - /* - * Number of TRIM requests that failed because TRIM is not - * supported. - */ - kstat_named_t unsupported; - - /* - * Number of TRIM requests that failed for other reasons. - */ - kstat_named_t failed; -} zio_trim_stats_t; - -extern zio_trim_stats_t zio_trim_stats; - -#define ZIO_TRIM_STAT_INCR(stat, val) \ - atomic_add_64(&zio_trim_stats.stat.value.ui64, (val)); -#define ZIO_TRIM_STAT_BUMP(stat) \ - ZIO_TRIM_STAT_INCR(stat, 1); - -struct zio { - /* Core information about this I/O */ - zbookmark_phys_t io_bookmark; - zio_prop_t io_prop; - zio_type_t io_type; - enum zio_child io_child_type; - int io_cmd; - zio_priority_t io_priority; - uint8_t io_reexecute; - uint8_t io_state[ZIO_WAIT_TYPES]; - uint64_t io_txg; - spa_t *io_spa; - blkptr_t *io_bp; - blkptr_t *io_bp_override; - blkptr_t io_bp_copy; - list_t io_parent_list; - list_t io_child_list; - zio_t *io_logical; - zio_transform_t *io_transform_stack; - - /* Callback info */ - zio_done_func_t *io_ready; - zio_done_func_t *io_children_ready; - zio_done_func_t *io_physdone; - zio_done_func_t *io_done; - void *io_private; - int64_t io_prev_space_delta; /* DMU private */ - blkptr_t io_bp_orig; - - /* Data represented by this I/O */ - struct abd *io_abd; - struct abd *io_orig_abd; - uint64_t io_size; - uint64_t io_orig_size; - /* io_lsize != io_orig_size iff this is a raw write */ - uint64_t io_lsize; - - /* Stuff for the vdev stack */ - vdev_t *io_vd; - void *io_vsd; - const zio_vsd_ops_t *io_vsd_ops; - metaslab_class_t *io_metaslab_class; /* dva throttle class */ - - uint64_t io_offset; - hrtime_t io_timestamp; - hrtime_t io_queued_timestamp; - hrtime_t io_target_timestamp; - avl_node_t io_queue_node; - avl_node_t io_offset_node; - avl_node_t io_alloc_node; - zio_alloc_list_t io_alloc_list; - -#ifdef __FreeBSD__ - struct bio *io_bio; -#ifdef _KERNEL - struct callout io_timer; -#endif -#endif - - /* Internal pipeline state */ - enum zio_flag io_flags; - enum zio_stage io_stage; - enum zio_stage io_pipeline; - enum zio_flag io_orig_flags; - enum zio_stage io_orig_stage; - enum zio_stage io_orig_pipeline; - enum zio_stage io_pipeline_trace; - int io_error; - int io_child_error[ZIO_CHILD_TYPES]; - uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; - uint64_t io_child_count; - uint64_t io_phys_children; - uint64_t io_parent_count; - uint64_t *io_stall; - zio_t *io_gang_leader; - zio_gang_node_t *io_gang_tree; - void *io_executor; - void *io_waiter; - kmutex_t io_lock; - kcondvar_t io_cv; - int io_allocator; - - /* FMA state */ - zio_cksum_report_t *io_cksum_report; - uint64_t io_ena; - - /* Taskq dispatching state */ - taskq_ent_t io_tqent; - - avl_node_t io_trim_node; - list_node_t io_trim_link; -}; - -extern int zio_bookmark_compare(const void *, const void *); - -extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, - zio_done_func_t *done, void *priv, enum zio_flag flags); - -extern zio_t *zio_root(spa_t *spa, - zio_done_func_t *done, void *priv, enum zio_flag flags); - -extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv, - zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); - -extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, - zio_done_func_t *ready, zio_done_func_t *children_ready, - zio_done_func_t *physdone, zio_done_func_t *done, - void *priv, zio_priority_t priority, enum zio_flag flags, - const zbookmark_phys_t *zb); - -extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - struct abd *data, uint64_t size, zio_done_func_t *done, void *priv, - zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); - -extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, - boolean_t nopwrite); - -extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); - -extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, - const blkptr_t *bp, - zio_done_func_t *done, void *priv, enum zio_flag flags); - -extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, - zio_priority_t priority, enum zio_flag flags); - -extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, struct abd *data, int checksum, - zio_done_func_t *done, void *priv, zio_priority_t priority, - enum zio_flag flags, boolean_t labels); - -extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, struct abd *data, int checksum, - zio_done_func_t *done, void *priv, zio_priority_t priority, - enum zio_flag flags, boolean_t labels); - -extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, - const blkptr_t *bp, uint64_t size, enum zio_flag flags); - -extern int zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, - blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t *slog); -extern void zio_flush(zio_t *zio, vdev_t *vd); -extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, - uint64_t size); -extern void zio_shrink(zio_t *zio, uint64_t size); - -extern int zio_wait(zio_t *zio); -extern void zio_nowait(zio_t *zio); -extern void zio_execute(zio_t *zio); -extern void zio_interrupt(zio_t *zio); -extern void zio_delay_init(zio_t *zio); -extern void zio_delay_interrupt(zio_t *zio); - -extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **); -extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); -extern zio_t *zio_unique_parent(zio_t *cio); -extern void zio_add_child(zio_t *pio, zio_t *cio); - -extern void *zio_buf_alloc(size_t size); -extern void zio_buf_free(void *buf, size_t size); -extern void *zio_data_buf_alloc(size_t size); -extern void zio_data_buf_free(void *buf, size_t size); - -extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size, - uint64_t bufsize, zio_transform_func_t *transform); -extern void zio_pop_transforms(zio_t *zio); - -extern void zio_resubmit_stage_async(void *); - -extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, - uint64_t offset, struct abd *data, uint64_t size, int type, - zio_priority_t priority, enum zio_flag flags, - zio_done_func_t *done, void *priv); - -extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, - struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority, - enum zio_flag flags, zio_done_func_t *done, void *priv); - -extern void zio_vdev_io_bypass(zio_t *zio); -extern void zio_vdev_io_reissue(zio_t *zio); -extern void zio_vdev_io_redone(zio_t *zio); - -extern void zio_change_priority(zio_t *pio, zio_priority_t priority); - -extern void zio_checksum_verified(zio_t *zio); -extern int zio_worst_error(int e1, int e2); - -extern enum zio_checksum zio_checksum_select(enum zio_checksum child, - enum zio_checksum parent); -extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, - enum zio_checksum child, enum zio_checksum parent); -extern enum zio_compress zio_compress_select(spa_t *spa, - enum zio_compress child, enum zio_compress parent); - -extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t); -extern int zio_resume(spa_t *spa); -extern void zio_resume_wait(spa_t *spa); - -/* - * Initial setup and teardown. - */ -extern void zio_init(void); -extern void zio_fini(void); - -/* - * Fault injection - */ -struct zinject_record; -extern uint32_t zio_injection_enabled; -extern int zio_inject_fault(char *name, int flags, int *id, - struct zinject_record *record); -extern int zio_inject_list_next(int *id, char *name, size_t buflen, - struct zinject_record *record); -extern int zio_clear_fault(int id); -extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type); -extern int zio_handle_fault_injection(zio_t *zio, int error); -extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); -extern int zio_handle_label_injection(zio_t *zio, int error); -extern void zio_handle_ignored_writes(zio_t *zio); -extern hrtime_t zio_handle_io_delay(zio_t *zio); - -/* - * Checksum ereport functions - */ -extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio, - uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info); -extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, - const void *good_data, const void *bad_data, boolean_t drop_if_identical); - -extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report); -extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); - -/* If we have the good data in hand, this function can be used */ -extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, - struct zio *zio, uint64_t offset, uint64_t length, - const void *good_data, const void *bad_data, struct zio_bad_cksum *info); - -/* Called from spa_sync(), but primarily an injection handler */ -extern void spa_handle_ignored_writes(spa_t *spa); - -/* zbookmark_phys functions */ -boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp, - const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); -int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, - uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); - -#ifdef __cplusplus -} -#endif - -#endif /* _ZIO_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h deleted file mode 100644 index 782df534c9a0..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, 2016 by Delphix. All rights reserved. - * Copyright Saso Kiselkov 2013, All rights reserved. - */ - -#ifndef _SYS_ZIO_CHECKSUM_H -#define _SYS_ZIO_CHECKSUM_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -struct abd; - -/* - * Signature for checksum functions. - */ -typedef void zio_checksum_t(struct abd *, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp); -typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); -typedef void zio_checksum_tmpl_free_t(void *ctx_template); - -typedef enum zio_checksum_flags { - /* Strong enough for metadata? */ - ZCHECKSUM_FLAG_METADATA = (1 << 1), - /* ZIO embedded checksum */ - ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), - /* Strong enough for dedup (without verification)? */ - ZCHECKSUM_FLAG_DEDUP = (1 << 3), - /* Uses salt value */ - ZCHECKSUM_FLAG_SALTED = (1 << 4), - /* Strong enough for nopwrite? */ - ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) -} zio_checksum_flags_t; - -/* - * Information about each checksum function. - */ -typedef struct zio_checksum_info { - /* checksum function for each byteorder */ - zio_checksum_t *ci_func[2]; - zio_checksum_tmpl_init_t *ci_tmpl_init; - zio_checksum_tmpl_free_t *ci_tmpl_free; - zio_checksum_flags_t ci_flags; - char *ci_name; /* descriptive name */ -} zio_checksum_info_t; - -typedef struct zio_bad_cksum { - zio_cksum_t zbc_expected; - zio_cksum_t zbc_actual; - const char *zbc_checksum_name; - uint8_t zbc_byteswapped; - uint8_t zbc_injected; - uint8_t zbc_has_cksum; /* expected/actual valid */ -} zio_bad_cksum_t; - -extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; - -/* - * Checksum routines. - */ -extern zio_checksum_t abd_checksum_SHA256; -extern zio_checksum_t abd_checksum_SHA512_native; -extern zio_checksum_t abd_checksum_SHA512_byteswap; - -/* Skein */ -extern zio_checksum_t abd_checksum_skein_native; -extern zio_checksum_t abd_checksum_skein_byteswap; -extern zio_checksum_tmpl_init_t abd_checksum_skein_tmpl_init; -extern zio_checksum_tmpl_free_t abd_checksum_skein_tmpl_free; - -#ifdef illumos -/* Edon-R */ -extern zio_checksum_t abd_checksum_edonr_native; -extern zio_checksum_t abd_checksum_edonr_byteswap; -extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init; -extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free; -#endif - -extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, - void *, uint64_t, uint64_t, zio_bad_cksum_t *); -extern void zio_checksum_compute(zio_t *, enum zio_checksum, - struct abd *, uint64_t); -extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum, - struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *); -extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); -extern enum zio_checksum spa_dedup_checksum(spa_t *spa); -extern void zio_checksum_templates_free(spa_t *spa); -extern spa_feature_t zio_checksum_to_feature(enum zio_checksum cksum); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZIO_CHECKSUM_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h deleted file mode 100644 index aab0282c45be..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h +++ /dev/null @@ -1,128 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2015, 2016 by Delphix. All rights reserved. - */ - -#ifndef _SYS_ZIO_COMPRESS_H -#define _SYS_ZIO_COMPRESS_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -enum zio_compress { - ZIO_COMPRESS_INHERIT = 0, - ZIO_COMPRESS_ON, - ZIO_COMPRESS_OFF, - ZIO_COMPRESS_LZJB, - ZIO_COMPRESS_EMPTY, - ZIO_COMPRESS_GZIP_1, - ZIO_COMPRESS_GZIP_2, - ZIO_COMPRESS_GZIP_3, - ZIO_COMPRESS_GZIP_4, - ZIO_COMPRESS_GZIP_5, - ZIO_COMPRESS_GZIP_6, - ZIO_COMPRESS_GZIP_7, - ZIO_COMPRESS_GZIP_8, - ZIO_COMPRESS_GZIP_9, - ZIO_COMPRESS_ZLE, - ZIO_COMPRESS_LZ4, - ZIO_COMPRESS_FUNCTIONS -}; - -/* Common signature for all zio compress functions. */ -typedef size_t zio_compress_func_t(void *src, void *dst, - size_t s_len, size_t d_len, int); -/* Common signature for all zio decompress functions. */ -typedef int zio_decompress_func_t(void *src, void *dst, - size_t s_len, size_t d_len, int); -/* - * Common signature for all zio decompress functions using an ABD as input. - * This is helpful if you have both compressed ARC and scatter ABDs enabled, - * but is not a requirement for all compression algorithms. - */ -typedef int zio_decompress_abd_func_t(abd_t *src, void *dst, - size_t s_len, size_t d_len, int); - -/* - * Information about each compression function. - */ -typedef struct zio_compress_info { - char *ci_name; - int ci_level; - zio_compress_func_t *ci_compress; - zio_decompress_func_t *ci_decompress; -} zio_compress_info_t; - -extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; - -/* - * Compression routines. - */ -extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern void lz4_init(void); -extern void lz4_fini(void); -extern size_t lz4_compress(void *src, void *dst, size_t s_len, size_t d_len, - int level); -extern int lz4_decompress(void *src, void *dst, size_t s_len, size_t d_len, - int level); - -/* - * Compress and decompress data if necessary. - */ -extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst, - size_t s_len); -extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, - size_t s_len, size_t d_len); -extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, - size_t s_len, size_t d_len); - -/* - * Module lifetime management. - */ -extern void zio_compress_init(void); -extern void zio_compress_fini(void); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZIO_COMPRESS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h deleted file mode 100644 index 96b3b0135813..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h +++ /dev/null @@ -1,256 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. - */ - -#ifndef _ZIO_IMPL_H -#define _ZIO_IMPL_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * XXX -- Describe ZFS I/O pipeline here. Fill in as needed. - * - * The ZFS I/O pipeline is comprised of various stages which are defined - * in the zio_stage enum below. The individual stages are used to construct - * these basic I/O operations: Read, Write, Free, Claim, and Ioctl. - * - * I/O operations: (XXX - provide detail for each of the operations) - * - * Read: - * Write: - * Free: - * Claim: - * Ioctl: - * - * Although the most common pipeline are used by the basic I/O operations - * above, there are some helper pipelines (one could consider them - * sub-pipelines) which are used internally by the ZIO module and are - * explained below: - * - * Interlock Pipeline: - * The interlock pipeline is the most basic pipeline and is used by all - * of the I/O operations. The interlock pipeline does not perform any I/O - * and is used to coordinate the dependencies between I/Os that are being - * issued (i.e. the parent/child relationship). - * - * Vdev child Pipeline: - * The vdev child pipeline is responsible for performing the physical I/O. - * It is in this pipeline where the I/O are queued and possibly cached. - * - * In addition to performing I/O, the pipeline is also responsible for - * data transformations. The transformations performed are based on the - * specific properties that user may have selected and modify the - * behavior of the pipeline. Examples of supported transformations are - * compression, dedup, and nop writes. Transformations will either modify - * the data or the pipeline. This list below further describes each of - * the supported transformations: - * - * Compression: - * ZFS supports three different flavors of compression -- gzip, lzjb, and - * zle. Compression occurs as part of the write pipeline and is performed - * in the ZIO_STAGE_WRITE_BP_INIT stage. - * - * Dedup: - * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and - * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing - * read pipeline if the dedup bit is set on the block pointer. - * Writing a dedup block is performed by the ZIO_STAGE_DDT_WRITE stage - * and added to a write pipeline if a user has enabled dedup on that - * particular dataset. - * - * NOP Write: - * The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage - * and is added to an existing write pipeline if a crypographically - * secure checksum (i.e. SHA256) is enabled and compression is turned on. - * The NOP write stage will compare the checksums of the current data - * on-disk (level-0 blocks only) and the data that is currently being written. - * If the checksum values are identical then the pipeline is converted to - * an interlock pipeline skipping block allocation and bypassing the - * physical I/O. The nop write feature can handle writes in either - * syncing or open context (i.e. zil writes) and as a result is mutually - * exclusive with dedup. - */ - -/* - * zio pipeline stage definitions - */ -enum zio_stage { - ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */ - - ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */ - ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */ - ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */ - ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */ - ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */ - - ZIO_STAGE_CHECKSUM_GENERATE = 1 << 6, /* -W--- */ - - ZIO_STAGE_NOP_WRITE = 1 << 7, /* -W--- */ - - ZIO_STAGE_DDT_READ_START = 1 << 8, /* R---- */ - ZIO_STAGE_DDT_READ_DONE = 1 << 9, /* R---- */ - ZIO_STAGE_DDT_WRITE = 1 << 10, /* -W--- */ - ZIO_STAGE_DDT_FREE = 1 << 11, /* --F-- */ - - ZIO_STAGE_GANG_ASSEMBLE = 1 << 12, /* RWFC- */ - ZIO_STAGE_GANG_ISSUE = 1 << 13, /* RWFC- */ - - ZIO_STAGE_DVA_THROTTLE = 1 << 14, /* -W--- */ - ZIO_STAGE_DVA_ALLOCATE = 1 << 15, /* -W--- */ - ZIO_STAGE_DVA_FREE = 1 << 16, /* --F-- */ - ZIO_STAGE_DVA_CLAIM = 1 << 17, /* ---C- */ - - ZIO_STAGE_READY = 1 << 18, /* RWFCI */ - - ZIO_STAGE_VDEV_IO_START = 1 << 19, /* RWF-I */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 20, /* RWF-I */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 21, /* RWF-I */ - - ZIO_STAGE_CHECKSUM_VERIFY = 1 << 22, /* R---- */ - - ZIO_STAGE_DONE = 1 << 23 /* RWFCI */ -}; - -#define ZIO_INTERLOCK_STAGES \ - (ZIO_STAGE_READY | \ - ZIO_STAGE_DONE) - -#define ZIO_INTERLOCK_PIPELINE \ - ZIO_INTERLOCK_STAGES - -#define ZIO_VDEV_IO_STAGES \ - (ZIO_STAGE_VDEV_IO_START | \ - ZIO_STAGE_VDEV_IO_DONE | \ - ZIO_STAGE_VDEV_IO_ASSESS) - -#define ZIO_VDEV_CHILD_PIPELINE \ - (ZIO_VDEV_IO_STAGES | \ - ZIO_STAGE_DONE) - -#define ZIO_READ_COMMON_STAGES \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_VDEV_IO_STAGES | \ - ZIO_STAGE_CHECKSUM_VERIFY) - -#define ZIO_READ_PHYS_PIPELINE \ - ZIO_READ_COMMON_STAGES - -#define ZIO_READ_PIPELINE \ - (ZIO_READ_COMMON_STAGES | \ - ZIO_STAGE_READ_BP_INIT) - -#define ZIO_DDT_CHILD_READ_PIPELINE \ - ZIO_READ_COMMON_STAGES - -#define ZIO_DDT_READ_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_STAGE_READ_BP_INIT | \ - ZIO_STAGE_DDT_READ_START | \ - ZIO_STAGE_DDT_READ_DONE) - -#define ZIO_WRITE_COMMON_STAGES \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_VDEV_IO_STAGES | \ - ZIO_STAGE_ISSUE_ASYNC | \ - ZIO_STAGE_CHECKSUM_GENERATE) - -#define ZIO_WRITE_PHYS_PIPELINE \ - ZIO_WRITE_COMMON_STAGES - -#define ZIO_REWRITE_PIPELINE \ - (ZIO_WRITE_COMMON_STAGES | \ - ZIO_STAGE_WRITE_COMPRESS | \ - ZIO_STAGE_WRITE_BP_INIT) - -#define ZIO_WRITE_PIPELINE \ - (ZIO_WRITE_COMMON_STAGES | \ - ZIO_STAGE_WRITE_BP_INIT | \ - ZIO_STAGE_WRITE_COMPRESS | \ - ZIO_STAGE_DVA_THROTTLE | \ - ZIO_STAGE_DVA_ALLOCATE) - -#define ZIO_DDT_CHILD_WRITE_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_VDEV_IO_STAGES | \ - ZIO_STAGE_DVA_THROTTLE | \ - ZIO_STAGE_DVA_ALLOCATE) - -#define ZIO_DDT_WRITE_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_STAGE_WRITE_BP_INIT | \ - ZIO_STAGE_ISSUE_ASYNC | \ - ZIO_STAGE_WRITE_COMPRESS | \ - ZIO_STAGE_CHECKSUM_GENERATE | \ - ZIO_STAGE_DDT_WRITE) - -#define ZIO_GANG_STAGES \ - (ZIO_STAGE_GANG_ASSEMBLE | \ - ZIO_STAGE_GANG_ISSUE) - -#define ZIO_FREE_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_STAGE_FREE_BP_INIT | \ - ZIO_STAGE_DVA_FREE) - -#define ZIO_FREE_PHYS_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_VDEV_IO_STAGES) - -#define ZIO_DDT_FREE_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_STAGE_FREE_BP_INIT | \ - ZIO_STAGE_ISSUE_ASYNC | \ - ZIO_STAGE_DDT_FREE) - -#define ZIO_CLAIM_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_STAGE_DVA_CLAIM) - -#define ZIO_IOCTL_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_STAGE_VDEV_IO_START | \ - ZIO_STAGE_VDEV_IO_ASSESS) - -#define ZIO_BLOCKING_STAGES \ - (ZIO_STAGE_DVA_ALLOCATE | \ - ZIO_STAGE_DVA_CLAIM | \ - ZIO_STAGE_VDEV_IO_START) - -extern void zio_inject_init(void); -extern void zio_inject_fini(void); - -#ifdef __cplusplus -} -#endif - -#endif /* _ZIO_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h deleted file mode 100644 index ebe05a09dc4e..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2014, 2016 by Delphix. All rights reserved. - */ -#ifndef _ZIO_PRIORITY_H -#define _ZIO_PRIORITY_H - -#ifdef __cplusplus -extern "C" { -#endif - -typedef enum zio_priority { - ZIO_PRIORITY_SYNC_READ, - ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ - ZIO_PRIORITY_ASYNC_READ, /* prefetch */ - ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ - ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ - ZIO_PRIORITY_TRIM, /* free requests used for TRIM */ - ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ - ZIO_PRIORITY_INITIALIZING, /* initializing I/O */ - ZIO_PRIORITY_NUM_QUEUEABLE, - - ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ -} zio_priority_t; - -#ifdef __cplusplus -} -#endif - -#endif /* _ZIO_PRIORITY_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h deleted file mode 100644 index b6eba1a18ff4..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2015 by Delphix. All rights reserved. - */ - -#ifndef _SYS_ZRLOCK_H -#define _SYS_ZRLOCK_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct zrlock { - kmutex_t zr_mtx; - volatile int32_t zr_refcount; - kcondvar_t zr_cv; - uint16_t zr_pad; -#ifdef ZFS_DEBUG - kthread_t *zr_owner; - const char *zr_caller; -#endif -} zrlock_t; - -extern void zrl_init(zrlock_t *); -extern void zrl_destroy(zrlock_t *); -#define zrl_add(_z) zrl_add_impl((_z), __func__) -extern void zrl_add_impl(zrlock_t *, const char *); -extern void zrl_remove(zrlock_t *); -extern int zrl_tryenter(zrlock_t *); -extern void zrl_exit(zrlock_t *); -extern int zrl_is_zero(zrlock_t *); -extern int zrl_is_locked(zrlock_t *); -#ifdef ZFS_DEBUG -extern kthread_t *zrl_owner(zrlock_t *); -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZRLOCK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h deleted file mode 100644 index 33c218ec4c7d..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2017, 2018 by Delphix. All rights reserved. - */ - -#ifndef _SYS_ZTHR_H -#define _SYS_ZTHR_H - -typedef struct zthr zthr_t; -typedef void (zthr_func_t)(void *, zthr_t *); -typedef boolean_t (zthr_checkfunc_t)(void *, zthr_t *); - -extern zthr_t *zthr_create(zthr_checkfunc_t checkfunc, - zthr_func_t *func, void *arg); -extern zthr_t *zthr_create_timer(zthr_checkfunc_t *checkfunc, - zthr_func_t *func, void *arg, hrtime_t nano_wait); -extern void zthr_destroy(zthr_t *t); - -extern void zthr_wakeup(zthr_t *t); -extern void zthr_cancel(zthr_t *t); -extern void zthr_resume(zthr_t *t); - -extern boolean_t zthr_iscancelled(zthr_t *t); - -#endif /* _SYS_ZTHR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h deleted file mode 100644 index 6bd4d42b8c3f..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2016 Actifio, Inc. All rights reserved. - */ - -#ifndef _SYS_ZVOL_H -#define _SYS_ZVOL_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define ZVOL_OBJ 1ULL -#define ZVOL_ZAP_OBJ 2ULL - -#ifdef _KERNEL -extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); -extern int zvol_check_volblocksize(uint64_t volblocksize); -extern int zvol_get_stats(objset_t *os, nvlist_t *nv); -extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); -extern int zvol_set_volsize(const char *, uint64_t); - -#ifdef illumos -extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr); -extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks); -extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr); -extern int zvol_strategy(buf_t *bp); -extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr); -extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr); -extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr); -extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr); -#endif /* illumos */ -extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, - int *rvalp); -extern int zvol_busy(void); -extern void zvol_init(void); -extern void zvol_fini(void); - -#ifdef illumos -extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize, - uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl, - void **rl_hdl, void **bonus_hdl); -extern uint64_t zvol_get_volume_size(void *minor_hdl); -extern int zvol_get_volume_wce(void *minor_hdl); -extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, - ssize_t resid, boolean_t sync); -#endif /* illumos */ - -#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) -extern void zvol_create_minors(spa_t *spa, const char *name); -extern void zvol_remove_minors(spa_t *spa, const char *name); -extern void zvol_rename_minors(spa_t *spa, const char *oldname, - const char *newname); -#endif - -#endif /* _KERNEL */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ZVOL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c deleted file mode 100644 index e837320ce538..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c +++ /dev/null @@ -1,634 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2012 Pawel Jakub Dawidek . - * All rights reserved. - */ - -#include -#include -#include -#include -#include - -/* - * Calculate the zio end, upgrading based on ashift which would be - * done by zio_vdev_io_start. - * - * This makes free range consolidation much more effective - * than it would otherwise be as well as ensuring that entire - * blocks are invalidated by writes. - */ -#define TRIM_ZIO_END(vd, offset, size) (offset + \ - P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift)) - -/* Maximal segment size for ATA TRIM. */ -#define TRIM_MAP_SIZE_FACTOR (512 << 16) - -#define TRIM_MAP_SEGS(size) (1 + (size) / TRIM_MAP_SIZE_FACTOR) - -#define TRIM_MAP_ADD(tm, ts) do { \ - list_insert_tail(&(tm)->tm_head, (ts)); \ - (tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \ -} while (0) - -#define TRIM_MAP_REM(tm, ts) do { \ - list_remove(&(tm)->tm_head, (ts)); \ - (tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \ -} while (0) - -typedef struct trim_map { - list_t tm_head; /* List of segments sorted by txg. */ - avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */ - avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */ - avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */ - list_t tm_pending_writes; /* Writes blocked on in-flight frees. */ - kmutex_t tm_lock; - uint64_t tm_pending; /* Count of pending TRIMs. */ -} trim_map_t; - -typedef struct trim_seg { - avl_node_t ts_node; /* AVL node. */ - list_node_t ts_next; /* List element. */ - uint64_t ts_start; /* Starting offset of this segment. */ - uint64_t ts_end; /* Ending offset (non-inclusive). */ - uint64_t ts_txg; /* Segment creation txg. */ - hrtime_t ts_time; /* Segment creation time. */ -} trim_seg_t; - -extern boolean_t zfs_trim_enabled; - -static u_int trim_txg_delay = 32; /* Keep deleted data up to 32 TXG */ -static u_int trim_timeout = 30; /* Keep deleted data up to 30s */ -static u_int trim_max_interval = 1; /* 1s delays between TRIMs */ -static u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */ - -SYSCTL_DECL(_vfs_zfs); -SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, - "ZFS TRIM"); - -SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay, - 0, "Delay TRIMs by up to this many TXGs"); -SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0, - "Delay TRIMs by up to this many seconds"); -SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN, - &trim_max_interval, 0, - "Maximum interval between TRIM queue processing (seconds)"); - -SYSCTL_DECL(_vfs_zfs_vdev); -SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN, - &trim_vdev_max_pending, 0, - "Maximum pending TRIM segments for a vdev"); - -static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd); - -static int -trim_map_seg_compare(const void *x1, const void *x2) -{ - const trim_seg_t *s1 = x1; - const trim_seg_t *s2 = x2; - - if (s1->ts_start < s2->ts_start) { - if (s1->ts_end > s2->ts_start) - return (0); - return (-1); - } - if (s1->ts_start > s2->ts_start) { - if (s1->ts_start < s2->ts_end) - return (0); - return (1); - } - return (0); -} - -static int -trim_map_zio_compare(const void *x1, const void *x2) -{ - const zio_t *z1 = x1; - const zio_t *z2 = x2; - - if (z1->io_offset < z2->io_offset) { - if (z1->io_offset + z1->io_size > z2->io_offset) - return (0); - return (-1); - } - if (z1->io_offset > z2->io_offset) { - if (z1->io_offset < z2->io_offset + z2->io_size) - return (0); - return (1); - } - return (0); -} - -void -trim_map_create(vdev_t *vd) -{ - trim_map_t *tm; - - ASSERT(zfs_trim_enabled && !vd->vdev_notrim && - vd->vdev_ops->vdev_op_leaf); - - tm = kmem_zalloc(sizeof (*tm), KM_SLEEP); - mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&tm->tm_head, sizeof (trim_seg_t), - offsetof(trim_seg_t, ts_next)); - list_create(&tm->tm_pending_writes, sizeof (zio_t), - offsetof(zio_t, io_trim_link)); - avl_create(&tm->tm_queued_frees, trim_map_seg_compare, - sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); - avl_create(&tm->tm_inflight_frees, trim_map_seg_compare, - sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); - avl_create(&tm->tm_inflight_writes, trim_map_zio_compare, - sizeof (zio_t), offsetof(zio_t, io_trim_node)); - vd->vdev_trimmap = tm; -} - -void -trim_map_destroy(vdev_t *vd) -{ - trim_map_t *tm; - trim_seg_t *ts; - - ASSERT(vd->vdev_ops->vdev_op_leaf); - - if (!zfs_trim_enabled) - return; - - tm = vd->vdev_trimmap; - if (tm == NULL) - return; - - /* - * We may have been called before trim_map_vdev_commit_done() - * had a chance to run, so do it now to prune the remaining - * inflight frees. - */ - trim_map_vdev_commit_done(vd->vdev_spa, vd); - - mutex_enter(&tm->tm_lock); - while ((ts = list_head(&tm->tm_head)) != NULL) { - avl_remove(&tm->tm_queued_frees, ts); - TRIM_MAP_REM(tm, ts); - kmem_free(ts, sizeof (*ts)); - } - mutex_exit(&tm->tm_lock); - - avl_destroy(&tm->tm_queued_frees); - avl_destroy(&tm->tm_inflight_frees); - avl_destroy(&tm->tm_inflight_writes); - list_destroy(&tm->tm_pending_writes); - list_destroy(&tm->tm_head); - mutex_destroy(&tm->tm_lock); - kmem_free(tm, sizeof (*tm)); - vd->vdev_trimmap = NULL; -} - -static void -trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) -{ - avl_index_t where; - trim_seg_t tsearch, *ts_before, *ts_after, *ts; - boolean_t merge_before, merge_after; - hrtime_t time; - - ASSERT(MUTEX_HELD(&tm->tm_lock)); - VERIFY(start < end); - - time = gethrtime(); - tsearch.ts_start = start; - tsearch.ts_end = end; - - ts = avl_find(&tm->tm_queued_frees, &tsearch, &where); - if (ts != NULL) { - if (start < ts->ts_start) - trim_map_segment_add(tm, start, ts->ts_start, txg); - if (end > ts->ts_end) - trim_map_segment_add(tm, ts->ts_end, end, txg); - return; - } - - ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE); - ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER); - - merge_before = (ts_before != NULL && ts_before->ts_end == start); - merge_after = (ts_after != NULL && ts_after->ts_start == end); - - if (merge_before && merge_after) { - avl_remove(&tm->tm_queued_frees, ts_before); - TRIM_MAP_REM(tm, ts_before); - TRIM_MAP_REM(tm, ts_after); - ts_after->ts_start = ts_before->ts_start; - ts_after->ts_txg = txg; - ts_after->ts_time = time; - TRIM_MAP_ADD(tm, ts_after); - kmem_free(ts_before, sizeof (*ts_before)); - } else if (merge_before) { - TRIM_MAP_REM(tm, ts_before); - ts_before->ts_end = end; - ts_before->ts_txg = txg; - ts_before->ts_time = time; - TRIM_MAP_ADD(tm, ts_before); - } else if (merge_after) { - TRIM_MAP_REM(tm, ts_after); - ts_after->ts_start = start; - ts_after->ts_txg = txg; - ts_after->ts_time = time; - TRIM_MAP_ADD(tm, ts_after); - } else { - ts = kmem_alloc(sizeof (*ts), KM_SLEEP); - ts->ts_start = start; - ts->ts_end = end; - ts->ts_txg = txg; - ts->ts_time = time; - avl_insert(&tm->tm_queued_frees, ts, where); - TRIM_MAP_ADD(tm, ts); - } -} - -static void -trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start, - uint64_t end) -{ - trim_seg_t *nts; - boolean_t left_over, right_over; - - ASSERT(MUTEX_HELD(&tm->tm_lock)); - - left_over = (ts->ts_start < start); - right_over = (ts->ts_end > end); - - TRIM_MAP_REM(tm, ts); - if (left_over && right_over) { - nts = kmem_alloc(sizeof (*nts), KM_SLEEP); - nts->ts_start = end; - nts->ts_end = ts->ts_end; - nts->ts_txg = ts->ts_txg; - nts->ts_time = ts->ts_time; - ts->ts_end = start; - avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER); - TRIM_MAP_ADD(tm, ts); - TRIM_MAP_ADD(tm, nts); - } else if (left_over) { - ts->ts_end = start; - TRIM_MAP_ADD(tm, ts); - } else if (right_over) { - ts->ts_start = end; - TRIM_MAP_ADD(tm, ts); - } else { - avl_remove(&tm->tm_queued_frees, ts); - kmem_free(ts, sizeof (*ts)); - } -} - -static void -trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) -{ - zio_t zsearch, *zs; - - ASSERT(MUTEX_HELD(&tm->tm_lock)); - - zsearch.io_offset = start; - zsearch.io_size = end - start; - - zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL); - if (zs == NULL) { - trim_map_segment_add(tm, start, end, txg); - return; - } - if (start < zs->io_offset) - trim_map_free_locked(tm, start, zs->io_offset, txg); - if (zs->io_offset + zs->io_size < end) - trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg); -} - -void -trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) -{ - trim_map_t *tm = vd->vdev_trimmap; - - if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) - return; - - mutex_enter(&tm->tm_lock); - trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg); - mutex_exit(&tm->tm_lock); -} - -boolean_t -trim_map_write_start(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - trim_map_t *tm = vd->vdev_trimmap; - trim_seg_t tsearch, *ts; - boolean_t left_over, right_over; - uint64_t start, end; - - if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) - return (B_TRUE); - - start = zio->io_offset; - end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size); - tsearch.ts_start = start; - tsearch.ts_end = end; - - mutex_enter(&tm->tm_lock); - - /* - * Checking for colliding in-flight frees. - */ - ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL); - if (ts != NULL) { - list_insert_tail(&tm->tm_pending_writes, zio); - mutex_exit(&tm->tm_lock); - return (B_FALSE); - } - - /* - * Loop until all overlapping segments are removed. - */ - while ((ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL)) != NULL) { - trim_map_segment_remove(tm, ts, start, end); - } - - avl_add(&tm->tm_inflight_writes, zio); - - mutex_exit(&tm->tm_lock); - - return (B_TRUE); -} - -void -trim_map_write_done(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - trim_map_t *tm = vd->vdev_trimmap; - - /* - * Don't check for vdev_notrim, since the write could have - * started before vdev_notrim was set. - */ - if (!zfs_trim_enabled || tm == NULL) - return; - - mutex_enter(&tm->tm_lock); - /* - * Don't fail if the write isn't in the tree, since the write - * could have started after vdev_notrim was set. - */ - if (zio->io_trim_node.avl_child[0] || - zio->io_trim_node.avl_child[1] || - AVL_XPARENT(&zio->io_trim_node) || - tm->tm_inflight_writes.avl_root == &zio->io_trim_node) - avl_remove(&tm->tm_inflight_writes, zio); - mutex_exit(&tm->tm_lock); -} - -/* - * Return the oldest segment (the one with the lowest txg / time) or NULL if: - * 1. The list is empty - * 2. The first element's txg is greater than txgsafe - * 3. The first element's txg is not greater than the txg argument and the - * the first element's time is not greater than time argument - */ -static trim_seg_t * -trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time, - boolean_t force) -{ - trim_seg_t *ts; - - ASSERT(MUTEX_HELD(&tm->tm_lock)); - VERIFY(txgsafe >= txg); - - ts = list_head(&tm->tm_head); - if (ts != NULL && ts->ts_txg <= txgsafe && - (ts->ts_txg <= txg || ts->ts_time <= time || force)) - return (ts); - return (NULL); -} - -static void -trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) -{ - trim_map_t *tm = vd->vdev_trimmap; - trim_seg_t *ts; - uint64_t size, offset, txgtarget, txgsafe; - int64_t hard, soft; - hrtime_t timelimit; - - ASSERT(vd->vdev_ops->vdev_op_leaf); - - if (tm == NULL) - return; - - timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC; - if (vd->vdev_isl2cache) { - txgsafe = UINT64_MAX; - txgtarget = UINT64_MAX; - } else { - txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa)); - if (txgsafe > trim_txg_delay) - txgtarget = txgsafe - trim_txg_delay; - else - txgtarget = 0; - } - - mutex_enter(&tm->tm_lock); - hard = 0; - if (tm->tm_pending > trim_vdev_max_pending) - hard = (tm->tm_pending - trim_vdev_max_pending) / 4; - soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64); - /* Loop until we have sent all outstanding free's */ - while (soft > 0 && - (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0)) - != NULL) { - TRIM_MAP_REM(tm, ts); - avl_remove(&tm->tm_queued_frees, ts); - avl_add(&tm->tm_inflight_frees, ts); - size = ts->ts_end - ts->ts_start; - offset = ts->ts_start; - /* - * We drop the lock while we call zio_nowait as the IO - * scheduler can result in a different IO being run e.g. - * a write which would result in a recursive lock. - */ - mutex_exit(&tm->tm_lock); - - zio_nowait(zio_trim(zio, spa, vd, offset, size)); - - soft -= TRIM_MAP_SEGS(size); - hard -= TRIM_MAP_SEGS(size); - mutex_enter(&tm->tm_lock); - } - mutex_exit(&tm->tm_lock); -} - -static void -trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd) -{ - trim_map_t *tm = vd->vdev_trimmap; - trim_seg_t *ts; - list_t pending_writes; - zio_t *zio; - uint64_t start, size; - void *cookie; - - ASSERT(vd->vdev_ops->vdev_op_leaf); - - if (tm == NULL) - return; - - mutex_enter(&tm->tm_lock); - if (!avl_is_empty(&tm->tm_inflight_frees)) { - cookie = NULL; - while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees, - &cookie)) != NULL) { - kmem_free(ts, sizeof (*ts)); - } - } - list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t, - io_trim_link)); - list_move_tail(&pending_writes, &tm->tm_pending_writes); - mutex_exit(&tm->tm_lock); - - while ((zio = list_remove_head(&pending_writes)) != NULL) { - zio_vdev_io_reissue(zio); - zio_execute(zio); - } - list_destroy(&pending_writes); -} - -static void -trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd) -{ - int c; - - if (vd == NULL) - return; - - if (vd->vdev_ops->vdev_op_leaf) { - trim_map_vdev_commit(spa, zio, vd); - } else { - for (c = 0; c < vd->vdev_children; c++) - trim_map_commit(spa, zio, vd->vdev_child[c]); - } -} - -static void -trim_map_commit_done(spa_t *spa, vdev_t *vd) -{ - int c; - - if (vd == NULL) - return; - - if (vd->vdev_ops->vdev_op_leaf) { - trim_map_vdev_commit_done(spa, vd); - } else { - for (c = 0; c < vd->vdev_children; c++) - trim_map_commit_done(spa, vd->vdev_child[c]); - } -} - -static void -trim_thread(void *arg) -{ - spa_t *spa = arg; - zio_t *zio; - -#ifdef _KERNEL - (void) snprintf(curthread->td_name, sizeof(curthread->td_name), - "trim %s", spa_name(spa)); -#endif - - for (;;) { - mutex_enter(&spa->spa_trim_lock); - if (spa->spa_trim_thread == NULL) { - spa->spa_trim_thread = curthread; - cv_signal(&spa->spa_trim_cv); - mutex_exit(&spa->spa_trim_lock); - thread_exit(); - } - - (void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock, - hz * trim_max_interval); - mutex_exit(&spa->spa_trim_lock); - - zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); - - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - trim_map_commit(spa, zio, spa->spa_root_vdev); - (void) zio_wait(zio); - trim_map_commit_done(spa, spa->spa_root_vdev); - spa_config_exit(spa, SCL_STATE, FTAG); - } -} - -void -trim_thread_create(spa_t *spa) -{ - - if (!zfs_trim_enabled) - return; - - mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL); - mutex_enter(&spa->spa_trim_lock); - spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0, - TS_RUN, minclsyspri); - mutex_exit(&spa->spa_trim_lock); -} - -void -trim_thread_destroy(spa_t *spa) -{ - - if (!zfs_trim_enabled) - return; - if (spa->spa_trim_thread == NULL) - return; - - mutex_enter(&spa->spa_trim_lock); - /* Setting spa_trim_thread to NULL tells the thread to stop. */ - spa->spa_trim_thread = NULL; - cv_signal(&spa->spa_trim_cv); - /* The thread will set it back to != NULL on exit. */ - while (spa->spa_trim_thread == NULL) - cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); - spa->spa_trim_thread = NULL; - mutex_exit(&spa->spa_trim_lock); - - cv_destroy(&spa->spa_trim_cv); - mutex_destroy(&spa->spa_trim_lock); -} - -void -trim_thread_wakeup(spa_t *spa) -{ - - if (!zfs_trim_enabled) - return; - if (spa->spa_trim_thread == NULL) - return; - - mutex_enter(&spa->spa_trim_lock); - cv_signal(&spa->spa_trim_cv); - mutex_exit(&spa->spa_trim_lock); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c deleted file mode 100644 index 64a5d0972a74..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c +++ /dev/null @@ -1,977 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Portions Copyright 2011 Martin Matuska - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * ZFS Transaction Groups - * ---------------------- - * - * ZFS transaction groups are, as the name implies, groups of transactions - * that act on persistent state. ZFS asserts consistency at the granularity of - * these transaction groups. Each successive transaction group (txg) is - * assigned a 64-bit consecutive identifier. There are three active - * transaction group states: open, quiescing, or syncing. At any given time, - * there may be an active txg associated with each state; each active txg may - * either be processing, or blocked waiting to enter the next state. There may - * be up to three active txgs, and there is always a txg in the open state - * (though it may be blocked waiting to enter the quiescing state). In broad - * strokes, transactions -- operations that change in-memory structures -- are - * accepted into the txg in the open state, and are completed while the txg is - * in the open or quiescing states. The accumulated changes are written to - * disk in the syncing state. - * - * Open - * - * When a new txg becomes active, it first enters the open state. New - * transactions -- updates to in-memory structures -- are assigned to the - * currently open txg. There is always a txg in the open state so that ZFS can - * accept new changes (though the txg may refuse new changes if it has hit - * some limit). ZFS advances the open txg to the next state for a variety of - * reasons such as it hitting a time or size threshold, or the execution of an - * administrative action that must be completed in the syncing state. - * - * Quiescing - * - * After a txg exits the open state, it enters the quiescing state. The - * quiescing state is intended to provide a buffer between accepting new - * transactions in the open state and writing them out to stable storage in - * the syncing state. While quiescing, transactions can continue their - * operation without delaying either of the other states. Typically, a txg is - * in the quiescing state very briefly since the operations are bounded by - * software latencies rather than, say, slower I/O latencies. After all - * transactions complete, the txg is ready to enter the next state. - * - * Syncing - * - * In the syncing state, the in-memory state built up during the open and (to - * a lesser degree) the quiescing states is written to stable storage. The - * process of writing out modified data can, in turn modify more data. For - * example when we write new blocks, we need to allocate space for them; those - * allocations modify metadata (space maps)... which themselves must be - * written to stable storage. During the sync state, ZFS iterates, writing out - * data until it converges and all in-memory changes have been written out. - * The first such pass is the largest as it encompasses all the modified user - * data (as opposed to filesystem metadata). Subsequent passes typically have - * far less data to write as they consist exclusively of filesystem metadata. - * - * To ensure convergence, after a certain number of passes ZFS begins - * overwriting locations on stable storage that had been allocated earlier in - * the syncing state (and subsequently freed). ZFS usually allocates new - * blocks to optimize for large, continuous, writes. For the syncing state to - * converge however it must complete a pass where no new blocks are allocated - * since each allocation requires a modification of persistent metadata. - * Further, to hasten convergence, after a prescribed number of passes, ZFS - * also defers frees, and stops compressing. - * - * In addition to writing out user data, we must also execute synctasks during - * the syncing context. A synctask is the mechanism by which some - * administrative activities work such as creating and destroying snapshots or - * datasets. Note that when a synctask is initiated it enters the open txg, - * and ZFS then pushes that txg as quickly as possible to completion of the - * syncing state in order to reduce the latency of the administrative - * activity. To complete the syncing state, ZFS writes out a new uberblock, - * the root of the tree of blocks that comprise all state stored on the ZFS - * pool. Finally, if there is a quiesced txg waiting, we signal that it can - * now transition to the syncing state. - */ - -static void txg_sync_thread(void *arg); -static void txg_quiesce_thread(void *arg); - -int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ - -SYSCTL_DECL(_vfs_zfs); -SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "ZFS TXG"); -SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RWTUN, &zfs_txg_timeout, 0, - "Maximum seconds worth of delta per txg"); - -/* - * Prepare the txg subsystem. - */ -void -txg_init(dsl_pool_t *dp, uint64_t txg) -{ - tx_state_t *tx = &dp->dp_tx; - int c; - bzero(tx, sizeof (tx_state_t)); - - tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); - - for (c = 0; c < max_ncpus; c++) { - int i; - - mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT, - NULL); - for (i = 0; i < TXG_SIZE; i++) { - cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, - NULL); - list_create(&tx->tx_cpu[c].tc_callbacks[i], - sizeof (dmu_tx_callback_t), - offsetof(dmu_tx_callback_t, dcb_node)); - } - } - - mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); - - cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); - cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); - cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); - cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); - cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); - - tx->tx_open_txg = txg; -} - -/* - * Close down the txg subsystem. - */ -void -txg_fini(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - int c; - - ASSERT0(tx->tx_threads); - - mutex_destroy(&tx->tx_sync_lock); - - cv_destroy(&tx->tx_sync_more_cv); - cv_destroy(&tx->tx_sync_done_cv); - cv_destroy(&tx->tx_quiesce_more_cv); - cv_destroy(&tx->tx_quiesce_done_cv); - cv_destroy(&tx->tx_exit_cv); - - for (c = 0; c < max_ncpus; c++) { - int i; - - mutex_destroy(&tx->tx_cpu[c].tc_open_lock); - mutex_destroy(&tx->tx_cpu[c].tc_lock); - for (i = 0; i < TXG_SIZE; i++) { - cv_destroy(&tx->tx_cpu[c].tc_cv[i]); - list_destroy(&tx->tx_cpu[c].tc_callbacks[i]); - } - } - - if (tx->tx_commit_cb_taskq != NULL) - taskq_destroy(tx->tx_commit_cb_taskq); - - kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); - - bzero(tx, sizeof (tx_state_t)); -} - -/* - * Start syncing transaction groups. - */ -void -txg_sync_start(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - - mutex_enter(&tx->tx_sync_lock); - - dprintf("pool %p\n", dp); - - ASSERT0(tx->tx_threads); - - tx->tx_threads = 2; - - tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, - dp, 0, spa_proc(dp->dp_spa), TS_RUN, minclsyspri); - - /* - * The sync thread can need a larger-than-default stack size on - * 32-bit x86. This is due in part to nested pools and - * scrub_visitbp() recursion. - */ - tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread, - dp, 0, spa_proc(dp->dp_spa), TS_RUN, minclsyspri); - - mutex_exit(&tx->tx_sync_lock); -} - -static void -txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) -{ - CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); - mutex_enter(&tx->tx_sync_lock); -} - -static void -txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) -{ - ASSERT(*tpp != NULL); - *tpp = NULL; - tx->tx_threads--; - cv_broadcast(&tx->tx_exit_cv); - CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ - thread_exit(); -} - -static void -txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time) -{ - CALLB_CPR_SAFE_BEGIN(cpr); - - if (time) - (void) cv_timedwait(cv, &tx->tx_sync_lock, time); - else - cv_wait(cv, &tx->tx_sync_lock); - - CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); -} - -/* - * Stop syncing transaction groups. - */ -void -txg_sync_stop(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - - dprintf("pool %p\n", dp); - /* - * Finish off any work in progress. - */ - ASSERT3U(tx->tx_threads, ==, 2); - - /* - * We need to ensure that we've vacated the deferred space_maps. - */ - txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); - - /* - * Wake all sync threads and wait for them to die. - */ - mutex_enter(&tx->tx_sync_lock); - - ASSERT3U(tx->tx_threads, ==, 2); - - tx->tx_exiting = 1; - - cv_broadcast(&tx->tx_quiesce_more_cv); - cv_broadcast(&tx->tx_quiesce_done_cv); - cv_broadcast(&tx->tx_sync_more_cv); - - while (tx->tx_threads != 0) - cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); - - tx->tx_exiting = 0; - - mutex_exit(&tx->tx_sync_lock); -} - -uint64_t -txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) -{ - tx_state_t *tx = &dp->dp_tx; - tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID]; - uint64_t txg; - - mutex_enter(&tc->tc_open_lock); - txg = tx->tx_open_txg; - - mutex_enter(&tc->tc_lock); - tc->tc_count[txg & TXG_MASK]++; - mutex_exit(&tc->tc_lock); - - th->th_cpu = tc; - th->th_txg = txg; - - return (txg); -} - -void -txg_rele_to_quiesce(txg_handle_t *th) -{ - tx_cpu_t *tc = th->th_cpu; - - ASSERT(!MUTEX_HELD(&tc->tc_lock)); - mutex_exit(&tc->tc_open_lock); -} - -void -txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks) -{ - tx_cpu_t *tc = th->th_cpu; - int g = th->th_txg & TXG_MASK; - - mutex_enter(&tc->tc_lock); - list_move_tail(&tc->tc_callbacks[g], tx_callbacks); - mutex_exit(&tc->tc_lock); -} - -void -txg_rele_to_sync(txg_handle_t *th) -{ - tx_cpu_t *tc = th->th_cpu; - int g = th->th_txg & TXG_MASK; - - mutex_enter(&tc->tc_lock); - ASSERT(tc->tc_count[g] != 0); - if (--tc->tc_count[g] == 0) - cv_broadcast(&tc->tc_cv[g]); - mutex_exit(&tc->tc_lock); - - th->th_cpu = NULL; /* defensive */ -} - -/* - * Blocks until all transactions in the group are committed. - * - * On return, the transaction group has reached a stable state in which it can - * then be passed off to the syncing context. - */ -static __noinline void -txg_quiesce(dsl_pool_t *dp, uint64_t txg) -{ - tx_state_t *tx = &dp->dp_tx; - int g = txg & TXG_MASK; - int c; - - /* - * Grab all tc_open_locks so nobody else can get into this txg. - */ - for (c = 0; c < max_ncpus; c++) - mutex_enter(&tx->tx_cpu[c].tc_open_lock); - - ASSERT(txg == tx->tx_open_txg); - tx->tx_open_txg++; - tx->tx_open_time = gethrtime(); - - DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg); - DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg); - - /* - * Now that we've incremented tx_open_txg, we can let threads - * enter the next transaction group. - */ - for (c = 0; c < max_ncpus; c++) - mutex_exit(&tx->tx_cpu[c].tc_open_lock); - - /* - * Quiesce the transaction group by waiting for everyone to txg_exit(). - */ - for (c = 0; c < max_ncpus; c++) { - tx_cpu_t *tc = &tx->tx_cpu[c]; - mutex_enter(&tc->tc_lock); - while (tc->tc_count[g] != 0) - cv_wait(&tc->tc_cv[g], &tc->tc_lock); - mutex_exit(&tc->tc_lock); - } -} - -static void -txg_do_callbacks(void *arg) -{ - list_t *cb_list = arg; - - dmu_tx_do_callbacks(cb_list, 0); - - list_destroy(cb_list); - - kmem_free(cb_list, sizeof (list_t)); -} - -/* - * Dispatch the commit callbacks registered on this txg to worker threads. - * - * If no callbacks are registered for a given TXG, nothing happens. - * This function creates a taskq for the associated pool, if needed. - */ -static void -txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) -{ - int c; - tx_state_t *tx = &dp->dp_tx; - list_t *cb_list; - - for (c = 0; c < max_ncpus; c++) { - tx_cpu_t *tc = &tx->tx_cpu[c]; - /* - * No need to lock tx_cpu_t at this point, since this can - * only be called once a txg has been synced. - */ - - int g = txg & TXG_MASK; - - if (list_is_empty(&tc->tc_callbacks[g])) - continue; - - if (tx->tx_commit_cb_taskq == NULL) { - /* - * Commit callback taskq hasn't been created yet. - */ - tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", - max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2, - TASKQ_PREPOPULATE); - } - - cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); - list_create(cb_list, sizeof (dmu_tx_callback_t), - offsetof(dmu_tx_callback_t, dcb_node)); - - list_move_tail(cb_list, &tc->tc_callbacks[g]); - - (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) - txg_do_callbacks, cb_list, TQ_SLEEP); - } -} - -static boolean_t -txg_is_syncing(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); - return (tx->tx_syncing_txg != 0); -} - -static boolean_t -txg_is_quiescing(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); - return (tx->tx_quiescing_txg != 0); -} - -static boolean_t -txg_has_quiesced_to_sync(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); - return (tx->tx_quiesced_txg != 0); -} - -static void -txg_sync_thread(void *arg) -{ - dsl_pool_t *dp = arg; - spa_t *spa = dp->dp_spa; - tx_state_t *tx = &dp->dp_tx; - callb_cpr_t cpr; - uint64_t start, delta; - - txg_thread_enter(tx, &cpr); - - start = delta = 0; - for (;;) { - uint64_t timeout = zfs_txg_timeout * hz; - uint64_t timer; - uint64_t txg; - uint64_t dirty_min_bytes = - zfs_dirty_data_max * zfs_dirty_data_sync_pct / 100; - - /* - * We sync when we're scanning, there's someone waiting - * on us, or the quiesce thread has handed off a txg to - * us, or we have reached our timeout. - */ - timer = (delta >= timeout ? 0 : timeout - delta); - while (!dsl_scan_active(dp->dp_scan) && - !tx->tx_exiting && timer > 0 && - tx->tx_synced_txg >= tx->tx_sync_txg_waiting && - !txg_has_quiesced_to_sync(dp) && - dp->dp_dirty_total < dirty_min_bytes) { - dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", - tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); - txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); - delta = ddi_get_lbolt() - start; - timer = (delta > timeout ? 0 : timeout - delta); - } - - /* - * Wait until the quiesce thread hands off a txg to us, - * prompting it to do so if necessary. - */ - while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { - if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) - tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; - cv_broadcast(&tx->tx_quiesce_more_cv); - txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); - } - - if (tx->tx_exiting) - txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); - - /* - * Consume the quiesced txg which has been handed off to - * us. This may cause the quiescing thread to now be - * able to quiesce another txg, so we must signal it. - */ - ASSERT(tx->tx_quiesced_txg != 0); - txg = tx->tx_quiesced_txg; - tx->tx_quiesced_txg = 0; - tx->tx_syncing_txg = txg; - DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg); - cv_broadcast(&tx->tx_quiesce_more_cv); - - dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", - txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); - mutex_exit(&tx->tx_sync_lock); - - start = ddi_get_lbolt(); - spa_sync(spa, txg); - delta = ddi_get_lbolt() - start; - - mutex_enter(&tx->tx_sync_lock); - tx->tx_synced_txg = txg; - tx->tx_syncing_txg = 0; - DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg); - cv_broadcast(&tx->tx_sync_done_cv); - - /* - * Dispatch commit callbacks to worker threads. - */ - txg_dispatch_callbacks(dp, txg); - } -} - -static void -txg_quiesce_thread(void *arg) -{ - dsl_pool_t *dp = arg; - tx_state_t *tx = &dp->dp_tx; - callb_cpr_t cpr; - - txg_thread_enter(tx, &cpr); - - for (;;) { - uint64_t txg; - - /* - * We quiesce when there's someone waiting on us. - * However, we can only have one txg in "quiescing" or - * "quiesced, waiting to sync" state. So we wait until - * the "quiesced, waiting to sync" txg has been consumed - * by the sync thread. - */ - while (!tx->tx_exiting && - (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || - txg_has_quiesced_to_sync(dp))) - txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); - - if (tx->tx_exiting) - txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); - - txg = tx->tx_open_txg; - dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", - txg, tx->tx_quiesce_txg_waiting, - tx->tx_sync_txg_waiting); - tx->tx_quiescing_txg = txg; - - mutex_exit(&tx->tx_sync_lock); - txg_quiesce(dp, txg); - mutex_enter(&tx->tx_sync_lock); - - /* - * Hand this txg off to the sync thread. - */ - dprintf("quiesce done, handing off txg %llu\n", txg); - tx->tx_quiescing_txg = 0; - tx->tx_quiesced_txg = txg; - DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); - cv_broadcast(&tx->tx_sync_more_cv); - cv_broadcast(&tx->tx_quiesce_done_cv); - } -} - -/* - * Delay this thread by delay nanoseconds if we are still in the open - * transaction group and there is already a waiting txg quiesing or quiesced. - * Abort the delay if this txg stalls or enters the quiesing state. - */ -void -txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) -{ - tx_state_t *tx = &dp->dp_tx; - hrtime_t start = gethrtime(); - - /* don't delay if this txg could transition to quiescing immediately */ - if (tx->tx_open_txg > txg || - tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) - return; - - mutex_enter(&tx->tx_sync_lock); - if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) { - mutex_exit(&tx->tx_sync_lock); - return; - } - - while (gethrtime() - start < delay && - tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) { - (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv, - &tx->tx_sync_lock, delay, resolution, 0); - } - - mutex_exit(&tx->tx_sync_lock); -} - -static boolean_t -txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig) -{ - tx_state_t *tx = &dp->dp_tx; - - ASSERT(!dsl_pool_config_held(dp)); - - mutex_enter(&tx->tx_sync_lock); - ASSERT3U(tx->tx_threads, ==, 2); - if (txg == 0) - txg = tx->tx_open_txg + TXG_DEFER_SIZE; - if (tx->tx_sync_txg_waiting < txg) - tx->tx_sync_txg_waiting = txg; - dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", - txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); - while (tx->tx_synced_txg < txg) { - dprintf("broadcasting sync more " - "tx_synced=%llu waiting=%llu dp=%p\n", - tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); - cv_broadcast(&tx->tx_sync_more_cv); - if (wait_sig) { - /* - * Condition wait here but stop if the thread receives a - * signal. The caller may call txg_wait_synced*() again - * to resume waiting for this txg. - */ -#ifdef __FreeBSD__ - /* - * FreeBSD returns EINTR or ERESTART if there is - * a pending signal, zero if the conditional variable - * is signaled. illumos returns zero in the former case - * and >0 in the latter. - */ - if (cv_wait_sig(&tx->tx_sync_done_cv, - &tx->tx_sync_lock) != 0) { -#else - if (cv_wait_sig(&tx->tx_sync_done_cv, - &tx->tx_sync_lock) == 0) { -#endif - - mutex_exit(&tx->tx_sync_lock); - return (B_TRUE); - } - } else { - cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock); - } - } - mutex_exit(&tx->tx_sync_lock); - return (B_FALSE); -} - -void -txg_wait_synced(dsl_pool_t *dp, uint64_t txg) -{ - VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE)); -} - -/* - * Similar to a txg_wait_synced but it can be interrupted from a signal. - * Returns B_TRUE if the thread was signaled while waiting. - */ -boolean_t -txg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg) -{ - return (txg_wait_synced_impl(dp, txg, B_TRUE)); -} - -void -txg_wait_open(dsl_pool_t *dp, uint64_t txg) -{ - tx_state_t *tx = &dp->dp_tx; - - ASSERT(!dsl_pool_config_held(dp)); - - mutex_enter(&tx->tx_sync_lock); - ASSERT3U(tx->tx_threads, ==, 2); - if (txg == 0) - txg = tx->tx_open_txg + 1; - if (tx->tx_quiesce_txg_waiting < txg) - tx->tx_quiesce_txg_waiting = txg; - dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", - txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); - while (tx->tx_open_txg < txg) { - cv_broadcast(&tx->tx_quiesce_more_cv); - cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); - } - mutex_exit(&tx->tx_sync_lock); -} - -/* - * If there isn't a txg syncing or in the pipeline, push another txg through - * the pipeline by queiscing the open txg. - */ -void -txg_kick(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - - ASSERT(!dsl_pool_config_held(dp)); - - mutex_enter(&tx->tx_sync_lock); - if (!txg_is_syncing(dp) && - !txg_is_quiescing(dp) && - tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && - tx->tx_sync_txg_waiting <= tx->tx_synced_txg && - tx->tx_quiesced_txg <= tx->tx_synced_txg) { - tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; - cv_broadcast(&tx->tx_quiesce_more_cv); - } - mutex_exit(&tx->tx_sync_lock); -} - -boolean_t -txg_stalled(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); -} - -boolean_t -txg_sync_waiting(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - - return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting || - tx->tx_quiesced_txg != 0); -} - -/* - * Verify that this txg is active (open, quiescing, syncing). Non-active - * txg's should not be manipulated. - */ -void -txg_verify(spa_t *spa, uint64_t txg) -{ - dsl_pool_t *dp = spa_get_dsl(spa); - if (txg <= TXG_INITIAL || txg == ZILTEST_TXG) - return; - ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); - ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg); - ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES); -} - -/* - * Per-txg object lists. - */ -void -txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset) -{ - int t; - - mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); - - tl->tl_offset = offset; - tl->tl_spa = spa; - - for (t = 0; t < TXG_SIZE; t++) - tl->tl_head[t] = NULL; -} - -void -txg_list_destroy(txg_list_t *tl) -{ - int t; - - for (t = 0; t < TXG_SIZE; t++) - ASSERT(txg_list_empty(tl, t)); - - mutex_destroy(&tl->tl_lock); -} - -boolean_t -txg_list_empty(txg_list_t *tl, uint64_t txg) -{ - txg_verify(tl->tl_spa, txg); - return (tl->tl_head[txg & TXG_MASK] == NULL); -} - -/* - * Returns true if all txg lists are empty. - * - * Warning: this is inherently racy (an item could be added immediately - * after this function returns). We don't bother with the lock because - * it wouldn't change the semantics. - */ -boolean_t -txg_all_lists_empty(txg_list_t *tl) -{ - for (int i = 0; i < TXG_SIZE; i++) { - if (!txg_list_empty(tl, i)) { - return (B_FALSE); - } - } - return (B_TRUE); -} - -/* - * Add an entry to the list (unless it's already on the list). - * Returns B_TRUE if it was actually added. - */ -boolean_t -txg_list_add(txg_list_t *tl, void *p, uint64_t txg) -{ - int t = txg & TXG_MASK; - txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); - boolean_t add; - - txg_verify(tl->tl_spa, txg); - mutex_enter(&tl->tl_lock); - add = (tn->tn_member[t] == 0); - if (add) { - tn->tn_member[t] = 1; - tn->tn_next[t] = tl->tl_head[t]; - tl->tl_head[t] = tn; - } - mutex_exit(&tl->tl_lock); - - return (add); -} - -/* - * Add an entry to the end of the list, unless it's already on the list. - * (walks list to find end) - * Returns B_TRUE if it was actually added. - */ -boolean_t -txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) -{ - int t = txg & TXG_MASK; - txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); - boolean_t add; - - txg_verify(tl->tl_spa, txg); - mutex_enter(&tl->tl_lock); - add = (tn->tn_member[t] == 0); - if (add) { - txg_node_t **tp; - - for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) - continue; - - tn->tn_member[t] = 1; - tn->tn_next[t] = NULL; - *tp = tn; - } - mutex_exit(&tl->tl_lock); - - return (add); -} - -/* - * Remove the head of the list and return it. - */ -void * -txg_list_remove(txg_list_t *tl, uint64_t txg) -{ - int t = txg & TXG_MASK; - txg_node_t *tn; - void *p = NULL; - - txg_verify(tl->tl_spa, txg); - mutex_enter(&tl->tl_lock); - if ((tn = tl->tl_head[t]) != NULL) { - ASSERT(tn->tn_member[t]); - ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]); - p = (char *)tn - tl->tl_offset; - tl->tl_head[t] = tn->tn_next[t]; - tn->tn_next[t] = NULL; - tn->tn_member[t] = 0; - } - mutex_exit(&tl->tl_lock); - - return (p); -} - -/* - * Remove a specific item from the list and return it. - */ -void * -txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) -{ - int t = txg & TXG_MASK; - txg_node_t *tn, **tp; - - txg_verify(tl->tl_spa, txg); - mutex_enter(&tl->tl_lock); - - for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { - if ((char *)tn - tl->tl_offset == p) { - *tp = tn->tn_next[t]; - tn->tn_next[t] = NULL; - tn->tn_member[t] = 0; - mutex_exit(&tl->tl_lock); - return (p); - } - } - - mutex_exit(&tl->tl_lock); - - return (NULL); -} - -boolean_t -txg_list_member(txg_list_t *tl, void *p, uint64_t txg) -{ - int t = txg & TXG_MASK; - txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); - - txg_verify(tl->tl_spa, txg); - return (tn->tn_member[t] != 0); -} - -/* - * Walk a txg list -- only safe if you know it's not changing. - */ -void * -txg_list_head(txg_list_t *tl, uint64_t txg) -{ - int t = txg & TXG_MASK; - txg_node_t *tn = tl->tl_head[t]; - - txg_verify(tl->tl_spa, txg); - return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); -} - -void * -txg_list_next(txg_list_t *tl, void *p, uint64_t txg) -{ - int t = txg & TXG_MASK; - txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); - - txg_verify(tl->tl_spa, txg); - tn = tn->tn_next[t]; - - return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c deleted file mode 100644 index b8857d74d810..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c +++ /dev/null @@ -1,74 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include - -int -uberblock_verify(uberblock_t *ub) -{ - if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) - byteswap_uint64_array(ub, sizeof (uberblock_t)); - - if (ub->ub_magic != UBERBLOCK_MAGIC) - return (SET_ERROR(EINVAL)); - - return (0); -} - -/* - * Update the uberblock and return TRUE if anything changed in this - * transaction group. - */ -boolean_t -uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay) -{ - ASSERT(ub->ub_txg < txg); - - /* - * We explicitly do not set ub_version here, so that older versions - * continue to be written with the previous uberblock version. - */ - ub->ub_magic = UBERBLOCK_MAGIC; - ub->ub_txg = txg; - ub->ub_guid_sum = rvd->vdev_guid_sum; - ub->ub_timestamp = gethrestime_sec(); - ub->ub_software_version = SPA_VERSION; - ub->ub_mmp_magic = MMP_MAGIC; - if (spa_multihost(rvd->vdev_spa)) { - ub->ub_mmp_delay = mmp_delay; - ub->ub_mmp_config = MMP_SEQ_SET(0) | - MMP_INTERVAL_SET(zfs_multihost_interval) | - MMP_FAIL_INT_SET(zfs_multihost_fail_intervals); - } else { - ub->ub_mmp_delay = 0; - ub->ub_mmp_config = 0; - } - ub->ub_checkpoint_txg = 0; - - return (ub->ub_rootbp.blk_birth == txg); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c deleted file mode 100644 index d33f451938b8..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c +++ /dev/null @@ -1,112 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include -#include -#include - -static avl_tree_t unique_avl; -static kmutex_t unique_mtx; - -typedef struct unique { - avl_node_t un_link; - uint64_t un_value; -} unique_t; - -#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1) - -static int -unique_compare(const void *a, const void *b) -{ - const unique_t *una = (const unique_t *)a; - const unique_t *unb = (const unique_t *)b; - - return (AVL_CMP(una->un_value, unb->un_value)); -} - -void -unique_init(void) -{ - avl_create(&unique_avl, unique_compare, - sizeof (unique_t), offsetof(unique_t, un_link)); - mutex_init(&unique_mtx, NULL, MUTEX_DEFAULT, NULL); -} - -void -unique_fini(void) -{ - avl_destroy(&unique_avl); - mutex_destroy(&unique_mtx); -} - -uint64_t -unique_create(void) -{ - uint64_t value = unique_insert(0); - unique_remove(value); - return (value); -} - -uint64_t -unique_insert(uint64_t value) -{ - avl_index_t idx; - unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP); - - un->un_value = value; - - mutex_enter(&unique_mtx); - while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK || - avl_find(&unique_avl, un, &idx)) { - mutex_exit(&unique_mtx); - (void) random_get_pseudo_bytes((void*)&un->un_value, - sizeof (un->un_value)); - un->un_value &= UNIQUE_MASK; - mutex_enter(&unique_mtx); - } - - avl_insert(&unique_avl, un, idx); - mutex_exit(&unique_mtx); - - return (un->un_value); -} - -void -unique_remove(uint64_t value) -{ - unique_t un_tofind; - unique_t *un; - - un_tofind.un_value = value; - mutex_enter(&unique_mtx); - un = avl_find(&unique_avl, &un_tofind, NULL); - if (un != NULL) { - avl_remove(&unique_avl, un); - kmem_free(un, sizeof (unique_t)); - } - mutex_exit(&unique_mtx); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c deleted file mode 100644 index 6043adee0241..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ /dev/null @@ -1,4520 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. - * Copyright 2013 Martin Matuska . All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2016 Toomas Soome - * Copyright 2019 Joyent, Inc. - * Copyright (c) 2017, Intel Corporation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -SYSCTL_DECL(_vfs_zfs); -SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "ZFS VDEV"); - -/* - * Virtual device management. - */ - -/* - * The limit for ZFS to automatically increase a top-level vdev's ashift - * from logical ashift to physical ashift. - * - * Example: one or more 512B emulation child vdevs - * child->vdev_ashift = 9 (512 bytes) - * child->vdev_physical_ashift = 12 (4096 bytes) - * zfs_max_auto_ashift = 11 (2048 bytes) - * zfs_min_auto_ashift = 9 (512 bytes) - * - * On pool creation or the addition of a new top-level vdev, ZFS will - * increase the ashift of the top-level vdev to 2048 as limited by - * zfs_max_auto_ashift. - * - * Example: one or more 512B emulation child vdevs - * child->vdev_ashift = 9 (512 bytes) - * child->vdev_physical_ashift = 12 (4096 bytes) - * zfs_max_auto_ashift = 13 (8192 bytes) - * zfs_min_auto_ashift = 9 (512 bytes) - * - * On pool creation or the addition of a new top-level vdev, ZFS will - * increase the ashift of the top-level vdev to 4096 to match the - * max vdev_physical_ashift. - * - * Example: one or more 512B emulation child vdevs - * child->vdev_ashift = 9 (512 bytes) - * child->vdev_physical_ashift = 9 (512 bytes) - * zfs_max_auto_ashift = 13 (8192 bytes) - * zfs_min_auto_ashift = 12 (4096 bytes) - * - * On pool creation or the addition of a new top-level vdev, ZFS will - * increase the ashift of the top-level vdev to 4096 to match the - * zfs_min_auto_ashift. - */ -static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; -static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT; - -static int -sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) -{ - uint64_t val; - int err; - - val = zfs_max_auto_ashift; - err = sysctl_handle_64(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift) - return (EINVAL); - - zfs_max_auto_ashift = val; - - return (0); -} -SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, - CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), - sysctl_vfs_zfs_max_auto_ashift, "QU", - "Max ashift used when optimising for logical -> physical sectors size on " - "new top-level vdevs."); - -static int -sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) -{ - uint64_t val; - int err; - - val = zfs_min_auto_ashift; - err = sysctl_handle_64(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift) - return (EINVAL); - - zfs_min_auto_ashift = val; - - return (0); -} -SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, - CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), - sysctl_vfs_zfs_min_auto_ashift, "QU", - "Min ashift used when creating new top-level vdevs."); - -static vdev_ops_t *vdev_ops_table[] = { - &vdev_root_ops, - &vdev_raidz_ops, - &vdev_mirror_ops, - &vdev_replacing_ops, - &vdev_spare_ops, -#ifdef _KERNEL - &vdev_geom_ops, -#else - &vdev_disk_ops, -#endif - &vdev_file_ops, - &vdev_missing_ops, - &vdev_hole_ops, - &vdev_indirect_ops, - NULL -}; - - -/* default target for number of metaslabs per top-level vdev */ -int zfs_vdev_default_ms_count = 200; -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_count, CTLFLAG_RWTUN, - &zfs_vdev_default_ms_count, 0, - "Target number of metaslabs per top-level vdev"); - -/* minimum number of metaslabs per top-level vdev */ -int zfs_vdev_min_ms_count = 16; -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN, - &zfs_vdev_min_ms_count, 0, - "Minimum number of metaslabs per top-level vdev"); - -/* practical upper limit of total metaslabs per top-level vdev */ -int zfs_vdev_ms_count_limit = 1ULL << 17; -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN, - &zfs_vdev_ms_count_limit, 0, - "Maximum number of metaslabs per top-level vdev"); - -/* lower limit for metaslab size (512M) */ -int zfs_vdev_default_ms_shift = 29; -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN, - &zfs_vdev_default_ms_shift, 0, - "Default shift between vdev size and number of metaslabs"); - -/* upper limit for metaslab size (16G) */ -int zfs_vdev_max_ms_shift = 34; -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN, - &zfs_vdev_max_ms_shift, 0, - "Maximum shift between vdev size and number of metaslabs"); - -boolean_t vdev_validate_skip = B_FALSE; -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, validate_skip, CTLFLAG_RWTUN, - &vdev_validate_skip, 0, - "Bypass vdev validation"); - -/* - * Since the DTL space map of a vdev is not expected to have a lot of - * entries, we default its block size to 4K. - */ -int vdev_dtl_sm_blksz = (1 << 12); -SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN, - &vdev_dtl_sm_blksz, 0, - "Block size for DTL space map. Power of 2 and greater than 4096."); - -/* - * vdev-wide space maps that have lots of entries written to them at - * the end of each transaction can benefit from a higher I/O bandwidth - * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. - */ -int vdev_standard_sm_blksz = (1 << 17); -SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, - &vdev_standard_sm_blksz, 0, - "Block size for standard space map. Power of 2 and greater than 4096."); - -/* - * Tunable parameter for debugging or performance analysis. Setting this - * will cause pool corruption on power loss if a volatile out-of-order - * write cache is enabled. - */ -boolean_t zfs_nocacheflush = B_FALSE; -SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RWTUN, - &zfs_nocacheflush, 0, "Disable cache flush"); - -/*PRINTFLIKE2*/ -void -vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) -{ - va_list adx; - char buf[256]; - - va_start(adx, fmt); - (void) vsnprintf(buf, sizeof (buf), fmt, adx); - va_end(adx); - - if (vd->vdev_path != NULL) { - zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, - vd->vdev_path, buf); - } else { - zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", - vd->vdev_ops->vdev_op_type, - (u_longlong_t)vd->vdev_id, - (u_longlong_t)vd->vdev_guid, buf); - } -} - -void -vdev_dbgmsg_print_tree(vdev_t *vd, int indent) -{ - char state[20]; - - if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { - zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id, - vd->vdev_ops->vdev_op_type); - return; - } - - switch (vd->vdev_state) { - case VDEV_STATE_UNKNOWN: - (void) snprintf(state, sizeof (state), "unknown"); - break; - case VDEV_STATE_CLOSED: - (void) snprintf(state, sizeof (state), "closed"); - break; - case VDEV_STATE_OFFLINE: - (void) snprintf(state, sizeof (state), "offline"); - break; - case VDEV_STATE_REMOVED: - (void) snprintf(state, sizeof (state), "removed"); - break; - case VDEV_STATE_CANT_OPEN: - (void) snprintf(state, sizeof (state), "can't open"); - break; - case VDEV_STATE_FAULTED: - (void) snprintf(state, sizeof (state), "faulted"); - break; - case VDEV_STATE_DEGRADED: - (void) snprintf(state, sizeof (state), "degraded"); - break; - case VDEV_STATE_HEALTHY: - (void) snprintf(state, sizeof (state), "healthy"); - break; - default: - (void) snprintf(state, sizeof (state), "", - (uint_t)vd->vdev_state); - } - - zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, - "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, - vd->vdev_islog ? " (log)" : "", - (u_longlong_t)vd->vdev_guid, - vd->vdev_path ? vd->vdev_path : "N/A", state); - - for (uint64_t i = 0; i < vd->vdev_children; i++) - vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); -} - -/* - * Given a vdev type, return the appropriate ops vector. - */ -static vdev_ops_t * -vdev_getops(const char *type) -{ - vdev_ops_t *ops, **opspp; - - for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) - if (strcmp(ops->vdev_op_type, type) == 0) - break; - - return (ops); -} - -/* - * Derive the enumerated alloction bias from string input. - * String origin is either the per-vdev zap or zpool(1M). - */ -static vdev_alloc_bias_t -vdev_derive_alloc_bias(const char *bias) -{ - vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; - - if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) - alloc_bias = VDEV_BIAS_LOG; - else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) - alloc_bias = VDEV_BIAS_SPECIAL; - else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) - alloc_bias = VDEV_BIAS_DEDUP; - - return (alloc_bias); -} - -/* ARGSUSED */ -void -vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res) -{ - res->rs_start = in->rs_start; - res->rs_end = in->rs_end; -} - -/* - * Default asize function: return the MAX of psize with the asize of - * all children. This is what's used by anything other than RAID-Z. - */ -uint64_t -vdev_default_asize(vdev_t *vd, uint64_t psize) -{ - uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); - uint64_t csize; - - for (int c = 0; c < vd->vdev_children; c++) { - csize = vdev_psize_to_asize(vd->vdev_child[c], psize); - asize = MAX(asize, csize); - } - - return (asize); -} - -/* - * Get the minimum allocatable size. We define the allocatable size as - * the vdev's asize rounded to the nearest metaslab. This allows us to - * replace or attach devices which don't have the same physical size but - * can still satisfy the same number of allocations. - */ -uint64_t -vdev_get_min_asize(vdev_t *vd) -{ - vdev_t *pvd = vd->vdev_parent; - - /* - * If our parent is NULL (inactive spare or cache) or is the root, - * just return our own asize. - */ - if (pvd == NULL) - return (vd->vdev_asize); - - /* - * The top-level vdev just returns the allocatable size rounded - * to the nearest metaslab. - */ - if (vd == vd->vdev_top) - return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); - - /* - * The allocatable space for a raidz vdev is N * sizeof(smallest child), - * so each child must provide at least 1/Nth of its asize. - */ - if (pvd->vdev_ops == &vdev_raidz_ops) - return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / - pvd->vdev_children); - - return (pvd->vdev_min_asize); -} - -void -vdev_set_min_asize(vdev_t *vd) -{ - vd->vdev_min_asize = vdev_get_min_asize(vd); - - for (int c = 0; c < vd->vdev_children; c++) - vdev_set_min_asize(vd->vdev_child[c]); -} - -vdev_t * -vdev_lookup_top(spa_t *spa, uint64_t vdev) -{ - vdev_t *rvd = spa->spa_root_vdev; - - ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); - - if (vdev < rvd->vdev_children) { - ASSERT(rvd->vdev_child[vdev] != NULL); - return (rvd->vdev_child[vdev]); - } - - return (NULL); -} - -vdev_t * -vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) -{ - vdev_t *mvd; - - if (vd->vdev_guid == guid) - return (vd); - - for (int c = 0; c < vd->vdev_children; c++) - if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != - NULL) - return (mvd); - - return (NULL); -} - -static int -vdev_count_leaves_impl(vdev_t *vd) -{ - int n = 0; - - if (vd->vdev_ops->vdev_op_leaf) - return (1); - - for (int c = 0; c < vd->vdev_children; c++) - n += vdev_count_leaves_impl(vd->vdev_child[c]); - - return (n); -} - -int -vdev_count_leaves(spa_t *spa) -{ - return (vdev_count_leaves_impl(spa->spa_root_vdev)); -} - -void -vdev_add_child(vdev_t *pvd, vdev_t *cvd) -{ - size_t oldsize, newsize; - uint64_t id = cvd->vdev_id; - vdev_t **newchild; - spa_t *spa = cvd->vdev_spa; - - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - ASSERT(cvd->vdev_parent == NULL); - - cvd->vdev_parent = pvd; - - if (pvd == NULL) - return; - - ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); - - oldsize = pvd->vdev_children * sizeof (vdev_t *); - pvd->vdev_children = MAX(pvd->vdev_children, id + 1); - newsize = pvd->vdev_children * sizeof (vdev_t *); - - newchild = kmem_zalloc(newsize, KM_SLEEP); - if (pvd->vdev_child != NULL) { - bcopy(pvd->vdev_child, newchild, oldsize); - kmem_free(pvd->vdev_child, oldsize); - } - - pvd->vdev_child = newchild; - pvd->vdev_child[id] = cvd; - - cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); - ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); - - /* - * Walk up all ancestors to update guid sum. - */ - for (; pvd != NULL; pvd = pvd->vdev_parent) - pvd->vdev_guid_sum += cvd->vdev_guid_sum; - - if (cvd->vdev_ops->vdev_op_leaf) { - list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); - cvd->vdev_spa->spa_leaf_list_gen++; - } -} - -void -vdev_remove_child(vdev_t *pvd, vdev_t *cvd) -{ - int c; - uint_t id = cvd->vdev_id; - - ASSERT(cvd->vdev_parent == pvd); - - if (pvd == NULL) - return; - - ASSERT(id < pvd->vdev_children); - ASSERT(pvd->vdev_child[id] == cvd); - - pvd->vdev_child[id] = NULL; - cvd->vdev_parent = NULL; - - for (c = 0; c < pvd->vdev_children; c++) - if (pvd->vdev_child[c]) - break; - - if (c == pvd->vdev_children) { - kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); - pvd->vdev_child = NULL; - pvd->vdev_children = 0; - } - - if (cvd->vdev_ops->vdev_op_leaf) { - spa_t *spa = cvd->vdev_spa; - list_remove(&spa->spa_leaf_list, cvd); - spa->spa_leaf_list_gen++; - } - - /* - * Walk up all ancestors to update guid sum. - */ - for (; pvd != NULL; pvd = pvd->vdev_parent) - pvd->vdev_guid_sum -= cvd->vdev_guid_sum; -} - -/* - * Remove any holes in the child array. - */ -void -vdev_compact_children(vdev_t *pvd) -{ - vdev_t **newchild, *cvd; - int oldc = pvd->vdev_children; - int newc; - - ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); - - if (oldc == 0) - return; - - for (int c = newc = 0; c < oldc; c++) - if (pvd->vdev_child[c]) - newc++; - - if (newc > 0) { - newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); - - for (int c = newc = 0; c < oldc; c++) { - if ((cvd = pvd->vdev_child[c]) != NULL) { - newchild[newc] = cvd; - cvd->vdev_id = newc++; - } - } - } else { - newchild = NULL; - } - - kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); - pvd->vdev_child = newchild; - pvd->vdev_children = newc; -} - -/* - * Allocate and minimally initialize a vdev_t. - */ -vdev_t * -vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) -{ - vdev_t *vd; - vdev_indirect_config_t *vic; - - vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); - vic = &vd->vdev_indirect_config; - - if (spa->spa_root_vdev == NULL) { - ASSERT(ops == &vdev_root_ops); - spa->spa_root_vdev = vd; - spa->spa_load_guid = spa_generate_guid(NULL); - } - - if (guid == 0 && ops != &vdev_hole_ops) { - if (spa->spa_root_vdev == vd) { - /* - * The root vdev's guid will also be the pool guid, - * which must be unique among all pools. - */ - guid = spa_generate_guid(NULL); - } else { - /* - * Any other vdev's guid must be unique within the pool. - */ - guid = spa_generate_guid(spa); - } - ASSERT(!spa_guid_exists(spa_guid(spa), guid)); - } - - vd->vdev_spa = spa; - vd->vdev_id = id; - vd->vdev_guid = guid; - vd->vdev_guid_sum = guid; - vd->vdev_ops = ops; - vd->vdev_state = VDEV_STATE_CLOSED; - vd->vdev_ishole = (ops == &vdev_hole_ops); - vic->vic_prev_indirect_vdev = UINT64_MAX; - - rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); - mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); - vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); - - list_link_init(&vd->vdev_leaf_node); - mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); - cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); - - for (int t = 0; t < DTL_TYPES; t++) { - vd->vdev_dtl[t] = range_tree_create(NULL, NULL); - } - txg_list_create(&vd->vdev_ms_list, spa, - offsetof(struct metaslab, ms_txg_node)); - txg_list_create(&vd->vdev_dtl_list, spa, - offsetof(struct vdev, vdev_dtl_node)); - vd->vdev_stat.vs_timestamp = gethrtime(); - vdev_queue_init(vd); - vdev_cache_init(vd); - - return (vd); -} - -/* - * Allocate a new vdev. The 'alloctype' is used to control whether we are - * creating a new vdev or loading an existing one - the behavior is slightly - * different for each case. - */ -int -vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - int alloctype) -{ - vdev_ops_t *ops; - char *type; - uint64_t guid = 0, islog, nparity; - vdev_t *vd; - vdev_indirect_config_t *vic; - vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; - boolean_t top_level = (parent && !parent->vdev_parent); - - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) - return (SET_ERROR(EINVAL)); - - if ((ops = vdev_getops(type)) == NULL) - return (SET_ERROR(EINVAL)); - - /* - * If this is a load, get the vdev guid from the nvlist. - * Otherwise, vdev_alloc_common() will generate one for us. - */ - if (alloctype == VDEV_ALLOC_LOAD) { - uint64_t label_id; - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || - label_id != id) - return (SET_ERROR(EINVAL)); - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) - return (SET_ERROR(EINVAL)); - } else if (alloctype == VDEV_ALLOC_SPARE) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) - return (SET_ERROR(EINVAL)); - } else if (alloctype == VDEV_ALLOC_L2CACHE) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) - return (SET_ERROR(EINVAL)); - } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) - return (SET_ERROR(EINVAL)); - } - - /* - * The first allocated vdev must be of type 'root'. - */ - if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) - return (SET_ERROR(EINVAL)); - - /* - * Determine whether we're a log vdev. - */ - islog = 0; - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); - if (islog && spa_version(spa) < SPA_VERSION_SLOGS) - return (SET_ERROR(ENOTSUP)); - - if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) - return (SET_ERROR(ENOTSUP)); - - /* - * Set the nparity property for RAID-Z vdevs. - */ - nparity = -1ULL; - if (ops == &vdev_raidz_ops) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, - &nparity) == 0) { - if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) - return (SET_ERROR(EINVAL)); - /* - * Previous versions could only support 1 or 2 parity - * device. - */ - if (nparity > 1 && - spa_version(spa) < SPA_VERSION_RAIDZ2) - return (SET_ERROR(ENOTSUP)); - if (nparity > 2 && - spa_version(spa) < SPA_VERSION_RAIDZ3) - return (SET_ERROR(ENOTSUP)); - } else { - /* - * We require the parity to be specified for SPAs that - * support multiple parity levels. - */ - if (spa_version(spa) >= SPA_VERSION_RAIDZ2) - return (SET_ERROR(EINVAL)); - /* - * Otherwise, we default to 1 parity device for RAID-Z. - */ - nparity = 1; - } - } else { - nparity = 0; - } - ASSERT(nparity != -1ULL); - - /* - * If creating a top-level vdev, check for allocation classes input - */ - if (top_level && alloctype == VDEV_ALLOC_ADD) { - char *bias; - - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, - &bias) == 0) { - alloc_bias = vdev_derive_alloc_bias(bias); - - /* spa_vdev_add() expects feature to be enabled */ - if (alloc_bias != VDEV_BIAS_LOG && - spa->spa_load_state != SPA_LOAD_CREATE && - !spa_feature_is_enabled(spa, - SPA_FEATURE_ALLOCATION_CLASSES)) { - return (SET_ERROR(ENOTSUP)); - } - } - } - - vd = vdev_alloc_common(spa, id, guid, ops); - vic = &vd->vdev_indirect_config; - - vd->vdev_islog = islog; - vd->vdev_nparity = nparity; - if (top_level && alloc_bias != VDEV_BIAS_NONE) - vd->vdev_alloc_bias = alloc_bias; - - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) - vd->vdev_path = spa_strdup(vd->vdev_path); - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) - vd->vdev_devid = spa_strdup(vd->vdev_devid); - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, - &vd->vdev_physpath) == 0) - vd->vdev_physpath = spa_strdup(vd->vdev_physpath); - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) - vd->vdev_fru = spa_strdup(vd->vdev_fru); - - /* - * Set the whole_disk property. If it's not specified, leave the value - * as -1. - */ - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, - &vd->vdev_wholedisk) != 0) - vd->vdev_wholedisk = -1ULL; - - ASSERT0(vic->vic_mapping_object); - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, - &vic->vic_mapping_object); - ASSERT0(vic->vic_births_object); - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, - &vic->vic_births_object); - ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, - &vic->vic_prev_indirect_vdev); - - /* - * Look for the 'not present' flag. This will only be set if the device - * was not present at the time of import. - */ - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, - &vd->vdev_not_present); - - /* - * Get the alignment requirement. - */ - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); - - /* - * Retrieve the vdev creation time. - */ - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, - &vd->vdev_crtxg); - - /* - * If we're a top-level vdev, try to load the allocation parameters. - */ - if (top_level && - (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, - &vd->vdev_ms_array); - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, - &vd->vdev_ms_shift); - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, - &vd->vdev_asize); - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, - &vd->vdev_removing); - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, - &vd->vdev_top_zap); - } else { - ASSERT0(vd->vdev_top_zap); - } - - if (top_level && alloctype != VDEV_ALLOC_ATTACH) { - ASSERT(alloctype == VDEV_ALLOC_LOAD || - alloctype == VDEV_ALLOC_ADD || - alloctype == VDEV_ALLOC_SPLIT || - alloctype == VDEV_ALLOC_ROOTPOOL); - /* Note: metaslab_group_create() is now deferred */ - } - - if (vd->vdev_ops->vdev_op_leaf && - (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { - (void) nvlist_lookup_uint64(nv, - ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); - } else { - ASSERT0(vd->vdev_leaf_zap); - } - - /* - * If we're a leaf vdev, try to load the DTL object and other state. - */ - - if (vd->vdev_ops->vdev_op_leaf && - (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || - alloctype == VDEV_ALLOC_ROOTPOOL)) { - if (alloctype == VDEV_ALLOC_LOAD) { - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, - &vd->vdev_dtl_object); - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, - &vd->vdev_unspare); - } - - if (alloctype == VDEV_ALLOC_ROOTPOOL) { - uint64_t spare = 0; - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, - &spare) == 0 && spare) - spa_spare_add(vd); - } - - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, - &vd->vdev_offline); - - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, - &vd->vdev_resilver_txg); - - /* - * When importing a pool, we want to ignore the persistent fault - * state, as the diagnosis made on another system may not be - * valid in the current context. Local vdevs will - * remain in the faulted state. - */ - if (spa_load_state(spa) == SPA_LOAD_OPEN) { - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, - &vd->vdev_faulted); - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, - &vd->vdev_degraded); - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, - &vd->vdev_removed); - - if (vd->vdev_faulted || vd->vdev_degraded) { - char *aux; - - vd->vdev_label_aux = - VDEV_AUX_ERR_EXCEEDED; - if (nvlist_lookup_string(nv, - ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && - strcmp(aux, "external") == 0) - vd->vdev_label_aux = VDEV_AUX_EXTERNAL; - } - } - } - - /* - * Add ourselves to the parent's list of children. - */ - vdev_add_child(parent, vd); - - *vdp = vd; - - return (0); -} - -void -vdev_free(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - ASSERT3P(vd->vdev_initialize_thread, ==, NULL); - - /* - * Scan queues are normally destroyed at the end of a scan. If the - * queue exists here, that implies the vdev is being removed while - * the scan is still running. - */ - if (vd->vdev_scan_io_queue != NULL) { - mutex_enter(&vd->vdev_scan_io_queue_lock); - dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); - vd->vdev_scan_io_queue = NULL; - mutex_exit(&vd->vdev_scan_io_queue_lock); - } - - /* - * vdev_free() implies closing the vdev first. This is simpler than - * trying to ensure complicated semantics for all callers. - */ - vdev_close(vd); - - ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); - ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); - - /* - * Free all children. - */ - for (int c = 0; c < vd->vdev_children; c++) - vdev_free(vd->vdev_child[c]); - - ASSERT(vd->vdev_child == NULL); - ASSERT(vd->vdev_guid_sum == vd->vdev_guid); - ASSERT(vd->vdev_initialize_thread == NULL); - - /* - * Discard allocation state. - */ - if (vd->vdev_mg != NULL) { - vdev_metaslab_fini(vd); - metaslab_group_destroy(vd->vdev_mg); - } - - ASSERT0(vd->vdev_stat.vs_space); - ASSERT0(vd->vdev_stat.vs_dspace); - ASSERT0(vd->vdev_stat.vs_alloc); - - /* - * Remove this vdev from its parent's child list. - */ - vdev_remove_child(vd->vdev_parent, vd); - - ASSERT(vd->vdev_parent == NULL); - ASSERT(!list_link_active(&vd->vdev_leaf_node)); - - /* - * Clean up vdev structure. - */ - vdev_queue_fini(vd); - vdev_cache_fini(vd); - - if (vd->vdev_path) - spa_strfree(vd->vdev_path); - if (vd->vdev_devid) - spa_strfree(vd->vdev_devid); - if (vd->vdev_physpath) - spa_strfree(vd->vdev_physpath); - if (vd->vdev_fru) - spa_strfree(vd->vdev_fru); - - if (vd->vdev_isspare) - spa_spare_remove(vd); - if (vd->vdev_isl2cache) - spa_l2cache_remove(vd); - - txg_list_destroy(&vd->vdev_ms_list); - txg_list_destroy(&vd->vdev_dtl_list); - - mutex_enter(&vd->vdev_dtl_lock); - space_map_close(vd->vdev_dtl_sm); - for (int t = 0; t < DTL_TYPES; t++) { - range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); - range_tree_destroy(vd->vdev_dtl[t]); - } - mutex_exit(&vd->vdev_dtl_lock); - - EQUIV(vd->vdev_indirect_births != NULL, - vd->vdev_indirect_mapping != NULL); - if (vd->vdev_indirect_births != NULL) { - vdev_indirect_mapping_close(vd->vdev_indirect_mapping); - vdev_indirect_births_close(vd->vdev_indirect_births); - } - - if (vd->vdev_obsolete_sm != NULL) { - ASSERT(vd->vdev_removing || - vd->vdev_ops == &vdev_indirect_ops); - space_map_close(vd->vdev_obsolete_sm); - vd->vdev_obsolete_sm = NULL; - } - range_tree_destroy(vd->vdev_obsolete_segments); - rw_destroy(&vd->vdev_indirect_rwlock); - mutex_destroy(&vd->vdev_obsolete_lock); - - mutex_destroy(&vd->vdev_dtl_lock); - mutex_destroy(&vd->vdev_stat_lock); - mutex_destroy(&vd->vdev_probe_lock); - mutex_destroy(&vd->vdev_scan_io_queue_lock); - mutex_destroy(&vd->vdev_initialize_lock); - mutex_destroy(&vd->vdev_initialize_io_lock); - cv_destroy(&vd->vdev_initialize_io_cv); - cv_destroy(&vd->vdev_initialize_cv); - - if (vd == spa->spa_root_vdev) - spa->spa_root_vdev = NULL; - - kmem_free(vd, sizeof (vdev_t)); -} - -/* - * Transfer top-level vdev state from svd to tvd. - */ -static void -vdev_top_transfer(vdev_t *svd, vdev_t *tvd) -{ - spa_t *spa = svd->vdev_spa; - metaslab_t *msp; - vdev_t *vd; - int t; - - ASSERT(tvd == tvd->vdev_top); - - tvd->vdev_ms_array = svd->vdev_ms_array; - tvd->vdev_ms_shift = svd->vdev_ms_shift; - tvd->vdev_ms_count = svd->vdev_ms_count; - tvd->vdev_top_zap = svd->vdev_top_zap; - - svd->vdev_ms_array = 0; - svd->vdev_ms_shift = 0; - svd->vdev_ms_count = 0; - svd->vdev_top_zap = 0; - - if (tvd->vdev_mg) - ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); - tvd->vdev_mg = svd->vdev_mg; - tvd->vdev_ms = svd->vdev_ms; - - svd->vdev_mg = NULL; - svd->vdev_ms = NULL; - - if (tvd->vdev_mg != NULL) - tvd->vdev_mg->mg_vd = tvd; - - tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; - svd->vdev_checkpoint_sm = NULL; - - tvd->vdev_alloc_bias = svd->vdev_alloc_bias; - svd->vdev_alloc_bias = VDEV_BIAS_NONE; - - tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; - tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; - tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; - - svd->vdev_stat.vs_alloc = 0; - svd->vdev_stat.vs_space = 0; - svd->vdev_stat.vs_dspace = 0; - - /* - * State which may be set on a top-level vdev that's in the - * process of being removed. - */ - ASSERT0(tvd->vdev_indirect_config.vic_births_object); - ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); - ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); - ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); - ASSERT3P(tvd->vdev_indirect_births, ==, NULL); - ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); - ASSERT0(tvd->vdev_removing); - tvd->vdev_removing = svd->vdev_removing; - tvd->vdev_indirect_config = svd->vdev_indirect_config; - tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; - tvd->vdev_indirect_births = svd->vdev_indirect_births; - range_tree_swap(&svd->vdev_obsolete_segments, - &tvd->vdev_obsolete_segments); - tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; - svd->vdev_indirect_config.vic_mapping_object = 0; - svd->vdev_indirect_config.vic_births_object = 0; - svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; - svd->vdev_indirect_mapping = NULL; - svd->vdev_indirect_births = NULL; - svd->vdev_obsolete_sm = NULL; - svd->vdev_removing = 0; - - for (t = 0; t < TXG_SIZE; t++) { - while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) - (void) txg_list_add(&tvd->vdev_ms_list, msp, t); - while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) - (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); - if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) - (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); - } - - if (list_link_active(&svd->vdev_config_dirty_node)) { - vdev_config_clean(svd); - vdev_config_dirty(tvd); - } - - if (list_link_active(&svd->vdev_state_dirty_node)) { - vdev_state_clean(svd); - vdev_state_dirty(tvd); - } - - tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; - svd->vdev_deflate_ratio = 0; - - tvd->vdev_islog = svd->vdev_islog; - svd->vdev_islog = 0; - - dsl_scan_io_queue_vdev_xfer(svd, tvd); -} - -static void -vdev_top_update(vdev_t *tvd, vdev_t *vd) -{ - if (vd == NULL) - return; - - vd->vdev_top = tvd; - - for (int c = 0; c < vd->vdev_children; c++) - vdev_top_update(tvd, vd->vdev_child[c]); -} - -/* - * Add a mirror/replacing vdev above an existing vdev. - */ -vdev_t * -vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) -{ - spa_t *spa = cvd->vdev_spa; - vdev_t *pvd = cvd->vdev_parent; - vdev_t *mvd; - - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - - mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); - - mvd->vdev_asize = cvd->vdev_asize; - mvd->vdev_min_asize = cvd->vdev_min_asize; - mvd->vdev_max_asize = cvd->vdev_max_asize; - mvd->vdev_psize = cvd->vdev_psize; - mvd->vdev_ashift = cvd->vdev_ashift; - mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; - mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; - mvd->vdev_state = cvd->vdev_state; - mvd->vdev_crtxg = cvd->vdev_crtxg; - - vdev_remove_child(pvd, cvd); - vdev_add_child(pvd, mvd); - cvd->vdev_id = mvd->vdev_children; - vdev_add_child(mvd, cvd); - vdev_top_update(cvd->vdev_top, cvd->vdev_top); - - if (mvd == mvd->vdev_top) - vdev_top_transfer(cvd, mvd); - - return (mvd); -} - -/* - * Remove a 1-way mirror/replacing vdev from the tree. - */ -void -vdev_remove_parent(vdev_t *cvd) -{ - vdev_t *mvd = cvd->vdev_parent; - vdev_t *pvd = mvd->vdev_parent; - - ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); - - ASSERT(mvd->vdev_children == 1); - ASSERT(mvd->vdev_ops == &vdev_mirror_ops || - mvd->vdev_ops == &vdev_replacing_ops || - mvd->vdev_ops == &vdev_spare_ops); - cvd->vdev_ashift = mvd->vdev_ashift; - cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; - cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; - - vdev_remove_child(mvd, cvd); - vdev_remove_child(pvd, mvd); - - /* - * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. - * Otherwise, we could have detached an offline device, and when we - * go to import the pool we'll think we have two top-level vdevs, - * instead of a different version of the same top-level vdev. - */ - if (mvd->vdev_top == mvd) { - uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; - cvd->vdev_orig_guid = cvd->vdev_guid; - cvd->vdev_guid += guid_delta; - cvd->vdev_guid_sum += guid_delta; - } - cvd->vdev_id = mvd->vdev_id; - vdev_add_child(pvd, cvd); - vdev_top_update(cvd->vdev_top, cvd->vdev_top); - - if (cvd == cvd->vdev_top) - vdev_top_transfer(mvd, cvd); - - ASSERT(mvd->vdev_children == 0); - vdev_free(mvd); -} - -static void -vdev_metaslab_group_create(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - /* - * metaslab_group_create was delayed until allocation bias was available - */ - if (vd->vdev_mg == NULL) { - metaslab_class_t *mc; - - if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) - vd->vdev_alloc_bias = VDEV_BIAS_LOG; - - ASSERT3U(vd->vdev_islog, ==, - (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); - - switch (vd->vdev_alloc_bias) { - case VDEV_BIAS_LOG: - mc = spa_log_class(spa); - break; - case VDEV_BIAS_SPECIAL: - mc = spa_special_class(spa); - break; - case VDEV_BIAS_DEDUP: - mc = spa_dedup_class(spa); - break; - default: - mc = spa_normal_class(spa); - } - - vd->vdev_mg = metaslab_group_create(mc, vd, - spa->spa_alloc_count); - - /* - * The spa ashift values currently only reflect the - * general vdev classes. Class destination is late - * binding so ashift checking had to wait until now - */ - if (vd->vdev_top == vd && vd->vdev_ashift != 0 && - mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { - if (vd->vdev_ashift > spa->spa_max_ashift) - spa->spa_max_ashift = vd->vdev_ashift; - if (vd->vdev_ashift < spa->spa_min_ashift) - spa->spa_min_ashift = vd->vdev_ashift; - } - } -} - -int -vdev_metaslab_init(vdev_t *vd, uint64_t txg) -{ - spa_t *spa = vd->vdev_spa; - objset_t *mos = spa->spa_meta_objset; - uint64_t m; - uint64_t oldc = vd->vdev_ms_count; - uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; - metaslab_t **mspp; - int error; - boolean_t expanding = (oldc != 0); - - ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); - - /* - * This vdev is not being allocated from yet or is a hole. - */ - if (vd->vdev_ms_shift == 0) - return (0); - - ASSERT(!vd->vdev_ishole); - - ASSERT(oldc <= newc); - - mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); - - if (expanding) { - bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); - kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); - } - - vd->vdev_ms = mspp; - vd->vdev_ms_count = newc; - for (m = oldc; m < newc; m++) { - uint64_t object = 0; - - /* - * vdev_ms_array may be 0 if we are creating the "fake" - * metaslabs for an indirect vdev for zdb's leak detection. - * See zdb_leak_init(). - */ - if (txg == 0 && vd->vdev_ms_array != 0) { - error = dmu_read(mos, vd->vdev_ms_array, - m * sizeof (uint64_t), sizeof (uint64_t), &object, - DMU_READ_PREFETCH); - if (error != 0) { - vdev_dbgmsg(vd, "unable to read the metaslab " - "array [error=%d]", error); - return (error); - } - } - -#ifndef _KERNEL - /* - * To accomodate zdb_leak_init() fake indirect - * metaslabs, we allocate a metaslab group for - * indirect vdevs which normally don't have one. - */ - if (vd->vdev_mg == NULL) { - ASSERT0(vdev_is_concrete(vd)); - vdev_metaslab_group_create(vd); - } -#endif - error = metaslab_init(vd->vdev_mg, m, object, txg, - &(vd->vdev_ms[m])); - if (error != 0) { - vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", - error); - return (error); - } - } - - if (txg == 0) - spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); - - /* - * If the vdev is being removed we don't activate - * the metaslabs since we want to ensure that no new - * allocations are performed on this device. - */ - if (!expanding && !vd->vdev_removing) { - metaslab_group_activate(vd->vdev_mg); - } - - if (txg == 0) - spa_config_exit(spa, SCL_ALLOC, FTAG); - - return (0); -} - -void -vdev_metaslab_fini(vdev_t *vd) -{ - if (vd->vdev_checkpoint_sm != NULL) { - ASSERT(spa_feature_is_active(vd->vdev_spa, - SPA_FEATURE_POOL_CHECKPOINT)); - space_map_close(vd->vdev_checkpoint_sm); - /* - * Even though we close the space map, we need to set its - * pointer to NULL. The reason is that vdev_metaslab_fini() - * may be called multiple times for certain operations - * (i.e. when destroying a pool) so we need to ensure that - * this clause never executes twice. This logic is similar - * to the one used for the vdev_ms clause below. - */ - vd->vdev_checkpoint_sm = NULL; - } - - if (vd->vdev_ms != NULL) { - metaslab_group_t *mg = vd->vdev_mg; - metaslab_group_passivate(mg); - - uint64_t count = vd->vdev_ms_count; - for (uint64_t m = 0; m < count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - if (msp != NULL) - metaslab_fini(msp); - } - kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); - vd->vdev_ms = NULL; - - vd->vdev_ms_count = 0; - - for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) - ASSERT0(mg->mg_histogram[i]); - } - ASSERT0(vd->vdev_ms_count); -} - -typedef struct vdev_probe_stats { - boolean_t vps_readable; - boolean_t vps_writeable; - int vps_flags; -} vdev_probe_stats_t; - -static void -vdev_probe_done(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - vdev_t *vd = zio->io_vd; - vdev_probe_stats_t *vps = zio->io_private; - - ASSERT(vd->vdev_probe_zio != NULL); - - if (zio->io_type == ZIO_TYPE_READ) { - if (zio->io_error == 0) - vps->vps_readable = 1; - if (zio->io_error == 0 && spa_writeable(spa)) { - zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, - zio->io_offset, zio->io_size, zio->io_abd, - ZIO_CHECKSUM_OFF, vdev_probe_done, vps, - ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); - } else { - abd_free(zio->io_abd); - } - } else if (zio->io_type == ZIO_TYPE_WRITE) { - if (zio->io_error == 0) - vps->vps_writeable = 1; - abd_free(zio->io_abd); - } else if (zio->io_type == ZIO_TYPE_NULL) { - zio_t *pio; - - vd->vdev_cant_read |= !vps->vps_readable; - vd->vdev_cant_write |= !vps->vps_writeable; - - if (vdev_readable(vd) && - (vdev_writeable(vd) || !spa_writeable(spa))) { - zio->io_error = 0; - } else { - ASSERT(zio->io_error != 0); - vdev_dbgmsg(vd, "failed probe"); - zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, - spa, vd, NULL, 0, 0); - zio->io_error = SET_ERROR(ENXIO); - } - - mutex_enter(&vd->vdev_probe_lock); - ASSERT(vd->vdev_probe_zio == zio); - vd->vdev_probe_zio = NULL; - mutex_exit(&vd->vdev_probe_lock); - - zio_link_t *zl = NULL; - while ((pio = zio_walk_parents(zio, &zl)) != NULL) - if (!vdev_accessible(vd, pio)) - pio->io_error = SET_ERROR(ENXIO); - - kmem_free(vps, sizeof (*vps)); - } -} - -/* - * Determine whether this device is accessible. - * - * Read and write to several known locations: the pad regions of each - * vdev label but the first, which we leave alone in case it contains - * a VTOC. - */ -zio_t * -vdev_probe(vdev_t *vd, zio_t *zio) -{ - spa_t *spa = vd->vdev_spa; - vdev_probe_stats_t *vps = NULL; - zio_t *pio; - - ASSERT(vd->vdev_ops->vdev_op_leaf); - - /* - * Don't probe the probe. - */ - if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) - return (NULL); - - /* - * To prevent 'probe storms' when a device fails, we create - * just one probe i/o at a time. All zios that want to probe - * this vdev will become parents of the probe io. - */ - mutex_enter(&vd->vdev_probe_lock); - - if ((pio = vd->vdev_probe_zio) == NULL) { - vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); - - vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | - ZIO_FLAG_TRYHARD; - - if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { - /* - * vdev_cant_read and vdev_cant_write can only - * transition from TRUE to FALSE when we have the - * SCL_ZIO lock as writer; otherwise they can only - * transition from FALSE to TRUE. This ensures that - * any zio looking at these values can assume that - * failures persist for the life of the I/O. That's - * important because when a device has intermittent - * connectivity problems, we want to ensure that - * they're ascribed to the device (ENXIO) and not - * the zio (EIO). - * - * Since we hold SCL_ZIO as writer here, clear both - * values so the probe can reevaluate from first - * principles. - */ - vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; - vd->vdev_cant_read = B_FALSE; - vd->vdev_cant_write = B_FALSE; - } - - vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, - vdev_probe_done, vps, - vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); - - /* - * We can't change the vdev state in this context, so we - * kick off an async task to do it on our behalf. - */ - if (zio != NULL) { - vd->vdev_probe_wanted = B_TRUE; - spa_async_request(spa, SPA_ASYNC_PROBE); - } - } - - if (zio != NULL) - zio_add_child(zio, pio); - - mutex_exit(&vd->vdev_probe_lock); - - if (vps == NULL) { - ASSERT(zio != NULL); - return (NULL); - } - - for (int l = 1; l < VDEV_LABELS; l++) { - zio_nowait(zio_read_phys(pio, vd, - vdev_label_offset(vd->vdev_psize, l, - offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, - abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), - ZIO_CHECKSUM_OFF, vdev_probe_done, vps, - ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); - } - - if (zio == NULL) - return (pio); - - zio_nowait(pio); - return (NULL); -} - -static void -vdev_open_child(void *arg) -{ - vdev_t *vd = arg; - - vd->vdev_open_thread = curthread; - vd->vdev_open_error = vdev_open(vd); - vd->vdev_open_thread = NULL; -} - -boolean_t -vdev_uses_zvols(vdev_t *vd) -{ - if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, - strlen(ZVOL_DIR)) == 0) - return (B_TRUE); - for (int c = 0; c < vd->vdev_children; c++) - if (vdev_uses_zvols(vd->vdev_child[c])) - return (B_TRUE); - return (B_FALSE); -} - -void -vdev_open_children(vdev_t *vd) -{ - taskq_t *tq; - int children = vd->vdev_children; - - vd->vdev_nonrot = B_TRUE; - - /* - * in order to handle pools on top of zvols, do the opens - * in a single thread so that the same thread holds the - * spa_namespace_lock - */ - if (B_TRUE || vdev_uses_zvols(vd)) { - for (int c = 0; c < children; c++) { - vd->vdev_child[c]->vdev_open_error = - vdev_open(vd->vdev_child[c]); - vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; - } - return; - } - tq = taskq_create("vdev_open", children, minclsyspri, - children, children, TASKQ_PREPOPULATE); - - for (int c = 0; c < children; c++) - VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], - TQ_SLEEP) != 0); - - taskq_destroy(tq); - - for (int c = 0; c < children; c++) - vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; -} - -/* - * Compute the raidz-deflation ratio. Note, we hard-code - * in 128k (1 << 17) because it is the "typical" blocksize. - * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, - * otherwise it would inconsistently account for existing bp's. - */ -static void -vdev_set_deflate_ratio(vdev_t *vd) -{ - if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { - vd->vdev_deflate_ratio = (1 << 17) / - (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); - } -} - -/* - * Prepare a virtual device for access. - */ -int -vdev_open(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - int error; - uint64_t osize = 0; - uint64_t max_osize = 0; - uint64_t asize, max_asize, psize; - uint64_t logical_ashift = 0; - uint64_t physical_ashift = 0; - - ASSERT(vd->vdev_open_thread == curthread || - spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); - ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || - vd->vdev_state == VDEV_STATE_CANT_OPEN || - vd->vdev_state == VDEV_STATE_OFFLINE); - - vd->vdev_stat.vs_aux = VDEV_AUX_NONE; - vd->vdev_cant_read = B_FALSE; - vd->vdev_cant_write = B_FALSE; - vd->vdev_notrim = B_FALSE; - vd->vdev_min_asize = vdev_get_min_asize(vd); - - /* - * If this vdev is not removed, check its fault status. If it's - * faulted, bail out of the open. - */ - if (!vd->vdev_removed && vd->vdev_faulted) { - ASSERT(vd->vdev_children == 0); - ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || - vd->vdev_label_aux == VDEV_AUX_EXTERNAL); - vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, - vd->vdev_label_aux); - return (SET_ERROR(ENXIO)); - } else if (vd->vdev_offline) { - ASSERT(vd->vdev_children == 0); - vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); - return (SET_ERROR(ENXIO)); - } - - error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, - &logical_ashift, &physical_ashift); - - /* - * Reset the vdev_reopening flag so that we actually close - * the vdev on error. - */ - vd->vdev_reopening = B_FALSE; - if (zio_injection_enabled && error == 0) - error = zio_handle_device_injection(vd, NULL, ENXIO); - - if (error) { - if (vd->vdev_removed && - vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) - vd->vdev_removed = B_FALSE; - - if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, - vd->vdev_stat.vs_aux); - } else { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - vd->vdev_stat.vs_aux); - } - return (error); - } - - vd->vdev_removed = B_FALSE; - - /* - * Recheck the faulted flag now that we have confirmed that - * the vdev is accessible. If we're faulted, bail. - */ - if (vd->vdev_faulted) { - ASSERT(vd->vdev_children == 0); - ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || - vd->vdev_label_aux == VDEV_AUX_EXTERNAL); - vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, - vd->vdev_label_aux); - return (SET_ERROR(ENXIO)); - } - - if (vd->vdev_degraded) { - ASSERT(vd->vdev_children == 0); - vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, - VDEV_AUX_ERR_EXCEEDED); - } else { - vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); - } - - /* - * For hole or missing vdevs we just return success. - */ - if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) - return (0); - - if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf) - trim_map_create(vd); - - for (int c = 0; c < vd->vdev_children; c++) { - if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, - VDEV_AUX_NONE); - break; - } - } - - osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); - max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); - - if (vd->vdev_children == 0) { - if (osize < SPA_MINDEVSIZE) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_TOO_SMALL); - return (SET_ERROR(EOVERFLOW)); - } - psize = osize; - asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); - max_asize = max_osize - (VDEV_LABEL_START_SIZE + - VDEV_LABEL_END_SIZE); - } else { - if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_TOO_SMALL); - return (SET_ERROR(EOVERFLOW)); - } - psize = 0; - asize = osize; - max_asize = max_osize; - } - - vd->vdev_psize = psize; - - /* - * Make sure the allocatable size hasn't shrunk too much. - */ - if (asize < vd->vdev_min_asize) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LABEL); - return (SET_ERROR(EINVAL)); - } - - vd->vdev_physical_ashift = - MAX(physical_ashift, vd->vdev_physical_ashift); - vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); - vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); - - if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_ASHIFT_TOO_BIG); - return (EINVAL); - } - - if (vd->vdev_asize == 0) { - /* - * This is the first-ever open, so use the computed values. - * For testing purposes, a higher ashift can be requested. - */ - vd->vdev_asize = asize; - vd->vdev_max_asize = max_asize; - } else { - /* - * Make sure the alignment requirement hasn't increased. - */ - if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && - vd->vdev_ops->vdev_op_leaf) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LABEL); - return (EINVAL); - } - vd->vdev_max_asize = max_asize; - } - - /* - * If all children are healthy we update asize if either: - * The asize has increased, due to a device expansion caused by dynamic - * LUN growth or vdev replacement, and automatic expansion is enabled; - * making the additional space available. - * - * The asize has decreased, due to a device shrink usually caused by a - * vdev replace with a smaller device. This ensures that calculations - * based of max_asize and asize e.g. esize are always valid. It's safe - * to do this as we've already validated that asize is greater than - * vdev_min_asize. - */ - if (vd->vdev_state == VDEV_STATE_HEALTHY && - ((asize > vd->vdev_asize && - (vd->vdev_expanding || spa->spa_autoexpand)) || - (asize < vd->vdev_asize))) - vd->vdev_asize = asize; - - vdev_set_min_asize(vd); - - /* - * Ensure we can issue some IO before declaring the - * vdev open for business. - */ - if (vd->vdev_ops->vdev_op_leaf && - (error = zio_wait(vdev_probe(vd, NULL))) != 0) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, - VDEV_AUX_ERR_EXCEEDED); - return (error); - } - - /* - * Track the min and max ashift values for normal data devices. - * - * DJB - TBD these should perhaps be tracked per allocation class - * (e.g. spa_min_ashift is used to round up post compression buffers) - */ - if (vd->vdev_top == vd && vd->vdev_ashift != 0 && - vd->vdev_alloc_bias == VDEV_BIAS_NONE && - vd->vdev_aux == NULL) { - if (vd->vdev_ashift > spa->spa_max_ashift) - spa->spa_max_ashift = vd->vdev_ashift; - if (vd->vdev_ashift < spa->spa_min_ashift) - spa->spa_min_ashift = vd->vdev_ashift; - } - - /* - * If a leaf vdev has a DTL, and seems healthy, then kick off a - * resilver. But don't do this if we are doing a reopen for a scrub, - * since this would just restart the scrub we are already doing. - */ - if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && - vdev_resilver_needed(vd, NULL, NULL)) - spa_async_request(spa, SPA_ASYNC_RESILVER); - - return (0); -} - -/* - * Called once the vdevs are all opened, this routine validates the label - * contents. This needs to be done before vdev_load() so that we don't - * inadvertently do repair I/Os to the wrong device. - * - * This function will only return failure if one of the vdevs indicates that it - * has since been destroyed or exported. This is only possible if - * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state - * will be updated but the function will return 0. - */ -int -vdev_validate(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - nvlist_t *label; - uint64_t guid = 0, aux_guid = 0, top_guid; - uint64_t state; - nvlist_t *nvl; - uint64_t txg; - - if (vdev_validate_skip) - return (0); - - for (uint64_t c = 0; c < vd->vdev_children; c++) - if (vdev_validate(vd->vdev_child[c]) != 0) - return (SET_ERROR(EBADF)); - - /* - * If the device has already failed, or was marked offline, don't do - * any further validation. Otherwise, label I/O will fail and we will - * overwrite the previous state. - */ - if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) - return (0); - - /* - * If we are performing an extreme rewind, we allow for a label that - * was modified at a point after the current txg. - * If config lock is not held do not check for the txg. spa_sync could - * be updating the vdev's label before updating spa_last_synced_txg. - */ - if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || - spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) - txg = UINT64_MAX; - else - txg = spa_last_synced_txg(spa); - - if ((label = vdev_label_read_config(vd, txg)) == NULL) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LABEL); - vdev_dbgmsg(vd, "vdev_validate: failed reading config for " - "txg %llu", (u_longlong_t)txg); - return (0); - } - - /* - * Determine if this vdev has been split off into another - * pool. If so, then refuse to open it. - */ - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, - &aux_guid) == 0 && aux_guid == spa_guid(spa)) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_SPLIT_POOL); - nvlist_free(label); - vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); - return (0); - } - - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - nvlist_free(label); - vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", - ZPOOL_CONFIG_POOL_GUID); - return (0); - } - - /* - * If config is not trusted then ignore the spa guid check. This is - * necessary because if the machine crashed during a re-guid the new - * guid might have been written to all of the vdev labels, but not the - * cached config. The check will be performed again once we have the - * trusted config from the MOS. - */ - if (spa->spa_trust_config && guid != spa_guid(spa)) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - nvlist_free(label); - vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " - "match config (%llu != %llu)", (u_longlong_t)guid, - (u_longlong_t)spa_guid(spa)); - return (0); - } - - if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) - != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, - &aux_guid) != 0) - aux_guid = 0; - - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - nvlist_free(label); - vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", - ZPOOL_CONFIG_GUID); - return (0); - } - - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) - != 0) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - nvlist_free(label); - vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", - ZPOOL_CONFIG_TOP_GUID); - return (0); - } - - /* - * If this vdev just became a top-level vdev because its sibling was - * detached, it will have adopted the parent's vdev guid -- but the - * label may or may not be on disk yet. Fortunately, either version - * of the label will have the same top guid, so if we're a top-level - * vdev, we can safely compare to that instead. - * However, if the config comes from a cachefile that failed to update - * after the detach, a top-level vdev will appear as a non top-level - * vdev in the config. Also relax the constraints if we perform an - * extreme rewind. - * - * If we split this vdev off instead, then we also check the - * original pool's guid. We don't want to consider the vdev - * corrupt if it is partway through a split operation. - */ - if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { - boolean_t mismatch = B_FALSE; - if (spa->spa_trust_config && !spa->spa_extreme_rewind) { - if (vd != vd->vdev_top || vd->vdev_guid != top_guid) - mismatch = B_TRUE; - } else { - if (vd->vdev_guid != top_guid && - vd->vdev_top->vdev_guid != guid) - mismatch = B_TRUE; - } - - if (mismatch) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - nvlist_free(label); - vdev_dbgmsg(vd, "vdev_validate: config guid " - "doesn't match label guid"); - vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", - (u_longlong_t)vd->vdev_guid, - (u_longlong_t)vd->vdev_top->vdev_guid); - vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " - "aux_guid %llu", (u_longlong_t)guid, - (u_longlong_t)top_guid, (u_longlong_t)aux_guid); - return (0); - } - } - - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, - &state) != 0) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - nvlist_free(label); - vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", - ZPOOL_CONFIG_POOL_STATE); - return (0); - } - - nvlist_free(label); - - /* - * If this is a verbatim import, no need to check the - * state of the pool. - */ - if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && - spa_load_state(spa) == SPA_LOAD_OPEN && - state != POOL_STATE_ACTIVE) { - vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " - "for spa %s", (u_longlong_t)state, spa->spa_name); - return (SET_ERROR(EBADF)); - } - - /* - * If we were able to open and validate a vdev that was - * previously marked permanently unavailable, clear that state - * now. - */ - if (vd->vdev_not_present) - vd->vdev_not_present = 0; - - return (0); -} - -static void -vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) -{ - if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { - if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { - zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " - "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, - dvd->vdev_path, svd->vdev_path); - spa_strfree(dvd->vdev_path); - dvd->vdev_path = spa_strdup(svd->vdev_path); - } - } else if (svd->vdev_path != NULL) { - dvd->vdev_path = spa_strdup(svd->vdev_path); - zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", - (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); - } -} - -/* - * Recursively copy vdev paths from one vdev to another. Source and destination - * vdev trees must have same geometry otherwise return error. Intended to copy - * paths from userland config into MOS config. - */ -int -vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) -{ - if ((svd->vdev_ops == &vdev_missing_ops) || - (svd->vdev_ishole && dvd->vdev_ishole) || - (dvd->vdev_ops == &vdev_indirect_ops)) - return (0); - - if (svd->vdev_ops != dvd->vdev_ops) { - vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", - svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); - return (SET_ERROR(EINVAL)); - } - - if (svd->vdev_guid != dvd->vdev_guid) { - vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " - "%llu)", (u_longlong_t)svd->vdev_guid, - (u_longlong_t)dvd->vdev_guid); - return (SET_ERROR(EINVAL)); - } - - if (svd->vdev_children != dvd->vdev_children) { - vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " - "%llu != %llu", (u_longlong_t)svd->vdev_children, - (u_longlong_t)dvd->vdev_children); - return (SET_ERROR(EINVAL)); - } - - for (uint64_t i = 0; i < svd->vdev_children; i++) { - int error = vdev_copy_path_strict(svd->vdev_child[i], - dvd->vdev_child[i]); - if (error != 0) - return (error); - } - - if (svd->vdev_ops->vdev_op_leaf) - vdev_copy_path_impl(svd, dvd); - - return (0); -} - -static void -vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) -{ - ASSERT(stvd->vdev_top == stvd); - ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); - - for (uint64_t i = 0; i < dvd->vdev_children; i++) { - vdev_copy_path_search(stvd, dvd->vdev_child[i]); - } - - if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) - return; - - /* - * The idea here is that while a vdev can shift positions within - * a top vdev (when replacing, attaching mirror, etc.) it cannot - * step outside of it. - */ - vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); - - if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) - return; - - ASSERT(vd->vdev_ops->vdev_op_leaf); - - vdev_copy_path_impl(vd, dvd); -} - -/* - * Recursively copy vdev paths from one root vdev to another. Source and - * destination vdev trees may differ in geometry. For each destination leaf - * vdev, search a vdev with the same guid and top vdev id in the source. - * Intended to copy paths from userland config into MOS config. - */ -void -vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) -{ - uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); - ASSERT(srvd->vdev_ops == &vdev_root_ops); - ASSERT(drvd->vdev_ops == &vdev_root_ops); - - for (uint64_t i = 0; i < children; i++) { - vdev_copy_path_search(srvd->vdev_child[i], - drvd->vdev_child[i]); - } -} - -/* - * Close a virtual device. - */ -void -vdev_close(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - vdev_t *pvd = vd->vdev_parent; - - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); - - /* - * If our parent is reopening, then we are as well, unless we are - * going offline. - */ - if (pvd != NULL && pvd->vdev_reopening) - vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); - - vd->vdev_ops->vdev_op_close(vd); - - vdev_cache_purge(vd); - - if (vd->vdev_ops->vdev_op_leaf) - trim_map_destroy(vd); - - /* - * We record the previous state before we close it, so that if we are - * doing a reopen(), we don't generate FMA ereports if we notice that - * it's still faulted. - */ - vd->vdev_prevstate = vd->vdev_state; - - if (vd->vdev_offline) - vd->vdev_state = VDEV_STATE_OFFLINE; - else - vd->vdev_state = VDEV_STATE_CLOSED; - vd->vdev_stat.vs_aux = VDEV_AUX_NONE; -} - -void -vdev_hold(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - ASSERT(spa_is_root(spa)); - if (spa->spa_state == POOL_STATE_UNINITIALIZED) - return; - - for (int c = 0; c < vd->vdev_children; c++) - vdev_hold(vd->vdev_child[c]); - - if (vd->vdev_ops->vdev_op_leaf) - vd->vdev_ops->vdev_op_hold(vd); -} - -void -vdev_rele(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - ASSERT(spa_is_root(spa)); - for (int c = 0; c < vd->vdev_children; c++) - vdev_rele(vd->vdev_child[c]); - - if (vd->vdev_ops->vdev_op_leaf) - vd->vdev_ops->vdev_op_rele(vd); -} - -/* - * Reopen all interior vdevs and any unopened leaves. We don't actually - * reopen leaf vdevs which had previously been opened as they might deadlock - * on the spa_config_lock. Instead we only obtain the leaf's physical size. - * If the leaf has never been opened then open it, as usual. - */ -void -vdev_reopen(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); - - /* set the reopening flag unless we're taking the vdev offline */ - vd->vdev_reopening = !vd->vdev_offline; - vdev_close(vd); - (void) vdev_open(vd); - - /* - * Call vdev_validate() here to make sure we have the same device. - * Otherwise, a device with an invalid label could be successfully - * opened in response to vdev_reopen(). - */ - if (vd->vdev_aux) { - (void) vdev_validate_aux(vd); - if (vdev_readable(vd) && vdev_writeable(vd) && - vd->vdev_aux == &spa->spa_l2cache && - !l2arc_vdev_present(vd)) - l2arc_add_vdev(spa, vd); - } else { - (void) vdev_validate(vd); - } - - /* - * Reassess parent vdev's health. - */ - vdev_propagate_state(vd); -} - -int -vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) -{ - int error; - - /* - * Normally, partial opens (e.g. of a mirror) are allowed. - * For a create, however, we want to fail the request if - * there are any components we can't open. - */ - error = vdev_open(vd); - - if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { - vdev_close(vd); - return (error ? error : ENXIO); - } - - /* - * Recursively load DTLs and initialize all labels. - */ - if ((error = vdev_dtl_load(vd)) != 0 || - (error = vdev_label_init(vd, txg, isreplacing ? - VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { - vdev_close(vd); - return (error); - } - - return (0); -} - -void -vdev_metaslab_set_size(vdev_t *vd) -{ - uint64_t asize = vd->vdev_asize; - uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; - uint64_t ms_shift; - - /* - * There are two dimensions to the metaslab sizing calculation: - * the size of the metaslab and the count of metaslabs per vdev. - * - * The default values used below are a good balance between memory - * usage (larger metaslab size means more memory needed for loaded - * metaslabs; more metaslabs means more memory needed for the - * metaslab_t structs), metaslab load time (larger metaslabs take - * longer to load), and metaslab sync time (more metaslabs means - * more time spent syncing all of them). - * - * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. - * The range of the dimensions are as follows: - * - * 2^29 <= ms_size <= 2^34 - * 16 <= ms_count <= 131,072 - * - * On the lower end of vdev sizes, we aim for metaslabs sizes of - * at least 512MB (2^29) to minimize fragmentation effects when - * testing with smaller devices. However, the count constraint - * of at least 16 metaslabs will override this minimum size goal. - * - * On the upper end of vdev sizes, we aim for a maximum metaslab - * size of 16GB. However, we will cap the total count to 2^17 - * metaslabs to keep our memory footprint in check and let the - * metaslab size grow from there if that limit is hit. - * - * The net effect of applying above constrains is summarized below. - * - * vdev size metaslab count - * --------------|----------------- - * < 8GB ~16 - * 8GB - 100GB one per 512MB - * 100GB - 3TB ~200 - * 3TB - 2PB one per 16GB - * > 2PB ~131,072 - * -------------------------------- - * - * Finally, note that all of the above calculate the initial - * number of metaslabs. Expanding a top-level vdev will result - * in additional metaslabs being allocated making it possible - * to exceed the zfs_vdev_ms_count_limit. - */ - - if (ms_count < zfs_vdev_min_ms_count) - ms_shift = highbit64(asize / zfs_vdev_min_ms_count); - else if (ms_count > zfs_vdev_default_ms_count) - ms_shift = highbit64(asize / zfs_vdev_default_ms_count); - else - ms_shift = zfs_vdev_default_ms_shift; - - if (ms_shift < SPA_MAXBLOCKSHIFT) { - ms_shift = SPA_MAXBLOCKSHIFT; - } else if (ms_shift > zfs_vdev_max_ms_shift) { - ms_shift = zfs_vdev_max_ms_shift; - /* cap the total count to constrain memory footprint */ - if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) - ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); - } - - vd->vdev_ms_shift = ms_shift; - ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); -} - -/* - * Maximize performance by inflating the configured ashift for top level - * vdevs to be as close to the physical ashift as possible while maintaining - * administrator defined limits and ensuring it doesn't go below the - * logical ashift. - */ -void -vdev_ashift_optimize(vdev_t *vd) -{ - if (vd == vd->vdev_top) { - if (vd->vdev_ashift < vd->vdev_physical_ashift) { - vd->vdev_ashift = MIN( - MAX(zfs_max_auto_ashift, vd->vdev_ashift), - MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift)); - } else { - /* - * Unusual case where logical ashift > physical ashift - * so we can't cap the calculated ashift based on max - * ashift as that would cause failures. - * We still check if we need to increase it to match - * the min ashift. - */ - vd->vdev_ashift = MAX(zfs_min_auto_ashift, - vd->vdev_ashift); - } - } -} - -void -vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) -{ - ASSERT(vd == vd->vdev_top); - /* indirect vdevs don't have metaslabs or dtls */ - ASSERT(vdev_is_concrete(vd) || flags == 0); - ASSERT(ISP2(flags)); - ASSERT(spa_writeable(vd->vdev_spa)); - - if (flags & VDD_METASLAB) - (void) txg_list_add(&vd->vdev_ms_list, arg, txg); - - if (flags & VDD_DTL) - (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); - - (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); -} - -void -vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) -{ - for (int c = 0; c < vd->vdev_children; c++) - vdev_dirty_leaves(vd->vdev_child[c], flags, txg); - - if (vd->vdev_ops->vdev_op_leaf) - vdev_dirty(vd->vdev_top, flags, vd, txg); -} - -/* - * DTLs. - * - * A vdev's DTL (dirty time log) is the set of transaction groups for which - * the vdev has less than perfect replication. There are four kinds of DTL: - * - * DTL_MISSING: txgs for which the vdev has no valid copies of the data - * - * DTL_PARTIAL: txgs for which data is available, but not fully replicated - * - * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon - * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of - * txgs that was scrubbed. - * - * DTL_OUTAGE: txgs which cannot currently be read, whether due to - * persistent errors or just some device being offline. - * Unlike the other three, the DTL_OUTAGE map is not generally - * maintained; it's only computed when needed, typically to - * determine whether a device can be detached. - * - * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device - * either has the data or it doesn't. - * - * For interior vdevs such as mirror and RAID-Z the picture is more complex. - * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because - * if any child is less than fully replicated, then so is its parent. - * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, - * comprising only those txgs which appear in 'maxfaults' or more children; - * those are the txgs we don't have enough replication to read. For example, - * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); - * thus, its DTL_MISSING consists of the set of txgs that appear in more than - * two child DTL_MISSING maps. - * - * It should be clear from the above that to compute the DTLs and outage maps - * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. - * Therefore, that is all we keep on disk. When loading the pool, or after - * a configuration change, we generate all other DTLs from first principles. - */ -void -vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) -{ - range_tree_t *rt = vd->vdev_dtl[t]; - - ASSERT(t < DTL_TYPES); - ASSERT(vd != vd->vdev_spa->spa_root_vdev); - ASSERT(spa_writeable(vd->vdev_spa)); - - mutex_enter(&vd->vdev_dtl_lock); - if (!range_tree_contains(rt, txg, size)) - range_tree_add(rt, txg, size); - mutex_exit(&vd->vdev_dtl_lock); -} - -boolean_t -vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) -{ - range_tree_t *rt = vd->vdev_dtl[t]; - boolean_t dirty = B_FALSE; - - ASSERT(t < DTL_TYPES); - ASSERT(vd != vd->vdev_spa->spa_root_vdev); - - /* - * While we are loading the pool, the DTLs have not been loaded yet. - * Ignore the DTLs and try all devices. This avoids a recursive - * mutex enter on the vdev_dtl_lock, and also makes us try hard - * when loading the pool (relying on the checksum to ensure that - * we get the right data -- note that we while loading, we are - * only reading the MOS, which is always checksummed). - */ - if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) - return (B_FALSE); - - mutex_enter(&vd->vdev_dtl_lock); - if (!range_tree_is_empty(rt)) - dirty = range_tree_contains(rt, txg, size); - mutex_exit(&vd->vdev_dtl_lock); - - return (dirty); -} - -boolean_t -vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) -{ - range_tree_t *rt = vd->vdev_dtl[t]; - boolean_t empty; - - mutex_enter(&vd->vdev_dtl_lock); - empty = range_tree_is_empty(rt); - mutex_exit(&vd->vdev_dtl_lock); - - return (empty); -} - -/* - * Returns B_TRUE if vdev determines offset needs to be resilvered. - */ -boolean_t -vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) -{ - ASSERT(vd != vd->vdev_spa->spa_root_vdev); - - if (vd->vdev_ops->vdev_op_need_resilver == NULL || - vd->vdev_ops->vdev_op_leaf) - return (B_TRUE); - - return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); -} - -/* - * Returns the lowest txg in the DTL range. - */ -static uint64_t -vdev_dtl_min(vdev_t *vd) -{ - range_seg_t *rs; - - ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); - ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); - ASSERT0(vd->vdev_children); - - rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); - return (rs->rs_start - 1); -} - -/* - * Returns the highest txg in the DTL. - */ -static uint64_t -vdev_dtl_max(vdev_t *vd) -{ - range_seg_t *rs; - - ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); - ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); - ASSERT0(vd->vdev_children); - - rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); - return (rs->rs_end); -} - -/* - * Determine if a resilvering vdev should remove any DTL entries from - * its range. If the vdev was resilvering for the entire duration of the - * scan then it should excise that range from its DTLs. Otherwise, this - * vdev is considered partially resilvered and should leave its DTL - * entries intact. The comment in vdev_dtl_reassess() describes how we - * excise the DTLs. - */ -static boolean_t -vdev_dtl_should_excise(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; - - ASSERT0(scn->scn_phys.scn_errors); - ASSERT0(vd->vdev_children); - - if (vd->vdev_state < VDEV_STATE_DEGRADED) - return (B_FALSE); - - if (vd->vdev_resilver_txg == 0 || - range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) - return (B_TRUE); - - /* - * When a resilver is initiated the scan will assign the scn_max_txg - * value to the highest txg value that exists in all DTLs. If this - * device's max DTL is not part of this scan (i.e. it is not in - * the range (scn_min_txg, scn_max_txg] then it is not eligible - * for excision. - */ - if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { - ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); - ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); - ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); - return (B_TRUE); - } - return (B_FALSE); -} - -/* - * Reassess DTLs after a config change or scrub completion. - */ -void -vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) -{ - spa_t *spa = vd->vdev_spa; - avl_tree_t reftree; - int minref; - - ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); - - for (int c = 0; c < vd->vdev_children; c++) - vdev_dtl_reassess(vd->vdev_child[c], txg, - scrub_txg, scrub_done); - - if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) - return; - - if (vd->vdev_ops->vdev_op_leaf) { - dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; - - mutex_enter(&vd->vdev_dtl_lock); - - /* - * If we've completed a scan cleanly then determine - * if this vdev should remove any DTLs. We only want to - * excise regions on vdevs that were available during - * the entire duration of this scan. - */ - if (scrub_txg != 0 && - (spa->spa_scrub_started || - (scn != NULL && scn->scn_phys.scn_errors == 0)) && - vdev_dtl_should_excise(vd)) { - /* - * We completed a scrub up to scrub_txg. If we - * did it without rebooting, then the scrub dtl - * will be valid, so excise the old region and - * fold in the scrub dtl. Otherwise, leave the - * dtl as-is if there was an error. - * - * There's little trick here: to excise the beginning - * of the DTL_MISSING map, we put it into a reference - * tree and then add a segment with refcnt -1 that - * covers the range [0, scrub_txg). This means - * that each txg in that range has refcnt -1 or 0. - * We then add DTL_SCRUB with a refcnt of 2, so that - * entries in the range [0, scrub_txg) will have a - * positive refcnt -- either 1 or 2. We then convert - * the reference tree into the new DTL_MISSING map. - */ - space_reftree_create(&reftree); - space_reftree_add_map(&reftree, - vd->vdev_dtl[DTL_MISSING], 1); - space_reftree_add_seg(&reftree, 0, scrub_txg, -1); - space_reftree_add_map(&reftree, - vd->vdev_dtl[DTL_SCRUB], 2); - space_reftree_generate_map(&reftree, - vd->vdev_dtl[DTL_MISSING], 1); - space_reftree_destroy(&reftree); - } - range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); - range_tree_walk(vd->vdev_dtl[DTL_MISSING], - range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); - if (scrub_done) - range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); - range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); - if (!vdev_readable(vd)) - range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); - else - range_tree_walk(vd->vdev_dtl[DTL_MISSING], - range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); - - /* - * If the vdev was resilvering and no longer has any - * DTLs then reset its resilvering flag and dirty - * the top level so that we persist the change. - */ - if (vd->vdev_resilver_txg != 0 && - range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && - range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { - vd->vdev_resilver_txg = 0; - vdev_config_dirty(vd->vdev_top); - } - - mutex_exit(&vd->vdev_dtl_lock); - - if (txg != 0) - vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); - return; - } - - mutex_enter(&vd->vdev_dtl_lock); - for (int t = 0; t < DTL_TYPES; t++) { - /* account for child's outage in parent's missing map */ - int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; - if (t == DTL_SCRUB) - continue; /* leaf vdevs only */ - if (t == DTL_PARTIAL) - minref = 1; /* i.e. non-zero */ - else if (vd->vdev_nparity != 0) - minref = vd->vdev_nparity + 1; /* RAID-Z */ - else - minref = vd->vdev_children; /* any kind of mirror */ - space_reftree_create(&reftree); - for (int c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - mutex_enter(&cvd->vdev_dtl_lock); - space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); - mutex_exit(&cvd->vdev_dtl_lock); - } - space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); - space_reftree_destroy(&reftree); - } - mutex_exit(&vd->vdev_dtl_lock); -} - -int -vdev_dtl_load(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - objset_t *mos = spa->spa_meta_objset; - int error = 0; - - if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { - ASSERT(vdev_is_concrete(vd)); - - error = space_map_open(&vd->vdev_dtl_sm, mos, - vd->vdev_dtl_object, 0, -1ULL, 0); - if (error) - return (error); - ASSERT(vd->vdev_dtl_sm != NULL); - - mutex_enter(&vd->vdev_dtl_lock); - error = space_map_load(vd->vdev_dtl_sm, - vd->vdev_dtl[DTL_MISSING], SM_ALLOC); - mutex_exit(&vd->vdev_dtl_lock); - - return (error); - } - - for (int c = 0; c < vd->vdev_children; c++) { - error = vdev_dtl_load(vd->vdev_child[c]); - if (error != 0) - break; - } - - return (error); -} - -static void -vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) -{ - spa_t *spa = vd->vdev_spa; - objset_t *mos = spa->spa_meta_objset; - vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; - const char *string; - - ASSERT(alloc_bias != VDEV_BIAS_NONE); - - string = - (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : - (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : - (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; - - ASSERT(string != NULL); - VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, - 1, strlen(string) + 1, string, tx)); - - if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { - spa_activate_allocation_classes(spa, tx); - } -} - -void -vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) -{ - spa_t *spa = vd->vdev_spa; - - VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); - VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, - zapobj, tx)); -} - -uint64_t -vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) -{ - spa_t *spa = vd->vdev_spa; - uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, - DMU_OT_NONE, 0, tx); - - ASSERT(zap != 0); - VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, - zap, tx)); - - return (zap); -} - -void -vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) -{ - if (vd->vdev_ops != &vdev_hole_ops && - vd->vdev_ops != &vdev_missing_ops && - vd->vdev_ops != &vdev_root_ops && - !vd->vdev_top->vdev_removing) { - if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { - vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); - } - if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { - vd->vdev_top_zap = vdev_create_link_zap(vd, tx); - if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) - vdev_zap_allocation_data(vd, tx); - } - } - - for (uint64_t i = 0; i < vd->vdev_children; i++) { - vdev_construct_zaps(vd->vdev_child[i], tx); - } -} - -void -vdev_dtl_sync(vdev_t *vd, uint64_t txg) -{ - spa_t *spa = vd->vdev_spa; - range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; - objset_t *mos = spa->spa_meta_objset; - range_tree_t *rtsync; - dmu_tx_t *tx; - uint64_t object = space_map_object(vd->vdev_dtl_sm); - - ASSERT(vdev_is_concrete(vd)); - ASSERT(vd->vdev_ops->vdev_op_leaf); - - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - - if (vd->vdev_detached || vd->vdev_top->vdev_removing) { - mutex_enter(&vd->vdev_dtl_lock); - space_map_free(vd->vdev_dtl_sm, tx); - space_map_close(vd->vdev_dtl_sm); - vd->vdev_dtl_sm = NULL; - mutex_exit(&vd->vdev_dtl_lock); - - /* - * We only destroy the leaf ZAP for detached leaves or for - * removed log devices. Removed data devices handle leaf ZAP - * cleanup later, once cancellation is no longer possible. - */ - if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || - vd->vdev_top->vdev_islog)) { - vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); - vd->vdev_leaf_zap = 0; - } - - dmu_tx_commit(tx); - return; - } - - if (vd->vdev_dtl_sm == NULL) { - uint64_t new_object; - - new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx); - VERIFY3U(new_object, !=, 0); - - VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, - 0, -1ULL, 0)); - ASSERT(vd->vdev_dtl_sm != NULL); - } - - rtsync = range_tree_create(NULL, NULL); - - mutex_enter(&vd->vdev_dtl_lock); - range_tree_walk(rt, range_tree_add, rtsync); - mutex_exit(&vd->vdev_dtl_lock); - - space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx); - space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); - range_tree_vacate(rtsync, NULL, NULL); - - range_tree_destroy(rtsync); - - /* - * If the object for the space map has changed then dirty - * the top level so that we update the config. - */ - if (object != space_map_object(vd->vdev_dtl_sm)) { - vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " - "new object %llu", (u_longlong_t)txg, spa_name(spa), - (u_longlong_t)object, - (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); - vdev_config_dirty(vd->vdev_top); - } - - dmu_tx_commit(tx); -} - -/* - * Determine whether the specified vdev can be offlined/detached/removed - * without losing data. - */ -boolean_t -vdev_dtl_required(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - vdev_t *tvd = vd->vdev_top; - uint8_t cant_read = vd->vdev_cant_read; - boolean_t required; - - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); - - if (vd == spa->spa_root_vdev || vd == tvd) - return (B_TRUE); - - /* - * Temporarily mark the device as unreadable, and then determine - * whether this results in any DTL outages in the top-level vdev. - * If not, we can safely offline/detach/remove the device. - */ - vd->vdev_cant_read = B_TRUE; - vdev_dtl_reassess(tvd, 0, 0, B_FALSE); - required = !vdev_dtl_empty(tvd, DTL_OUTAGE); - vd->vdev_cant_read = cant_read; - vdev_dtl_reassess(tvd, 0, 0, B_FALSE); - - if (!required && zio_injection_enabled) - required = !!zio_handle_device_injection(vd, NULL, ECHILD); - - return (required); -} - -/* - * Determine if resilver is needed, and if so the txg range. - */ -boolean_t -vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) -{ - boolean_t needed = B_FALSE; - uint64_t thismin = UINT64_MAX; - uint64_t thismax = 0; - - if (vd->vdev_children == 0) { - mutex_enter(&vd->vdev_dtl_lock); - if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && - vdev_writeable(vd)) { - - thismin = vdev_dtl_min(vd); - thismax = vdev_dtl_max(vd); - needed = B_TRUE; - } - mutex_exit(&vd->vdev_dtl_lock); - } else { - for (int c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - uint64_t cmin, cmax; - - if (vdev_resilver_needed(cvd, &cmin, &cmax)) { - thismin = MIN(thismin, cmin); - thismax = MAX(thismax, cmax); - needed = B_TRUE; - } - } - } - - if (needed && minp) { - *minp = thismin; - *maxp = thismax; - } - return (needed); -} - -/* - * Gets the checkpoint space map object from the vdev's ZAP. - * Returns the spacemap object, or 0 if it wasn't in the ZAP - * or the ZAP doesn't exist yet. - */ -int -vdev_checkpoint_sm_object(vdev_t *vd) -{ - ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); - if (vd->vdev_top_zap == 0) { - return (0); - } - - uint64_t sm_obj = 0; - int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, - VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj); - - ASSERT(err == 0 || err == ENOENT); - - return (sm_obj); -} - -int -vdev_load(vdev_t *vd) -{ - int error = 0; - /* - * Recursively load all children. - */ - for (int c = 0; c < vd->vdev_children; c++) { - error = vdev_load(vd->vdev_child[c]); - if (error != 0) { - return (error); - } - } - - vdev_set_deflate_ratio(vd); - - /* - * On spa_load path, grab the allocation bias from our zap - */ - if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { - spa_t *spa = vd->vdev_spa; - char bias_str[64]; - - if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, - VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), - bias_str) == 0) { - ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); - vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); - } - } - - /* - * If this is a top-level vdev, initialize its metaslabs. - */ - if (vd == vd->vdev_top && vdev_is_concrete(vd)) { - vdev_metaslab_group_create(vd); - - if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " - "asize=%llu", (u_longlong_t)vd->vdev_ashift, - (u_longlong_t)vd->vdev_asize); - return (SET_ERROR(ENXIO)); - } - - error = vdev_metaslab_init(vd, 0); - if (error != 0) { - vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " - "[error=%d]", error); - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - return (error); - } - - uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd); - if (checkpoint_sm_obj != 0) { - objset_t *mos = spa_meta_objset(vd->vdev_spa); - ASSERT(vd->vdev_asize != 0); - ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); - - error = space_map_open(&vd->vdev_checkpoint_sm, - mos, checkpoint_sm_obj, 0, vd->vdev_asize, - vd->vdev_ashift); - if (error != 0) { - vdev_dbgmsg(vd, "vdev_load: space_map_open " - "failed for checkpoint spacemap (obj %llu) " - "[error=%d]", - (u_longlong_t)checkpoint_sm_obj, error); - return (error); - } - ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); - - /* - * Since the checkpoint_sm contains free entries - * exclusively we can use space_map_allocated() to - * indicate the cumulative checkpointed space that - * has been freed. - */ - vd->vdev_stat.vs_checkpoint_space = - -space_map_allocated(vd->vdev_checkpoint_sm); - vd->vdev_spa->spa_checkpoint_info.sci_dspace += - vd->vdev_stat.vs_checkpoint_space; - } - } - - /* - * If this is a leaf vdev, load its DTL. - */ - if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " - "[error=%d]", error); - return (error); - } - - uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); - if (obsolete_sm_object != 0) { - objset_t *mos = vd->vdev_spa->spa_meta_objset; - ASSERT(vd->vdev_asize != 0); - ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); - - if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, - obsolete_sm_object, 0, vd->vdev_asize, 0))) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " - "obsolete spacemap (obj %llu) [error=%d]", - (u_longlong_t)obsolete_sm_object, error); - return (error); - } - } - - return (0); -} - -/* - * The special vdev case is used for hot spares and l2cache devices. Its - * sole purpose it to set the vdev state for the associated vdev. To do this, - * we make sure that we can open the underlying device, then try to read the - * label, and make sure that the label is sane and that it hasn't been - * repurposed to another pool. - */ -int -vdev_validate_aux(vdev_t *vd) -{ - nvlist_t *label; - uint64_t guid, version; - uint64_t state; - - if (!vdev_readable(vd)) - return (0); - - if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - return (-1); - } - - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || - !SPA_VERSION_IS_SUPPORTED(version) || - nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || - guid != vd->vdev_guid || - nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - nvlist_free(label); - return (-1); - } - - /* - * We don't actually check the pool state here. If it's in fact in - * use by another pool, we update this fact on the fly when requested. - */ - nvlist_free(label); - return (0); -} - -/* - * Free the objects used to store this vdev's spacemaps, and the array - * that points to them. - */ -void -vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) -{ - if (vd->vdev_ms_array == 0) - return; - - objset_t *mos = vd->vdev_spa->spa_meta_objset; - uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; - size_t array_bytes = array_count * sizeof (uint64_t); - uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); - VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, - array_bytes, smobj_array, 0)); - - for (uint64_t i = 0; i < array_count; i++) { - uint64_t smobj = smobj_array[i]; - if (smobj == 0) - continue; - - space_map_free_obj(mos, smobj, tx); - } - - kmem_free(smobj_array, array_bytes); - VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); - vd->vdev_ms_array = 0; -} - -static void -vdev_remove_empty_log(vdev_t *vd, uint64_t txg) -{ - spa_t *spa = vd->vdev_spa; - - ASSERT(vd->vdev_islog); - ASSERT(vd == vd->vdev_top); - ASSERT3U(txg, ==, spa_syncing_txg(spa)); - - dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); - - vdev_destroy_spacemaps(vd, tx); - if (vd->vdev_top_zap != 0) { - vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); - vd->vdev_top_zap = 0; - } - - dmu_tx_commit(tx); -} - -void -vdev_sync_done(vdev_t *vd, uint64_t txg) -{ - metaslab_t *msp; - boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); - - ASSERT(vdev_is_concrete(vd)); - - while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) - != NULL) - metaslab_sync_done(msp, txg); - - if (reassess) - metaslab_sync_reassess(vd->vdev_mg); -} - -void -vdev_sync(vdev_t *vd, uint64_t txg) -{ - spa_t *spa = vd->vdev_spa; - vdev_t *lvd; - metaslab_t *msp; - - ASSERT3U(txg, ==, spa->spa_syncing_txg); - dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - if (range_tree_space(vd->vdev_obsolete_segments) > 0) { - ASSERT(vd->vdev_removing || - vd->vdev_ops == &vdev_indirect_ops); - - vdev_indirect_sync_obsolete(vd, tx); - - /* - * If the vdev is indirect, it can't have dirty - * metaslabs or DTLs. - */ - if (vd->vdev_ops == &vdev_indirect_ops) { - ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); - ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); - dmu_tx_commit(tx); - return; - } - } - - ASSERT(vdev_is_concrete(vd)); - - if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && - !vd->vdev_removing) { - ASSERT(vd == vd->vdev_top); - ASSERT0(vd->vdev_indirect_config.vic_mapping_object); - vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, - DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); - ASSERT(vd->vdev_ms_array != 0); - vdev_config_dirty(vd); - } - - while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { - metaslab_sync(msp, txg); - (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); - } - - while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) - vdev_dtl_sync(lvd, txg); - - /* - * If this is an empty log device being removed, destroy the - * metadata associated with it. - */ - if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) - vdev_remove_empty_log(vd, txg); - - (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); - dmu_tx_commit(tx); -} - -uint64_t -vdev_psize_to_asize(vdev_t *vd, uint64_t psize) -{ - return (vd->vdev_ops->vdev_op_asize(vd, psize)); -} - -/* - * Mark the given vdev faulted. A faulted vdev behaves as if the device could - * not be opened, and no I/O is attempted. - */ -int -vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) -{ - vdev_t *vd, *tvd; - - spa_vdev_state_enter(spa, SCL_NONE); - - if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) - return (spa_vdev_state_exit(spa, NULL, ENODEV)); - - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); - - tvd = vd->vdev_top; - - /* - * We don't directly use the aux state here, but if we do a - * vdev_reopen(), we need this value to be present to remember why we - * were faulted. - */ - vd->vdev_label_aux = aux; - - /* - * Faulted state takes precedence over degraded. - */ - vd->vdev_delayed_close = B_FALSE; - vd->vdev_faulted = 1ULL; - vd->vdev_degraded = 0ULL; - vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); - - /* - * If this device has the only valid copy of the data, then - * back off and simply mark the vdev as degraded instead. - */ - if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { - vd->vdev_degraded = 1ULL; - vd->vdev_faulted = 0ULL; - - /* - * If we reopen the device and it's not dead, only then do we - * mark it degraded. - */ - vdev_reopen(tvd); - - if (vdev_readable(vd)) - vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); - } - - return (spa_vdev_state_exit(spa, vd, 0)); -} - -/* - * Mark the given vdev degraded. A degraded vdev is purely an indication to the - * user that something is wrong. The vdev continues to operate as normal as far - * as I/O is concerned. - */ -int -vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) -{ - vdev_t *vd; - - spa_vdev_state_enter(spa, SCL_NONE); - - if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) - return (spa_vdev_state_exit(spa, NULL, ENODEV)); - - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); - - /* - * If the vdev is already faulted, then don't do anything. - */ - if (vd->vdev_faulted || vd->vdev_degraded) - return (spa_vdev_state_exit(spa, NULL, 0)); - - vd->vdev_degraded = 1ULL; - if (!vdev_is_dead(vd)) - vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, - aux); - - return (spa_vdev_state_exit(spa, vd, 0)); -} - -/* - * Online the given vdev. - * - * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached - * spare device should be detached when the device finishes resilvering. - * Second, the online should be treated like a 'test' online case, so no FMA - * events are generated if the device fails to open. - */ -int -vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) -{ - vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; - boolean_t wasoffline; - vdev_state_t oldstate; - - spa_vdev_state_enter(spa, SCL_NONE); - - if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) - return (spa_vdev_state_exit(spa, NULL, ENODEV)); - - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); - - wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); - oldstate = vd->vdev_state; - - tvd = vd->vdev_top; - vd->vdev_offline = B_FALSE; - vd->vdev_tmpoffline = B_FALSE; - vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); - vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); - - /* XXX - L2ARC 1.0 does not support expansion */ - if (!vd->vdev_aux) { - for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) - pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); - } - - vdev_reopen(tvd); - vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; - - if (!vd->vdev_aux) { - for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) - pvd->vdev_expanding = B_FALSE; - } - - if (newstate) - *newstate = vd->vdev_state; - if ((flags & ZFS_ONLINE_UNSPARE) && - !vdev_is_dead(vd) && vd->vdev_parent && - vd->vdev_parent->vdev_ops == &vdev_spare_ops && - vd->vdev_parent->vdev_child[0] == vd) - vd->vdev_unspare = B_TRUE; - - if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { - - /* XXX - L2ARC 1.0 does not support expansion */ - if (vd->vdev_aux) - return (spa_vdev_state_exit(spa, vd, ENOTSUP)); - spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); - } - - /* Restart initializing if necessary */ - mutex_enter(&vd->vdev_initialize_lock); - if (vdev_writeable(vd) && - vd->vdev_initialize_thread == NULL && - vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { - (void) vdev_initialize(vd); - } - mutex_exit(&vd->vdev_initialize_lock); - - if (wasoffline || - (oldstate < VDEV_STATE_DEGRADED && - vd->vdev_state >= VDEV_STATE_DEGRADED)) - spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); - - return (spa_vdev_state_exit(spa, vd, 0)); -} - -static int -vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) -{ - vdev_t *vd, *tvd; - int error = 0; - uint64_t generation; - metaslab_group_t *mg; - -top: - spa_vdev_state_enter(spa, SCL_ALLOC); - - if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) - return (spa_vdev_state_exit(spa, NULL, ENODEV)); - - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); - - tvd = vd->vdev_top; - mg = tvd->vdev_mg; - generation = spa->spa_config_generation + 1; - - /* - * If the device isn't already offline, try to offline it. - */ - if (!vd->vdev_offline) { - /* - * If this device has the only valid copy of some data, - * don't allow it to be offlined. Log devices are always - * expendable. - */ - if (!tvd->vdev_islog && vd->vdev_aux == NULL && - vdev_dtl_required(vd)) - return (spa_vdev_state_exit(spa, NULL, EBUSY)); - - /* - * If the top-level is a slog and it has had allocations - * then proceed. We check that the vdev's metaslab group - * is not NULL since it's possible that we may have just - * added this vdev but not yet initialized its metaslabs. - */ - if (tvd->vdev_islog && mg != NULL) { - /* - * Prevent any future allocations. - */ - metaslab_group_passivate(mg); - (void) spa_vdev_state_exit(spa, vd, 0); - - error = spa_reset_logs(spa); - - /* - * If the log device was successfully reset but has - * checkpointed data, do not offline it. - */ - if (error == 0 && - tvd->vdev_checkpoint_sm != NULL) { - error = ZFS_ERR_CHECKPOINT_EXISTS; - } - - spa_vdev_state_enter(spa, SCL_ALLOC); - - /* - * Check to see if the config has changed. - */ - if (error || generation != spa->spa_config_generation) { - metaslab_group_activate(mg); - if (error) - return (spa_vdev_state_exit(spa, - vd, error)); - (void) spa_vdev_state_exit(spa, vd, 0); - goto top; - } - ASSERT0(tvd->vdev_stat.vs_alloc); - } - - /* - * Offline this device and reopen its top-level vdev. - * If the top-level vdev is a log device then just offline - * it. Otherwise, if this action results in the top-level - * vdev becoming unusable, undo it and fail the request. - */ - vd->vdev_offline = B_TRUE; - vdev_reopen(tvd); - - if (!tvd->vdev_islog && vd->vdev_aux == NULL && - vdev_is_dead(tvd)) { - vd->vdev_offline = B_FALSE; - vdev_reopen(tvd); - return (spa_vdev_state_exit(spa, NULL, EBUSY)); - } - - /* - * Add the device back into the metaslab rotor so that - * once we online the device it's open for business. - */ - if (tvd->vdev_islog && mg != NULL) - metaslab_group_activate(mg); - } - - vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); - - return (spa_vdev_state_exit(spa, vd, 0)); -} - -int -vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) -{ - int error; - - mutex_enter(&spa->spa_vdev_top_lock); - error = vdev_offline_locked(spa, guid, flags); - mutex_exit(&spa->spa_vdev_top_lock); - - return (error); -} - -/* - * Clear the error counts associated with this vdev. Unlike vdev_online() and - * vdev_offline(), we assume the spa config is locked. We also clear all - * children. If 'vd' is NULL, then the user wants to clear all vdevs. - */ -void -vdev_clear(spa_t *spa, vdev_t *vd) -{ - vdev_t *rvd = spa->spa_root_vdev; - - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); - - if (vd == NULL) - vd = rvd; - - vd->vdev_stat.vs_read_errors = 0; - vd->vdev_stat.vs_write_errors = 0; - vd->vdev_stat.vs_checksum_errors = 0; - - for (int c = 0; c < vd->vdev_children; c++) - vdev_clear(spa, vd->vdev_child[c]); - - if (vd == rvd) { - for (int c = 0; c < spa->spa_l2cache.sav_count; c++) - vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); - - for (int c = 0; c < spa->spa_spares.sav_count; c++) - vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); - } - - /* - * It makes no sense to "clear" an indirect vdev. - */ - if (!vdev_is_concrete(vd)) - return; - - /* - * If we're in the FAULTED state or have experienced failed I/O, then - * clear the persistent state and attempt to reopen the device. We - * also mark the vdev config dirty, so that the new faulted state is - * written out to disk. - */ - if (vd->vdev_faulted || vd->vdev_degraded || - !vdev_readable(vd) || !vdev_writeable(vd)) { - - /* - * When reopening in reponse to a clear event, it may be due to - * a fmadm repair request. In this case, if the device is - * still broken, we want to still post the ereport again. - */ - vd->vdev_forcefault = B_TRUE; - - vd->vdev_faulted = vd->vdev_degraded = 0ULL; - vd->vdev_cant_read = B_FALSE; - vd->vdev_cant_write = B_FALSE; - - vdev_reopen(vd == rvd ? rvd : vd->vdev_top); - - vd->vdev_forcefault = B_FALSE; - - if (vd != rvd && vdev_writeable(vd->vdev_top)) - vdev_state_dirty(vd->vdev_top); - - if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) - spa_async_request(spa, SPA_ASYNC_RESILVER); - - spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); - } - - /* - * When clearing a FMA-diagnosed fault, we always want to - * unspare the device, as we assume that the original spare was - * done in response to the FMA fault. - */ - if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && - vd->vdev_parent->vdev_ops == &vdev_spare_ops && - vd->vdev_parent->vdev_child[0] == vd) - vd->vdev_unspare = B_TRUE; -} - -boolean_t -vdev_is_dead(vdev_t *vd) -{ - /* - * Holes and missing devices are always considered "dead". - * This simplifies the code since we don't have to check for - * these types of devices in the various code paths. - * Instead we rely on the fact that we skip over dead devices - * before issuing I/O to them. - */ - return (vd->vdev_state < VDEV_STATE_DEGRADED || - vd->vdev_ops == &vdev_hole_ops || - vd->vdev_ops == &vdev_missing_ops); -} - -boolean_t -vdev_readable(vdev_t *vd) -{ - return (!vdev_is_dead(vd) && !vd->vdev_cant_read); -} - -boolean_t -vdev_writeable(vdev_t *vd) -{ - return (!vdev_is_dead(vd) && !vd->vdev_cant_write && - vdev_is_concrete(vd)); -} - -boolean_t -vdev_allocatable(vdev_t *vd) -{ - uint64_t state = vd->vdev_state; - - /* - * We currently allow allocations from vdevs which may be in the - * process of reopening (i.e. VDEV_STATE_CLOSED). If the device - * fails to reopen then we'll catch it later when we're holding - * the proper locks. Note that we have to get the vdev state - * in a local variable because although it changes atomically, - * we're asking two separate questions about it. - */ - return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && - !vd->vdev_cant_write && vdev_is_concrete(vd) && - vd->vdev_mg->mg_initialized); -} - -boolean_t -vdev_accessible(vdev_t *vd, zio_t *zio) -{ - ASSERT(zio->io_vd == vd); - - if (vdev_is_dead(vd) || vd->vdev_remove_wanted) - return (B_FALSE); - - if (zio->io_type == ZIO_TYPE_READ) - return (!vd->vdev_cant_read); - - if (zio->io_type == ZIO_TYPE_WRITE) - return (!vd->vdev_cant_write); - - return (B_TRUE); -} - -boolean_t -vdev_is_spacemap_addressable(vdev_t *vd) -{ - if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) - return (B_TRUE); - - /* - * If double-word space map entries are not enabled we assume - * 47 bits of the space map entry are dedicated to the entry's - * offset (see SM_OFFSET_BITS in space_map.h). We then use that - * to calculate the maximum address that can be described by a - * space map entry for the given device. - */ - uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; - - if (shift >= 63) /* detect potential overflow */ - return (B_TRUE); - - return (vd->vdev_asize < (1ULL << shift)); -} - -/* - * Get statistics for the given vdev. - */ -void -vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) -{ - spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *tvd = vd->vdev_top; - - ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); - - mutex_enter(&vd->vdev_stat_lock); - bcopy(&vd->vdev_stat, vs, sizeof (*vs)); - vs->vs_timestamp = gethrtime() - vs->vs_timestamp; - vs->vs_state = vd->vdev_state; - vs->vs_rsize = vdev_get_min_asize(vd); - if (vd->vdev_ops->vdev_op_leaf) { - vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; - /* - * Report intializing progress. Since we don't have the - * initializing locks held, this is only an estimate (although a - * fairly accurate one). - */ - vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; - vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; - vs->vs_initialize_state = vd->vdev_initialize_state; - vs->vs_initialize_action_time = vd->vdev_initialize_action_time; - } - /* - * Report expandable space on top-level, non-auxillary devices only. - * The expandable space is reported in terms of metaslab sized units - * since that determines how much space the pool can expand. - */ - if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) { - vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize - - spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift); - } - vs->vs_configured_ashift = vd->vdev_top != NULL - ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; - vs->vs_logical_ashift = vd->vdev_logical_ashift; - vs->vs_physical_ashift = vd->vdev_physical_ashift; - if (vd->vdev_aux == NULL && vd == vd->vdev_top && - vdev_is_concrete(vd)) { - vs->vs_fragmentation = (vd->vdev_mg != NULL) ? - vd->vdev_mg->mg_fragmentation : 0; - } - - /* - * If we're getting stats on the root vdev, aggregate the I/O counts - * over all top-level vdevs (i.e. the direct children of the root). - */ - if (vd == rvd) { - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *cvd = rvd->vdev_child[c]; - vdev_stat_t *cvs = &cvd->vdev_stat; - - for (int t = 0; t < ZIO_TYPES; t++) { - vs->vs_ops[t] += cvs->vs_ops[t]; - vs->vs_bytes[t] += cvs->vs_bytes[t]; - } - cvs->vs_scan_removing = cvd->vdev_removing; - } - } - mutex_exit(&vd->vdev_stat_lock); -} - -void -vdev_clear_stats(vdev_t *vd) -{ - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_space = 0; - vd->vdev_stat.vs_dspace = 0; - vd->vdev_stat.vs_alloc = 0; - mutex_exit(&vd->vdev_stat_lock); -} - -void -vdev_scan_stat_init(vdev_t *vd) -{ - vdev_stat_t *vs = &vd->vdev_stat; - - for (int c = 0; c < vd->vdev_children; c++) - vdev_scan_stat_init(vd->vdev_child[c]); - - mutex_enter(&vd->vdev_stat_lock); - vs->vs_scan_processed = 0; - mutex_exit(&vd->vdev_stat_lock); -} - -void -vdev_stat_update(zio_t *zio, uint64_t psize) -{ - spa_t *spa = zio->io_spa; - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; - vdev_t *pvd; - uint64_t txg = zio->io_txg; - vdev_stat_t *vs = &vd->vdev_stat; - zio_type_t type = zio->io_type; - int flags = zio->io_flags; - - /* - * If this i/o is a gang leader, it didn't do any actual work. - */ - if (zio->io_gang_tree) - return; - - if (zio->io_error == 0) { - /* - * If this is a root i/o, don't count it -- we've already - * counted the top-level vdevs, and vdev_get_stats() will - * aggregate them when asked. This reduces contention on - * the root vdev_stat_lock and implicitly handles blocks - * that compress away to holes, for which there is no i/o. - * (Holes never create vdev children, so all the counters - * remain zero, which is what we want.) - * - * Note: this only applies to successful i/o (io_error == 0) - * because unlike i/o counts, errors are not additive. - * When reading a ditto block, for example, failure of - * one top-level vdev does not imply a root-level error. - */ - if (vd == rvd) - return; - - ASSERT(vd == zio->io_vd); - - if (flags & ZIO_FLAG_IO_BYPASS) - return; - - mutex_enter(&vd->vdev_stat_lock); - - if (flags & ZIO_FLAG_IO_REPAIR) { - if (flags & ZIO_FLAG_SCAN_THREAD) { - dsl_scan_phys_t *scn_phys = - &spa->spa_dsl_pool->dp_scan->scn_phys; - uint64_t *processed = &scn_phys->scn_processed; - - /* XXX cleanup? */ - if (vd->vdev_ops->vdev_op_leaf) - atomic_add_64(processed, psize); - vs->vs_scan_processed += psize; - } - - if (flags & ZIO_FLAG_SELF_HEAL) - vs->vs_self_healed += psize; - } - - vs->vs_ops[type]++; - vs->vs_bytes[type] += psize; - - mutex_exit(&vd->vdev_stat_lock); - return; - } - - if (flags & ZIO_FLAG_SPECULATIVE) - return; - - /* - * If this is an I/O error that is going to be retried, then ignore the - * error. Otherwise, the user may interpret B_FAILFAST I/O errors as - * hard errors, when in reality they can happen for any number of - * innocuous reasons (bus resets, MPxIO link failure, etc). - */ - if (zio->io_error == EIO && - !(zio->io_flags & ZIO_FLAG_IO_RETRY)) - return; - - /* - * Intent logs writes won't propagate their error to the root - * I/O so don't mark these types of failures as pool-level - * errors. - */ - if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) - return; - - mutex_enter(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { - if (zio->io_error == ECKSUM) - vs->vs_checksum_errors++; - else - vs->vs_read_errors++; - } - if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) - vs->vs_write_errors++; - mutex_exit(&vd->vdev_stat_lock); - - if (spa->spa_load_state == SPA_LOAD_NONE && - type == ZIO_TYPE_WRITE && txg != 0 && - (!(flags & ZIO_FLAG_IO_REPAIR) || - (flags & ZIO_FLAG_SCAN_THREAD) || - spa->spa_claiming)) { - /* - * This is either a normal write (not a repair), or it's - * a repair induced by the scrub thread, or it's a repair - * made by zil_claim() during spa_load() in the first txg. - * In the normal case, we commit the DTL change in the same - * txg as the block was born. In the scrub-induced repair - * case, we know that scrubs run in first-pass syncing context, - * so we commit the DTL change in spa_syncing_txg(spa). - * In the zil_claim() case, we commit in spa_first_txg(spa). - * - * We currently do not make DTL entries for failed spontaneous - * self-healing writes triggered by normal (non-scrubbing) - * reads, because we have no transactional context in which to - * do so -- and it's not clear that it'd be desirable anyway. - */ - if (vd->vdev_ops->vdev_op_leaf) { - uint64_t commit_txg = txg; - if (flags & ZIO_FLAG_SCAN_THREAD) { - ASSERT(flags & ZIO_FLAG_IO_REPAIR); - ASSERT(spa_sync_pass(spa) == 1); - vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); - commit_txg = spa_syncing_txg(spa); - } else if (spa->spa_claiming) { - ASSERT(flags & ZIO_FLAG_IO_REPAIR); - commit_txg = spa_first_txg(spa); - } - ASSERT(commit_txg >= spa_syncing_txg(spa)); - if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) - return; - for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) - vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); - vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); - } - if (vd != rvd) - vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); - } -} - -int64_t -vdev_deflated_space(vdev_t *vd, int64_t space) -{ - ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); - ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); - - return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); -} - -/* - * Update the in-core space usage stats for this vdev and the root vdev. - */ -void -vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, - int64_t space_delta) -{ - int64_t dspace_delta; - spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; - - ASSERT(vd == vd->vdev_top); - - /* - * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion - * factor. We must calculate this here and not at the root vdev - * because the root vdev's psize-to-asize is simply the max of its - * childrens', thus not accurate enough for us. - */ - dspace_delta = vdev_deflated_space(vd, space_delta); - - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_alloc += alloc_delta; - vd->vdev_stat.vs_space += space_delta; - vd->vdev_stat.vs_dspace += dspace_delta; - mutex_exit(&vd->vdev_stat_lock); - - /* every class but log contributes to root space stats */ - if (vd->vdev_mg != NULL && !vd->vdev_islog) { - mutex_enter(&rvd->vdev_stat_lock); - rvd->vdev_stat.vs_alloc += alloc_delta; - rvd->vdev_stat.vs_space += space_delta; - rvd->vdev_stat.vs_dspace += dspace_delta; - mutex_exit(&rvd->vdev_stat_lock); - } - /* Note: metaslab_class_space_update moved to metaslab_space_update */ -} - -/* - * Mark a top-level vdev's config as dirty, placing it on the dirty list - * so that it will be written out next time the vdev configuration is synced. - * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. - */ -void -vdev_config_dirty(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; - int c; - - ASSERT(spa_writeable(spa)); - - /* - * If this is an aux vdev (as with l2cache and spare devices), then we - * update the vdev config manually and set the sync flag. - */ - if (vd->vdev_aux != NULL) { - spa_aux_vdev_t *sav = vd->vdev_aux; - nvlist_t **aux; - uint_t naux; - - for (c = 0; c < sav->sav_count; c++) { - if (sav->sav_vdevs[c] == vd) - break; - } - - if (c == sav->sav_count) { - /* - * We're being removed. There's nothing more to do. - */ - ASSERT(sav->sav_sync == B_TRUE); - return; - } - - sav->sav_sync = B_TRUE; - - if (nvlist_lookup_nvlist_array(sav->sav_config, - ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { - VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, - ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); - } - - ASSERT(c < naux); - - /* - * Setting the nvlist in the middle if the array is a little - * sketchy, but it will work. - */ - nvlist_free(aux[c]); - aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); - - return; - } - - /* - * The dirty list is protected by the SCL_CONFIG lock. The caller - * must either hold SCL_CONFIG as writer, or must be the sync thread - * (which holds SCL_CONFIG as reader). There's only one sync thread, - * so this is sufficient to ensure mutual exclusion. - */ - ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || - (dsl_pool_sync_context(spa_get_dsl(spa)) && - spa_config_held(spa, SCL_CONFIG, RW_READER))); - - if (vd == rvd) { - for (c = 0; c < rvd->vdev_children; c++) - vdev_config_dirty(rvd->vdev_child[c]); - } else { - ASSERT(vd == vd->vdev_top); - - if (!list_link_active(&vd->vdev_config_dirty_node) && - vdev_is_concrete(vd)) { - list_insert_head(&spa->spa_config_dirty_list, vd); - } - } -} - -void -vdev_config_clean(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || - (dsl_pool_sync_context(spa_get_dsl(spa)) && - spa_config_held(spa, SCL_CONFIG, RW_READER))); - - ASSERT(list_link_active(&vd->vdev_config_dirty_node)); - list_remove(&spa->spa_config_dirty_list, vd); -} - -/* - * Mark a top-level vdev's state as dirty, so that the next pass of - * spa_sync() can convert this into vdev_config_dirty(). We distinguish - * the state changes from larger config changes because they require - * much less locking, and are often needed for administrative actions. - */ -void -vdev_state_dirty(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - ASSERT(spa_writeable(spa)); - ASSERT(vd == vd->vdev_top); - - /* - * The state list is protected by the SCL_STATE lock. The caller - * must either hold SCL_STATE as writer, or must be the sync thread - * (which holds SCL_STATE as reader). There's only one sync thread, - * so this is sufficient to ensure mutual exclusion. - */ - ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || - (dsl_pool_sync_context(spa_get_dsl(spa)) && - spa_config_held(spa, SCL_STATE, RW_READER))); - - if (!list_link_active(&vd->vdev_state_dirty_node) && - vdev_is_concrete(vd)) - list_insert_head(&spa->spa_state_dirty_list, vd); -} - -void -vdev_state_clean(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || - (dsl_pool_sync_context(spa_get_dsl(spa)) && - spa_config_held(spa, SCL_STATE, RW_READER))); - - ASSERT(list_link_active(&vd->vdev_state_dirty_node)); - list_remove(&spa->spa_state_dirty_list, vd); -} - -/* - * Propagate vdev state up from children to parent. - */ -void -vdev_propagate_state(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; - int degraded = 0, faulted = 0; - int corrupted = 0; - vdev_t *child; - - if (vd->vdev_children > 0) { - for (int c = 0; c < vd->vdev_children; c++) { - child = vd->vdev_child[c]; - - /* - * Don't factor holes or indirect vdevs into the - * decision. - */ - if (!vdev_is_concrete(child)) - continue; - - if (!vdev_readable(child) || - (!vdev_writeable(child) && spa_writeable(spa))) { - /* - * Root special: if there is a top-level log - * device, treat the root vdev as if it were - * degraded. - */ - if (child->vdev_islog && vd == rvd) - degraded++; - else - faulted++; - } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { - degraded++; - } - - if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) - corrupted++; - } - - vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); - - /* - * Root special: if there is a top-level vdev that cannot be - * opened due to corrupted metadata, then propagate the root - * vdev's aux state as 'corrupt' rather than 'insufficient - * replicas'. - */ - if (corrupted && vd == rvd && - rvd->vdev_state == VDEV_STATE_CANT_OPEN) - vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - } - - if (vd->vdev_parent) - vdev_propagate_state(vd->vdev_parent); -} - -/* - * Set a vdev's state. If this is during an open, we don't update the parent - * state, because we're in the process of opening children depth-first. - * Otherwise, we propagate the change to the parent. - * - * If this routine places a device in a faulted state, an appropriate ereport is - * generated. - */ -void -vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) -{ - uint64_t save_state; - spa_t *spa = vd->vdev_spa; - - if (state == vd->vdev_state) { - vd->vdev_stat.vs_aux = aux; - return; - } - - save_state = vd->vdev_state; - - vd->vdev_state = state; - vd->vdev_stat.vs_aux = aux; - - /* - * If we are setting the vdev state to anything but an open state, then - * always close the underlying device unless the device has requested - * a delayed close (i.e. we're about to remove or fault the device). - * Otherwise, we keep accessible but invalid devices open forever. - * We don't call vdev_close() itself, because that implies some extra - * checks (offline, etc) that we don't want here. This is limited to - * leaf devices, because otherwise closing the device will affect other - * children. - */ - if (!vd->vdev_delayed_close && vdev_is_dead(vd) && - vd->vdev_ops->vdev_op_leaf) - vd->vdev_ops->vdev_op_close(vd); - - if (vd->vdev_removed && - state == VDEV_STATE_CANT_OPEN && - (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { - /* - * If the previous state is set to VDEV_STATE_REMOVED, then this - * device was previously marked removed and someone attempted to - * reopen it. If this failed due to a nonexistent device, then - * keep the device in the REMOVED state. We also let this be if - * it is one of our special test online cases, which is only - * attempting to online the device and shouldn't generate an FMA - * fault. - */ - vd->vdev_state = VDEV_STATE_REMOVED; - vd->vdev_stat.vs_aux = VDEV_AUX_NONE; - } else if (state == VDEV_STATE_REMOVED) { - vd->vdev_removed = B_TRUE; - } else if (state == VDEV_STATE_CANT_OPEN) { - /* - * If we fail to open a vdev during an import or recovery, we - * mark it as "not available", which signifies that it was - * never there to begin with. Failure to open such a device - * is not considered an error. - */ - if ((spa_load_state(spa) == SPA_LOAD_IMPORT || - spa_load_state(spa) == SPA_LOAD_RECOVER) && - vd->vdev_ops->vdev_op_leaf) - vd->vdev_not_present = 1; - - /* - * Post the appropriate ereport. If the 'prevstate' field is - * set to something other than VDEV_STATE_UNKNOWN, it indicates - * that this is part of a vdev_reopen(). In this case, we don't - * want to post the ereport if the device was already in the - * CANT_OPEN state beforehand. - * - * If the 'checkremove' flag is set, then this is an attempt to - * online the device in response to an insertion event. If we - * hit this case, then we have detected an insertion event for a - * faulted or offline device that wasn't in the removed state. - * In this scenario, we don't post an ereport because we are - * about to replace the device, or attempt an online with - * vdev_forcefault, which will generate the fault for us. - */ - if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && - !vd->vdev_not_present && !vd->vdev_checkremove && - vd != spa->spa_root_vdev) { - const char *class; - - switch (aux) { - case VDEV_AUX_OPEN_FAILED: - class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; - break; - case VDEV_AUX_CORRUPT_DATA: - class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; - break; - case VDEV_AUX_NO_REPLICAS: - class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; - break; - case VDEV_AUX_BAD_GUID_SUM: - class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; - break; - case VDEV_AUX_TOO_SMALL: - class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; - break; - case VDEV_AUX_BAD_LABEL: - class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; - break; - default: - class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; - } - - zfs_ereport_post(class, spa, vd, NULL, save_state, 0); - } - - /* Erase any notion of persistent removed state */ - vd->vdev_removed = B_FALSE; - } else { - vd->vdev_removed = B_FALSE; - } - - /* - * Notify the fmd of the state change. Be verbose and post - * notifications even for stuff that's not important; the fmd agent can - * sort it out. Don't emit state change events for non-leaf vdevs since - * they can't change state on their own. The FMD can check their state - * if it wants to when it sees that a leaf vdev had a state change. - */ - if (vd->vdev_ops->vdev_op_leaf) - zfs_post_state_change(spa, vd); - - if (!isopen && vd->vdev_parent) - vdev_propagate_state(vd->vdev_parent); -} - -boolean_t -vdev_children_are_offline(vdev_t *vd) -{ - ASSERT(!vd->vdev_ops->vdev_op_leaf); - - for (uint64_t i = 0; i < vd->vdev_children; i++) { - if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) - return (B_FALSE); - } - - return (B_TRUE); -} - -/* - * Check the vdev configuration to ensure that it's capable of supporting - * a root pool. We do not support partial configuration. - * In addition, only a single top-level vdev is allowed. - * - * FreeBSD does not have above limitations. - */ -boolean_t -vdev_is_bootable(vdev_t *vd) -{ -#ifdef illumos - if (!vd->vdev_ops->vdev_op_leaf) { - char *vdev_type = vd->vdev_ops->vdev_op_type; - - if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && - vd->vdev_children > 1) { - return (B_FALSE); - } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || - strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { - return (B_FALSE); - } - } - - for (int c = 0; c < vd->vdev_children; c++) { - if (!vdev_is_bootable(vd->vdev_child[c])) - return (B_FALSE); - } -#endif /* illumos */ - return (B_TRUE); -} - -boolean_t -vdev_is_concrete(vdev_t *vd) -{ - vdev_ops_t *ops = vd->vdev_ops; - if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || - ops == &vdev_missing_ops || ops == &vdev_root_ops) { - return (B_FALSE); - } else { - return (B_TRUE); - } -} - -/* - * Determine if a log device has valid content. If the vdev was - * removed or faulted in the MOS config then we know that - * the content on the log device has already been written to the pool. - */ -boolean_t -vdev_log_state_valid(vdev_t *vd) -{ - if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && - !vd->vdev_removed) - return (B_TRUE); - - for (int c = 0; c < vd->vdev_children; c++) - if (vdev_log_state_valid(vd->vdev_child[c])) - return (B_TRUE); - - return (B_FALSE); -} - -/* - * Expand a vdev if possible. - */ -void -vdev_expand(vdev_t *vd, uint64_t txg) -{ - ASSERT(vd->vdev_top == vd); - ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); - ASSERT(vdev_is_concrete(vd)); - - vdev_set_deflate_ratio(vd); - - if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && - vdev_is_concrete(vd)) { - vdev_metaslab_group_create(vd); - VERIFY(vdev_metaslab_init(vd, txg) == 0); - vdev_config_dirty(vd); - } -} - -/* - * Split a vdev. - */ -void -vdev_split(vdev_t *vd) -{ - vdev_t *cvd, *pvd = vd->vdev_parent; - - vdev_remove_child(pvd, vd); - vdev_compact_children(pvd); - - cvd = pvd->vdev_child[0]; - if (pvd->vdev_children == 1) { - vdev_remove_parent(cvd); - cvd->vdev_splitting = B_TRUE; - } - vdev_propagate_state(cvd); -} - -void -vdev_deadman(vdev_t *vd) -{ - for (int c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - - vdev_deadman(cvd); - } - - if (vd->vdev_ops->vdev_op_leaf) { - vdev_queue_t *vq = &vd->vdev_queue; - - mutex_enter(&vq->vq_lock); - if (avl_numnodes(&vq->vq_active_tree) > 0) { - spa_t *spa = vd->vdev_spa; - zio_t *fio; - uint64_t delta; - - /* - * Look at the head of all the pending queues, - * if any I/O has been outstanding for longer than - * the spa_deadman_synctime we panic the system. - */ - fio = avl_first(&vq->vq_active_tree); - delta = gethrtime() - fio->io_timestamp; - if (delta > spa_deadman_synctime(spa)) { - vdev_dbgmsg(vd, "SLOW IO: zio timestamp " - "%lluns, delta %lluns, last io %lluns", - fio->io_timestamp, (u_longlong_t)delta, - vq->vq_io_complete_ts); - fm_panic("I/O to pool '%s' appears to be " - "hung on vdev guid %llu at '%s'.", - spa_name(spa), - (long long unsigned int) vd->vdev_guid, - vd->vdev_path); - } - } - mutex_exit(&vq->vq_lock); - } -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c deleted file mode 100644 index 69421bb61897..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c +++ /dev/null @@ -1,434 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include - -/* - * Virtual device read-ahead caching. - * - * This file implements a simple LRU read-ahead cache. When the DMU reads - * a given block, it will often want other, nearby blocks soon thereafter. - * We take advantage of this by reading a larger disk region and caching - * the result. In the best case, this can turn 128 back-to-back 512-byte - * reads into a single 64k read followed by 127 cache hits; this reduces - * latency dramatically. In the worst case, it can turn an isolated 512-byte - * read into a 64k read, which doesn't affect latency all that much but is - * terribly wasteful of bandwidth. A more intelligent version of the cache - * could keep track of access patterns and not do read-ahead unless it sees - * at least two temporally close I/Os to the same region. Currently, only - * metadata I/O is inflated. A futher enhancement could take advantage of - * more semantic information about the I/O. And it could use something - * faster than an AVL tree; that was chosen solely for convenience. - * - * There are five cache operations: allocate, fill, read, write, evict. - * - * (1) Allocate. This reserves a cache entry for the specified region. - * We separate the allocate and fill operations so that multiple threads - * don't generate I/O for the same cache miss. - * - * (2) Fill. When the I/O for a cache miss completes, the fill routine - * places the data in the previously allocated cache entry. - * - * (3) Read. Read data from the cache. - * - * (4) Write. Update cache contents after write completion. - * - * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry - * if the total cache size exceeds zfs_vdev_cache_size. - */ - -/* - * These tunables are for performance analysis. - */ -/* - * All i/os smaller than zfs_vdev_cache_max will be turned into - * 1<ve_offset, ve2->ve_offset)); -} - -static int -vdev_cache_lastused_compare(const void *a1, const void *a2) -{ - const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; - const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; - - int cmp = AVL_CMP(ve1->ve_lastused, ve2->ve_lastused); - if (likely(cmp)) - return (cmp); - - /* - * Among equally old entries, sort by offset to ensure uniqueness. - */ - return (vdev_cache_offset_compare(a1, a2)); -} - -/* - * Evict the specified entry from the cache. - */ -static void -vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) -{ - ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT3P(ve->ve_fill_io, ==, NULL); - ASSERT3P(ve->ve_abd, !=, NULL); - - avl_remove(&vc->vc_lastused_tree, ve); - avl_remove(&vc->vc_offset_tree, ve); - abd_free(ve->ve_abd); - kmem_free(ve, sizeof (vdev_cache_entry_t)); -} - -/* - * Allocate an entry in the cache. At the point we don't have the data, - * we're just creating a placeholder so that multiple threads don't all - * go off and read the same blocks. - */ -static vdev_cache_entry_t * -vdev_cache_allocate(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - uint64_t offset = P2ALIGN(zio->io_offset, VCBS); - vdev_cache_entry_t *ve; - - ASSERT(MUTEX_HELD(&vc->vc_lock)); - - if (zfs_vdev_cache_size == 0) - return (NULL); - - /* - * If adding a new entry would exceed the cache size, - * evict the oldest entry (LRU). - */ - if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > - zfs_vdev_cache_size) { - ve = avl_first(&vc->vc_lastused_tree); - if (ve->ve_fill_io != NULL) - return (NULL); - ASSERT3U(ve->ve_hits, !=, 0); - vdev_cache_evict(vc, ve); - } - - ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); - ve->ve_offset = offset; - ve->ve_lastused = ddi_get_lbolt(); - ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); - - avl_add(&vc->vc_offset_tree, ve); - avl_add(&vc->vc_lastused_tree, ve); - - return (ve); -} - -static void -vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) -{ - uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); - - ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT3P(ve->ve_fill_io, ==, NULL); - - if (ve->ve_lastused != ddi_get_lbolt()) { - avl_remove(&vc->vc_lastused_tree, ve); - ve->ve_lastused = ddi_get_lbolt(); - avl_add(&vc->vc_lastused_tree, ve); - } - - ve->ve_hits++; - abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size); -} - -/* - * Fill a previously allocated cache entry with data. - */ -static void -vdev_cache_fill(zio_t *fio) -{ - vdev_t *vd = fio->io_vd; - vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve = fio->io_private; - zio_t *pio; - - ASSERT3U(fio->io_size, ==, VCBS); - - /* - * Add data to the cache. - */ - mutex_enter(&vc->vc_lock); - - ASSERT3P(ve->ve_fill_io, ==, fio); - ASSERT3U(ve->ve_offset, ==, fio->io_offset); - ASSERT3P(ve->ve_abd, ==, fio->io_abd); - - ve->ve_fill_io = NULL; - - /* - * Even if this cache line was invalidated by a missed write update, - * any reads that were queued up before the missed update are still - * valid, so we can satisfy them from this line before we evict it. - */ - zio_link_t *zl = NULL; - while ((pio = zio_walk_parents(fio, &zl)) != NULL) - vdev_cache_hit(vc, ve, pio); - - if (fio->io_error || ve->ve_missed_update) - vdev_cache_evict(vc, ve); - - mutex_exit(&vc->vc_lock); -} - -/* - * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss. - */ -boolean_t -vdev_cache_read(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - vdev_cache_entry_t *ve, ve_search; - uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); - uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); - zio_t *fio; - - ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - - if (zio->io_flags & ZIO_FLAG_DONT_CACHE) - return (B_FALSE); - - if (zio->io_size > zfs_vdev_cache_max) - return (B_FALSE); - - /* - * If the I/O straddles two or more cache blocks, don't cache it. - */ - if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) - return (B_FALSE); - - ASSERT3U(cache_phase + zio->io_size, <=, VCBS); - - mutex_enter(&vc->vc_lock); - - ve_search.ve_offset = cache_offset; - ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); - - if (ve != NULL) { - if (ve->ve_missed_update) { - mutex_exit(&vc->vc_lock); - return (B_FALSE); - } - - if ((fio = ve->ve_fill_io) != NULL) { - zio_vdev_io_bypass(zio); - zio_add_child(zio, fio); - mutex_exit(&vc->vc_lock); - VDCSTAT_BUMP(vdc_stat_delegations); - return (B_TRUE); - } - - vdev_cache_hit(vc, ve, zio); - zio_vdev_io_bypass(zio); - - mutex_exit(&vc->vc_lock); - VDCSTAT_BUMP(vdc_stat_hits); - return (B_TRUE); - } - - ve = vdev_cache_allocate(zio); - - if (ve == NULL) { - mutex_exit(&vc->vc_lock); - return (B_FALSE); - } - - fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, - ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, - ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); - - ve->ve_fill_io = fio; - zio_vdev_io_bypass(zio); - zio_add_child(zio, fio); - - mutex_exit(&vc->vc_lock); - zio_nowait(fio); - VDCSTAT_BUMP(vdc_stat_misses); - - return (B_TRUE); -} - -/* - * Update cache contents upon write completion. - */ -void -vdev_cache_write(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - vdev_cache_entry_t *ve, ve_search; - uint64_t io_start = zio->io_offset; - uint64_t io_end = io_start + zio->io_size; - uint64_t min_offset = P2ALIGN(io_start, VCBS); - uint64_t max_offset = P2ROUNDUP(io_end, VCBS); - avl_index_t where; - - ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); - - mutex_enter(&vc->vc_lock); - - ve_search.ve_offset = min_offset; - ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); - - if (ve == NULL) - ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); - - while (ve != NULL && ve->ve_offset < max_offset) { - uint64_t start = MAX(ve->ve_offset, io_start); - uint64_t end = MIN(ve->ve_offset + VCBS, io_end); - - if (ve->ve_fill_io != NULL) { - ve->ve_missed_update = 1; - } else { - abd_copy_off(ve->ve_abd, zio->io_abd, - start - ve->ve_offset, start - io_start, - end - start); - } - ve = AVL_NEXT(&vc->vc_offset_tree, ve); - } - mutex_exit(&vc->vc_lock); -} - -void -vdev_cache_purge(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve; - - mutex_enter(&vc->vc_lock); - while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) - vdev_cache_evict(vc, ve); - mutex_exit(&vc->vc_lock); -} - -void -vdev_cache_init(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - - mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); - - avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, - sizeof (vdev_cache_entry_t), - offsetof(struct vdev_cache_entry, ve_offset_node)); - - avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, - sizeof (vdev_cache_entry_t), - offsetof(struct vdev_cache_entry, ve_lastused_node)); -} - -void -vdev_cache_fini(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - - vdev_cache_purge(vd); - - avl_destroy(&vc->vc_offset_tree); - avl_destroy(&vc->vc_lastused_tree); - - mutex_destroy(&vc->vc_lock); -} - -void -vdev_cache_stat_init(void) -{ - vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", - KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - if (vdc_ksp != NULL) { - vdc_ksp->ks_data = &vdc_stats; - kstat_install(vdc_ksp); - } -} - -void -vdev_cache_stat_fini(void) -{ - if (vdc_ksp != NULL) { - kstat_delete(vdc_ksp); - vdc_ksp = NULL; - } -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c deleted file mode 100644 index 2fe7b35f4fa0..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c +++ /dev/null @@ -1,971 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright 2016 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013 Joyent, Inc. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Virtual device vector for disks. - */ - -extern ldi_ident_t zfs_li; - -static void vdev_disk_close(vdev_t *); - -typedef struct vdev_disk_ldi_cb { - list_node_t lcb_next; - ldi_callback_id_t lcb_id; -} vdev_disk_ldi_cb_t; - -/* - * Bypass the devid when opening a disk vdev. - * There have been issues where the devids of several devices were shuffled, - * causing pool open failures. Note, that this flag is intended to be used - * for pool recovery only. - * - * Note that if a pool is imported with the devids bypassed, all its vdevs will - * cease storing devid information permanently. In practice, the devid is rarely - * useful as vdev paths do not tend to change unless the hardware is - * reconfigured. That said, if the paths do change and a pool fails to open - * automatically at boot, a simple zpool import should re-scan the paths and fix - * the issue. - */ -boolean_t vdev_disk_bypass_devid = B_FALSE; - -static void -vdev_disk_alloc(vdev_t *vd) -{ - vdev_disk_t *dvd; - - dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); - /* - * Create the LDI event callback list. - */ - list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t), - offsetof(vdev_disk_ldi_cb_t, lcb_next)); -} - -static void -vdev_disk_free(vdev_t *vd) -{ - vdev_disk_t *dvd = vd->vdev_tsd; - vdev_disk_ldi_cb_t *lcb; - - if (dvd == NULL) - return; - - /* - * We have already closed the LDI handle. Clean up the LDI event - * callbacks and free vd->vdev_tsd. - */ - while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) { - list_remove(&dvd->vd_ldi_cbs, lcb); - (void) ldi_ev_remove_callbacks(lcb->lcb_id); - kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t)); - } - list_destroy(&dvd->vd_ldi_cbs); - kmem_free(dvd, sizeof (vdev_disk_t)); - vd->vdev_tsd = NULL; -} - -/* ARGSUSED */ -static int -vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg, - void *ev_data) -{ - vdev_t *vd = (vdev_t *)arg; - vdev_disk_t *dvd = vd->vdev_tsd; - - /* - * Ignore events other than offline. - */ - if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) - return (LDI_EV_SUCCESS); - - /* - * All LDI handles must be closed for the state change to succeed, so - * call on vdev_disk_close() to do this. - * - * We inform vdev_disk_close that it is being called from offline - * notify context so it will defer cleanup of LDI event callbacks and - * freeing of vd->vdev_tsd to the offline finalize or a reopen. - */ - dvd->vd_ldi_offline = B_TRUE; - vdev_disk_close(vd); - - /* - * Now that the device is closed, request that the spa_async_thread - * mark the device as REMOVED and notify FMA of the removal. - */ - zfs_post_remove(vd->vdev_spa, vd); - vd->vdev_remove_wanted = B_TRUE; - spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); - - return (LDI_EV_SUCCESS); -} - -/* ARGSUSED */ -static void -vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, - int ldi_result, void *arg, void *ev_data) -{ - vdev_t *vd = (vdev_t *)arg; - - /* - * Ignore events other than offline. - */ - if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) - return; - - /* - * We have already closed the LDI handle in notify. - * Clean up the LDI event callbacks and free vd->vdev_tsd. - */ - vdev_disk_free(vd); - - /* - * Request that the vdev be reopened if the offline state change was - * unsuccessful. - */ - if (ldi_result != LDI_EV_SUCCESS) { - vd->vdev_probe_wanted = B_TRUE; - spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE); - } -} - -static ldi_ev_callback_t vdev_disk_off_callb = { - .cb_vers = LDI_EV_CB_VERS, - .cb_notify = vdev_disk_off_notify, - .cb_finalize = vdev_disk_off_finalize -}; - -/* ARGSUSED */ -static void -vdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, - int ldi_result, void *arg, void *ev_data) -{ - vdev_t *vd = (vdev_t *)arg; - - /* - * Ignore events other than degrade. - */ - if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0) - return; - - /* - * Degrade events always succeed. Mark the vdev as degraded. - * This status is purely informative for the user. - */ - (void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0); -} - -static ldi_ev_callback_t vdev_disk_dgrd_callb = { - .cb_vers = LDI_EV_CB_VERS, - .cb_notify = NULL, - .cb_finalize = vdev_disk_dgrd_finalize -}; - -static void -vdev_disk_hold(vdev_t *vd) -{ - ddi_devid_t devid; - char *minor; - - ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); - - /* - * We must have a pathname, and it must be absolute. - */ - if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') - return; - - /* - * Only prefetch path and devid info if the device has - * never been opened. - */ - if (vd->vdev_tsd != NULL) - return; - - if (vd->vdev_wholedisk == -1ULL) { - size_t len = strlen(vd->vdev_path) + 3; - char *buf = kmem_alloc(len, KM_SLEEP); - - (void) snprintf(buf, len, "%ss0", vd->vdev_path); - - (void) ldi_vp_from_name(buf, &vd->vdev_name_vp); - kmem_free(buf, len); - } - - if (vd->vdev_name_vp == NULL) - (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp); - - if (vd->vdev_devid != NULL && - ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) { - (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp); - ddi_devid_str_free(minor); - ddi_devid_free(devid); - } -} - -static void -vdev_disk_rele(vdev_t *vd) -{ - ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); - - if (vd->vdev_name_vp) { - VN_RELE_ASYNC(vd->vdev_name_vp, - dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); - vd->vdev_name_vp = NULL; - } - if (vd->vdev_devid_vp) { - VN_RELE_ASYNC(vd->vdev_devid_vp, - dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); - vd->vdev_devid_vp = NULL; - } -} - -/* - * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when - * even a fallback to DKIOCGMEDIAINFO fails. - */ -#ifdef DEBUG -#define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__) -#else -#define VDEV_DEBUG(...) /* Nothing... */ -#endif - -static int -vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift) -{ - spa_t *spa = vd->vdev_spa; - vdev_disk_t *dvd = vd->vdev_tsd; - ldi_ev_cookie_t ecookie; - vdev_disk_ldi_cb_t *lcb; - union { - struct dk_minfo_ext ude; - struct dk_minfo ud; - } dks; - struct dk_minfo_ext *dkmext = &dks.ude; - struct dk_minfo *dkm = &dks.ud; - int error; - dev_t dev; - int otyp; - boolean_t validate_devid = B_FALSE; - ddi_devid_t devid; - uint64_t capacity = 0, blksz = 0, pbsize; - - /* - * We must have a pathname, and it must be absolute. - */ - if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (SET_ERROR(EINVAL)); - } - - /* - * Reopen the device if it's not currently open. Otherwise, - * just update the physical size of the device. - */ - if (dvd != NULL) { - if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) { - /* - * If we are opening a device in its offline notify - * context, the LDI handle was just closed. Clean - * up the LDI event callbacks and free vd->vdev_tsd. - */ - vdev_disk_free(vd); - } else { - ASSERT(vd->vdev_reopening); - goto skip_open; - } - } - - /* - * Create vd->vdev_tsd. - */ - vdev_disk_alloc(vd); - dvd = vd->vdev_tsd; - - /* - * Allow bypassing the devid. - */ - if (vd->vdev_devid != NULL && vdev_disk_bypass_devid) { - vdev_dbgmsg(vd, "vdev_disk_open, devid %s bypassed", - vd->vdev_devid); - spa_strfree(vd->vdev_devid); - vd->vdev_devid = NULL; - } - - /* - * When opening a disk device, we want to preserve the user's original - * intent. We always want to open the device by the path the user gave - * us, even if it is one of multiple paths to the save device. But we - * also want to be able to survive disks being removed/recabled. - * Therefore the sequence of opening devices is: - * - * 1. Try opening the device by path. For legacy pools without the - * 'whole_disk' property, attempt to fix the path by appending 's0'. - * - * 2. If the devid of the device matches the stored value, return - * success. - * - * 3. Otherwise, the device may have moved. Try opening the device - * by the devid instead. - */ - if (vd->vdev_devid != NULL) { - if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, - &dvd->vd_minor) != 0) { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - vdev_dbgmsg(vd, "vdev_disk_open: invalid " - "vdev_devid '%s'", vd->vdev_devid); - return (SET_ERROR(EINVAL)); - } - } - - error = EINVAL; /* presume failure */ - - if (vd->vdev_path != NULL) { - - if (vd->vdev_wholedisk == -1ULL) { - size_t len = strlen(vd->vdev_path) + 3; - char *buf = kmem_alloc(len, KM_SLEEP); - - (void) snprintf(buf, len, "%ss0", vd->vdev_path); - - error = ldi_open_by_name(buf, spa_mode(spa), kcred, - &dvd->vd_lh, zfs_li); - if (error == 0) { - spa_strfree(vd->vdev_path); - vd->vdev_path = buf; - vd->vdev_wholedisk = 1ULL; - } else { - kmem_free(buf, len); - } - } - - /* - * If we have not yet opened the device, try to open it by the - * specified path. - */ - if (error != 0) { - error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), - kcred, &dvd->vd_lh, zfs_li); - } - - /* - * Compare the devid to the stored value. - */ - if (error == 0 && vd->vdev_devid != NULL && - ldi_get_devid(dvd->vd_lh, &devid) == 0) { - if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { - /* - * A mismatch here is unexpected, log it. - */ - char *devid_str = ddi_devid_str_encode(devid, - dvd->vd_minor); - vdev_dbgmsg(vd, "vdev_disk_open: devid " - "mismatch: %s != %s", vd->vdev_devid, - devid_str); - cmn_err(CE_NOTE, "vdev_disk_open %s: devid " - "mismatch: %s != %s", vd->vdev_path, - vd->vdev_devid, devid_str); - ddi_devid_str_free(devid_str); - - error = SET_ERROR(EINVAL); - (void) ldi_close(dvd->vd_lh, spa_mode(spa), - kcred); - dvd->vd_lh = NULL; - } - ddi_devid_free(devid); - } - - /* - * If we succeeded in opening the device, but 'vdev_wholedisk' - * is not yet set, then this must be a slice. - */ - if (error == 0 && vd->vdev_wholedisk == -1ULL) - vd->vdev_wholedisk = 0; - } - - /* - * If we were unable to open by path, or the devid check fails, open by - * devid instead. - */ - if (error != 0 && vd->vdev_devid != NULL) { - error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, - spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); - if (error != 0) { - vdev_dbgmsg(vd, "Failed to open by devid (%s)", - vd->vdev_devid); - } - } - - /* - * If all else fails, then try opening by physical path (if available) - * or the logical path (if we failed due to the devid check). While not - * as reliable as the devid, this will give us something, and the higher - * level vdev validation will prevent us from opening the wrong device. - */ - if (error) { - if (vd->vdev_devid != NULL) - validate_devid = B_TRUE; - - if (vd->vdev_physpath != NULL && - (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) - error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), - kcred, &dvd->vd_lh, zfs_li); - - /* - * Note that we don't support the legacy auto-wholedisk support - * as above. This hasn't been used in a very long time and we - * don't need to propagate its oddities to this edge condition. - */ - if (error && vd->vdev_path != NULL) - error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), - kcred, &dvd->vd_lh, zfs_li); - } - - if (error) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]", - error); - return (error); - } - - /* - * Now that the device has been successfully opened, update the devid - * if necessary. - */ - if (validate_devid && spa_writeable(spa) && - ldi_get_devid(dvd->vd_lh, &devid) == 0) { - if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { - char *vd_devid; - - vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor); - vdev_dbgmsg(vd, "vdev_disk_open: update devid from " - "'%s' to '%s'", vd->vdev_devid, vd_devid); - cmn_err(CE_NOTE, "vdev_disk_open %s: update devid " - "from '%s' to '%s'", vd->vdev_path != NULL ? - vd->vdev_path : "?", vd->vdev_devid, vd_devid); - spa_strfree(vd->vdev_devid); - vd->vdev_devid = spa_strdup(vd_devid); - ddi_devid_str_free(vd_devid); - } - ddi_devid_free(devid); - } - - /* - * Once a device is opened, verify that the physical device path (if - * available) is up to date. - */ - if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && - ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { - char *physpath, *minorname; - - physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); - minorname = NULL; - if (ddi_dev_pathname(dev, otyp, physpath) == 0 && - ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && - (vd->vdev_physpath == NULL || - strcmp(vd->vdev_physpath, physpath) != 0)) { - if (vd->vdev_physpath) - spa_strfree(vd->vdev_physpath); - (void) strlcat(physpath, ":", MAXPATHLEN); - (void) strlcat(physpath, minorname, MAXPATHLEN); - vd->vdev_physpath = spa_strdup(physpath); - } - if (minorname) - kmem_free(minorname, strlen(minorname) + 1); - kmem_free(physpath, MAXPATHLEN); - } - - /* - * Register callbacks for the LDI offline event. - */ - if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) == - LDI_EV_SUCCESS) { - lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); - list_insert_tail(&dvd->vd_ldi_cbs, lcb); - (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, - &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id); - } - - /* - * Register callbacks for the LDI degrade event. - */ - if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) == - LDI_EV_SUCCESS) { - lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); - list_insert_tail(&dvd->vd_ldi_cbs, lcb); - (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, - &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id); - } -skip_open: - /* - * Determine the actual size of the device. - */ - if (ldi_get_size(dvd->vd_lh, psize) != 0) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - vdev_dbgmsg(vd, "vdev_disk_open: failed to get size"); - return (SET_ERROR(EINVAL)); - } - - *max_psize = *psize; - - /* - * Determine the device's minimum transfer size. - * If the ioctl isn't supported, assume DEV_BSIZE. - */ - if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, - (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) { - capacity = dkmext->dki_capacity - 1; - blksz = dkmext->dki_lbsize; - pbsize = dkmext->dki_pbsize; - } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, - (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) { - VDEV_DEBUG( - "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n", - vd->vdev_path); - capacity = dkm->dki_capacity - 1; - blksz = dkm->dki_lbsize; - pbsize = blksz; - } else { - VDEV_DEBUG("vdev_disk_open(\"%s\"): " - "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n", - vd->vdev_path, error); - pbsize = DEV_BSIZE; - } - - *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; - - if (vd->vdev_wholedisk == 1) { - int wce = 1; - - if (error == 0) { - /* - * If we have the capability to expand, we'd have - * found out via success from DKIOCGMEDIAINFO{,EXT}. - * Adjust max_psize upward accordingly since we know - * we own the whole disk now. - */ - *max_psize = capacity * blksz; - } - - /* - * Since we own the whole disk, try to enable disk write - * caching. We ignore errors because it's OK if we can't do it. - */ - (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, - FKIOCTL, kcred, NULL); - } - - /* - * Clear the nowritecache bit, so that on a vdev_reopen() we will - * try again. - */ - vd->vdev_nowritecache = B_FALSE; - - return (0); -} - -static void -vdev_disk_close(vdev_t *vd) -{ - vdev_disk_t *dvd = vd->vdev_tsd; - - if (vd->vdev_reopening || dvd == NULL) - return; - - if (dvd->vd_minor != NULL) { - ddi_devid_str_free(dvd->vd_minor); - dvd->vd_minor = NULL; - } - - if (dvd->vd_devid != NULL) { - ddi_devid_free(dvd->vd_devid); - dvd->vd_devid = NULL; - } - - if (dvd->vd_lh != NULL) { - (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); - dvd->vd_lh = NULL; - } - - vd->vdev_delayed_close = B_FALSE; - /* - * If we closed the LDI handle due to an offline notify from LDI, - * don't free vd->vdev_tsd or unregister the callbacks here; - * the offline finalize callback or a reopen will take care of it. - */ - if (dvd->vd_ldi_offline) - return; - - vdev_disk_free(vd); -} - -int -vdev_disk_physio(vdev_t *vd, caddr_t data, - size_t size, uint64_t offset, int flags, boolean_t isdump) -{ - vdev_disk_t *dvd = vd->vdev_tsd; - - /* - * If the vdev is closed, it's likely in the REMOVED or FAULTED state. - * Nothing to be done here but return failure. - */ - if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) - return (EIO); - - ASSERT(vd->vdev_ops == &vdev_disk_ops); - - /* - * If in the context of an active crash dump, use the ldi_dump(9F) - * call instead of ldi_strategy(9F) as usual. - */ - if (isdump) { - ASSERT3P(dvd, !=, NULL); - return (ldi_dump(dvd->vd_lh, data, lbtodb(offset), - lbtodb(size))); - } - - return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags)); -} - -int -vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, - size_t size, uint64_t offset, int flags) -{ - buf_t *bp; - int error = 0; - - if (vd_lh == NULL) - return (SET_ERROR(EINVAL)); - - ASSERT(flags & B_READ || flags & B_WRITE); - - bp = getrbuf(KM_SLEEP); - bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST; - bp->b_bcount = size; - bp->b_un.b_addr = (void *)data; - bp->b_lblkno = lbtodb(offset); - bp->b_bufsize = size; - - error = ldi_strategy(vd_lh, bp); - ASSERT(error == 0); - if ((error = biowait(bp)) == 0 && bp->b_resid != 0) - error = SET_ERROR(EIO); - freerbuf(bp); - - return (error); -} - -static void -vdev_disk_io_intr(buf_t *bp) -{ - vdev_buf_t *vb = (vdev_buf_t *)bp; - zio_t *zio = vb->vb_io; - - /* - * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. - * Rather than teach the rest of the stack about other error - * possibilities (EFAULT, etc), we normalize the error value here. - */ - zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0); - - if (zio->io_error == 0 && bp->b_resid != 0) - zio->io_error = SET_ERROR(EIO); - - if (zio->io_type == ZIO_TYPE_READ) { - abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size); - } else { - abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size); - } - - kmem_free(vb, sizeof (vdev_buf_t)); - - zio_delay_interrupt(zio); -} - -static void -vdev_disk_ioctl_free(zio_t *zio) -{ - kmem_free(zio->io_vsd, sizeof (struct dk_callback)); -} - -static const zio_vsd_ops_t vdev_disk_vsd_ops = { - vdev_disk_ioctl_free, - zio_vsd_default_cksum_report -}; - -static void -vdev_disk_ioctl_done(void *zio_arg, int error) -{ - zio_t *zio = zio_arg; - - zio->io_error = error; - - zio_interrupt(zio); -} - -static void -vdev_disk_io_start(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_disk_t *dvd = vd->vdev_tsd; - vdev_buf_t *vb; - struct dk_callback *dkc; - buf_t *bp; - int error; - - /* - * If the vdev is closed, it's likely in the REMOVED or FAULTED state. - * Nothing to be done here but return failure. - */ - if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) { - zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); - return; - } - - if (zio->io_type == ZIO_TYPE_IOCTL) { - /* XXPOLICY */ - if (!vdev_readable(vd)) { - zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); - return; - } - - switch (zio->io_cmd) { - - case DKIOCFLUSHWRITECACHE: - - if (zfs_nocacheflush) - break; - - if (vd->vdev_nowritecache) { - zio->io_error = SET_ERROR(ENOTSUP); - break; - } - - zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); - zio->io_vsd_ops = &vdev_disk_vsd_ops; - - dkc->dkc_callback = vdev_disk_ioctl_done; - dkc->dkc_flag = FLUSH_VOLATILE; - dkc->dkc_cookie = zio; - - error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, - (uintptr_t)dkc, FKIOCTL, kcred, NULL); - - if (error == 0) { - /* - * The ioctl will be done asychronously, - * and will call vdev_disk_ioctl_done() - * upon completion. - */ - return; - } - - zio->io_error = error; - - break; - - default: - zio->io_error = SET_ERROR(ENOTSUP); - } - - zio_execute(zio); - return; - } - - ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); - zio->io_target_timestamp = zio_handle_io_delay(zio); - - vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); - - vb->vb_io = zio; - bp = &vb->vb_buf; - - bioinit(bp); - bp->b_flags = B_BUSY | B_NOCACHE | - (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); - if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) - bp->b_flags |= B_FAILFAST; - bp->b_bcount = zio->io_size; - - if (zio->io_type == ZIO_TYPE_READ) { - bp->b_un.b_addr = - abd_borrow_buf(zio->io_abd, zio->io_size); - } else { - bp->b_un.b_addr = - abd_borrow_buf_copy(zio->io_abd, zio->io_size); - } - - bp->b_lblkno = lbtodb(zio->io_offset); - bp->b_bufsize = zio->io_size; - bp->b_iodone = (int (*)())vdev_disk_io_intr; - - /* ldi_strategy() will return non-zero only on programming errors */ - VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0); -} - -static void -vdev_disk_io_done(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - - /* - * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if - * the device has been removed. If this is the case, then we trigger an - * asynchronous removal of the device. Otherwise, probe the device and - * make sure it's still accessible. - */ - if (zio->io_error == EIO && !vd->vdev_remove_wanted) { - vdev_disk_t *dvd = vd->vdev_tsd; - int state = DKIO_NONE; - - if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, - FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { - /* - * We post the resource as soon as possible, instead of - * when the async removal actually happens, because the - * DE is using this information to discard previous I/O - * errors. - */ - zfs_post_remove(zio->io_spa, vd); - vd->vdev_remove_wanted = B_TRUE; - spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); - } else if (!vd->vdev_delayed_close) { - vd->vdev_delayed_close = B_TRUE; - } - } -} - -vdev_ops_t vdev_disk_ops = { - vdev_disk_open, - vdev_disk_close, - vdev_default_asize, - vdev_disk_io_start, - vdev_disk_io_done, - NULL, - NULL, - vdev_disk_hold, - vdev_disk_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ -}; - -/* - * Given the root disk device devid or pathname, read the label from - * the device, and construct a configuration nvlist. - */ -int -vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) -{ - ldi_handle_t vd_lh; - vdev_label_t *label; - uint64_t s, size; - int l; - ddi_devid_t tmpdevid; - int error = -1; - char *minor_name; - - /* - * Read the device label and build the nvlist. - */ - if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, - &minor_name) == 0) { - error = ldi_open_by_devid(tmpdevid, minor_name, - FREAD, kcred, &vd_lh, zfs_li); - ddi_devid_free(tmpdevid); - ddi_devid_str_free(minor_name); - } - - if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, - zfs_li))) - return (error); - - if (ldi_get_size(vd_lh, &s)) { - (void) ldi_close(vd_lh, FREAD, kcred); - return (SET_ERROR(EIO)); - } - - size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); - label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); - - *config = NULL; - for (l = 0; l < VDEV_LABELS; l++) { - uint64_t offset, state, txg = 0; - - /* read vdev label */ - offset = vdev_label_offset(size, l, 0); - if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label, - VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) - continue; - - if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, - sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { - *config = NULL; - continue; - } - - if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, - &state) != 0 || state >= POOL_STATE_DESTROYED) { - nvlist_free(*config); - *config = NULL; - continue; - } - - if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, - &txg) != 0 || txg == 0) { - nvlist_free(*config); - *config = NULL; - continue; - } - - break; - } - - kmem_free(label, sizeof (vdev_label_t)); - (void) ldi_close(vd_lh, FREAD, kcred); - if (*config == NULL) - error = SET_ERROR(EIDRM); - - return (error); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c deleted file mode 100644 index aa80e028f7df..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c +++ /dev/null @@ -1,307 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Virtual device vector for files. - */ - -static taskq_t *vdev_file_taskq; - -void -vdev_file_init(void) -{ - vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16), - minclsyspri, max_ncpus, INT_MAX, 0); -} - -void -vdev_file_fini(void) -{ - taskq_destroy(vdev_file_taskq); -} - -static void -vdev_file_hold(vdev_t *vd) -{ - ASSERT(vd->vdev_path != NULL); -} - -static void -vdev_file_rele(vdev_t *vd) -{ - ASSERT(vd->vdev_path != NULL); -} - -static int -vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *logical_ashift, uint64_t *physical_ashift) -{ - vdev_file_t *vf; - vnode_t *vp; - vattr_t vattr; - int error; - - /* Rotational optimizations only make sense on block devices */ - vd->vdev_nonrot = B_TRUE; - - /* - * We must have a pathname, and it must be absolute. - */ - if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (SET_ERROR(EINVAL)); - } - - /* - * Reopen the device if it's not currently open. Otherwise, - * just update the physical size of the device. - */ - if (vd->vdev_tsd != NULL) { - ASSERT(vd->vdev_reopening); - vf = vd->vdev_tsd; - vp = vf->vf_vnode; - goto skip_open; - } - - vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); - - /* - * We always open the files from the root of the global zone, even if - * we're in a local zone. If the user has gotten to this point, the - * administrator has already decided that the pool should be available - * to local zone users, so the underlying devices should be as well. - */ - ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); - error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, - spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); - - if (error) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); - vd->vdev_tsd = NULL; - return (error); - } - - vf->vf_vnode = vp; - -#ifdef _KERNEL - /* - * Make sure it's a regular file. - */ - if (vp->v_type != VREG) { -#ifdef __FreeBSD__ - (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); -#endif - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; -#ifdef __FreeBSD__ - kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); - vd->vdev_tsd = NULL; -#endif - return (SET_ERROR(ENODEV)); - } -#endif /* _KERNEL */ - -skip_open: - /* - * Determine the physical size of the file. - */ - vattr.va_mask = AT_SIZE; - vn_lock(vp, LK_SHARED | LK_RETRY); - error = VOP_GETATTR(vp, &vattr, kcred); - VOP_UNLOCK(vp); - if (error) { - (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); - vd->vdev_tsd = NULL; - return (error); - } - - vd->vdev_notrim = B_TRUE; - - *max_psize = *psize = vattr.va_size; - *logical_ashift = SPA_MINBLOCKSHIFT; - *physical_ashift = SPA_MINBLOCKSHIFT; - - return (0); -} - -static void -vdev_file_close(vdev_t *vd) -{ - vdev_file_t *vf = vd->vdev_tsd; - - if (vd->vdev_reopening || vf == NULL) - return; - - if (vf->vf_vnode != NULL) { - (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, - kcred, NULL); - } - - vd->vdev_delayed_close = B_FALSE; - kmem_free(vf, sizeof (vdev_file_t)); - vd->vdev_tsd = NULL; -} - -/* - * Implements the interrupt side for file vdev types. This routine will be - * called when the I/O completes allowing us to transfer the I/O to the - * interrupt taskqs. For consistency, the code structure mimics disk vdev - * types. - */ -static void -vdev_file_io_intr(zio_t *zio) -{ - zio_delay_interrupt(zio); -} - -static void -vdev_file_io_strategy(void *arg) -{ - zio_t *zio = arg; - vdev_t *vd = zio->io_vd; - vdev_file_t *vf; - vnode_t *vp; - void *addr; - ssize_t resid; - - vf = vd->vdev_tsd; - vp = vf->vf_vnode; - - ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); - if (zio->io_type == ZIO_TYPE_READ) { - addr = abd_borrow_buf(zio->io_abd, zio->io_size); - } else { - addr = abd_borrow_buf_copy(zio->io_abd, zio->io_size); - } - - zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vp, addr, zio->io_size, - zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); - - if (zio->io_type == ZIO_TYPE_READ) { - abd_return_buf_copy(zio->io_abd, addr, zio->io_size); - } else { - abd_return_buf(zio->io_abd, addr, zio->io_size); - } - - if (resid != 0 && zio->io_error == 0) - zio->io_error = ENOSPC; - - vdev_file_io_intr(zio); -} - -static void -vdev_file_io_start(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_file_t *vf = vd->vdev_tsd; - - if (zio->io_type == ZIO_TYPE_IOCTL) { - /* XXPOLICY */ - if (!vdev_readable(vd)) { - zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); - return; - } - - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - - if (zfs_nocacheflush) - break; - - zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, - kcred, NULL); - break; - default: - zio->io_error = SET_ERROR(ENOTSUP); - } - - zio_execute(zio); - return; - } - - ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); - zio->io_target_timestamp = zio_handle_io_delay(zio); - - VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, - TQ_SLEEP), !=, 0); -} - -/* ARGSUSED */ -static void -vdev_file_io_done(zio_t *zio) -{ -} - -vdev_ops_t vdev_file_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_FILE, /* name of this vdev type */ - B_TRUE /* leaf vdev */ -}; - -/* - * From userland we access disks just like files. - */ -#ifndef _KERNEL - -vdev_ops_t vdev_disk_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ -}; - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c deleted file mode 100644 index 5ff895ce472c..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ /dev/null @@ -1,1193 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2006 Pawel Jakub Dawidek - * All rights reserved. - * - * Portions Copyright (c) 2012 Martin Matuska - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Virtual device vector for GEOM. - */ - -static g_attrchanged_t vdev_geom_attrchanged; -struct g_class zfs_vdev_class = { - .name = "ZFS::VDEV", - .version = G_VERSION, - .attrchanged = vdev_geom_attrchanged, -}; - -struct consumer_vdev_elem { - SLIST_ENTRY(consumer_vdev_elem) elems; - vdev_t *vd; -}; - -SLIST_HEAD(consumer_priv_t, consumer_vdev_elem); -_Static_assert(sizeof(((struct g_consumer*)NULL)->private) - == sizeof(struct consumer_priv_t*), - "consumer_priv_t* can't be stored in g_consumer.private"); - -DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); - -SYSCTL_DECL(_vfs_zfs_vdev); -/* Don't send BIO_FLUSH. */ -static int vdev_geom_bio_flush_disable; -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN, - &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); -/* Don't send BIO_DELETE. */ -static int vdev_geom_bio_delete_disable; -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN, - &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); - -/* Declare local functions */ -static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read); - -/* - * Thread local storage used to indicate when a thread is probing geoms - * for their guids. If NULL, this thread is not tasting geoms. If non NULL, - * it is looking for a replacement for the vdev_t* that is its value. - */ -uint_t zfs_geom_probe_vdev_key; - -static void -vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp) -{ - int error; - uint16_t rate; - - error = g_getattr("GEOM::rotation_rate", cp, &rate); - if (error == 0 && rate == 1) - vd->vdev_nonrot = B_TRUE; - else - vd->vdev_nonrot = B_FALSE; -} - -static void -vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp, - boolean_t do_null_update) -{ - boolean_t needs_update = B_FALSE; - char *physpath; - int error, physpath_len; - - physpath_len = MAXPATHLEN; - physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); - error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); - if (error == 0) { - char *old_physpath; - - /* g_topology lock ensures that vdev has not been closed */ - g_topology_assert(); - old_physpath = vd->vdev_physpath; - vd->vdev_physpath = spa_strdup(physpath); - - if (old_physpath != NULL) { - needs_update = (strcmp(old_physpath, - vd->vdev_physpath) != 0); - spa_strfree(old_physpath); - } else - needs_update = do_null_update; - } - g_free(physpath); - - /* - * If the physical path changed, update the config. - * Only request an update for previously unset physpaths if - * requested by the caller. - */ - if (needs_update) - spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE); - -} - -static void -vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) -{ - char *old_physpath; - struct consumer_priv_t *priv; - struct consumer_vdev_elem *elem; - int error; - - priv = (struct consumer_priv_t*)&cp->private; - if (SLIST_EMPTY(priv)) - return; - - SLIST_FOREACH(elem, priv, elems) { - vdev_t *vd = elem->vd; - if (strcmp(attr, "GEOM::rotation_rate") == 0) { - vdev_geom_set_rotation_rate(vd, cp); - return; - } - if (strcmp(attr, "GEOM::physpath") == 0) { - vdev_geom_set_physpath(vd, cp, /*null_update*/B_TRUE); - return; - } - } -} - -static void -vdev_geom_resize(struct g_consumer *cp) -{ - struct consumer_priv_t *priv; - struct consumer_vdev_elem *elem; - spa_t *spa; - vdev_t *vd; - - priv = (struct consumer_priv_t *)&cp->private; - if (SLIST_EMPTY(priv)) - return; - - SLIST_FOREACH(elem, priv, elems) { - vd = elem->vd; - if (vd->vdev_state != VDEV_STATE_HEALTHY) - continue; - spa = vd->vdev_spa; - if (!spa->spa_autoexpand) - continue; - vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL); - } -} - -static void -vdev_geom_orphan(struct g_consumer *cp) -{ - struct consumer_priv_t *priv; - struct consumer_vdev_elem *elem; - - g_topology_assert(); - - priv = (struct consumer_priv_t*)&cp->private; - if (SLIST_EMPTY(priv)) - /* Vdev close in progress. Ignore the event. */ - return; - - /* - * Orphan callbacks occur from the GEOM event thread. - * Concurrent with this call, new I/O requests may be - * working their way through GEOM about to find out - * (only once executed by the g_down thread) that we've - * been orphaned from our disk provider. These I/Os - * must be retired before we can detach our consumer. - * This is most easily achieved by acquiring the - * SPA ZIO configuration lock as a writer, but doing - * so with the GEOM topology lock held would cause - * a lock order reversal. Instead, rely on the SPA's - * async removal support to invoke a close on this - * vdev once it is safe to do so. - */ - SLIST_FOREACH(elem, priv, elems) { - vdev_t *vd = elem->vd; - - vd->vdev_remove_wanted = B_TRUE; - spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); - } -} - -static struct g_consumer * -vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity) -{ - struct g_geom *gp; - struct g_consumer *cp; - int error; - - g_topology_assert(); - - ZFS_LOG(1, "Attaching to %s.", pp->name); - - if (sanity) { - if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) { - ZFS_LOG(1, "Failing attach of %s. " - "Incompatible sectorsize %d\n", - pp->name, pp->sectorsize); - return (NULL); - } else if (pp->mediasize < SPA_MINDEVSIZE) { - ZFS_LOG(1, "Failing attach of %s. " - "Incompatible mediasize %ju\n", - pp->name, pp->mediasize); - return (NULL); - } - } - - /* Do we have geom already? No? Create one. */ - LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { - if (gp->flags & G_GEOM_WITHER) - continue; - if (strcmp(gp->name, "zfs::vdev") != 0) - continue; - break; - } - if (gp == NULL) { - gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); - gp->orphan = vdev_geom_orphan; - gp->attrchanged = vdev_geom_attrchanged; - gp->resize = vdev_geom_resize; - cp = g_new_consumer(gp); - error = g_attach(cp, pp); - if (error != 0) { - ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, - __LINE__, error); - vdev_geom_detach(cp, B_FALSE); - return (NULL); - } - error = g_access(cp, 1, 0, 1); - if (error != 0) { - ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, - __LINE__, error); - vdev_geom_detach(cp, B_FALSE); - return (NULL); - } - ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); - } else { - /* Check if we are already connected to this provider. */ - LIST_FOREACH(cp, &gp->consumer, consumer) { - if (cp->provider == pp) { - ZFS_LOG(1, "Found consumer for %s.", pp->name); - break; - } - } - if (cp == NULL) { - cp = g_new_consumer(gp); - error = g_attach(cp, pp); - if (error != 0) { - ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", - __func__, __LINE__, error); - vdev_geom_detach(cp, B_FALSE); - return (NULL); - } - error = g_access(cp, 1, 0, 1); - if (error != 0) { - ZFS_LOG(1, "%s(%d): g_access failed: %d\n", - __func__, __LINE__, error); - vdev_geom_detach(cp, B_FALSE); - return (NULL); - } - ZFS_LOG(1, "Created consumer for %s.", pp->name); - } else { - error = g_access(cp, 1, 0, 1); - if (error != 0) { - ZFS_LOG(1, "%s(%d): g_access failed: %d\n", - __func__, __LINE__, error); - return (NULL); - } - ZFS_LOG(1, "Used existing consumer for %s.", pp->name); - } - } - - if (vd != NULL) - vd->vdev_tsd = cp; - - cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; - return (cp); -} - -static void -vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read) -{ - struct g_geom *gp; - - g_topology_assert(); - - ZFS_LOG(1, "Detaching from %s.", - cp->provider && cp->provider->name ? cp->provider->name : "NULL"); - - gp = cp->geom; - if (open_for_read) - g_access(cp, -1, 0, -1); - /* Destroy consumer on last close. */ - if (cp->acr == 0 && cp->ace == 0) { - if (cp->acw > 0) - g_access(cp, 0, -cp->acw, 0); - if (cp->provider != NULL) { - ZFS_LOG(1, "Destroying consumer for %s.", - cp->provider->name ? cp->provider->name : "NULL"); - g_detach(cp); - } - g_destroy_consumer(cp); - } - /* Destroy geom if there are no consumers left. */ - if (LIST_EMPTY(&gp->consumer)) { - ZFS_LOG(1, "Destroyed geom %s.", gp->name); - g_wither_geom(gp, ENXIO); - } -} - -static void -vdev_geom_close_locked(vdev_t *vd) -{ - struct g_consumer *cp; - struct consumer_priv_t *priv; - struct consumer_vdev_elem *elem, *elem_temp; - - g_topology_assert(); - - cp = vd->vdev_tsd; - vd->vdev_delayed_close = B_FALSE; - if (cp == NULL) - return; - - ZFS_LOG(1, "Closing access to %s.", cp->provider->name); - KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__)); - priv = (struct consumer_priv_t*)&cp->private; - vd->vdev_tsd = NULL; - SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) { - if (elem->vd == vd) { - SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems); - g_free(elem); - } - } - - vdev_geom_detach(cp, B_TRUE); -} - -/* - * Issue one or more bios to the vdev in parallel - * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO - * operation is described by parallel entries from each array. There may be - * more bios actually issued than entries in the array - */ -static void -vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, - off_t *sizes, int *errors, int ncmds) -{ - struct bio **bios; - u_char *p; - off_t off, maxio, s, end; - int i, n_bios, j; - size_t bios_size; - - maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); - n_bios = 0; - - /* How many bios are required for all commands ? */ - for (i = 0; i < ncmds; i++) - n_bios += (sizes[i] + maxio - 1) / maxio; - - /* Allocate memory for the bios */ - bios_size = n_bios * sizeof(struct bio*); - bios = kmem_zalloc(bios_size, KM_SLEEP); - - /* Prepare and issue all of the bios */ - for (i = j = 0; i < ncmds; i++) { - off = offsets[i]; - p = datas[i]; - s = sizes[i]; - end = off + s; - ASSERT((off % cp->provider->sectorsize) == 0); - ASSERT((s % cp->provider->sectorsize) == 0); - - for (; off < end; off += maxio, p += maxio, s -= maxio, j++) { - bios[j] = g_alloc_bio(); - bios[j]->bio_cmd = cmds[i]; - bios[j]->bio_done = NULL; - bios[j]->bio_offset = off; - bios[j]->bio_length = MIN(s, maxio); - bios[j]->bio_data = p; - g_io_request(bios[j], cp); - } - } - ASSERT(j == n_bios); - - /* Wait for all of the bios to complete, and clean them up */ - for (i = j = 0; i < ncmds; i++) { - off = offsets[i]; - s = sizes[i]; - end = off + s; - - for (; off < end; off += maxio, s -= maxio, j++) { - errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i]; - g_destroy_bio(bios[j]); - } - } - kmem_free(bios, bios_size); -} - -/* - * Read the vdev config from a device. Return the number of valid labels that - * were found. The vdev config will be returned in config if and only if at - * least one valid label was found. - */ -static int -vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) -{ - struct g_provider *pp; - nvlist_t *config; - vdev_phys_t *vdev_lists[VDEV_LABELS]; - char *buf; - size_t buflen; - uint64_t psize, state, txg; - off_t offsets[VDEV_LABELS]; - off_t size; - off_t sizes[VDEV_LABELS]; - int cmds[VDEV_LABELS]; - int errors[VDEV_LABELS]; - int l, nlabels; - - g_topology_assert_not(); - - pp = cp->provider; - ZFS_LOG(1, "Reading config from %s...", pp->name); - - psize = pp->mediasize; - psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t)); - - size = sizeof(*vdev_lists[0]) + pp->sectorsize - - ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1; - - buflen = sizeof(vdev_lists[0]->vp_nvlist); - - /* Create all of the IO requests */ - for (l = 0; l < VDEV_LABELS; l++) { - cmds[l] = BIO_READ; - vdev_lists[l] = kmem_alloc(size, KM_SLEEP); - offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; - sizes[l] = size; - errors[l] = 0; - ASSERT(offsets[l] % pp->sectorsize == 0); - } - - /* Issue the IO requests */ - vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors, - VDEV_LABELS); - - /* Parse the labels */ - config = *configp = NULL; - nlabels = 0; - for (l = 0; l < VDEV_LABELS; l++) { - if (errors[l] != 0) - continue; - - buf = vdev_lists[l]->vp_nvlist; - - if (nvlist_unpack(buf, buflen, &config, 0) != 0) - continue; - - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, - &state) != 0 || state > POOL_STATE_L2CACHE) { - nvlist_free(config); - continue; - } - - if (state != POOL_STATE_SPARE && - state != POOL_STATE_L2CACHE && - (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, - &txg) != 0 || txg == 0)) { - nvlist_free(config); - continue; - } - - if (*configp != NULL) - nvlist_free(*configp); - *configp = config; - - nlabels++; - } - - /* Free the label storage */ - for (l = 0; l < VDEV_LABELS; l++) - kmem_free(vdev_lists[l], size); - - return (nlabels); -} - -static void -resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) -{ - nvlist_t **new_configs; - uint64_t i; - - if (id < *count) - return; - new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *), - KM_SLEEP); - for (i = 0; i < *count; i++) - new_configs[i] = (*configs)[i]; - if (*configs != NULL) - kmem_free(*configs, *count * sizeof(void *)); - *configs = new_configs; - *count = id + 1; -} - -static void -process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, - const char *name, uint64_t* known_pool_guid) -{ - nvlist_t *vdev_tree; - uint64_t pool_guid; - uint64_t vdev_guid, known_guid; - uint64_t id, txg, known_txg; - char *pname; - int i; - - if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || - strcmp(pname, name) != 0) - goto ignore; - - if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) - goto ignore; - - if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) - goto ignore; - - if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) - goto ignore; - - if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) - goto ignore; - - VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); - - if (*known_pool_guid != 0) { - if (pool_guid != *known_pool_guid) - goto ignore; - } else - *known_pool_guid = pool_guid; - - resize_configs(configs, count, id); - - if ((*configs)[id] != NULL) { - VERIFY(nvlist_lookup_uint64((*configs)[id], - ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0); - if (txg <= known_txg) - goto ignore; - nvlist_free((*configs)[id]); - } - - (*configs)[id] = cfg; - return; - -ignore: - nvlist_free(cfg); -} - -int -vdev_geom_read_pool_label(const char *name, - nvlist_t ***configs, uint64_t *count) -{ - struct g_class *mp; - struct g_geom *gp; - struct g_provider *pp; - struct g_consumer *zcp; - nvlist_t *vdev_cfg; - uint64_t pool_guid; - int error, nlabels; - - DROP_GIANT(); - g_topology_lock(); - - *configs = NULL; - *count = 0; - pool_guid = 0; - LIST_FOREACH(mp, &g_classes, class) { - if (mp == &zfs_vdev_class) - continue; - LIST_FOREACH(gp, &mp->geom, geom) { - if (gp->flags & G_GEOM_WITHER) - continue; - LIST_FOREACH(pp, &gp->provider, provider) { - if (pp->flags & G_PF_WITHER) - continue; - zcp = vdev_geom_attach(pp, NULL, B_TRUE); - if (zcp == NULL) - continue; - g_topology_unlock(); - nlabels = vdev_geom_read_config(zcp, &vdev_cfg); - g_topology_lock(); - vdev_geom_detach(zcp, B_TRUE); - if (nlabels == 0) - continue; - ZFS_LOG(1, "successfully read vdev config"); - - process_vdev_config(configs, count, - vdev_cfg, name, &pool_guid); - } - } - } - g_topology_unlock(); - PICKUP_GIANT(); - - return (*count > 0 ? 0 : ENOENT); -} - -enum match { - NO_MATCH = 0, /* No matching labels found */ - TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid*/ - ZERO_MATCH = 1, /* Should never be returned */ - ONE_MATCH = 2, /* 1 label matching the vdev_guid */ - TWO_MATCH = 3, /* 2 label matching the vdev_guid */ - THREE_MATCH = 4, /* 3 label matching the vdev_guid */ - FULL_MATCH = 5 /* all labels match the vdev_guid */ -}; - -static enum match -vdev_attach_ok(vdev_t *vd, struct g_provider *pp) -{ - nvlist_t *config; - uint64_t pool_guid, top_guid, vdev_guid; - struct g_consumer *cp; - int nlabels; - - cp = vdev_geom_attach(pp, NULL, B_TRUE); - if (cp == NULL) { - ZFS_LOG(1, "Unable to attach tasting instance to %s.", - pp->name); - return (NO_MATCH); - } - g_topology_unlock(); - nlabels = vdev_geom_read_config(cp, &config); - g_topology_lock(); - vdev_geom_detach(cp, B_TRUE); - if (nlabels == 0) { - ZFS_LOG(1, "Unable to read config from %s.", pp->name); - return (NO_MATCH); - } - - pool_guid = 0; - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid); - top_guid = 0; - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid); - vdev_guid = 0; - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); - nvlist_free(config); - - /* - * Check that the label's pool guid matches the desired guid. - * Inactive spares and L2ARCs do not have any pool guid in the label. - */ - if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) { - ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.", - pp->name, - (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid); - return (NO_MATCH); - } - - /* - * Check that the label's vdev guid matches the desired guid. - * The second condition handles possible race on vdev detach, when - * remaining vdev receives GUID of destroyed top level mirror vdev. - */ - if (vdev_guid == vd->vdev_guid) { - ZFS_LOG(1, "guids match for provider %s.", pp->name); - return (ZERO_MATCH + nlabels); - } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) { - ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name); - return (TOPGUID_MATCH); - } - ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.", - pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid); - return (NO_MATCH); -} - -static struct g_consumer * -vdev_geom_attach_by_guids(vdev_t *vd) -{ - struct g_class *mp; - struct g_geom *gp; - struct g_provider *pp, *best_pp; - struct g_consumer *cp; - const char *vdpath; - enum match match, best_match; - - g_topology_assert(); - - vdpath = vd->vdev_path + sizeof("/dev/") - 1; - cp = NULL; - best_pp = NULL; - best_match = NO_MATCH; - LIST_FOREACH(mp, &g_classes, class) { - if (mp == &zfs_vdev_class) - continue; - LIST_FOREACH(gp, &mp->geom, geom) { - if (gp->flags & G_GEOM_WITHER) - continue; - LIST_FOREACH(pp, &gp->provider, provider) { - match = vdev_attach_ok(vd, pp); - if (match > best_match) { - best_match = match; - best_pp = pp; - } else if (match == best_match) { - if (strcmp(pp->name, vdpath) == 0) { - best_pp = pp; - } - } - if (match == FULL_MATCH) - goto out; - } - } - } - -out: - if (best_pp) { - cp = vdev_geom_attach(best_pp, vd, B_TRUE); - if (cp == NULL) { - printf("ZFS WARNING: Unable to attach to %s.\n", - best_pp->name); - } - } - return (cp); -} - -static struct g_consumer * -vdev_geom_open_by_guids(vdev_t *vd) -{ - struct g_consumer *cp; - char *buf; - size_t len; - - g_topology_assert(); - - ZFS_LOG(1, "Searching by guids [%ju:%ju].", - (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); - cp = vdev_geom_attach_by_guids(vd); - if (cp != NULL) { - len = strlen(cp->provider->name) + strlen("/dev/") + 1; - buf = kmem_alloc(len, KM_SLEEP); - - snprintf(buf, len, "/dev/%s", cp->provider->name); - spa_strfree(vd->vdev_path); - vd->vdev_path = buf; - - ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", - (uintmax_t)spa_guid(vd->vdev_spa), - (uintmax_t)vd->vdev_guid, cp->provider->name); - } else { - ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", - (uintmax_t)spa_guid(vd->vdev_spa), - (uintmax_t)vd->vdev_guid); - } - - return (cp); -} - -static struct g_consumer * -vdev_geom_open_by_path(vdev_t *vd, int check_guid) -{ - struct g_provider *pp; - struct g_consumer *cp; - - g_topology_assert(); - - cp = NULL; - pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1); - if (pp != NULL) { - ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); - if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH) - cp = vdev_geom_attach(pp, vd, B_FALSE); - } - - return (cp); -} - -static int -vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *logical_ashift, uint64_t *physical_ashift) -{ - struct g_provider *pp; - struct g_consumer *cp; - size_t bufsize; - int error; - - /* Set the TLS to indicate downstack that we should not access zvols*/ - VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0); - - /* - * We must have a pathname, and it must be absolute. - */ - if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); - } - - /* - * Reopen the device if it's not currently open. Otherwise, - * just update the physical size of the device. - */ - if ((cp = vd->vdev_tsd) != NULL) { - ASSERT(vd->vdev_reopening); - goto skip_open; - } - - DROP_GIANT(); - g_topology_lock(); - error = 0; - - if (vd->vdev_spa->spa_splitting_newspa || - (vd->vdev_prevstate == VDEV_STATE_UNKNOWN && - vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || - vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) { - /* - * We are dealing with a vdev that hasn't been previously - * opened (since boot), and we are not loading an - * existing pool configuration. This looks like a - * vdev add operation to a new or existing pool. - * Assume the user knows what he/she is doing and find - * GEOM provider by its name, ignoring GUID mismatches. - * - * XXPOLICY: It would be safer to only allow a device - * that is unlabeled or labeled but missing - * GUID information to be opened in this fashion, - * unless we are doing a split, in which case we - * should allow any guid. - */ - cp = vdev_geom_open_by_path(vd, 0); - } else { - /* - * Try using the recorded path for this device, but only - * accept it if its label data contains the expected GUIDs. - */ - cp = vdev_geom_open_by_path(vd, 1); - if (cp == NULL) { - /* - * The device at vd->vdev_path doesn't have the - * expected GUIDs. The disks might have merely - * moved around so try all other GEOM providers - * to find one with the right GUIDs. - */ - cp = vdev_geom_open_by_guids(vd); - } - } - - /* Clear the TLS now that tasting is done */ - VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0); - - if (cp == NULL) { - ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path); - error = ENOENT; - } else { - struct consumer_priv_t *priv; - struct consumer_vdev_elem *elem; - int spamode; - - priv = (struct consumer_priv_t*)&cp->private; - if (cp->private == NULL) - SLIST_INIT(priv); - elem = g_malloc(sizeof(*elem), M_WAITOK|M_ZERO); - elem->vd = vd; - SLIST_INSERT_HEAD(priv, elem, elems); - - spamode = spa_mode(vd->vdev_spa); - if (cp->provider->sectorsize > VDEV_PAD_SIZE || - !ISP2(cp->provider->sectorsize)) { - ZFS_LOG(1, "Provider %s has unsupported sectorsize.", - cp->provider->name); - - vdev_geom_close_locked(vd); - error = EINVAL; - cp = NULL; - } else if (cp->acw == 0 && (spamode & FWRITE) != 0) { - int i; - - for (i = 0; i < 5; i++) { - error = g_access(cp, 0, 1, 0); - if (error == 0) - break; - g_topology_unlock(); - tsleep(vd, 0, "vdev", hz / 2); - g_topology_lock(); - } - if (error != 0) { - printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n", - cp->provider->name, error); - vdev_geom_close_locked(vd); - cp = NULL; - } - } - } - - /* Fetch initial physical path information for this device. */ - if (cp != NULL) { - vdev_geom_attrchanged(cp, "GEOM::physpath"); - - /* Set other GEOM characteristics */ - vdev_geom_set_physpath(vd, cp, /*do_null_update*/B_FALSE); - vdev_geom_set_rotation_rate(vd, cp); - } - - g_topology_unlock(); - PICKUP_GIANT(); - if (cp == NULL) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]", - error); - return (error); - } -skip_open: - pp = cp->provider; - - /* - * Determine the actual size of the device. - */ - *max_psize = *psize = pp->mediasize; - - /* - * Determine the device's minimum transfer size and preferred - * transfer size. - */ - *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; - *physical_ashift = 0; - if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) && - pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0) - *physical_ashift = highbit(pp->stripesize) - 1; - - /* - * Clear the nowritecache settings, so that on a vdev_reopen() - * we will try again. - */ - vd->vdev_nowritecache = B_FALSE; - - return (0); -} - -static void -vdev_geom_close(vdev_t *vd) -{ - struct g_consumer *cp; - int locked; - - cp = vd->vdev_tsd; - - DROP_GIANT(); - locked = g_topology_locked(); - if (!locked) - g_topology_lock(); - - if (!vd->vdev_reopening || - (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 || - (cp->provider != NULL && cp->provider->error != 0)))) - vdev_geom_close_locked(vd); - - if (!locked) - g_topology_unlock(); - PICKUP_GIANT(); -} - -static void -vdev_geom_io_intr(struct bio *bp) -{ - vdev_t *vd; - zio_t *zio; - - zio = bp->bio_caller1; - vd = zio->io_vd; - zio->io_error = bp->bio_error; - if (zio->io_error == 0 && bp->bio_resid != 0) - zio->io_error = SET_ERROR(EIO); - - switch(zio->io_error) { - case ENOTSUP: - /* - * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know - * that future attempts will never succeed. In this case - * we set a persistent flag so that we don't bother with - * requests in the future. - */ - switch(bp->bio_cmd) { - case BIO_FLUSH: - vd->vdev_nowritecache = B_TRUE; - break; - case BIO_DELETE: - vd->vdev_notrim = B_TRUE; - break; - } - break; - case ENXIO: - if (!vd->vdev_remove_wanted) { - /* - * If provider's error is set we assume it is being - * removed. - */ - if (bp->bio_to->error != 0) { - vd->vdev_remove_wanted = B_TRUE; - spa_async_request(zio->io_spa, - SPA_ASYNC_REMOVE); - } else if (!vd->vdev_delayed_close) { - vd->vdev_delayed_close = B_TRUE; - } - } - break; - } - - /* - * We have to split bio freeing into two parts, because the ABD code - * cannot be called in this context and vdev_op_io_done is not called - * for ZIO_TYPE_IOCTL zio-s. - */ - if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { - g_destroy_bio(bp); - zio->io_bio = NULL; - } - zio_delay_interrupt(zio); -} - -static void -vdev_geom_io_start(zio_t *zio) -{ - vdev_t *vd; - struct g_consumer *cp; - struct bio *bp; - int error; - - vd = zio->io_vd; - - switch (zio->io_type) { - case ZIO_TYPE_IOCTL: - /* XXPOLICY */ - if (!vdev_readable(vd)) { - zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); - return; - } else { - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - if (zfs_nocacheflush || vdev_geom_bio_flush_disable) - break; - if (vd->vdev_nowritecache) { - zio->io_error = SET_ERROR(ENOTSUP); - break; - } - goto sendreq; - default: - zio->io_error = SET_ERROR(ENOTSUP); - } - } - - zio_execute(zio); - return; - case ZIO_TYPE_FREE: - if (vd->vdev_notrim) { - zio->io_error = SET_ERROR(ENOTSUP); - } else if (!vdev_geom_bio_delete_disable) { - goto sendreq; - } - zio_execute(zio); - return; - } -sendreq: - ASSERT(zio->io_type == ZIO_TYPE_READ || - zio->io_type == ZIO_TYPE_WRITE || - zio->io_type == ZIO_TYPE_FREE || - zio->io_type == ZIO_TYPE_IOCTL); - - cp = vd->vdev_tsd; - if (cp == NULL) { - zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); - return; - } - bp = g_alloc_bio(); - bp->bio_caller1 = zio; - switch (zio->io_type) { - case ZIO_TYPE_READ: - case ZIO_TYPE_WRITE: - zio->io_target_timestamp = zio_handle_io_delay(zio); - bp->bio_offset = zio->io_offset; - bp->bio_length = zio->io_size; - if (zio->io_type == ZIO_TYPE_READ) { - bp->bio_cmd = BIO_READ; - bp->bio_data = - abd_borrow_buf(zio->io_abd, zio->io_size); - } else { - bp->bio_cmd = BIO_WRITE; - bp->bio_data = - abd_borrow_buf_copy(zio->io_abd, zio->io_size); - } - break; - case ZIO_TYPE_FREE: - bp->bio_cmd = BIO_DELETE; - bp->bio_data = NULL; - bp->bio_offset = zio->io_offset; - bp->bio_length = zio->io_size; - break; - case ZIO_TYPE_IOCTL: - bp->bio_cmd = BIO_FLUSH; - bp->bio_data = NULL; - bp->bio_offset = cp->provider->mediasize; - bp->bio_length = 0; - break; - } - bp->bio_done = vdev_geom_io_intr; - zio->io_bio = bp; - - g_io_request(bp, cp); -} - -static void -vdev_geom_io_done(zio_t *zio) -{ - struct bio *bp = zio->io_bio; - - if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { - ASSERT(bp == NULL); - return; - } - - if (bp == NULL) { - ASSERT3S(zio->io_error, ==, ENXIO); - return; - } - - if (zio->io_type == ZIO_TYPE_READ) - abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size); - else - abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size); - - g_destroy_bio(bp); - zio->io_bio = NULL; -} - -static void -vdev_geom_hold(vdev_t *vd) -{ -} - -static void -vdev_geom_rele(vdev_t *vd) -{ -} - -vdev_ops_t vdev_geom_ops = { - vdev_geom_open, - vdev_geom_close, - vdev_default_asize, - vdev_geom_io_start, - vdev_geom_io_done, - NULL, - NULL, - vdev_geom_hold, - vdev_geom_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ -}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c deleted file mode 100644 index 469150a4b72f..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c +++ /dev/null @@ -1,1849 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2014, 2017 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * An indirect vdev corresponds to a vdev that has been removed. Since - * we cannot rewrite block pointers of snapshots, etc., we keep a - * mapping from old location on the removed device to the new location - * on another device in the pool and use this mapping whenever we need - * to access the DVA. Unfortunately, this mapping did not respect - * logical block boundaries when it was first created, and so a DVA on - * this indirect vdev may be "split" into multiple sections that each - * map to a different location. As a consequence, not all DVAs can be - * translated to an equivalent new DVA. Instead we must provide a - * "vdev_remap" operation that executes a callback on each contiguous - * segment of the new location. This function is used in multiple ways: - * - * - i/os to this vdev use the callback to determine where the - * data is now located, and issue child i/os for each segment's new - * location. - * - * - frees and claims to this vdev use the callback to free or claim - * each mapped segment. (Note that we don't actually need to claim - * log blocks on indirect vdevs, because we don't allocate to - * removing vdevs. However, zdb uses zio_claim() for its leak - * detection.) - */ - -/* - * "Big theory statement" for how we mark blocks obsolete. - * - * When a block on an indirect vdev is freed or remapped, a section of - * that vdev's mapping may no longer be referenced (aka "obsolete"). We - * keep track of how much of each mapping entry is obsolete. When - * an entry becomes completely obsolete, we can remove it, thus reducing - * the memory used by the mapping. The complete picture of obsolescence - * is given by the following data structures, described below: - * - the entry-specific obsolete count - * - the vdev-specific obsolete spacemap - * - the pool-specific obsolete bpobj - * - * == On disk data structures used == - * - * We track the obsolete space for the pool using several objects. Each - * of these objects is created on demand and freed when no longer - * needed, and is assumed to be empty if it does not exist. - * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. - * - * - Each vic_mapping_object (associated with an indirect vdev) can - * have a vimp_counts_object. This is an array of uint32_t's - * with the same number of entries as the vic_mapping_object. When - * the mapping is condensed, entries from the vic_obsolete_sm_object - * (see below) are folded into the counts. Therefore, each - * obsolete_counts entry tells us the number of bytes in the - * corresponding mapping entry that were not referenced when the - * mapping was last condensed. - * - * - Each indirect or removing vdev can have a vic_obsolete_sm_object. - * This is a space map containing an alloc entry for every DVA that - * has been obsoleted since the last time this indirect vdev was - * condensed. We use this object in order to improve performance - * when marking a DVA as obsolete. Instead of modifying an arbitrary - * offset of the vimp_counts_object, we only need to append an entry - * to the end of this object. When a DVA becomes obsolete, it is - * added to the obsolete space map. This happens when the DVA is - * freed, remapped and not referenced by a snapshot, or the last - * snapshot referencing it is destroyed. - * - * - Each dataset can have a ds_remap_deadlist object. This is a - * deadlist object containing all blocks that were remapped in this - * dataset but referenced in a previous snapshot. Blocks can *only* - * appear on this list if they were remapped (dsl_dataset_block_remapped); - * blocks that were killed in a head dataset are put on the normal - * ds_deadlist and marked obsolete when they are freed. - * - * - The pool can have a dp_obsolete_bpobj. This is a list of blocks - * in the pool that need to be marked obsolete. When a snapshot is - * destroyed, we move some of the ds_remap_deadlist to the obsolete - * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then - * asynchronously process the obsolete bpobj, moving its entries to - * the specific vdevs' obsolete space maps. - * - * == Summary of how we mark blocks as obsolete == - * - * - When freeing a block: if any DVA is on an indirect vdev, append to - * vic_obsolete_sm_object. - * - When remapping a block, add dva to ds_remap_deadlist (if prev snap - * references; otherwise append to vic_obsolete_sm_object). - * - When freeing a snapshot: move parts of ds_remap_deadlist to - * dp_obsolete_bpobj (same algorithm as ds_deadlist). - * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to - * individual vdev's vic_obsolete_sm_object. - */ - -/* - * "Big theory statement" for how we condense indirect vdevs. - * - * Condensing an indirect vdev's mapping is the process of determining - * the precise counts of obsolete space for each mapping entry (by - * integrating the obsolete spacemap into the obsolete counts) and - * writing out a new mapping that contains only referenced entries. - * - * We condense a vdev when we expect the mapping to shrink (see - * vdev_indirect_should_condense()), but only perform one condense at a - * time to limit the memory usage. In addition, we use a separate - * open-context thread (spa_condense_indirect_thread) to incrementally - * create the new mapping object in a way that minimizes the impact on - * the rest of the system. - * - * == Generating a new mapping == - * - * To generate a new mapping, we follow these steps: - * - * 1. Save the old obsolete space map and create a new mapping object - * (see spa_condense_indirect_start_sync()). This initializes the - * spa_condensing_indirect_phys with the "previous obsolete space map", - * which is now read only. Newly obsolete DVAs will be added to a - * new (initially empty) obsolete space map, and will not be - * considered as part of this condense operation. - * - * 2. Construct in memory the precise counts of obsolete space for each - * mapping entry, by incorporating the obsolete space map into the - * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) - * - * 3. Iterate through each mapping entry, writing to the new mapping any - * entries that are not completely obsolete (i.e. which don't have - * obsolete count == mapping length). (See - * spa_condense_indirect_generate_new_mapping().) - * - * 4. Destroy the old mapping object and switch over to the new one - * (spa_condense_indirect_complete_sync). - * - * == Restarting from failure == - * - * To restart the condense when we import/open the pool, we must start - * at the 2nd step above: reconstruct the precise counts in memory, - * based on the space map + counts. Then in the 3rd step, we start - * iterating where we left off: at vimp_max_offset of the new mapping - * object. - */ - -boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE; - -/* - * Condense if at least this percent of the bytes in the mapping is - * obsolete. With the default of 25%, the amount of space mapped - * will be reduced to 1% of its original size after at most 16 - * condenses. Higher values will condense less often (causing less - * i/o); lower values will reduce the mapping size more quickly. - */ -int zfs_indirect_condense_obsolete_pct = 25; - -/* - * Condense if the obsolete space map takes up more than this amount of - * space on disk (logically). This limits the amount of disk space - * consumed by the obsolete space map; the default of 1GB is small enough - * that we typically don't mind "wasting" it. - */ -uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; - -/* - * Don't bother condensing if the mapping uses less than this amount of - * memory. The default of 128KB is considered a "trivial" amount of - * memory and not worth reducing. - */ -uint64_t zfs_condense_min_mapping_bytes = 128 * 1024; - -/* - * This is used by the test suite so that it can ensure that certain - * actions happen while in the middle of a condense (which might otherwise - * complete too quickly). If used to reduce the performance impact of - * condensing in production, a maximum value of 1 should be sufficient. - */ -int zfs_condense_indirect_commit_entry_delay_ticks = 0; - -/* - * If an indirect split block contains more than this many possible unique - * combinations when being reconstructed, consider it too computationally - * expensive to check them all. Instead, try at most 100 randomly-selected - * combinations each time the block is accessed. This allows all segment - * copies to participate fairly in the reconstruction when all combinations - * cannot be checked and prevents repeated use of one bad copy. - */ -int zfs_reconstruct_indirect_combinations_max = 256; - - -/* - * Enable to simulate damaged segments and validate reconstruction. - * Used by ztest - */ -unsigned long zfs_reconstruct_indirect_damage_fraction = 0; - -/* - * The indirect_child_t represents the vdev that we will read from, when we - * need to read all copies of the data (e.g. for scrub or reconstruction). - * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), - * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, - * ic_vdev is a child of the mirror. - */ -typedef struct indirect_child { - abd_t *ic_data; - vdev_t *ic_vdev; - - /* - * ic_duplicate is NULL when the ic_data contents are unique, when it - * is determined to be a duplicate it references the primary child. - */ - struct indirect_child *ic_duplicate; - list_node_t ic_node; /* node on is_unique_child */ -} indirect_child_t; - -/* - * The indirect_split_t represents one mapped segment of an i/o to the - * indirect vdev. For non-split (contiguously-mapped) blocks, there will be - * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. - * For split blocks, there will be several of these. - */ -typedef struct indirect_split { - list_node_t is_node; /* link on iv_splits */ - - /* - * is_split_offset is the offset into the i/o. - * This is the sum of the previous splits' is_size's. - */ - uint64_t is_split_offset; - - vdev_t *is_vdev; /* top-level vdev */ - uint64_t is_target_offset; /* offset on is_vdev */ - uint64_t is_size; - int is_children; /* number of entries in is_child[] */ - int is_unique_children; /* number of entries in is_unique_child */ - list_t is_unique_child; - - /* - * is_good_child is the child that we are currently using to - * attempt reconstruction. - */ - indirect_child_t *is_good_child; - - indirect_child_t is_child[1]; /* variable-length */ -} indirect_split_t; - -/* - * The indirect_vsd_t is associated with each i/o to the indirect vdev. - * It is the "Vdev-Specific Data" in the zio_t's io_vsd. - */ -typedef struct indirect_vsd { - boolean_t iv_split_block; - boolean_t iv_reconstruct; - uint64_t iv_unique_combinations; - uint64_t iv_attempts; - uint64_t iv_attempts_max; - - list_t iv_splits; /* list of indirect_split_t's */ -} indirect_vsd_t; - -static void -vdev_indirect_map_free(zio_t *zio) -{ - indirect_vsd_t *iv = zio->io_vsd; - - indirect_split_t *is; - while ((is = list_head(&iv->iv_splits)) != NULL) { - for (int c = 0; c < is->is_children; c++) { - indirect_child_t *ic = &is->is_child[c]; - if (ic->ic_data != NULL) - abd_free(ic->ic_data); - } - list_remove(&iv->iv_splits, is); - - indirect_child_t *ic; - while ((ic = list_head(&is->is_unique_child)) != NULL) - list_remove(&is->is_unique_child, ic); - - list_destroy(&is->is_unique_child); - - kmem_free(is, - offsetof(indirect_split_t, is_child[is->is_children])); - } - kmem_free(iv, sizeof (*iv)); -} - -static const zio_vsd_ops_t vdev_indirect_vsd_ops = { - vdev_indirect_map_free, - zio_vsd_default_cksum_report -}; -/* - * Mark the given offset and size as being obsolete. - */ -void -vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) -{ - spa_t *spa = vd->vdev_spa; - - ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); - ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); - ASSERT(size > 0); - VERIFY(vdev_indirect_mapping_entry_for_offset( - vd->vdev_indirect_mapping, offset) != NULL); - - if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { - mutex_enter(&vd->vdev_obsolete_lock); - range_tree_add(vd->vdev_obsolete_segments, offset, size); - mutex_exit(&vd->vdev_obsolete_lock); - vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); - } -} - -/* - * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This - * wrapper is provided because the DMU does not know about vdev_t's and - * cannot directly call vdev_indirect_mark_obsolete. - */ -void -spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, - uint64_t size, dmu_tx_t *tx) -{ - vdev_t *vd = vdev_lookup_top(spa, vdev_id); - ASSERT(dmu_tx_is_syncing(tx)); - - /* The DMU can only remap indirect vdevs. */ - ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); - vdev_indirect_mark_obsolete(vd, offset, size); -} - -static spa_condensing_indirect_t * -spa_condensing_indirect_create(spa_t *spa) -{ - spa_condensing_indirect_phys_t *scip = - &spa->spa_condensing_indirect_phys; - spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); - objset_t *mos = spa->spa_meta_objset; - - for (int i = 0; i < TXG_SIZE; i++) { - list_create(&sci->sci_new_mapping_entries[i], - sizeof (vdev_indirect_mapping_entry_t), - offsetof(vdev_indirect_mapping_entry_t, vime_node)); - } - - sci->sci_new_mapping = - vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); - - return (sci); -} - -static void -spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) -{ - for (int i = 0; i < TXG_SIZE; i++) - list_destroy(&sci->sci_new_mapping_entries[i]); - - if (sci->sci_new_mapping != NULL) - vdev_indirect_mapping_close(sci->sci_new_mapping); - - kmem_free(sci, sizeof (*sci)); -} - -boolean_t -vdev_indirect_should_condense(vdev_t *vd) -{ - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - spa_t *spa = vd->vdev_spa; - - ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); - - if (!zfs_condense_indirect_vdevs_enable) - return (B_FALSE); - - /* - * We can only condense one indirect vdev at a time. - */ - if (spa->spa_condensing_indirect != NULL) - return (B_FALSE); - - if (spa_shutting_down(spa)) - return (B_FALSE); - - /* - * The mapping object size must not change while we are - * condensing, so we can only condense indirect vdevs - * (not vdevs that are still in the middle of being removed). - */ - if (vd->vdev_ops != &vdev_indirect_ops) - return (B_FALSE); - - /* - * If nothing new has been marked obsolete, there is no - * point in condensing. - */ - if (vd->vdev_obsolete_sm == NULL) { - ASSERT0(vdev_obsolete_sm_object(vd)); - return (B_FALSE); - } - - ASSERT(vd->vdev_obsolete_sm != NULL); - - ASSERT3U(vdev_obsolete_sm_object(vd), ==, - space_map_object(vd->vdev_obsolete_sm)); - - uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); - uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); - uint64_t mapping_size = vdev_indirect_mapping_size(vim); - uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); - - ASSERT3U(bytes_obsolete, <=, bytes_mapped); - - /* - * If a high percentage of the bytes that are mapped have become - * obsolete, condense (unless the mapping is already small enough). - * This has a good chance of reducing the amount of memory used - * by the mapping. - */ - if (bytes_obsolete * 100 / bytes_mapped >= - zfs_indirect_condense_obsolete_pct && - mapping_size > zfs_condense_min_mapping_bytes) { - zfs_dbgmsg("should condense vdev %llu because obsolete " - "spacemap covers %d%% of %lluMB mapping", - (u_longlong_t)vd->vdev_id, - (int)(bytes_obsolete * 100 / bytes_mapped), - (u_longlong_t)bytes_mapped / 1024 / 1024); - return (B_TRUE); - } - - /* - * If the obsolete space map takes up too much space on disk, - * condense in order to free up this disk space. - */ - if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { - zfs_dbgmsg("should condense vdev %llu because obsolete sm " - "length %lluMB >= max size %lluMB", - (u_longlong_t)vd->vdev_id, - (u_longlong_t)obsolete_sm_size / 1024 / 1024, - (u_longlong_t)zfs_condense_max_obsolete_bytes / - 1024 / 1024); - return (B_TRUE); - } - - return (B_FALSE); -} - -/* - * This sync task completes (finishes) a condense, deleting the old - * mapping and replacing it with the new one. - */ -static void -spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) -{ - spa_condensing_indirect_t *sci = arg; - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - spa_condensing_indirect_phys_t *scip = - &spa->spa_condensing_indirect_phys; - vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); - vdev_indirect_config_t *vic = &vd->vdev_indirect_config; - objset_t *mos = spa->spa_meta_objset; - vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; - uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); - uint64_t new_count = - vdev_indirect_mapping_num_entries(sci->sci_new_mapping); - - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); - ASSERT3P(sci, ==, spa->spa_condensing_indirect); - for (int i = 0; i < TXG_SIZE; i++) { - ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); - } - ASSERT(vic->vic_mapping_object != 0); - ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); - ASSERT(scip->scip_next_mapping_object != 0); - ASSERT(scip->scip_prev_obsolete_sm_object != 0); - - /* - * Reset vdev_indirect_mapping to refer to the new object. - */ - rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); - vdev_indirect_mapping_close(vd->vdev_indirect_mapping); - vd->vdev_indirect_mapping = sci->sci_new_mapping; - rw_exit(&vd->vdev_indirect_rwlock); - - sci->sci_new_mapping = NULL; - vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); - vic->vic_mapping_object = scip->scip_next_mapping_object; - scip->scip_next_mapping_object = 0; - - space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); - spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); - scip->scip_prev_obsolete_sm_object = 0; - - scip->scip_vdev = 0; - - VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_CONDENSING_INDIRECT, tx)); - spa_condensing_indirect_destroy(spa->spa_condensing_indirect); - spa->spa_condensing_indirect = NULL; - - zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " - "new mapping object %llu has %llu entries " - "(was %llu entries)", - vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, - new_count, old_count); - - vdev_config_dirty(spa->spa_root_vdev); -} - -/* - * This sync task appends entries to the new mapping object. - */ -static void -spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) -{ - spa_condensing_indirect_t *sci = arg; - uint64_t txg = dmu_tx_get_txg(tx); - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT3P(sci, ==, spa->spa_condensing_indirect); - - vdev_indirect_mapping_add_entries(sci->sci_new_mapping, - &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); - ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); -} - -/* - * Open-context function to add one entry to the new mapping. The new - * entry will be remembered and written from syncing context. - */ -static void -spa_condense_indirect_commit_entry(spa_t *spa, - vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) -{ - spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; - - ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); - - dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; - - /* - * If we are the first entry committed this txg, kick off the sync - * task to write to the MOS on our behalf. - */ - if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { - dsl_sync_task_nowait(dmu_tx_pool(tx), - spa_condense_indirect_commit_sync, sci, - 0, ZFS_SPACE_CHECK_NONE, tx); - } - - vdev_indirect_mapping_entry_t *vime = - kmem_alloc(sizeof (*vime), KM_SLEEP); - vime->vime_mapping = *vimep; - vime->vime_obsolete_count = count; - list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); - - dmu_tx_commit(tx); -} - -static void -spa_condense_indirect_generate_new_mapping(vdev_t *vd, - uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) -{ - spa_t *spa = vd->vdev_spa; - uint64_t mapi = start_index; - vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; - uint64_t old_num_entries = - vdev_indirect_mapping_num_entries(old_mapping); - - ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); - ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); - - zfs_dbgmsg("starting condense of vdev %llu from index %llu", - (u_longlong_t)vd->vdev_id, - (u_longlong_t)mapi); - - while (mapi < old_num_entries) { - - if (zthr_iscancelled(zthr)) { - zfs_dbgmsg("pausing condense of vdev %llu " - "at index %llu", (u_longlong_t)vd->vdev_id, - (u_longlong_t)mapi); - break; - } - - vdev_indirect_mapping_entry_phys_t *entry = - &old_mapping->vim_entries[mapi]; - uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); - ASSERT3U(obsolete_counts[mapi], <=, entry_size); - if (obsolete_counts[mapi] < entry_size) { - spa_condense_indirect_commit_entry(spa, entry, - obsolete_counts[mapi]); - - /* - * This delay may be requested for testing, debugging, - * or performance reasons. - */ - delay(zfs_condense_indirect_commit_entry_delay_ticks); - } - - mapi++; - } -} - -/* ARGSUSED */ -static boolean_t -spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) -{ - spa_t *spa = arg; - - return (spa->spa_condensing_indirect != NULL); -} - -/* ARGSUSED */ -static void -spa_condense_indirect_thread(void *arg, zthr_t *zthr) -{ - spa_t *spa = arg; - vdev_t *vd; - - ASSERT3P(spa->spa_condensing_indirect, !=, NULL); - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); - ASSERT3P(vd, !=, NULL); - spa_config_exit(spa, SCL_VDEV, FTAG); - - spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; - spa_condensing_indirect_phys_t *scip = - &spa->spa_condensing_indirect_phys; - uint32_t *counts; - uint64_t start_index; - vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; - space_map_t *prev_obsolete_sm = NULL; - - ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); - ASSERT(scip->scip_next_mapping_object != 0); - ASSERT(scip->scip_prev_obsolete_sm_object != 0); - ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); - - for (int i = 0; i < TXG_SIZE; i++) { - /* - * The list must start out empty in order for the - * _commit_sync() sync task to be properly registered - * on the first call to _commit_entry(); so it's wise - * to double check and ensure we actually are starting - * with empty lists. - */ - ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); - } - - VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, - scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); - counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); - if (prev_obsolete_sm != NULL) { - vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, - counts, prev_obsolete_sm); - } - space_map_close(prev_obsolete_sm); - - /* - * Generate new mapping. Determine what index to continue from - * based on the max offset that we've already written in the - * new mapping. - */ - uint64_t max_offset = - vdev_indirect_mapping_max_offset(sci->sci_new_mapping); - if (max_offset == 0) { - /* We haven't written anything to the new mapping yet. */ - start_index = 0; - } else { - /* - * Pick up from where we left off. _entry_for_offset() - * returns a pointer into the vim_entries array. If - * max_offset is greater than any of the mappings - * contained in the table NULL will be returned and - * that indicates we've exhausted our iteration of the - * old_mapping. - */ - - vdev_indirect_mapping_entry_phys_t *entry = - vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, - max_offset); - - if (entry == NULL) { - /* - * We've already written the whole new mapping. - * This special value will cause us to skip the - * generate_new_mapping step and just do the sync - * task to complete the condense. - */ - start_index = UINT64_MAX; - } else { - start_index = entry - old_mapping->vim_entries; - ASSERT3U(start_index, <, - vdev_indirect_mapping_num_entries(old_mapping)); - } - } - - spa_condense_indirect_generate_new_mapping(vd, counts, - start_index, zthr); - - vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); - - /* - * If the zthr has received a cancellation signal while running - * in generate_new_mapping() or at any point after that, then bail - * early. We don't want to complete the condense if the spa is - * shutting down. - */ - if (zthr_iscancelled(zthr)) - return; - - VERIFY0(dsl_sync_task(spa_name(spa), NULL, - spa_condense_indirect_complete_sync, sci, 0, - ZFS_SPACE_CHECK_EXTRA_RESERVED)); -} - -/* - * Sync task to begin the condensing process. - */ -void -spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) -{ - spa_t *spa = vd->vdev_spa; - spa_condensing_indirect_phys_t *scip = - &spa->spa_condensing_indirect_phys; - - ASSERT0(scip->scip_next_mapping_object); - ASSERT0(scip->scip_prev_obsolete_sm_object); - ASSERT0(scip->scip_vdev); - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); - ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); - ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); - - uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); - ASSERT(obsolete_sm_obj != 0); - - scip->scip_vdev = vd->vdev_id; - scip->scip_next_mapping_object = - vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); - - scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; - - /* - * We don't need to allocate a new space map object, since - * vdev_indirect_sync_obsolete will allocate one when needed. - */ - space_map_close(vd->vdev_obsolete_sm); - vd->vdev_obsolete_sm = NULL; - VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, - VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); - - VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), - sizeof (*scip) / sizeof (uint64_t), scip, tx)); - - ASSERT3P(spa->spa_condensing_indirect, ==, NULL); - spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); - - zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " - "posm=%llu nm=%llu", - vd->vdev_id, dmu_tx_get_txg(tx), - (u_longlong_t)scip->scip_prev_obsolete_sm_object, - (u_longlong_t)scip->scip_next_mapping_object); - - zthr_wakeup(spa->spa_condense_zthr); -} - -/* - * Sync to the given vdev's obsolete space map any segments that are no longer - * referenced as of the given txg. - * - * If the obsolete space map doesn't exist yet, create and open it. - */ -void -vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) -{ - spa_t *spa = vd->vdev_spa; - vdev_indirect_config_t *vic = &vd->vdev_indirect_config; - - ASSERT3U(vic->vic_mapping_object, !=, 0); - ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); - ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); - ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); - - if (vdev_obsolete_sm_object(vd) == 0) { - uint64_t obsolete_sm_object = - space_map_alloc(spa->spa_meta_objset, - vdev_standard_sm_blksz, tx); - - ASSERT(vd->vdev_top_zap != 0); - VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, - VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, - sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); - ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0); - - spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); - VERIFY0(space_map_open(&vd->vdev_obsolete_sm, - spa->spa_meta_objset, obsolete_sm_object, - 0, vd->vdev_asize, 0)); - } - - ASSERT(vd->vdev_obsolete_sm != NULL); - ASSERT3U(vdev_obsolete_sm_object(vd), ==, - space_map_object(vd->vdev_obsolete_sm)); - - space_map_write(vd->vdev_obsolete_sm, - vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); - range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); -} - -int -spa_condense_init(spa_t *spa) -{ - int error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), - sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), - &spa->spa_condensing_indirect_phys); - if (error == 0) { - if (spa_writeable(spa)) { - spa->spa_condensing_indirect = - spa_condensing_indirect_create(spa); - } - return (0); - } else if (error == ENOENT) { - return (0); - } else { - return (error); - } -} - -void -spa_condense_fini(spa_t *spa) -{ - if (spa->spa_condensing_indirect != NULL) { - spa_condensing_indirect_destroy(spa->spa_condensing_indirect); - spa->spa_condensing_indirect = NULL; - } -} - -void -spa_start_indirect_condensing_thread(spa_t *spa) -{ - ASSERT3P(spa->spa_condense_zthr, ==, NULL); - spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check, - spa_condense_indirect_thread, spa); -} - -/* - * Gets the obsolete spacemap object from the vdev's ZAP. - * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't - * exist yet. - */ -int -vdev_obsolete_sm_object(vdev_t *vd) -{ - ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); - if (vd->vdev_top_zap == 0) { - return (0); - } - - uint64_t sm_obj = 0; - int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, - VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj); - - ASSERT(err == 0 || err == ENOENT); - - return (sm_obj); -} - -boolean_t -vdev_obsolete_counts_are_precise(vdev_t *vd) -{ - ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); - if (vd->vdev_top_zap == 0) { - return (B_FALSE); - } - - uint64_t val = 0; - int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, - VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); - - ASSERT(err == 0 || err == ENOENT); - - return (val != 0); -} - -/* ARGSUSED */ -static void -vdev_indirect_close(vdev_t *vd) -{ -} - -/* ARGSUSED */ -static int -vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *logical_ashift, uint64_t *physical_ashift) -{ - *psize = *max_psize = vd->vdev_asize + - VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; - *logical_ashift = vd->vdev_ashift; - *physical_ashift = vd->vdev_physical_ashift; - return (0); -} - -typedef struct remap_segment { - vdev_t *rs_vd; - uint64_t rs_offset; - uint64_t rs_asize; - uint64_t rs_split_offset; - list_node_t rs_node; -} remap_segment_t; - -remap_segment_t * -rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) -{ - remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); - rs->rs_vd = vd; - rs->rs_offset = offset; - rs->rs_asize = asize; - rs->rs_split_offset = split_offset; - return (rs); -} - -/* - * Given an indirect vdev and an extent on that vdev, it duplicates the - * physical entries of the indirect mapping that correspond to the extent - * to a new array and returns a pointer to it. In addition, copied_entries - * is populated with the number of mapping entries that were duplicated. - * - * Note that the function assumes that the caller holds vdev_indirect_rwlock. - * This ensures that the mapping won't change due to condensing as we - * copy over its contents. - * - * Finally, since we are doing an allocation, it is up to the caller to - * free the array allocated in this function. - */ -vdev_indirect_mapping_entry_phys_t * -vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, - uint64_t asize, uint64_t *copied_entries) -{ - vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - uint64_t entries = 0; - - ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); - - vdev_indirect_mapping_entry_phys_t *first_mapping = - vdev_indirect_mapping_entry_for_offset(vim, offset); - ASSERT3P(first_mapping, !=, NULL); - - vdev_indirect_mapping_entry_phys_t *m = first_mapping; - while (asize > 0) { - uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); - - ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); - ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); - - uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); - uint64_t inner_size = MIN(asize, size - inner_offset); - - offset += inner_size; - asize -= inner_size; - entries++; - m++; - } - - size_t copy_length = entries * sizeof (*first_mapping); - duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); - bcopy(first_mapping, duplicate_mappings, copy_length); - *copied_entries = entries; - - return (duplicate_mappings); -} - -/* - * Goes through the relevant indirect mappings until it hits a concrete vdev - * and issues the callback. On the way to the concrete vdev, if any other - * indirect vdevs are encountered, then the callback will also be called on - * each of those indirect vdevs. For example, if the segment is mapped to - * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is - * mapped to segment B on concrete vdev 2, then the callback will be called on - * both vdev 1 and vdev 2. - * - * While the callback passed to vdev_indirect_remap() is called on every vdev - * the function encounters, certain callbacks only care about concrete vdevs. - * These types of callbacks should return immediately and explicitly when they - * are called on an indirect vdev. - * - * Because there is a possibility that a DVA section in the indirect device - * has been split into multiple sections in our mapping, we keep track - * of the relevant contiguous segments of the new location (remap_segment_t) - * in a stack. This way we can call the callback for each of the new sections - * created by a single section of the indirect device. Note though, that in - * this scenario the callbacks in each split block won't occur in-order in - * terms of offset, so callers should not make any assumptions about that. - * - * For callbacks that don't handle split blocks and immediately return when - * they encounter them (as is the case for remap_blkptr_cb), the caller can - * assume that its callback will be applied from the first indirect vdev - * encountered to the last one and then the concrete vdev, in that order. - */ -static void -vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, - void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) -{ - list_t stack; - spa_t *spa = vd->vdev_spa; - - list_create(&stack, sizeof (remap_segment_t), - offsetof(remap_segment_t, rs_node)); - - for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); - rs != NULL; rs = list_remove_head(&stack)) { - vdev_t *v = rs->rs_vd; - uint64_t num_entries = 0; - - ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); - ASSERT(rs->rs_asize > 0); - - /* - * Note: As this function can be called from open context - * (e.g. zio_read()), we need the following rwlock to - * prevent the mapping from being changed by condensing. - * - * So we grab the lock and we make a copy of the entries - * that are relevant to the extent that we are working on. - * Once that is done, we drop the lock and iterate over - * our copy of the mapping. Once we are done with the with - * the remap segment and we free it, we also free our copy - * of the indirect mapping entries that are relevant to it. - * - * This way we don't need to wait until the function is - * finished with a segment, to condense it. In addition, we - * don't need a recursive rwlock for the case that a call to - * vdev_indirect_remap() needs to call itself (through the - * codepath of its callback) for the same vdev in the middle - * of its execution. - */ - rw_enter(&v->vdev_indirect_rwlock, RW_READER); - vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping; - ASSERT3P(vim, !=, NULL); - - vdev_indirect_mapping_entry_phys_t *mapping = - vdev_indirect_mapping_duplicate_adjacent_entries(v, - rs->rs_offset, rs->rs_asize, &num_entries); - ASSERT3P(mapping, !=, NULL); - ASSERT3U(num_entries, >, 0); - rw_exit(&v->vdev_indirect_rwlock); - - for (uint64_t i = 0; i < num_entries; i++) { - /* - * Note: the vdev_indirect_mapping can not change - * while we are running. It only changes while the - * removal is in progress, and then only from syncing - * context. While a removal is in progress, this - * function is only called for frees, which also only - * happen from syncing context. - */ - vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; - - ASSERT3P(m, !=, NULL); - ASSERT3U(rs->rs_asize, >, 0); - - uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); - uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); - uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); - - ASSERT3U(rs->rs_offset, >=, - DVA_MAPPING_GET_SRC_OFFSET(m)); - ASSERT3U(rs->rs_offset, <, - DVA_MAPPING_GET_SRC_OFFSET(m) + size); - ASSERT3U(dst_vdev, !=, v->vdev_id); - - uint64_t inner_offset = rs->rs_offset - - DVA_MAPPING_GET_SRC_OFFSET(m); - uint64_t inner_size = - MIN(rs->rs_asize, size - inner_offset); - - vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); - ASSERT3P(dst_v, !=, NULL); - - if (dst_v->vdev_ops == &vdev_indirect_ops) { - list_insert_head(&stack, - rs_alloc(dst_v, dst_offset + inner_offset, - inner_size, rs->rs_split_offset)); - - } - - if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && - IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { - /* - * Note: This clause exists only solely for - * testing purposes. We use it to ensure that - * split blocks work and that the callbacks - * using them yield the same result if issued - * in reverse order. - */ - uint64_t inner_half = inner_size / 2; - - func(rs->rs_split_offset + inner_half, dst_v, - dst_offset + inner_offset + inner_half, - inner_half, arg); - - func(rs->rs_split_offset, dst_v, - dst_offset + inner_offset, - inner_half, arg); - } else { - func(rs->rs_split_offset, dst_v, - dst_offset + inner_offset, - inner_size, arg); - } - - rs->rs_offset += inner_size; - rs->rs_asize -= inner_size; - rs->rs_split_offset += inner_size; - } - VERIFY0(rs->rs_asize); - - kmem_free(mapping, num_entries * sizeof (*mapping)); - kmem_free(rs, sizeof (remap_segment_t)); - } - list_destroy(&stack); -} - -static void -vdev_indirect_child_io_done(zio_t *zio) -{ - zio_t *pio = zio->io_private; - - mutex_enter(&pio->io_lock); - pio->io_error = zio_worst_error(pio->io_error, zio->io_error); - mutex_exit(&pio->io_lock); - -#ifdef __FreeBSD__ - if (zio->io_abd != NULL) -#endif - abd_put(zio->io_abd); -} - -/* - * This is a callback for vdev_indirect_remap() which allocates an - * indirect_split_t for each split segment and adds it to iv_splits. - */ -static void -vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, - uint64_t size, void *arg) -{ - zio_t *zio = arg; - indirect_vsd_t *iv = zio->io_vsd; - - ASSERT3P(vd, !=, NULL); - - if (vd->vdev_ops == &vdev_indirect_ops) - return; - - int n = 1; - if (vd->vdev_ops == &vdev_mirror_ops) - n = vd->vdev_children; - - indirect_split_t *is = - kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP); - - is->is_children = n; - is->is_size = size; - is->is_split_offset = split_offset; - is->is_target_offset = offset; - is->is_vdev = vd; - list_create(&is->is_unique_child, sizeof (indirect_child_t), - offsetof(indirect_child_t, ic_node)); - - /* - * Note that we only consider multiple copies of the data for - * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even - * though they use the same ops as mirror, because there's only one - * "good" copy under the replacing/spare. - */ - if (vd->vdev_ops == &vdev_mirror_ops) { - for (int i = 0; i < n; i++) { - is->is_child[i].ic_vdev = vd->vdev_child[i]; - list_link_init(&is->is_child[i].ic_node); - } - } else { - is->is_child[0].ic_vdev = vd; - } - - list_insert_tail(&iv->iv_splits, is); -} - -static void -vdev_indirect_read_split_done(zio_t *zio) -{ - indirect_child_t *ic = zio->io_private; - - if (zio->io_error != 0) { - /* - * Clear ic_data to indicate that we do not have data for this - * child. - */ - abd_free(ic->ic_data); - ic->ic_data = NULL; - } -} - -/* - * Issue reads for all copies (mirror children) of all splits. - */ -static void -vdev_indirect_read_all(zio_t *zio) -{ - indirect_vsd_t *iv = zio->io_vsd; - - ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - - for (indirect_split_t *is = list_head(&iv->iv_splits); - is != NULL; is = list_next(&iv->iv_splits, is)) { - for (int i = 0; i < is->is_children; i++) { - indirect_child_t *ic = &is->is_child[i]; - - if (!vdev_readable(ic->ic_vdev)) - continue; - - /* - * Note, we may read from a child whose DTL - * indicates that the data may not be present here. - * While this might result in a few i/os that will - * likely return incorrect data, it simplifies the - * code since we can treat scrub and resilver - * identically. (The incorrect data will be - * detected and ignored when we verify the - * checksum.) - */ - - ic->ic_data = abd_alloc_sametype(zio->io_abd, - is->is_size); - ic->ic_duplicate = NULL; - - zio_nowait(zio_vdev_child_io(zio, NULL, - ic->ic_vdev, is->is_target_offset, ic->ic_data, - is->is_size, zio->io_type, zio->io_priority, 0, - vdev_indirect_read_split_done, ic)); - } - } - iv->iv_reconstruct = B_TRUE; -} - -static void -vdev_indirect_io_start(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); - list_create(&iv->iv_splits, - sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); - - zio->io_vsd = iv; - zio->io_vsd_ops = &vdev_indirect_vsd_ops; - - ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); -#ifdef __FreeBSD__ - if (zio->io_type == ZIO_TYPE_WRITE) { -#else - if (zio->io_type != ZIO_TYPE_READ) { - ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); -#endif - /* - * Note: this code can handle other kinds of writes, - * but we don't expect them. - */ - ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL | - ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0); - } - - vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, - vdev_indirect_gather_splits, zio); - - indirect_split_t *first = list_head(&iv->iv_splits); - if (first->is_size == zio->io_size) { - /* - * This is not a split block; we are pointing to the entire - * data, which will checksum the same as the original data. - * Pass the BP down so that the child i/o can verify the - * checksum, and try a different location if available - * (e.g. on a mirror). - * - * While this special case could be handled the same as the - * general (split block) case, doing it this way ensures - * that the vast majority of blocks on indirect vdevs - * (which are not split) are handled identically to blocks - * on non-indirect vdevs. This allows us to be less strict - * about performance in the general (but rare) case. - */ - ASSERT0(first->is_split_offset); - ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL); - zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - first->is_vdev, first->is_target_offset, -#ifdef __FreeBSD__ - zio->io_abd == NULL ? NULL : -#endif - abd_get_offset(zio->io_abd, 0), - zio->io_size, zio->io_type, zio->io_priority, 0, - vdev_indirect_child_io_done, zio)); - } else { - iv->iv_split_block = B_TRUE; - if (zio->io_type == ZIO_TYPE_READ && - zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { - /* - * Read all copies. Note that for simplicity, - * we don't bother consulting the DTL in the - * resilver case. - */ - vdev_indirect_read_all(zio); - } else { - /* - * If this is a read zio, we read one copy of each - * split segment, from the top-level vdev. Since - * we don't know the checksum of each split - * individually, the child zio can't ensure that - * we get the right data. E.g. if it's a mirror, - * it will just read from a random (healthy) leaf - * vdev. We have to verify the checksum in - * vdev_indirect_io_done(). - * - * For write zios, the vdev code will ensure we write - * to all children. - */ - for (indirect_split_t *is = list_head(&iv->iv_splits); - is != NULL; is = list_next(&iv->iv_splits, is)) { - zio_nowait(zio_vdev_child_io(zio, NULL, - is->is_vdev, is->is_target_offset, -#ifdef __FreeBSD__ - zio->io_abd == NULL ? NULL : -#endif - abd_get_offset(zio->io_abd, - is->is_split_offset), - is->is_size, zio->io_type, - zio->io_priority, 0, - vdev_indirect_child_io_done, zio)); - } - } - } - - zio_execute(zio); -} - -/* - * Report a checksum error for a child. - */ -static void -vdev_indirect_checksum_error(zio_t *zio, - indirect_split_t *is, indirect_child_t *ic) -{ - vdev_t *vd = ic->ic_vdev; - - if (zio->io_flags & ZIO_FLAG_SPECULATIVE) - return; - - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - - zio_bad_cksum_t zbc = { 0 }; - void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size); - abd_t *good_abd = is->is_good_child->ic_data; - void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size); - zfs_ereport_post_checksum(zio->io_spa, vd, zio, - is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc); - abd_return_buf(ic->ic_data, bad_buf, is->is_size); - abd_return_buf(good_abd, good_buf, is->is_size); -} - -/* - * Issue repair i/os for any incorrect copies. We do this by comparing - * each split segment's correct data (is_good_child's ic_data) with each - * other copy of the data. If they differ, then we overwrite the bad data - * with the good copy. Note that we do this without regard for the DTL's, - * which simplifies this code and also issues the optimal number of writes - * (based on which copies actually read bad data, as opposed to which we - * think might be wrong). For the same reason, we always use - * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start(). - */ -static void -vdev_indirect_repair(zio_t *zio) -{ - indirect_vsd_t *iv = zio->io_vsd; - - enum zio_flag flags = ZIO_FLAG_IO_REPAIR; - - if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) - flags |= ZIO_FLAG_SELF_HEAL; - - if (!spa_writeable(zio->io_spa)) - return; - - for (indirect_split_t *is = list_head(&iv->iv_splits); - is != NULL; is = list_next(&iv->iv_splits, is)) { - for (int c = 0; c < is->is_children; c++) { - indirect_child_t *ic = &is->is_child[c]; - if (ic == is->is_good_child) - continue; - if (ic->ic_data == NULL) - continue; - if (ic->ic_duplicate == is->is_good_child) - continue; - - zio_nowait(zio_vdev_child_io(zio, NULL, - ic->ic_vdev, is->is_target_offset, - is->is_good_child->ic_data, is->is_size, - ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, - NULL, NULL)); - - vdev_indirect_checksum_error(zio, is, ic); - } - } -} - -/* - * Report checksum errors on all children that we read from. - */ -static void -vdev_indirect_all_checksum_errors(zio_t *zio) -{ - indirect_vsd_t *iv = zio->io_vsd; - - if (zio->io_flags & ZIO_FLAG_SPECULATIVE) - return; - - for (indirect_split_t *is = list_head(&iv->iv_splits); - is != NULL; is = list_next(&iv->iv_splits, is)) { - for (int c = 0; c < is->is_children; c++) { - indirect_child_t *ic = &is->is_child[c]; - - if (ic->ic_data == NULL) - continue; - - vdev_t *vd = ic->ic_vdev; - - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - - zfs_ereport_post_checksum(zio->io_spa, vd, zio, - is->is_target_offset, is->is_size, - NULL, NULL, NULL); - } - } -} - -/* - * Copy data from all the splits to a main zio then validate the checksum. - * If then checksum is successfully validated return success. - */ -static int -vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio) -{ - zio_bad_cksum_t zbc; - - for (indirect_split_t *is = list_head(&iv->iv_splits); - is != NULL; is = list_next(&iv->iv_splits, is)) { - - ASSERT3P(is->is_good_child->ic_data, !=, NULL); - ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL); - - abd_copy_off(zio->io_abd, is->is_good_child->ic_data, - is->is_split_offset, 0, is->is_size); - } - - return (zio_checksum_error(zio, &zbc)); -} - -/* - * There are relatively few possible combinations making it feasible to - * deterministically check them all. We do this by setting the good_child - * to the next unique split version. If we reach the end of the list then - * "carry over" to the next unique split version (like counting in base - * is_unique_children, but each digit can have a different base). - */ -static int -vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio) -{ - boolean_t more = B_TRUE; - - iv->iv_attempts = 0; - - for (indirect_split_t *is = list_head(&iv->iv_splits); - is != NULL; is = list_next(&iv->iv_splits, is)) - is->is_good_child = list_head(&is->is_unique_child); - - while (more == B_TRUE) { - iv->iv_attempts++; - more = B_FALSE; - - if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) - return (0); - - for (indirect_split_t *is = list_head(&iv->iv_splits); - is != NULL; is = list_next(&iv->iv_splits, is)) { - is->is_good_child = list_next(&is->is_unique_child, - is->is_good_child); - if (is->is_good_child != NULL) { - more = B_TRUE; - break; - } - - is->is_good_child = list_head(&is->is_unique_child); - } - } - - ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations); - - return (SET_ERROR(ECKSUM)); -} - -/* - * There are too many combinations to try all of them in a reasonable amount - * of time. So try a fixed number of random combinations from the unique - * split versions, after which we'll consider the block unrecoverable. - */ -static int -vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio) -{ - iv->iv_attempts = 0; - - while (iv->iv_attempts < iv->iv_attempts_max) { - iv->iv_attempts++; - - for (indirect_split_t *is = list_head(&iv->iv_splits); - is != NULL; is = list_next(&iv->iv_splits, is)) { - indirect_child_t *ic = list_head(&is->is_unique_child); - int children = is->is_unique_children; - - for (int i = spa_get_random(children); i > 0; i--) - ic = list_next(&is->is_unique_child, ic); - - ASSERT3P(ic, !=, NULL); - is->is_good_child = ic; - } - - if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) - return (0); - } - - return (SET_ERROR(ECKSUM)); -} - -/* - * This is a validation function for reconstruction. It randomly selects - * a good combination, if one can be found, and then it intentionally - * damages all other segment copes by zeroing them. This forces the - * reconstruction algorithm to locate the one remaining known good copy. - */ -static int -vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) -{ - /* Presume all the copies are unique for initial selection. */ - for (indirect_split_t *is = list_head(&iv->iv_splits); - is != NULL; is = list_next(&iv->iv_splits, is)) { - is->is_unique_children = 0; - - for (int i = 0; i < is->is_children; i++) { - indirect_child_t *ic = &is->is_child[i]; - if (ic->ic_data != NULL) { - is->is_unique_children++; - list_insert_tail(&is->is_unique_child, ic); - } - } - } - - /* - * Set each is_good_child to a randomly-selected child which - * is known to contain validated data. - */ - int error = vdev_indirect_splits_enumerate_randomly(iv, zio); - if (error) - goto out; - - /* - * Damage all but the known good copy by zeroing it. This will - * result in two or less unique copies per indirect_child_t. - * Both may need to be checked in order to reconstruct the block. - * Set iv->iv_attempts_max such that all unique combinations will - * enumerated, but limit the damage to at most 16 indirect splits. - */ - iv->iv_attempts_max = 1; - - for (indirect_split_t *is = list_head(&iv->iv_splits); - is != NULL; is = list_next(&iv->iv_splits, is)) { - for (int c = 0; c < is->is_children; c++) { - indirect_child_t *ic = &is->is_child[c]; - - if (ic == is->is_good_child) - continue; - if (ic->ic_data == NULL) - continue; - - abd_zero(ic->ic_data, ic->ic_data->abd_size); - } - - iv->iv_attempts_max *= 2; - if (iv->iv_attempts_max > (1ULL << 16)) { - iv->iv_attempts_max = UINT64_MAX; - break; - } - } - -out: - /* Empty the unique children lists so they can be reconstructed. */ - for (indirect_split_t *is = list_head(&iv->iv_splits); - is != NULL; is = list_next(&iv->iv_splits, is)) { - indirect_child_t *ic; - while ((ic = list_head(&is->is_unique_child)) != NULL) - list_remove(&is->is_unique_child, ic); - - is->is_unique_children = 0; - } - - return (error); -} - -/* - * This function is called when we have read all copies of the data and need - * to try to find a combination of copies that gives us the right checksum. - * - * If we pointed to any mirror vdevs, this effectively does the job of the - * mirror. The mirror vdev code can't do its own job because we don't know - * the checksum of each split segment individually. - * - * We have to try every unique combination of copies of split segments, until - * we find one that checksums correctly. Duplicate segment copies are first - * identified and latter skipped during reconstruction. This optimization - * reduces the search space and ensures that of the remaining combinations - * at most one is correct. - * - * When the total number of combinations is small they can all be checked. - * For example, if we have 3 segments in the split, and each points to a - * 2-way mirror with unique copies, we will have the following pieces of data: - * - * | mirror child - * split | [0] [1] - * ======|===================== - * A | data_A_0 data_A_1 - * B | data_B_0 data_B_1 - * C | data_C_0 data_C_1 - * - * We will try the following (mirror children)^(number of splits) (2^3=8) - * combinations, which is similar to bitwise-little-endian counting in - * binary. In general each "digit" corresponds to a split segment, and the - * base of each digit is is_children, which can be different for each - * digit. - * - * "low bit" "high bit" - * v v - * data_A_0 data_B_0 data_C_0 - * data_A_1 data_B_0 data_C_0 - * data_A_0 data_B_1 data_C_0 - * data_A_1 data_B_1 data_C_0 - * data_A_0 data_B_0 data_C_1 - * data_A_1 data_B_0 data_C_1 - * data_A_0 data_B_1 data_C_1 - * data_A_1 data_B_1 data_C_1 - * - * Note that the split segments may be on the same or different top-level - * vdevs. In either case, we may need to try lots of combinations (see - * zfs_reconstruct_indirect_combinations_max). This ensures that if a mirror - * has small silent errors on all of its children, we can still reconstruct - * the correct data, as long as those errors are at sufficiently-separated - * offsets (specifically, separated by the largest block size - default of - * 128KB, but up to 16MB). - */ -static void -vdev_indirect_reconstruct_io_done(zio_t *zio) -{ - indirect_vsd_t *iv = zio->io_vsd; - boolean_t known_good = B_FALSE; - int error; - - iv->iv_unique_combinations = 1; - iv->iv_attempts_max = UINT64_MAX; - - if (zfs_reconstruct_indirect_combinations_max > 0) - iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max; - - /* - * If nonzero, every 1/x blocks will be damaged, in order to validate - * reconstruction when there are split segments with damaged copies. - * Known_good will TRUE when reconstruction is known to be possible. - */ - if (zfs_reconstruct_indirect_damage_fraction != 0 && - spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0) - known_good = (vdev_indirect_splits_damage(iv, zio) == 0); - - /* - * Determine the unique children for a split segment and add them - * to the is_unique_child list. By restricting reconstruction - * to these children, only unique combinations will be considered. - * This can vastly reduce the search space when there are a large - * number of indirect splits. - */ - for (indirect_split_t *is = list_head(&iv->iv_splits); - is != NULL; is = list_next(&iv->iv_splits, is)) { - is->is_unique_children = 0; - - for (int i = 0; i < is->is_children; i++) { - indirect_child_t *ic_i = &is->is_child[i]; - - if (ic_i->ic_data == NULL || - ic_i->ic_duplicate != NULL) - continue; - - for (int j = i + 1; j < is->is_children; j++) { - indirect_child_t *ic_j = &is->is_child[j]; - - if (ic_j->ic_data == NULL || - ic_j->ic_duplicate != NULL) - continue; - - if (abd_cmp(ic_i->ic_data, ic_j->ic_data, - is->is_size) == 0) { - ic_j->ic_duplicate = ic_i; - } - } - - is->is_unique_children++; - list_insert_tail(&is->is_unique_child, ic_i); - } - - /* Reconstruction is impossible, no valid children */ - EQUIV(list_is_empty(&is->is_unique_child), - is->is_unique_children == 0); - if (list_is_empty(&is->is_unique_child)) { - zio->io_error = EIO; - vdev_indirect_all_checksum_errors(zio); - zio_checksum_verified(zio); - return; - } - - iv->iv_unique_combinations *= is->is_unique_children; - } - - if (iv->iv_unique_combinations <= iv->iv_attempts_max) - error = vdev_indirect_splits_enumerate_all(iv, zio); - else - error = vdev_indirect_splits_enumerate_randomly(iv, zio); - - if (error != 0) { - /* All attempted combinations failed. */ - ASSERT3B(known_good, ==, B_FALSE); - zio->io_error = error; - vdev_indirect_all_checksum_errors(zio); - } else { - /* - * The checksum has been successfully validated. Issue - * repair I/Os to any copies of splits which don't match - * the validated version. - */ - ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio)); - vdev_indirect_repair(zio); - zio_checksum_verified(zio); - } -} - -static void -vdev_indirect_io_done(zio_t *zio) -{ - indirect_vsd_t *iv = zio->io_vsd; - - if (iv->iv_reconstruct) { - /* - * We have read all copies of the data (e.g. from mirrors), - * either because this was a scrub/resilver, or because the - * one-copy read didn't checksum correctly. - */ - vdev_indirect_reconstruct_io_done(zio); - return; - } - - if (!iv->iv_split_block) { - /* - * This was not a split block, so we passed the BP down, - * and the checksum was handled by the (one) child zio. - */ - return; - } - - zio_bad_cksum_t zbc; - int ret = zio_checksum_error(zio, &zbc); - if (ret == 0) { - zio_checksum_verified(zio); - return; - } - - /* - * The checksum didn't match. Read all copies of all splits, and - * then we will try to reconstruct. The next time - * vdev_indirect_io_done() is called, iv_reconstruct will be set. - */ - vdev_indirect_read_all(zio); - - zio_vdev_io_redone(zio); -} - -vdev_ops_t vdev_indirect_ops = { - vdev_indirect_open, - vdev_indirect_close, - vdev_default_asize, - vdev_indirect_io_start, - vdev_indirect_io_done, - NULL, - NULL, - NULL, - NULL, - vdev_indirect_remap, - NULL, - VDEV_TYPE_INDIRECT, /* name of this vdev type */ - B_FALSE /* leaf vdev */ -}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c deleted file mode 100644 index fbecbe830929..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c +++ /dev/null @@ -1,212 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2015 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include - -static boolean_t -vdev_indirect_births_verify(vdev_indirect_births_t *vib) -{ - ASSERT(vib != NULL); - - ASSERT(vib->vib_object != 0); - ASSERT(vib->vib_objset != NULL); - ASSERT(vib->vib_phys != NULL); - ASSERT(vib->vib_dbuf != NULL); - - EQUIV(vib->vib_phys->vib_count > 0, vib->vib_entries != NULL); - - return (B_TRUE); -} - -uint64_t -vdev_indirect_births_count(vdev_indirect_births_t *vib) -{ - ASSERT(vdev_indirect_births_verify(vib)); - - return (vib->vib_phys->vib_count); -} - -uint64_t -vdev_indirect_births_object(vdev_indirect_births_t *vib) -{ - ASSERT(vdev_indirect_births_verify(vib)); - - return (vib->vib_object); -} - -static uint64_t -vdev_indirect_births_size_impl(vdev_indirect_births_t *vib) -{ - return (vib->vib_phys->vib_count * sizeof (*vib->vib_entries)); -} - -void -vdev_indirect_births_close(vdev_indirect_births_t *vib) -{ - ASSERT(vdev_indirect_births_verify(vib)); - - if (vib->vib_phys->vib_count > 0) { - uint64_t births_size = vdev_indirect_births_size_impl(vib); - - kmem_free(vib->vib_entries, births_size); - vib->vib_entries = NULL; - } - - dmu_buf_rele(vib->vib_dbuf, vib); - - vib->vib_objset = NULL; - vib->vib_object = 0; - vib->vib_dbuf = NULL; - vib->vib_phys = NULL; - - kmem_free(vib, sizeof (*vib)); -} - -uint64_t -vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx) -{ - ASSERT(dmu_tx_is_syncing(tx)); - - return (dmu_object_alloc(os, - DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, - DMU_OTN_UINT64_METADATA, sizeof (vdev_indirect_birth_phys_t), - tx)); -} - -vdev_indirect_births_t * -vdev_indirect_births_open(objset_t *os, uint64_t births_object) -{ - vdev_indirect_births_t *vib = kmem_zalloc(sizeof (*vib), KM_SLEEP); - - vib->vib_objset = os; - vib->vib_object = births_object; - - VERIFY0(dmu_bonus_hold(os, vib->vib_object, vib, &vib->vib_dbuf)); - vib->vib_phys = vib->vib_dbuf->db_data; - - if (vib->vib_phys->vib_count > 0) { - uint64_t births_size = vdev_indirect_births_size_impl(vib); - vib->vib_entries = kmem_alloc(births_size, KM_SLEEP); - VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0, - births_size, vib->vib_entries, DMU_READ_PREFETCH)); - } - - ASSERT(vdev_indirect_births_verify(vib)); - - return (vib); -} - -void -vdev_indirect_births_free(objset_t *os, uint64_t object, dmu_tx_t *tx) -{ - VERIFY0(dmu_object_free(os, object, tx)); -} - -void -vdev_indirect_births_add_entry(vdev_indirect_births_t *vib, - uint64_t max_offset, uint64_t txg, dmu_tx_t *tx) -{ - vdev_indirect_birth_entry_phys_t vibe; - uint64_t old_size; - uint64_t new_size; - vdev_indirect_birth_entry_phys_t *new_entries; - - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx))); - ASSERT(vdev_indirect_births_verify(vib)); - - dmu_buf_will_dirty(vib->vib_dbuf, tx); - - vibe.vibe_offset = max_offset; - vibe.vibe_phys_birth_txg = txg; - - old_size = vdev_indirect_births_size_impl(vib); - dmu_write(vib->vib_objset, vib->vib_object, old_size, sizeof (vibe), - &vibe, tx); - vib->vib_phys->vib_count++; - new_size = vdev_indirect_births_size_impl(vib); - - new_entries = kmem_alloc(new_size, KM_SLEEP); - if (old_size > 0) { - bcopy(vib->vib_entries, new_entries, old_size); - kmem_free(vib->vib_entries, old_size); - } - new_entries[vib->vib_phys->vib_count - 1] = vibe; - vib->vib_entries = new_entries; -} - -uint64_t -vdev_indirect_births_last_entry_txg(vdev_indirect_births_t *vib) -{ - ASSERT(vdev_indirect_births_verify(vib)); - ASSERT(vib->vib_phys->vib_count > 0); - - vdev_indirect_birth_entry_phys_t *last = - &vib->vib_entries[vib->vib_phys->vib_count - 1]; - return (last->vibe_phys_birth_txg); -} - -/* - * Return the txg in which the given range was copied (i.e. its physical - * birth txg). The specified offset+asize must be contiguously mapped - * (i.e. not a split block). - * - * The entries are sorted by increasing phys_birth, and also by increasing - * offset. We find the specified offset by binary search. Note that we - * can not use bsearch() because looking at each entry independently is - * insufficient to find the correct entry. Each entry implicitly relies - * on the previous entry: an entry indicates that the offsets from the - * end of the previous entry to the end of this entry were written in the - * specified txg. - */ -uint64_t -vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, uint64_t offset, - uint64_t asize) -{ - vdev_indirect_birth_entry_phys_t *base; - vdev_indirect_birth_entry_phys_t *last; - - ASSERT(vdev_indirect_births_verify(vib)); - ASSERT(vib->vib_phys->vib_count > 0); - - base = vib->vib_entries; - last = base + vib->vib_phys->vib_count - 1; - - ASSERT3U(offset, <, last->vibe_offset); - - while (last >= base) { - vdev_indirect_birth_entry_phys_t *p = - base + ((last - base) / 2); - if (offset >= p->vibe_offset) { - base = p + 1; - } else if (p == vib->vib_entries || - offset >= (p - 1)->vibe_offset) { - ASSERT3U(offset + asize, <=, p->vibe_offset); - return (p->vibe_phys_birth_txg); - } else { - last = p - 1; - } - } - ASSERT(!"offset not found"); - return (-1); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c deleted file mode 100644 index 3d0f1344dd88..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c +++ /dev/null @@ -1,593 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2015, 2017 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include - -static boolean_t -vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim) -{ - ASSERT(vim != NULL); - - ASSERT(vim->vim_object != 0); - ASSERT(vim->vim_objset != NULL); - ASSERT(vim->vim_phys != NULL); - ASSERT(vim->vim_dbuf != NULL); - - EQUIV(vim->vim_phys->vimp_num_entries > 0, - vim->vim_entries != NULL); - if (vim->vim_phys->vimp_num_entries > 0) { - vdev_indirect_mapping_entry_phys_t *last_entry = - &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1]; - uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(last_entry); - uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst); - - ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size); - } - if (vim->vim_havecounts) { - ASSERT(vim->vim_phys->vimp_counts_object != 0); - } - - return (B_TRUE); -} - -uint64_t -vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim) -{ - ASSERT(vdev_indirect_mapping_verify(vim)); - - return (vim->vim_phys->vimp_num_entries); -} - -uint64_t -vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim) -{ - ASSERT(vdev_indirect_mapping_verify(vim)); - - return (vim->vim_phys->vimp_max_offset); -} - -uint64_t -vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim) -{ - ASSERT(vdev_indirect_mapping_verify(vim)); - - return (vim->vim_object); -} - -uint64_t -vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim) -{ - ASSERT(vdev_indirect_mapping_verify(vim)); - - return (vim->vim_phys->vimp_bytes_mapped); -} - -/* - * The length (in bytes) of the mapping object array in memory and - * (logically) on disk. - * - * Note that unlike most of our accessor functions, - * we don't assert that the struct is consistent; therefore it can be - * called while there may be concurrent changes, if we don't care about - * the value being immediately stale (e.g. from spa_removal_get_stats()). - */ -uint64_t -vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim) -{ - return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries)); -} - -/* - * Compare an offset with an indirect mapping entry; there are three - * possible scenarios: - * - * 1. The offset is "less than" the mapping entry; meaning the - * offset is less than the source offset of the mapping entry. In - * this case, there is no overlap between the offset and the - * mapping entry and -1 will be returned. - * - * 2. The offset is "greater than" the mapping entry; meaning the - * offset is greater than the mapping entry's source offset plus - * the entry's size. In this case, there is no overlap between - * the offset and the mapping entry and 1 will be returned. - * - * NOTE: If the offset is actually equal to the entry's offset - * plus size, this is considered to be "greater" than the entry, - * and this case applies (i.e. 1 will be returned). Thus, the - * entry's "range" can be considered to be inclusive at its - * start, but exclusive at its end: e.g. [src, src + size). - * - * 3. The last case to consider is if the offset actually falls - * within the mapping entry's range. If this is the case, the - * offset is considered to be "equal to" the mapping entry and - * 0 will be returned. - * - * NOTE: If the offset is equal to the entry's source offset, - * this case applies and 0 will be returned. If the offset is - * equal to the entry's source plus its size, this case does - * *not* apply (see "NOTE" above for scenario 2), and 1 will be - * returned. - */ -static int -dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem) -{ - const uint64_t *key = v_key; - const vdev_indirect_mapping_entry_phys_t *array_elem = - v_array_elem; - uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem); - - if (*key < src_offset) { - return (-1); - } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) { - return (0); - } else { - return (1); - } -} - -/* - * Returns the mapping entry for the given offset. - * - * It's possible that the given offset will not be in the mapping table - * (i.e. no mapping entries contain this offset), in which case, the - * return value value depends on the "next_if_missing" parameter. - * - * If the offset is not found in the table and "next_if_missing" is - * B_FALSE, then NULL will always be returned. The behavior is intended - * to allow consumers to get the entry corresponding to the offset - * parameter, iff the offset overlaps with an entry in the table. - * - * If the offset is not found in the table and "next_if_missing" is - * B_TRUE, then the entry nearest to the given offset will be returned, - * such that the entry's source offset is greater than the offset - * passed in (i.e. the "next" mapping entry in the table is returned, if - * the offset is missing from the table). If there are no entries whose - * source offset is greater than the passed in offset, NULL is returned. - */ -static vdev_indirect_mapping_entry_phys_t * -vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim, - uint64_t offset, boolean_t next_if_missing) -{ - ASSERT(vdev_indirect_mapping_verify(vim)); - ASSERT(vim->vim_phys->vimp_num_entries > 0); - - vdev_indirect_mapping_entry_phys_t *entry = NULL; - - uint64_t last = vim->vim_phys->vimp_num_entries - 1; - uint64_t base = 0; - - /* - * We don't define these inside of the while loop because we use - * their value in the case that offset isn't in the mapping. - */ - uint64_t mid; - int result; - - while (last >= base) { - mid = base + ((last - base) >> 1); - - result = dva_mapping_overlap_compare(&offset, - &vim->vim_entries[mid]); - - if (result == 0) { - entry = &vim->vim_entries[mid]; - break; - } else if (result < 0) { - last = mid - 1; - } else { - base = mid + 1; - } - } - - if (entry == NULL && next_if_missing) { - ASSERT3U(base, ==, last + 1); - ASSERT(mid == base || mid == last); - ASSERT3S(result, !=, 0); - - /* - * The offset we're looking for isn't actually contained - * in the mapping table, thus we need to return the - * closest mapping entry that is greater than the - * offset. We reuse the result of the last comparison, - * comparing the mapping entry at index "mid" and the - * offset. The offset is guaranteed to lie between - * indices one less than "mid", and one greater than - * "mid"; we just need to determine if offset is greater - * than, or less than the mapping entry contained at - * index "mid". - */ - - uint64_t index; - if (result < 0) - index = mid; - else - index = mid + 1; - - ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries); - - if (index == vim->vim_phys->vimp_num_entries) { - /* - * If "index" is past the end of the entries - * array, then not only is the offset not in the - * mapping table, but it's actually greater than - * all entries in the table. In this case, we - * can't return a mapping entry greater than the - * offset (since none exist), so we return NULL. - */ - - ASSERT3S(dva_mapping_overlap_compare(&offset, - &vim->vim_entries[index - 1]), >, 0); - - return (NULL); - } else { - /* - * Just to be safe, we verify the offset falls - * in between the mapping entries at index and - * one less than index. Since we know the offset - * doesn't overlap an entry, and we're supposed - * to return the entry just greater than the - * offset, both of the following tests must be - * true. - */ - ASSERT3S(dva_mapping_overlap_compare(&offset, - &vim->vim_entries[index]), <, 0); - IMPLY(index >= 1, dva_mapping_overlap_compare(&offset, - &vim->vim_entries[index - 1]) > 0); - - return (&vim->vim_entries[index]); - } - } else { - return (entry); - } -} - -vdev_indirect_mapping_entry_phys_t * -vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, - uint64_t offset) -{ - return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, - B_FALSE)); -} - -vdev_indirect_mapping_entry_phys_t * -vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim, - uint64_t offset) -{ - return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, - B_TRUE)); -} - -void -vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim) -{ - ASSERT(vdev_indirect_mapping_verify(vim)); - - if (vim->vim_phys->vimp_num_entries > 0) { - uint64_t map_size = vdev_indirect_mapping_size(vim); - kmem_free(vim->vim_entries, map_size); - vim->vim_entries = NULL; - } - - dmu_buf_rele(vim->vim_dbuf, vim); - - vim->vim_objset = NULL; - vim->vim_object = 0; - vim->vim_dbuf = NULL; - vim->vim_phys = NULL; - - kmem_free(vim, sizeof (*vim)); -} - -uint64_t -vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx) -{ - uint64_t object; - ASSERT(dmu_tx_is_syncing(tx)); - uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0; - - if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { - bonus_size = sizeof (vdev_indirect_mapping_phys_t); - } - - object = dmu_object_alloc(os, - DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, - DMU_OTN_UINT64_METADATA, bonus_size, - tx); - - if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { - dmu_buf_t *dbuf; - vdev_indirect_mapping_phys_t *vimp; - - VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); - vimp = dbuf->db_data; - vimp->vimp_counts_object = dmu_object_alloc(os, - DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE, - DMU_OT_NONE, 0, tx); - spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); - dmu_buf_rele(dbuf, FTAG); - } - - return (object); -} - - -vdev_indirect_mapping_t * -vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object) -{ - vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP); - dmu_object_info_t doi; - VERIFY0(dmu_object_info(os, mapping_object, &doi)); - - vim->vim_objset = os; - vim->vim_object = mapping_object; - - VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim, - &vim->vim_dbuf)); - vim->vim_phys = vim->vim_dbuf->db_data; - - vim->vim_havecounts = - (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0); - - if (vim->vim_phys->vimp_num_entries > 0) { - uint64_t map_size = vdev_indirect_mapping_size(vim); - vim->vim_entries = kmem_alloc(map_size, KM_SLEEP); - VERIFY0(dmu_read(os, vim->vim_object, 0, map_size, - vim->vim_entries, DMU_READ_PREFETCH)); - } - - ASSERT(vdev_indirect_mapping_verify(vim)); - - return (vim); -} - -void -vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx) -{ - vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object); - if (vim->vim_havecounts) { - VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object, - tx)); - spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); - } - vdev_indirect_mapping_close(vim); - - VERIFY0(dmu_object_free(os, object, tx)); -} - -/* - * Append the list of vdev_indirect_mapping_entry_t's to the on-disk - * mapping object. Also remove the entries from the list and free them. - * This also implicitly extends the max_offset of the mapping (to the end - * of the last entry). - */ -void -vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, - list_t *list, dmu_tx_t *tx) -{ - vdev_indirect_mapping_entry_phys_t *mapbuf; - uint64_t old_size; - uint32_t *countbuf = NULL; - vdev_indirect_mapping_entry_phys_t *old_entries; - uint64_t old_count; - uint64_t entries_written = 0; - - ASSERT(vdev_indirect_mapping_verify(vim)); - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx))); - ASSERT(!list_is_empty(list)); - - old_size = vdev_indirect_mapping_size(vim); - old_entries = vim->vim_entries; - old_count = vim->vim_phys->vimp_num_entries; - - dmu_buf_will_dirty(vim->vim_dbuf, tx); - - mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); - if (vim->vim_havecounts) { - countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); - ASSERT(spa_feature_is_active(vim->vim_objset->os_spa, - SPA_FEATURE_OBSOLETE_COUNTS)); - } - while (!list_is_empty(list)) { - uint64_t i; - /* - * Write entries from the list to the - * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE. - */ - for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) { - vdev_indirect_mapping_entry_t *entry = - list_remove_head(list); - if (entry == NULL) - break; - - uint64_t size = - DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst); - uint64_t src_offset = - DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping); - - /* - * We shouldn't be adding an entry which is fully - * obsolete. - */ - ASSERT3U(entry->vime_obsolete_count, <, size); - IMPLY(entry->vime_obsolete_count != 0, - vim->vim_havecounts); - - mapbuf[i] = entry->vime_mapping; - if (vim->vim_havecounts) - countbuf[i] = entry->vime_obsolete_count; - - vim->vim_phys->vimp_bytes_mapped += size; - ASSERT3U(src_offset, >=, - vim->vim_phys->vimp_max_offset); - vim->vim_phys->vimp_max_offset = src_offset + size; - - entries_written++; - - kmem_free(entry, sizeof (*entry)); - } - dmu_write(vim->vim_objset, vim->vim_object, - vim->vim_phys->vimp_num_entries * sizeof (*mapbuf), - i * sizeof (*mapbuf), - mapbuf, tx); - if (vim->vim_havecounts) { - dmu_write(vim->vim_objset, - vim->vim_phys->vimp_counts_object, - vim->vim_phys->vimp_num_entries * - sizeof (*countbuf), - i * sizeof (*countbuf), countbuf, tx); - } - vim->vim_phys->vimp_num_entries += i; - } - zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE); - if (vim->vim_havecounts) - zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE); - - /* - * Update the entry array to reflect the new entries. First, copy - * over any old entries then read back the new entries we just wrote. - */ - uint64_t new_size = vdev_indirect_mapping_size(vim); - ASSERT3U(new_size, >, old_size); - ASSERT3U(new_size - old_size, ==, - entries_written * sizeof (vdev_indirect_mapping_entry_phys_t)); - vim->vim_entries = kmem_alloc(new_size, KM_SLEEP); - if (old_size > 0) { - bcopy(old_entries, vim->vim_entries, old_size); - kmem_free(old_entries, old_size); - } - VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size, - new_size - old_size, &vim->vim_entries[old_count], - DMU_READ_PREFETCH)); - - zfs_dbgmsg("txg %llu: wrote %llu entries to " - "indirect mapping obj %llu; max offset=0x%llx", - (u_longlong_t)dmu_tx_get_txg(tx), - (u_longlong_t)entries_written, - (u_longlong_t)vim->vim_object, - (u_longlong_t)vim->vim_phys->vimp_max_offset); -} - -/* - * Increment the relevant counts for the specified offset and length. - * The counts array must be obtained from - * vdev_indirect_mapping_load_obsolete_counts(). - */ -void -vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim, - uint64_t offset, uint64_t length, uint32_t *counts) -{ - vdev_indirect_mapping_entry_phys_t *mapping; - uint64_t index; - - mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); - - ASSERT(length > 0); - ASSERT3P(mapping, !=, NULL); - - index = mapping - vim->vim_entries; - - while (length > 0) { - ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim)); - - uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); - uint64_t inner_offset = offset - - DVA_MAPPING_GET_SRC_OFFSET(mapping); - VERIFY3U(inner_offset, <, size); - uint64_t inner_size = MIN(length, size - inner_offset); - - VERIFY3U(counts[index] + inner_size, <=, size); - counts[index] += inner_size; - - offset += inner_size; - length -= inner_size; - mapping++; - index++; - } -} - -typedef struct load_obsolete_space_map_arg { - vdev_indirect_mapping_t *losma_vim; - uint32_t *losma_counts; -} load_obsolete_space_map_arg_t; - -static int -load_obsolete_sm_callback(space_map_entry_t *sme, void *arg) -{ - load_obsolete_space_map_arg_t *losma = arg; - ASSERT3S(sme->sme_type, ==, SM_ALLOC); - - vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim, - sme->sme_offset, sme->sme_run, losma->losma_counts); - - return (0); -} - -/* - * Modify the counts (increment them) based on the spacemap. - */ -void -vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim, - uint32_t *counts, space_map_t *obsolete_space_sm) -{ - load_obsolete_space_map_arg_t losma; - losma.losma_counts = counts; - losma.losma_vim = vim; - VERIFY0(space_map_iterate(obsolete_space_sm, - space_map_length(obsolete_space_sm), - load_obsolete_sm_callback, &losma)); -} - -/* - * Read the obsolete counts from disk, returning them in an array. - */ -uint32_t * -vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim) -{ - ASSERT(vdev_indirect_mapping_verify(vim)); - - uint64_t counts_size = - vim->vim_phys->vimp_num_entries * sizeof (uint32_t); - uint32_t *counts = kmem_alloc(counts_size, KM_SLEEP); - if (vim->vim_havecounts) { - VERIFY0(dmu_read(vim->vim_objset, - vim->vim_phys->vimp_counts_object, - 0, counts_size, - counts, DMU_READ_PREFETCH)); - } else { - bzero(counts, counts_size); - } - return (counts); -} - -extern void -vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim, - uint32_t *counts) -{ - ASSERT(vdev_indirect_mapping_verify(vim)); - - kmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c deleted file mode 100644 index 34d959008bd5..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c +++ /dev/null @@ -1,782 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2016 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Maximum number of metaslabs per group that can be initialized - * simultaneously. - */ -int max_initialize_ms = 3; - -/* - * Value that is written to disk during initialization. - */ -uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL; - -/* maximum number of I/Os outstanding per leaf vdev */ -int zfs_initialize_limit = 1; - -/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ -uint64_t zfs_initialize_chunk_size = 1024 * 1024; - -static boolean_t -vdev_initialize_should_stop(vdev_t *vd) -{ - return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || - vd->vdev_detached || vd->vdev_top->vdev_removing); -} - -static void -vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) -{ - /* - * We pass in the guid instead of the vdev_t since the vdev may - * have been freed prior to the sync task being processed. This - * happens when a vdev is detached as we call spa_config_vdev_exit(), - * stop the intializing thread, schedule the sync task, and free - * the vdev. Later when the scheduled sync task is invoked, it would - * find that the vdev has been freed. - */ - uint64_t guid = *(uint64_t *)arg; - uint64_t txg = dmu_tx_get_txg(tx); - kmem_free(arg, sizeof (uint64_t)); - - vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); - if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) - return; - - uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; - vd->vdev_initialize_offset[txg & TXG_MASK] = 0; - - VERIFY(vd->vdev_leaf_zap != 0); - - objset_t *mos = vd->vdev_spa->spa_meta_objset; - - if (last_offset > 0) { - vd->vdev_initialize_last_offset = last_offset; - VERIFY0(zap_update(mos, vd->vdev_leaf_zap, - VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, - sizeof (last_offset), 1, &last_offset, tx)); - } - if (vd->vdev_initialize_action_time > 0) { - uint64_t val = (uint64_t)vd->vdev_initialize_action_time; - VERIFY0(zap_update(mos, vd->vdev_leaf_zap, - VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), - 1, &val, tx)); - } - - uint64_t initialize_state = vd->vdev_initialize_state; - VERIFY0(zap_update(mos, vd->vdev_leaf_zap, - VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, - &initialize_state, tx)); -} - -static void -vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) -{ - ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); - spa_t *spa = vd->vdev_spa; - - if (new_state == vd->vdev_initialize_state) - return; - - /* - * Copy the vd's guid, this will be freed by the sync task. - */ - uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); - *guid = vd->vdev_guid; - - /* - * If we're suspending, then preserving the original start time. - */ - if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { - vd->vdev_initialize_action_time = gethrestime_sec(); - } - vd->vdev_initialize_state = new_state; - - dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, - guid, 2, ZFS_SPACE_CHECK_RESERVED, tx); - - switch (new_state) { - case VDEV_INITIALIZE_ACTIVE: - spa_history_log_internal(spa, "initialize", tx, - "vdev=%s activated", vd->vdev_path); - break; - case VDEV_INITIALIZE_SUSPENDED: - spa_history_log_internal(spa, "initialize", tx, - "vdev=%s suspended", vd->vdev_path); - break; - case VDEV_INITIALIZE_CANCELED: - spa_history_log_internal(spa, "initialize", tx, - "vdev=%s canceled", vd->vdev_path); - break; - case VDEV_INITIALIZE_COMPLETE: - spa_history_log_internal(spa, "initialize", tx, - "vdev=%s complete", vd->vdev_path); - break; - default: - panic("invalid state %llu", (unsigned long long)new_state); - } - - dmu_tx_commit(tx); -} - -static void -vdev_initialize_cb(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - mutex_enter(&vd->vdev_initialize_io_lock); - if (zio->io_error == ENXIO && !vdev_writeable(vd)) { - /* - * The I/O failed because the vdev was unavailable; roll the - * last offset back. (This works because spa_sync waits on - * spa_txg_zio before it runs sync tasks.) - */ - uint64_t *off = - &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; - *off = MIN(*off, zio->io_offset); - } else { - /* - * Since initializing is best-effort, we ignore I/O errors and - * rely on vdev_probe to determine if the errors are more - * critical. - */ - if (zio->io_error != 0) - vd->vdev_stat.vs_initialize_errors++; - - vd->vdev_initialize_bytes_done += zio->io_orig_size; - } - ASSERT3U(vd->vdev_initialize_inflight, >, 0); - vd->vdev_initialize_inflight--; - cv_broadcast(&vd->vdev_initialize_io_cv); - mutex_exit(&vd->vdev_initialize_io_lock); - - spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); -} - -/* Takes care of physical writing and limiting # of concurrent ZIOs. */ -static int -vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) -{ - spa_t *spa = vd->vdev_spa; - - /* Limit inflight initializing I/Os */ - mutex_enter(&vd->vdev_initialize_io_lock); - while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { - cv_wait(&vd->vdev_initialize_io_cv, - &vd->vdev_initialize_io_lock); - } - vd->vdev_initialize_inflight++; - mutex_exit(&vd->vdev_initialize_io_lock); - - dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - uint64_t txg = dmu_tx_get_txg(tx); - - spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); - mutex_enter(&vd->vdev_initialize_lock); - - if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { - uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); - *guid = vd->vdev_guid; - - /* This is the first write of this txg. */ - dsl_sync_task_nowait(spa_get_dsl(spa), - vdev_initialize_zap_update_sync, guid, 2, - ZFS_SPACE_CHECK_RESERVED, tx); - } - - /* - * We know the vdev struct will still be around since all - * consumers of vdev_free must stop the initialization first. - */ - if (vdev_initialize_should_stop(vd)) { - mutex_enter(&vd->vdev_initialize_io_lock); - ASSERT3U(vd->vdev_initialize_inflight, >, 0); - vd->vdev_initialize_inflight--; - mutex_exit(&vd->vdev_initialize_io_lock); - spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); - mutex_exit(&vd->vdev_initialize_lock); - dmu_tx_commit(tx); - return (SET_ERROR(EINTR)); - } - mutex_exit(&vd->vdev_initialize_lock); - - vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; - zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, - size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, - ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); - /* vdev_initialize_cb releases SCL_STATE_ALL */ - - dmu_tx_commit(tx); - - return (0); -} - -/* - * Translate a logical range to the physical range for the specified vdev_t. - * This function is initially called with a leaf vdev and will walk each - * parent vdev until it reaches a top-level vdev. Once the top-level is - * reached the physical range is initialized and the recursive function - * begins to unwind. As it unwinds it calls the parent's vdev specific - * translation function to do the real conversion. - */ -void -vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs) -{ - /* - * Walk up the vdev tree - */ - if (vd != vd->vdev_top) { - vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); - } else { - /* - * We've reached the top-level vdev, initialize the - * physical range to the logical range and start to - * unwind. - */ - physical_rs->rs_start = logical_rs->rs_start; - physical_rs->rs_end = logical_rs->rs_end; - return; - } - - vdev_t *pvd = vd->vdev_parent; - ASSERT3P(pvd, !=, NULL); - ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); - - /* - * As this recursive function unwinds, translate the logical - * range into its physical components by calling the - * vdev specific translate function. - */ - range_seg_t intermediate = { 0 }; - pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); - - physical_rs->rs_start = intermediate.rs_start; - physical_rs->rs_end = intermediate.rs_end; -} - -/* - * Callback to fill each ABD chunk with zfs_initialize_value. len must be - * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD - * allocation will guarantee these for us. - */ -/* ARGSUSED */ -static int -vdev_initialize_block_fill(void *buf, size_t len, void *unused) -{ - ASSERT0(len % sizeof (uint64_t)); - for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { - *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; - } - return (0); -} - -static abd_t * -vdev_initialize_block_alloc() -{ - /* Allocate ABD for filler data */ - abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); - - ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); - (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, - vdev_initialize_block_fill, NULL); - - return (data); -} - -static void -vdev_initialize_block_free(abd_t *data) -{ - abd_free(data); -} - -static int -vdev_initialize_ranges(vdev_t *vd, abd_t *data) -{ - avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root; - - for (range_seg_t *rs = avl_first(rt); rs != NULL; - rs = AVL_NEXT(rt, rs)) { - uint64_t size = rs->rs_end - rs->rs_start; - - /* Split range into legally-sized physical chunks */ - uint64_t writes_required = - ((size - 1) / zfs_initialize_chunk_size) + 1; - - for (uint64_t w = 0; w < writes_required; w++) { - int error; - - error = vdev_initialize_write(vd, - VDEV_LABEL_START_SIZE + rs->rs_start + - (w * zfs_initialize_chunk_size), - MIN(size - (w * zfs_initialize_chunk_size), - zfs_initialize_chunk_size), data); - if (error != 0) - return (error); - } - } - return (0); -} - -static void -vdev_initialize_mg_wait(metaslab_group_t *mg) -{ - ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); - while (mg->mg_initialize_updating) { - cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); - } -} - -static void -vdev_initialize_mg_mark(metaslab_group_t *mg) -{ - ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); - ASSERT(mg->mg_initialize_updating); - - while (mg->mg_ms_initializing >= max_initialize_ms) { - cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); - } - mg->mg_ms_initializing++; - ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms); -} - -/* - * Mark the metaslab as being initialized to prevent any allocations - * on this metaslab. We must also track how many metaslabs are currently - * being initialized within a metaslab group and limit them to prevent - * allocation failures from occurring because all metaslabs are being - * initialized. - */ -static void -vdev_initialize_ms_mark(metaslab_t *msp) -{ - ASSERT(!MUTEX_HELD(&msp->ms_lock)); - metaslab_group_t *mg = msp->ms_group; - - mutex_enter(&mg->mg_ms_initialize_lock); - - /* - * To keep an accurate count of how many threads are initializing - * a specific metaslab group, we only allow one thread to mark - * the metaslab group at a time. This ensures that the value of - * ms_initializing will be accurate when we decide to mark a metaslab - * group as being initialized. To do this we force all other threads - * to wait till the metaslab's mg_initialize_updating flag is no - * longer set. - */ - vdev_initialize_mg_wait(mg); - mg->mg_initialize_updating = B_TRUE; - if (msp->ms_initializing == 0) { - vdev_initialize_mg_mark(mg); - } - mutex_enter(&msp->ms_lock); - msp->ms_initializing++; - mutex_exit(&msp->ms_lock); - - mg->mg_initialize_updating = B_FALSE; - cv_broadcast(&mg->mg_ms_initialize_cv); - mutex_exit(&mg->mg_ms_initialize_lock); -} - -static void -vdev_initialize_ms_unmark(metaslab_t *msp) -{ - ASSERT(!MUTEX_HELD(&msp->ms_lock)); - metaslab_group_t *mg = msp->ms_group; - mutex_enter(&mg->mg_ms_initialize_lock); - mutex_enter(&msp->ms_lock); - if (--msp->ms_initializing == 0) { - mg->mg_ms_initializing--; - cv_broadcast(&mg->mg_ms_initialize_cv); - } - mutex_exit(&msp->ms_lock); - mutex_exit(&mg->mg_ms_initialize_lock); -} - -static void -vdev_initialize_calculate_progress(vdev_t *vd) -{ - ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || - spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); - ASSERT(vd->vdev_leaf_zap != 0); - - vd->vdev_initialize_bytes_est = 0; - vd->vdev_initialize_bytes_done = 0; - - for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { - metaslab_t *msp = vd->vdev_top->vdev_ms[i]; - mutex_enter(&msp->ms_lock); - - uint64_t ms_free = msp->ms_size - - metaslab_allocated_space(msp); - - if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) - ms_free /= vd->vdev_top->vdev_children; - - /* - * Convert the metaslab range to a physical range - * on our vdev. We use this to determine if we are - * in the middle of this metaslab range. - */ - range_seg_t logical_rs, physical_rs; - logical_rs.rs_start = msp->ms_start; - logical_rs.rs_end = msp->ms_start + msp->ms_size; - vdev_xlate(vd, &logical_rs, &physical_rs); - - if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { - vd->vdev_initialize_bytes_est += ms_free; - mutex_exit(&msp->ms_lock); - continue; - } else if (vd->vdev_initialize_last_offset > - physical_rs.rs_end) { - vd->vdev_initialize_bytes_done += ms_free; - vd->vdev_initialize_bytes_est += ms_free; - mutex_exit(&msp->ms_lock); - continue; - } - - /* - * If we get here, we're in the middle of initializing this - * metaslab. Load it and walk the free tree for more accurate - * progress estimation. - */ - VERIFY0(metaslab_load(msp)); - - for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); - rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { - logical_rs.rs_start = rs->rs_start; - logical_rs.rs_end = rs->rs_end; - vdev_xlate(vd, &logical_rs, &physical_rs); - - uint64_t size = physical_rs.rs_end - - physical_rs.rs_start; - vd->vdev_initialize_bytes_est += size; - if (vd->vdev_initialize_last_offset > - physical_rs.rs_end) { - vd->vdev_initialize_bytes_done += size; - } else if (vd->vdev_initialize_last_offset > - physical_rs.rs_start && - vd->vdev_initialize_last_offset < - physical_rs.rs_end) { - vd->vdev_initialize_bytes_done += - vd->vdev_initialize_last_offset - - physical_rs.rs_start; - } - } - mutex_exit(&msp->ms_lock); - } -} - -static void -vdev_initialize_load(vdev_t *vd) -{ - ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || - spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); - ASSERT(vd->vdev_leaf_zap != 0); - - if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || - vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { - int err = zap_lookup(vd->vdev_spa->spa_meta_objset, - vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, - sizeof (vd->vdev_initialize_last_offset), 1, - &vd->vdev_initialize_last_offset); - ASSERT(err == 0 || err == ENOENT); - } - - vdev_initialize_calculate_progress(vd); -} - - -/* - * Convert the logical range into a physcial range and add it to our - * avl tree. - */ -void -vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) -{ - vdev_t *vd = arg; - range_seg_t logical_rs, physical_rs; - logical_rs.rs_start = start; - logical_rs.rs_end = start + size; - - ASSERT(vd->vdev_ops->vdev_op_leaf); - vdev_xlate(vd, &logical_rs, &physical_rs); - - IMPLY(vd->vdev_top == vd, - logical_rs.rs_start == physical_rs.rs_start); - IMPLY(vd->vdev_top == vd, - logical_rs.rs_end == physical_rs.rs_end); - - /* Only add segments that we have not visited yet */ - if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) - return; - - /* Pick up where we left off mid-range. */ - if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { - zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " - "(%llu, %llu)", vd->vdev_path, - (u_longlong_t)physical_rs.rs_start, - (u_longlong_t)physical_rs.rs_end, - (u_longlong_t)vd->vdev_initialize_last_offset, - (u_longlong_t)physical_rs.rs_end); - ASSERT3U(physical_rs.rs_end, >, - vd->vdev_initialize_last_offset); - physical_rs.rs_start = vd->vdev_initialize_last_offset; - } - ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); - - /* - * With raidz, it's possible that the logical range does not live on - * this leaf vdev. We only add the physical range to this vdev's if it - * has a length greater than 0. - */ - if (physical_rs.rs_end > physical_rs.rs_start) { - range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, - physical_rs.rs_end - physical_rs.rs_start); - } else { - ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); - } -} - -static void -vdev_initialize_thread(void *arg) -{ - vdev_t *vd = arg; - spa_t *spa = vd->vdev_spa; - int error = 0; - uint64_t ms_count = 0; - - ASSERT(vdev_is_concrete(vd)); - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - - vd->vdev_initialize_last_offset = 0; - vdev_initialize_load(vd); - - abd_t *deadbeef = vdev_initialize_block_alloc(); - - vd->vdev_initialize_tree = range_tree_create(NULL, NULL); - - for (uint64_t i = 0; !vd->vdev_detached && - i < vd->vdev_top->vdev_ms_count; i++) { - metaslab_t *msp = vd->vdev_top->vdev_ms[i]; - - /* - * If we've expanded the top-level vdev or it's our - * first pass, calculate our progress. - */ - if (vd->vdev_top->vdev_ms_count != ms_count) { - vdev_initialize_calculate_progress(vd); - ms_count = vd->vdev_top->vdev_ms_count; - } - - vdev_initialize_ms_mark(msp); - mutex_enter(&msp->ms_lock); - VERIFY0(metaslab_load(msp)); - - range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, - vd); - mutex_exit(&msp->ms_lock); - - spa_config_exit(spa, SCL_CONFIG, FTAG); - error = vdev_initialize_ranges(vd, deadbeef); - vdev_initialize_ms_unmark(msp); - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - - range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); - if (error != 0) - break; - } - - spa_config_exit(spa, SCL_CONFIG, FTAG); - mutex_enter(&vd->vdev_initialize_io_lock); - while (vd->vdev_initialize_inflight > 0) { - cv_wait(&vd->vdev_initialize_io_cv, - &vd->vdev_initialize_io_lock); - } - mutex_exit(&vd->vdev_initialize_io_lock); - - range_tree_destroy(vd->vdev_initialize_tree); - vdev_initialize_block_free(deadbeef); - vd->vdev_initialize_tree = NULL; - - mutex_enter(&vd->vdev_initialize_lock); - if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) { - vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); - } - ASSERT(vd->vdev_initialize_thread != NULL || - vd->vdev_initialize_inflight == 0); - - /* - * Drop the vdev_initialize_lock while we sync out the - * txg since it's possible that a device might be trying to - * come online and must check to see if it needs to restart an - * initialization. That thread will be holding the spa_config_lock - * which would prevent the txg_wait_synced from completing. - */ - mutex_exit(&vd->vdev_initialize_lock); - txg_wait_synced(spa_get_dsl(spa), 0); - mutex_enter(&vd->vdev_initialize_lock); - - vd->vdev_initialize_thread = NULL; - cv_broadcast(&vd->vdev_initialize_cv); - mutex_exit(&vd->vdev_initialize_lock); - thread_exit(); -} - -/* - * Initiates a device. Caller must hold vdev_initialize_lock. - * Device must be a leaf and not already be initializing. - */ -void -vdev_initialize(vdev_t *vd) -{ - ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); - ASSERT(vd->vdev_ops->vdev_op_leaf); - ASSERT(vdev_is_concrete(vd)); - ASSERT3P(vd->vdev_initialize_thread, ==, NULL); - ASSERT(!vd->vdev_detached); - ASSERT(!vd->vdev_initialize_exit_wanted); - ASSERT(!vd->vdev_top->vdev_removing); - - vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); - vd->vdev_initialize_thread = thread_create(NULL, 0, - vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); -} - -/* - * Stop initializng a device, with the resultant initialing state being - * tgt_state. Blocks until the initializing thread has exited. - * Caller must hold vdev_initialize_lock and must not be writing to the spa - * config, as the initializing thread may try to enter the config as a reader - * before exiting. - */ -void -vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state) -{ - spa_t *spa = vd->vdev_spa; - ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER)); - - ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); - ASSERT(vd->vdev_ops->vdev_op_leaf); - ASSERT(vdev_is_concrete(vd)); - - /* - * Allow cancel requests to proceed even if the initialize thread - * has stopped. - */ - if (vd->vdev_initialize_thread == NULL && - tgt_state != VDEV_INITIALIZE_CANCELED) { - return; - } - - vdev_initialize_change_state(vd, tgt_state); - vd->vdev_initialize_exit_wanted = B_TRUE; - while (vd->vdev_initialize_thread != NULL) - cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); - - ASSERT3P(vd->vdev_initialize_thread, ==, NULL); - vd->vdev_initialize_exit_wanted = B_FALSE; -} - -static void -vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state) -{ - if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { - mutex_enter(&vd->vdev_initialize_lock); - vdev_initialize_stop(vd, tgt_state); - mutex_exit(&vd->vdev_initialize_lock); - return; - } - - for (uint64_t i = 0; i < vd->vdev_children; i++) { - vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state); - } -} - -/* - * Convenience function to stop initializing of a vdev tree and set all - * initialize thread pointers to NULL. - */ -void -vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) -{ - vdev_initialize_stop_all_impl(vd, tgt_state); - - if (vd->vdev_spa->spa_sync_on) { - /* Make sure that our state has been synced to disk */ - txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); - } -} - -void -vdev_initialize_restart(vdev_t *vd) -{ - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); - - if (vd->vdev_leaf_zap != 0) { - mutex_enter(&vd->vdev_initialize_lock); - uint64_t initialize_state = VDEV_INITIALIZE_NONE; - int err = zap_lookup(vd->vdev_spa->spa_meta_objset, - vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, - sizeof (initialize_state), 1, &initialize_state); - ASSERT(err == 0 || err == ENOENT); - vd->vdev_initialize_state = initialize_state; - - uint64_t timestamp = 0; - err = zap_lookup(vd->vdev_spa->spa_meta_objset, - vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, - sizeof (timestamp), 1, ×tamp); - ASSERT(err == 0 || err == ENOENT); - vd->vdev_initialize_action_time = (time_t)timestamp; - - if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || - vd->vdev_offline) { - /* load progress for reporting, but don't resume */ - vdev_initialize_load(vd); - } else if (vd->vdev_initialize_state == - VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) { - vdev_initialize(vd); - } - - mutex_exit(&vd->vdev_initialize_lock); - } - - for (uint64_t i = 0; i < vd->vdev_children; i++) { - vdev_initialize_restart(vd->vdev_child[i]); - } -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c deleted file mode 100644 index 0b777c8870c5..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c +++ /dev/null @@ -1,1701 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2020 by Delphix. All rights reserved. - * Copyright (c) 2017, Intel Corporation. - * Copyright 2019 Joyent, Inc. - */ - -/* - * Virtual Device Labels - * --------------------- - * - * The vdev label serves several distinct purposes: - * - * 1. Uniquely identify this device as part of a ZFS pool and confirm its - * identity within the pool. - * - * 2. Verify that all the devices given in a configuration are present - * within the pool. - * - * 3. Determine the uberblock for the pool. - * - * 4. In case of an import operation, determine the configuration of the - * toplevel vdev of which it is a part. - * - * 5. If an import operation cannot find all the devices in the pool, - * provide enough information to the administrator to determine which - * devices are missing. - * - * It is important to note that while the kernel is responsible for writing the - * label, it only consumes the information in the first three cases. The - * latter information is only consumed in userland when determining the - * configuration to import a pool. - * - * - * Label Organization - * ------------------ - * - * Before describing the contents of the label, it's important to understand how - * the labels are written and updated with respect to the uberblock. - * - * When the pool configuration is altered, either because it was newly created - * or a device was added, we want to update all the labels such that we can deal - * with fatal failure at any point. To this end, each disk has two labels which - * are updated before and after the uberblock is synced. Assuming we have - * labels and an uberblock with the following transaction groups: - * - * L1 UB L2 - * +------+ +------+ +------+ - * | | | | | | - * | t10 | | t10 | | t10 | - * | | | | | | - * +------+ +------+ +------+ - * - * In this stable state, the labels and the uberblock were all updated within - * the same transaction group (10). Each label is mirrored and checksummed, so - * that we can detect when we fail partway through writing the label. - * - * In order to identify which labels are valid, the labels are written in the - * following manner: - * - * 1. For each vdev, update 'L1' to the new label - * 2. Update the uberblock - * 3. For each vdev, update 'L2' to the new label - * - * Given arbitrary failure, we can determine the correct label to use based on - * the transaction group. If we fail after updating L1 but before updating the - * UB, we will notice that L1's transaction group is greater than the uberblock, - * so L2 must be valid. If we fail after writing the uberblock but before - * writing L2, we will notice that L2's transaction group is less than L1, and - * therefore L1 is valid. - * - * Another added complexity is that not every label is updated when the config - * is synced. If we add a single device, we do not want to have to re-write - * every label for every device in the pool. This means that both L1 and L2 may - * be older than the pool uberblock, because the necessary information is stored - * on another vdev. - * - * - * On-disk Format - * -------------- - * - * The vdev label consists of two distinct parts, and is wrapped within the - * vdev_label_t structure. The label includes 8k of padding to permit legacy - * VTOC disk labels, but is otherwise ignored. - * - * The first half of the label is a packed nvlist which contains pool wide - * properties, per-vdev properties, and configuration information. It is - * described in more detail below. - * - * The latter half of the label consists of a redundant array of uberblocks. - * These uberblocks are updated whenever a transaction group is committed, - * or when the configuration is updated. When a pool is loaded, we scan each - * vdev for the 'best' uberblock. - * - * - * Configuration Information - * ------------------------- - * - * The nvlist describing the pool and vdev contains the following elements: - * - * version ZFS on-disk version - * name Pool name - * state Pool state - * txg Transaction group in which this label was written - * pool_guid Unique identifier for this pool - * vdev_tree An nvlist describing vdev tree. - * features_for_read - * An nvlist of the features necessary for reading the MOS. - * - * Each leaf device label also contains the following: - * - * top_guid Unique ID for top-level vdev in which this is contained - * guid Unique ID for the leaf vdev - * - * The 'vs' configuration follows the format described in 'spa_config.c'. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static boolean_t vdev_trim_on_init = B_TRUE; -SYSCTL_DECL(_vfs_zfs_vdev); -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RWTUN, - &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation"); - -/* - * Basic routines to read and write from a vdev label. - * Used throughout the rest of this file. - */ -uint64_t -vdev_label_offset(uint64_t psize, int l, uint64_t offset) -{ - ASSERT(offset < sizeof (vdev_label_t)); - ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0); - - return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? - 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); -} - -/* - * Returns back the vdev label associated with the passed in offset. - */ -int -vdev_label_number(uint64_t psize, uint64_t offset) -{ - int l; - - if (offset >= psize - VDEV_LABEL_END_SIZE) { - offset -= psize - VDEV_LABEL_END_SIZE; - offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t); - } - l = offset / sizeof (vdev_label_t); - return (l < VDEV_LABELS ? l : -1); -} - -static void -vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private, int flags) -{ - ASSERT( - spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || - spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); - ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); - - zio_nowait(zio_read_phys(zio, vd, - vdev_label_offset(vd->vdev_psize, l, offset), - size, buf, ZIO_CHECKSUM_LABEL, done, private, - ZIO_PRIORITY_SYNC_READ, flags, B_TRUE)); -} - -void -vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private, int flags) -{ - ASSERT( - spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || - spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); - ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); - - zio_nowait(zio_write_phys(zio, vd, - vdev_label_offset(vd->vdev_psize, l, offset), - size, buf, ZIO_CHECKSUM_LABEL, done, private, - ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE)); -} - -static void -root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) -{ - spa_t *spa = vd->vdev_spa; - - if (vd != spa->spa_root_vdev) - return; - - /* provide either current or previous scan information */ - pool_scan_stat_t ps; - if (spa_scan_get_stats(spa, &ps) == 0) { - fnvlist_add_uint64_array(nvl, - ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, - sizeof (pool_scan_stat_t) / sizeof (uint64_t)); - } - - pool_removal_stat_t prs; - if (spa_removal_get_stats(spa, &prs) == 0) { - fnvlist_add_uint64_array(nvl, - ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs, - sizeof (prs) / sizeof (uint64_t)); - } - - pool_checkpoint_stat_t pcs; - if (spa_checkpoint_get_stats(spa, &pcs) == 0) { - fnvlist_add_uint64_array(nvl, - ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs, - sizeof (pcs) / sizeof (uint64_t)); - } -} - -/* - * Generate the nvlist representing this vdev's config. - */ -nvlist_t * -vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - vdev_config_flag_t flags) -{ - nvlist_t *nv = NULL; - vdev_indirect_config_t *vic = &vd->vdev_indirect_config; - - nv = fnvlist_alloc(); - - fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type); - if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE))) - fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id); - fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid); - - if (vd->vdev_path != NULL) - fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path); - - if (vd->vdev_devid != NULL) - fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid); - - if (vd->vdev_physpath != NULL) - fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, - vd->vdev_physpath); - - if (vd->vdev_fru != NULL) - fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); - - if (vd->vdev_nparity != 0) { - ASSERT(strcmp(vd->vdev_ops->vdev_op_type, - VDEV_TYPE_RAIDZ) == 0); - - /* - * Make sure someone hasn't managed to sneak a fancy new vdev - * into a crufty old storage pool. - */ - ASSERT(vd->vdev_nparity == 1 || - (vd->vdev_nparity <= 2 && - spa_version(spa) >= SPA_VERSION_RAIDZ2) || - (vd->vdev_nparity <= 3 && - spa_version(spa) >= SPA_VERSION_RAIDZ3)); - - /* - * Note that we'll add the nparity tag even on storage pools - * that only support a single parity device -- older software - * will just ignore it. - */ - fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); - } - - if (vd->vdev_wholedisk != -1ULL) - fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, - vd->vdev_wholedisk); - - if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) - fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); - - if (vd->vdev_isspare) - fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); - - if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && - vd == vd->vdev_top) { - fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, - vd->vdev_ms_array); - fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, - vd->vdev_ms_shift); - fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); - fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, - vd->vdev_asize); - fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); - if (vd->vdev_removing) { - fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, - vd->vdev_removing); - } - - /* zpool command expects alloc class data */ - if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) { - const char *bias = NULL; - - switch (vd->vdev_alloc_bias) { - case VDEV_BIAS_LOG: - bias = VDEV_ALLOC_BIAS_LOG; - break; - case VDEV_BIAS_SPECIAL: - bias = VDEV_ALLOC_BIAS_SPECIAL; - break; - case VDEV_BIAS_DEDUP: - bias = VDEV_ALLOC_BIAS_DEDUP; - break; - default: - ASSERT3U(vd->vdev_alloc_bias, ==, - VDEV_BIAS_NONE); - } - fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, - bias); - } - } - - if (vd->vdev_dtl_sm != NULL) { - fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, - space_map_object(vd->vdev_dtl_sm)); - } - - if (vic->vic_mapping_object != 0) { - fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, - vic->vic_mapping_object); - } - - if (vic->vic_births_object != 0) { - fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, - vic->vic_births_object); - } - - if (vic->vic_prev_indirect_vdev != UINT64_MAX) { - fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, - vic->vic_prev_indirect_vdev); - } - - if (vd->vdev_crtxg) - fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); - - if (flags & VDEV_CONFIG_MOS) { - if (vd->vdev_leaf_zap != 0) { - ASSERT(vd->vdev_ops->vdev_op_leaf); - fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, - vd->vdev_leaf_zap); - } - - if (vd->vdev_top_zap != 0) { - ASSERT(vd == vd->vdev_top); - fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, - vd->vdev_top_zap); - } - } - - if (getstats) { - vdev_stat_t vs; - - vdev_get_stats(vd, &vs); - fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)); - - root_vdev_actions_getprogress(vd, nv); - - /* - * Note: this can be called from open context - * (spa_get_stats()), so we need the rwlock to prevent - * the mapping from being changed by condensing. - */ - rw_enter(&vd->vdev_indirect_rwlock, RW_READER); - if (vd->vdev_indirect_mapping != NULL) { - ASSERT(vd->vdev_indirect_births != NULL); - vdev_indirect_mapping_t *vim = - vd->vdev_indirect_mapping; - fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, - vdev_indirect_mapping_size(vim)); - } - rw_exit(&vd->vdev_indirect_rwlock); - if (vd->vdev_mg != NULL && - vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) { - /* - * Compute approximately how much memory would be used - * for the indirect mapping if this device were to - * be removed. - * - * Note: If the frag metric is invalid, then not - * enough metaslabs have been converted to have - * histograms. - */ - uint64_t seg_count = 0; - uint64_t to_alloc = vd->vdev_stat.vs_alloc; - - /* - * There are the same number of allocated segments - * as free segments, so we will have at least one - * entry per free segment. However, small free - * segments (smaller than vdev_removal_max_span) - * will be combined with adjacent allocated segments - * as a single mapping. - */ - for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { - if (1ULL << (i + 1) < vdev_removal_max_span) { - to_alloc += - vd->vdev_mg->mg_histogram[i] << - i + 1; - } else { - seg_count += - vd->vdev_mg->mg_histogram[i]; - } - } - - /* - * The maximum length of a mapping is - * zfs_remove_max_segment, so we need at least one entry - * per zfs_remove_max_segment of allocated data. - */ - seg_count += to_alloc / zfs_remove_max_segment; - - fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, - seg_count * - sizeof (vdev_indirect_mapping_entry_phys_t)); - } - } - - if (!vd->vdev_ops->vdev_op_leaf) { - nvlist_t **child; - int c, idx; - - ASSERT(!vd->vdev_ishole); - - child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), - KM_SLEEP); - - for (c = 0, idx = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - - /* - * If we're generating an nvlist of removing - * vdevs then skip over any device which is - * not being removed. - */ - if ((flags & VDEV_CONFIG_REMOVING) && - !cvd->vdev_removing) - continue; - - child[idx++] = vdev_config_generate(spa, cvd, - getstats, flags); - } - - if (idx) { - fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - child, idx); - } - - for (c = 0; c < idx; c++) - nvlist_free(child[c]); - - kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); - - } else { - const char *aux = NULL; - - if (vd->vdev_offline && !vd->vdev_tmpoffline) - fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE); - if (vd->vdev_resilver_txg != 0) - fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, - vd->vdev_resilver_txg); - if (vd->vdev_faulted) - fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE); - if (vd->vdev_degraded) - fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE); - if (vd->vdev_removed) - fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE); - if (vd->vdev_unspare) - fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE); - if (vd->vdev_ishole) - fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE); - - switch (vd->vdev_stat.vs_aux) { - case VDEV_AUX_ERR_EXCEEDED: - aux = "err_exceeded"; - break; - - case VDEV_AUX_EXTERNAL: - aux = "external"; - break; - } - - if (aux != NULL) - fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux); - - if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { - fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, - vd->vdev_orig_guid); - } - } - - return (nv); -} - -/* - * Generate a view of the top-level vdevs. If we currently have holes - * in the namespace, then generate an array which contains a list of holey - * vdevs. Additionally, add the number of top-level children that currently - * exist. - */ -void -vdev_top_config_generate(spa_t *spa, nvlist_t *config) -{ - vdev_t *rvd = spa->spa_root_vdev; - uint64_t *array; - uint_t c, idx; - - array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP); - - for (c = 0, idx = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - - if (tvd->vdev_ishole) { - array[idx++] = c; - } - } - - if (idx) { - VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY, - array, idx) == 0); - } - - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, - rvd->vdev_children) == 0); - - kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); -} - -/* - * Returns the configuration from the label of the given vdev. For vdevs - * which don't have a txg value stored on their label (i.e. spares/cache) - * or have not been completely initialized (txg = 0) just return - * the configuration from the first valid label we find. Otherwise, - * find the most up-to-date label that does not exceed the specified - * 'txg' value. - */ -nvlist_t * -vdev_label_read_config(vdev_t *vd, uint64_t txg) -{ - spa_t *spa = vd->vdev_spa; - nvlist_t *config = NULL; - vdev_phys_t *vp; - abd_t *vp_abd; - zio_t *zio; - uint64_t best_txg = 0; - uint64_t label_txg = 0; - int error = 0; - int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | - ZIO_FLAG_SPECULATIVE; - - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); - - if (!vdev_readable(vd)) - return (NULL); - - vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); - vp = abd_to_buf(vp_abd); - -retry: - for (int l = 0; l < VDEV_LABELS; l++) { - nvlist_t *label = NULL; - - zio = zio_root(spa, NULL, NULL, flags); - - vdev_label_read(zio, vd, l, vp_abd, - offsetof(vdev_label_t, vl_vdev_phys), - sizeof (vdev_phys_t), NULL, NULL, flags); - - if (zio_wait(zio) == 0 && - nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), - &label, 0) == 0) { - /* - * Auxiliary vdevs won't have txg values in their - * labels and newly added vdevs may not have been - * completely initialized so just return the - * configuration from the first valid label we - * encounter. - */ - error = nvlist_lookup_uint64(label, - ZPOOL_CONFIG_POOL_TXG, &label_txg); - if ((error || label_txg == 0) && !config) { - config = label; - break; - } else if (label_txg <= txg && label_txg > best_txg) { - best_txg = label_txg; - nvlist_free(config); - config = fnvlist_dup(label); - } - } - - if (label != NULL) { - nvlist_free(label); - label = NULL; - } - } - - if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) { - flags |= ZIO_FLAG_TRYHARD; - goto retry; - } - - /* - * We found a valid label but it didn't pass txg restrictions. - */ - if (config == NULL && label_txg != 0) { - vdev_dbgmsg(vd, "label discarded as txg is too large " - "(%llu > %llu)", (u_longlong_t)label_txg, - (u_longlong_t)txg); - } - - abd_free(vp_abd); - - return (config); -} - -/* - * Determine if a device is in use. The 'spare_guid' parameter will be filled - * in with the device guid if this spare is active elsewhere on the system. - */ -static boolean_t -vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, - uint64_t *spare_guid, uint64_t *l2cache_guid) -{ - spa_t *spa = vd->vdev_spa; - uint64_t state, pool_guid, device_guid, txg, spare_pool; - uint64_t vdtxg = 0; - nvlist_t *label; - - if (spare_guid) - *spare_guid = 0ULL; - if (l2cache_guid) - *l2cache_guid = 0ULL; - - /* - * Read the label, if any, and perform some basic sanity checks. - */ - if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) - return (B_FALSE); - - (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, - &vdtxg); - - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, - &state) != 0 || - nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, - &device_guid) != 0) { - nvlist_free(label); - return (B_FALSE); - } - - if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && - (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, - &pool_guid) != 0 || - nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, - &txg) != 0)) { - nvlist_free(label); - return (B_FALSE); - } - - nvlist_free(label); - - /* - * Check to see if this device indeed belongs to the pool it claims to - * be a part of. The only way this is allowed is if the device is a hot - * spare (which we check for later on). - */ - if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && - !spa_guid_exists(pool_guid, device_guid) && - !spa_spare_exists(device_guid, NULL, NULL) && - !spa_l2cache_exists(device_guid, NULL)) - return (B_FALSE); - - /* - * If the transaction group is zero, then this an initialized (but - * unused) label. This is only an error if the create transaction - * on-disk is the same as the one we're using now, in which case the - * user has attempted to add the same vdev multiple times in the same - * transaction. - */ - if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && - txg == 0 && vdtxg == crtxg) - return (B_TRUE); - - /* - * Check to see if this is a spare device. We do an explicit check for - * spa_has_spare() here because it may be on our pending list of spares - * to add. We also check if it is an l2cache device. - */ - if (spa_spare_exists(device_guid, &spare_pool, NULL) || - spa_has_spare(spa, device_guid)) { - if (spare_guid) - *spare_guid = device_guid; - - switch (reason) { - case VDEV_LABEL_CREATE: - case VDEV_LABEL_L2CACHE: - return (B_TRUE); - - case VDEV_LABEL_REPLACE: - return (!spa_has_spare(spa, device_guid) || - spare_pool != 0ULL); - - case VDEV_LABEL_SPARE: - return (spa_has_spare(spa, device_guid)); - } - } - - /* - * Check to see if this is an l2cache device. - */ - if (spa_l2cache_exists(device_guid, NULL)) - return (B_TRUE); - - /* - * We can't rely on a pool's state if it's been imported - * read-only. Instead we look to see if the pools is marked - * read-only in the namespace and set the state to active. - */ - if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && - (spa = spa_by_guid(pool_guid, device_guid)) != NULL && - spa_mode(spa) == FREAD) - state = POOL_STATE_ACTIVE; - - /* - * If the device is marked ACTIVE, then this device is in use by another - * pool on the system. - */ - return (state == POOL_STATE_ACTIVE); -} - -/* - * Initialize a vdev label. We check to make sure each leaf device is not in - * use, and writable. We put down an initial label which we will later - * overwrite with a complete label. Note that it's important to do this - * sequentially, not in parallel, so that we catch cases of multiple use of the - * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with - * itself. - */ -int -vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) -{ - spa_t *spa = vd->vdev_spa; - nvlist_t *label; - vdev_phys_t *vp; - abd_t *vp_abd; - abd_t *bootenv; - uberblock_t *ub; - abd_t *ub_abd; - zio_t *zio; - char *buf; - size_t buflen; - int error; - uint64_t spare_guid, l2cache_guid; - int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; - - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - - for (int c = 0; c < vd->vdev_children; c++) - if ((error = vdev_label_init(vd->vdev_child[c], - crtxg, reason)) != 0) - return (error); - - /* Track the creation time for this vdev */ - vd->vdev_crtxg = crtxg; - - if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa)) - return (0); - - /* - * Dead vdevs cannot be initialized. - */ - if (vdev_is_dead(vd)) - return (SET_ERROR(EIO)); - - /* - * Determine if the vdev is in use. - */ - if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT && - vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) - return (SET_ERROR(EBUSY)); - - /* - * If this is a request to add or replace a spare or l2cache device - * that is in use elsewhere on the system, then we must update the - * guid (which was initialized to a random value) to reflect the - * actual GUID (which is shared between multiple pools). - */ - if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE && - spare_guid != 0ULL) { - uint64_t guid_delta = spare_guid - vd->vdev_guid; - - vd->vdev_guid += guid_delta; - - for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) - pvd->vdev_guid_sum += guid_delta; - - /* - * If this is a replacement, then we want to fallthrough to the - * rest of the code. If we're adding a spare, then it's already - * labeled appropriately and we can just return. - */ - if (reason == VDEV_LABEL_SPARE) - return (0); - ASSERT(reason == VDEV_LABEL_REPLACE || - reason == VDEV_LABEL_SPLIT); - } - - if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE && - l2cache_guid != 0ULL) { - uint64_t guid_delta = l2cache_guid - vd->vdev_guid; - - vd->vdev_guid += guid_delta; - - for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) - pvd->vdev_guid_sum += guid_delta; - - /* - * If this is a replacement, then we want to fallthrough to the - * rest of the code. If we're adding an l2cache, then it's - * already labeled appropriately and we can just return. - */ - if (reason == VDEV_LABEL_L2CACHE) - return (0); - ASSERT(reason == VDEV_LABEL_REPLACE); - } - - /* - * TRIM the whole thing, excluding the blank space and boot header - * as specified by ZFS On-Disk Specification (section 1.3), so that - * we start with a clean slate. - * It's just an optimization, so we don't care if it fails. - * Don't TRIM if removing so that we don't interfere with zpool - * disaster recovery. - */ - if (zfs_trim_enabled && vdev_trim_on_init && !vd->vdev_notrim && - (reason == VDEV_LABEL_CREATE || reason == VDEV_LABEL_SPARE || - reason == VDEV_LABEL_L2CACHE)) - zio_wait(zio_trim(NULL, spa, vd, VDEV_SKIP_SIZE, - vd->vdev_psize - VDEV_SKIP_SIZE)); - - /* - * Initialize its label. - */ - vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); - abd_zero(vp_abd, sizeof (vdev_phys_t)); - vp = abd_to_buf(vp_abd); - - /* - * Generate a label describing the pool and our top-level vdev. - * We mark it as being from txg 0 to indicate that it's not - * really part of an active pool just yet. The labels will - * be written again with a meaningful txg by spa_sync(). - */ - if (reason == VDEV_LABEL_SPARE || - (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) { - /* - * For inactive hot spares, we generate a special label that - * identifies as a mutually shared hot spare. We write the - * label if we are adding a hot spare, or if we are removing an - * active hot spare (in which case we want to revert the - * labels). - */ - VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, - spa_version(spa)) == 0); - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, - POOL_STATE_SPARE) == 0); - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, - vd->vdev_guid) == 0); - } else if (reason == VDEV_LABEL_L2CACHE || - (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) { - /* - * For level 2 ARC devices, add a special label. - */ - VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, - spa_version(spa)) == 0); - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, - POOL_STATE_L2CACHE) == 0); - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, - vd->vdev_guid) == 0); - } else { - uint64_t txg = 0ULL; - - if (reason == VDEV_LABEL_SPLIT) - txg = spa->spa_uberblock.ub_txg; - label = spa_config_generate(spa, vd, txg, B_FALSE); - - /* - * Add our creation time. This allows us to detect multiple - * vdev uses as described above, and automatically expires if we - * fail. - */ - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, - crtxg) == 0); - } - - buf = vp->vp_nvlist; - buflen = sizeof (vp->vp_nvlist); - - error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); - if (error != 0) { - nvlist_free(label); - abd_free(vp_abd); - /* EFAULT means nvlist_pack ran out of room */ - return (error == EFAULT ? ENAMETOOLONG : EINVAL); - } - - /* - * Initialize uberblock template. - */ - ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE); - abd_zero(ub_abd, VDEV_UBERBLOCK_RING); - abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t)); - ub = abd_to_buf(ub_abd); - ub->ub_txg = 0; - - /* Initialize the 2nd padding area. */ - bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); - abd_zero(bootenv, VDEV_PAD_SIZE); - - /* - * Write everything in parallel. - */ -retry: - zio = zio_root(spa, NULL, NULL, flags); - - for (int l = 0; l < VDEV_LABELS; l++) { - - vdev_label_write(zio, vd, l, vp_abd, - offsetof(vdev_label_t, vl_vdev_phys), - sizeof (vdev_phys_t), NULL, NULL, flags); - - /* - * Skip the 1st padding area. - * Zero out the 2nd padding area where it might have - * left over data from previous filesystem format. - */ - vdev_label_write(zio, vd, l, bootenv, - offsetof(vdev_label_t, vl_be), - VDEV_PAD_SIZE, NULL, NULL, flags); - - vdev_label_write(zio, vd, l, ub_abd, - offsetof(vdev_label_t, vl_uberblock), - VDEV_UBERBLOCK_RING, NULL, NULL, flags); - } - - error = zio_wait(zio); - - if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { - flags |= ZIO_FLAG_TRYHARD; - goto retry; - } - - nvlist_free(label); - abd_free(bootenv); - abd_free(ub_abd); - abd_free(vp_abd); - - /* - * If this vdev hasn't been previously identified as a spare, then we - * mark it as such only if a) we are labeling it as a spare, or b) it - * exists as a spare elsewhere in the system. Do the same for - * level 2 ARC devices. - */ - if (error == 0 && !vd->vdev_isspare && - (reason == VDEV_LABEL_SPARE || - spa_spare_exists(vd->vdev_guid, NULL, NULL))) - spa_spare_add(vd); - - if (error == 0 && !vd->vdev_isl2cache && - (reason == VDEV_LABEL_L2CACHE || - spa_l2cache_exists(vd->vdev_guid, NULL))) - spa_l2cache_add(vd); - - return (error); -} - -/* - * Done callback for vdev_label_read_bootenv_impl. If this is the first - * callback to finish, store our abd in the callback pointer. Otherwise, we - * just free our abd and return. - */ -static void -vdev_label_read_bootenv_done(zio_t *zio) -{ - zio_t *rio = zio->io_private; - abd_t **cbp = rio->io_private; - - ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE); - - if (zio->io_error == 0) { - mutex_enter(&rio->io_lock); - if (*cbp == NULL) { - /* Will free this buffer in vdev_label_read_bootenv. */ - *cbp = zio->io_abd; - } else { - abd_free(zio->io_abd); - } - mutex_exit(&rio->io_lock); - } else { - abd_free(zio->io_abd); - } -} - -static void -vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags) -{ - for (int c = 0; c < vd->vdev_children; c++) - vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags); - - /* - * We just use the first label that has a correct checksum; the - * bootloader should have rewritten them all to be the same on boot, - * and any changes we made since boot have been the same across all - * labels. - * - * While grub supports writing to all four labels, other bootloaders - * don't, so we only use the first two labels to store boot - * information. - */ - if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { - for (int l = 0; l < VDEV_LABELS / 2; l++) { - vdev_label_read(zio, vd, l, - abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE), - offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, - vdev_label_read_bootenv_done, zio, flags); - } - } -} - -int -vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *command) -{ - spa_t *spa = rvd->vdev_spa; - abd_t *abd = NULL; - int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | - ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; - - ASSERT(command); - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - - zio_t *zio = zio_root(spa, NULL, &abd, flags); - vdev_label_read_bootenv_impl(zio, rvd, flags); - int err = zio_wait(zio); - - if (abd != NULL) { - vdev_boot_envblock_t *vbe = abd_to_buf(abd); - if (vbe->vbe_version != VB_RAW) { - abd_free(abd); - return (SET_ERROR(ENOTSUP)); - } - vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0'; - fnvlist_add_string(command, "envmap", vbe->vbe_bootenv); - /* abd was allocated in vdev_label_read_bootenv_impl() */ - abd_free(abd); - /* If we managed to read any successfully, return success. */ - return (0); - } - return (err); -} - -int -vdev_label_write_bootenv(vdev_t *vd, char *envmap) -{ - zio_t *zio; - spa_t *spa = vd->vdev_spa; - vdev_boot_envblock_t *bootenv; - int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; - int error = ENXIO; - - if (strlen(envmap) >= sizeof (bootenv->vbe_bootenv)) { - return (SET_ERROR(E2BIG)); - } - - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - - for (int c = 0; c < vd->vdev_children; c++) { - int child_err = vdev_label_write_bootenv(vd->vdev_child[c], - envmap); - /* - * As long as any of the disks managed to write all of their - * labels successfully, return success. - */ - if (child_err == 0) - error = child_err; - } - - if (!vd->vdev_ops->vdev_op_leaf || vdev_is_dead(vd) || - !vdev_writeable(vd)) { - return (error); - } - ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE); - abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); - abd_zero(abd, VDEV_PAD_SIZE); - bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE); - - char *buf = bootenv->vbe_bootenv; - (void) strlcpy(buf, envmap, sizeof (bootenv->vbe_bootenv)); - bootenv->vbe_version = VB_RAW; - abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE); - -retry: - zio = zio_root(spa, NULL, NULL, flags); - for (int l = 0; l < VDEV_LABELS / 2; l++) { - vdev_label_write(zio, vd, l, abd, - offsetof(vdev_label_t, vl_be), - VDEV_PAD_SIZE, NULL, NULL, flags); - } - - error = zio_wait(zio); - if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { - flags |= ZIO_FLAG_TRYHARD; - goto retry; - } - - abd_free(abd); - return (error); -} - -int -vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size) -{ - spa_t *spa = vd->vdev_spa; - zio_t *zio; - abd_t *pad2; - int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; - int error; - - if (size > VDEV_PAD_SIZE) - return (EINVAL); - - if (!vd->vdev_ops->vdev_op_leaf) - return (ENODEV); - if (vdev_is_dead(vd)) - return (ENXIO); - - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - - pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); - abd_zero(pad2, VDEV_PAD_SIZE); - abd_copy_from_buf(pad2, buf, size); - -retry: - zio = zio_root(spa, NULL, NULL, flags); - vdev_label_write(zio, vd, 0, pad2, - offsetof(vdev_label_t, vl_be), - VDEV_PAD_SIZE, NULL, NULL, flags); - error = zio_wait(zio); - if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { - flags |= ZIO_FLAG_TRYHARD; - goto retry; - } - - abd_free(pad2); - return (error); -} - -/* - * ========================================================================== - * uberblock load/sync - * ========================================================================== - */ - -/* - * Consider the following situation: txg is safely synced to disk. We've - * written the first uberblock for txg + 1, and then we lose power. When we - * come back up, we fail to see the uberblock for txg + 1 because, say, - * it was on a mirrored device and the replica to which we wrote txg + 1 - * is now offline. If we then make some changes and sync txg + 1, and then - * the missing replica comes back, then for a few seconds we'll have two - * conflicting uberblocks on disk with the same txg. The solution is simple: - * among uberblocks with equal txg, choose the one with the latest timestamp. - */ -static int -vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) -{ - int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); - - if (likely(cmp)) - return (cmp); - - cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp); - if (likely(cmp)) - return (cmp); - - /* - * If MMP_VALID(ub) && MMP_SEQ_VALID(ub) then the host has an MMP-aware - * ZFS, e.g. zfsonlinux >= 0.7. - * - * If one ub has MMP and the other does not, they were written by - * different hosts, which matters for MMP. So we treat no MMP/no SEQ as - * a 0 value. - * - * Since timestamp and txg are the same if we get this far, either is - * acceptable for importing the pool. - */ - unsigned int seq1 = 0; - unsigned int seq2 = 0; - - if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1)) - seq1 = MMP_SEQ(ub1); - - if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) - seq2 = MMP_SEQ(ub2); - - return (AVL_CMP(seq1, seq2)); -} - -struct ubl_cbdata { - uberblock_t *ubl_ubbest; /* Best uberblock */ - vdev_t *ubl_vd; /* vdev associated with the above */ -}; - -static void -vdev_uberblock_load_done(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - spa_t *spa = zio->io_spa; - zio_t *rio = zio->io_private; - uberblock_t *ub = abd_to_buf(zio->io_abd); - struct ubl_cbdata *cbp = rio->io_private; - - ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); - - if (zio->io_error == 0 && uberblock_verify(ub) == 0) { - mutex_enter(&rio->io_lock); - if (ub->ub_txg <= spa->spa_load_max_txg && - vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { - /* - * Keep track of the vdev in which this uberblock - * was found. We will use this information later - * to obtain the config nvlist associated with - * this uberblock. - */ - *cbp->ubl_ubbest = *ub; - cbp->ubl_vd = vd; - } - mutex_exit(&rio->io_lock); - } - - abd_free(zio->io_abd); -} - -static void -vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, - struct ubl_cbdata *cbp) -{ - for (int c = 0; c < vd->vdev_children; c++) - vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); - - if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { - for (int l = 0; l < VDEV_LABELS; l++) { - for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { - vdev_label_read(zio, vd, l, - abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), - B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n), - VDEV_UBERBLOCK_SIZE(vd), - vdev_uberblock_load_done, zio, flags); - } - } - } -} - -/* - * Reads the 'best' uberblock from disk along with its associated - * configuration. First, we read the uberblock array of each label of each - * vdev, keeping track of the uberblock with the highest txg in each array. - * Then, we read the configuration from the same vdev as the best uberblock. - */ -void -vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) -{ - zio_t *zio; - spa_t *spa = rvd->vdev_spa; - struct ubl_cbdata cb; - int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | - ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; - - ASSERT(ub); - ASSERT(config); - - bzero(ub, sizeof (uberblock_t)); - *config = NULL; - - cb.ubl_ubbest = ub; - cb.ubl_vd = NULL; - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - zio = zio_root(spa, NULL, &cb, flags); - vdev_uberblock_load_impl(zio, rvd, flags, &cb); - (void) zio_wait(zio); - - /* - * It's possible that the best uberblock was discovered on a label - * that has a configuration which was written in a future txg. - * Search all labels on this vdev to find the configuration that - * matches the txg for our uberblock. - */ - if (cb.ubl_vd != NULL) { - vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. " - "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg); - - *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); - if (*config == NULL && spa->spa_extreme_rewind) { - vdev_dbgmsg(cb.ubl_vd, "failed to read label config. " - "Trying again without txg restrictions."); - *config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX); - } - if (*config == NULL) { - vdev_dbgmsg(cb.ubl_vd, "failed to read label config"); - } - } - spa_config_exit(spa, SCL_ALL, FTAG); -} - -/* - * On success, increment root zio's count of good writes. - * We only get credit for writes to known-visible vdevs; see spa_vdev_add(). - */ -static void -vdev_uberblock_sync_done(zio_t *zio) -{ - uint64_t *good_writes = zio->io_private; - - if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) - atomic_inc_64(good_writes); -} - -/* - * Write the uberblock to all labels of all leaves of the specified vdev. - */ -static void -vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, - uberblock_t *ub, vdev_t *vd, int flags) -{ - for (uint64_t c = 0; c < vd->vdev_children; c++) { - vdev_uberblock_sync(zio, good_writes, - ub, vd->vdev_child[c], flags); - } - - if (!vd->vdev_ops->vdev_op_leaf) - return; - - if (!vdev_writeable(vd)) - return; - - int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0; - int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m); - - /* Copy the uberblock_t into the ABD */ - abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); - abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); - abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); - - for (int l = 0; l < VDEV_LABELS; l++) - vdev_label_write(zio, vd, l, ub_abd, - VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), - vdev_uberblock_sync_done, good_writes, - flags | ZIO_FLAG_DONT_PROPAGATE); - - abd_free(ub_abd); -} - -/* Sync the uberblocks to all vdevs in svd[] */ -int -vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) -{ - spa_t *spa = svd[0]->vdev_spa; - zio_t *zio; - uint64_t good_writes = 0; - - zio = zio_root(spa, NULL, NULL, flags); - - for (int v = 0; v < svdcount; v++) - vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags); - - (void) zio_wait(zio); - - /* - * Flush the uberblocks to disk. This ensures that the odd labels - * are no longer needed (because the new uberblocks and the even - * labels are safely on disk), so it is safe to overwrite them. - */ - zio = zio_root(spa, NULL, NULL, flags); - - for (int v = 0; v < svdcount; v++) { - if (vdev_writeable(svd[v])) { - zio_flush(zio, svd[v]); - } - } - - (void) zio_wait(zio); - - return (good_writes >= 1 ? 0 : EIO); -} - -/* - * On success, increment the count of good writes for our top-level vdev. - */ -static void -vdev_label_sync_done(zio_t *zio) -{ - uint64_t *good_writes = zio->io_private; - - if (zio->io_error == 0) - atomic_inc_64(good_writes); -} - -/* - * If there weren't enough good writes, indicate failure to the parent. - */ -static void -vdev_label_sync_top_done(zio_t *zio) -{ - uint64_t *good_writes = zio->io_private; - - if (*good_writes == 0) - zio->io_error = SET_ERROR(EIO); - - kmem_free(good_writes, sizeof (uint64_t)); -} - -/* - * We ignore errors for log and cache devices, simply free the private data. - */ -static void -vdev_label_sync_ignore_done(zio_t *zio) -{ - kmem_free(zio->io_private, sizeof (uint64_t)); -} - -/* - * Write all even or odd labels to all leaves of the specified vdev. - */ -static void -vdev_label_sync(zio_t *zio, uint64_t *good_writes, - vdev_t *vd, int l, uint64_t txg, int flags) -{ - nvlist_t *label; - vdev_phys_t *vp; - abd_t *vp_abd; - char *buf; - size_t buflen; - - for (int c = 0; c < vd->vdev_children; c++) { - vdev_label_sync(zio, good_writes, - vd->vdev_child[c], l, txg, flags); - } - - if (!vd->vdev_ops->vdev_op_leaf) - return; - - if (!vdev_writeable(vd)) - return; - - /* - * Generate a label describing the top-level config to which we belong. - */ - label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); - - vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); - abd_zero(vp_abd, sizeof (vdev_phys_t)); - vp = abd_to_buf(vp_abd); - - buf = vp->vp_nvlist; - buflen = sizeof (vp->vp_nvlist); - - if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) { - for (; l < VDEV_LABELS; l += 2) { - vdev_label_write(zio, vd, l, vp_abd, - offsetof(vdev_label_t, vl_vdev_phys), - sizeof (vdev_phys_t), - vdev_label_sync_done, good_writes, - flags | ZIO_FLAG_DONT_PROPAGATE); - } - } - - abd_free(vp_abd); - nvlist_free(label); -} - -int -vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) -{ - list_t *dl = &spa->spa_config_dirty_list; - vdev_t *vd; - zio_t *zio; - int error; - - /* - * Write the new labels to disk. - */ - zio = zio_root(spa, NULL, NULL, flags); - - for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { - uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t), - KM_SLEEP); - - ASSERT(!vd->vdev_ishole); - - zio_t *vio = zio_null(zio, spa, NULL, - (vd->vdev_islog || vd->vdev_aux != NULL) ? - vdev_label_sync_ignore_done : vdev_label_sync_top_done, - good_writes, flags); - vdev_label_sync(vio, good_writes, vd, l, txg, flags); - zio_nowait(vio); - } - - error = zio_wait(zio); - - /* - * Flush the new labels to disk. - */ - zio = zio_root(spa, NULL, NULL, flags); - - for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) - zio_flush(zio, vd); - - (void) zio_wait(zio); - - return (error); -} - -/* - * Sync the uberblock and any changes to the vdev configuration. - * - * The order of operations is carefully crafted to ensure that - * if the system panics or loses power at any time, the state on disk - * is still transactionally consistent. The in-line comments below - * describe the failure semantics at each stage. - * - * Moreover, vdev_config_sync() is designed to be idempotent: if it fails - * at any time, you can just call it again, and it will resume its work. - */ -int -vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) -{ - spa_t *spa = svd[0]->vdev_spa; - uberblock_t *ub = &spa->spa_uberblock; - int error = 0; - int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; - - ASSERT(svdcount != 0); -retry: - /* - * Normally, we don't want to try too hard to write every label and - * uberblock. If there is a flaky disk, we don't want the rest of the - * sync process to block while we retry. But if we can't write a - * single label out, we should retry with ZIO_FLAG_TRYHARD before - * bailing out and declaring the pool faulted. - */ - if (error != 0) { - if ((flags & ZIO_FLAG_TRYHARD) != 0) - return (error); - flags |= ZIO_FLAG_TRYHARD; - } - - ASSERT(ub->ub_txg <= txg); - - /* - * If this isn't a resync due to I/O errors, - * and nothing changed in this transaction group, - * and the vdev configuration hasn't changed, - * then there's nothing to do. - */ - if (ub->ub_txg < txg) { - boolean_t changed = uberblock_update(ub, spa->spa_root_vdev, - txg, spa->spa_mmp.mmp_delay); - - if (!changed && list_is_empty(&spa->spa_config_dirty_list)) - return (0); - } - - if (txg > spa_freeze_txg(spa)) - return (0); - - ASSERT(txg <= spa->spa_final_txg); - - /* - * Flush the write cache of every disk that's been written to - * in this transaction group. This ensures that all blocks - * written in this txg will be committed to stable storage - * before any uberblock that references them. - */ - zio_t *zio = zio_root(spa, NULL, NULL, flags); - - for (vdev_t *vd = - txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL; - vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) - zio_flush(zio, vd); - - (void) zio_wait(zio); - - /* - * Sync out the even labels (L0, L2) for every dirty vdev. If the - * system dies in the middle of this process, that's OK: all of the - * even labels that made it to disk will be newer than any uberblock, - * and will therefore be considered invalid. The odd labels (L1, L3), - * which have not yet been touched, will still be valid. We flush - * the new labels to disk to ensure that all even-label updates - * are committed to stable storage before the uberblock update. - */ - if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) { - if ((flags & ZIO_FLAG_TRYHARD) != 0) { - zfs_dbgmsg("vdev_label_sync_list() returned error %d " - "for pool '%s' when syncing out the even labels " - "of dirty vdevs", error, spa_name(spa)); - } - goto retry; - } - - /* - * Sync the uberblocks to all vdevs in svd[]. - * If the system dies in the middle of this step, there are two cases - * to consider, and the on-disk state is consistent either way: - * - * (1) If none of the new uberblocks made it to disk, then the - * previous uberblock will be the newest, and the odd labels - * (which had not yet been touched) will be valid with respect - * to that uberblock. - * - * (2) If one or more new uberblocks made it to disk, then they - * will be the newest, and the even labels (which had all - * been successfully committed) will be valid with respect - * to the new uberblocks. - */ - if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) { - if ((flags & ZIO_FLAG_TRYHARD) != 0) { - zfs_dbgmsg("vdev_uberblock_sync_list() returned error " - "%d for pool '%s'", error, spa_name(spa)); - } - goto retry; - } - - if (spa_multihost(spa)) - mmp_update_uberblock(spa, ub); - - /* - * Sync out odd labels for every dirty vdev. If the system dies - * in the middle of this process, the even labels and the new - * uberblocks will suffice to open the pool. The next time - * the pool is opened, the first thing we'll do -- before any - * user data is modified -- is mark every vdev dirty so that - * all labels will be brought up to date. We flush the new labels - * to disk to ensure that all odd-label updates are committed to - * stable storage before the next transaction group begins. - */ - if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) { - if ((flags & ZIO_FLAG_TRYHARD) != 0) { - zfs_dbgmsg("vdev_label_sync_list() returned error %d " - "for pool '%s' when syncing out the odd labels of " - "dirty vdevs", error, spa_name(spa)); - } - goto retry;; - } - - trim_thread_wakeup(spa); - - return (0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c deleted file mode 100644 index 391cee87af08..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c +++ /dev/null @@ -1,779 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Virtual device vector for mirroring. - */ - -typedef struct mirror_child { - vdev_t *mc_vd; - uint64_t mc_offset; - int mc_error; - int mc_load; - uint8_t mc_tried; - uint8_t mc_skipped; - uint8_t mc_speculative; -} mirror_child_t; - -typedef struct mirror_map { - int *mm_preferred; - int mm_preferred_cnt; - int mm_children; - boolean_t mm_resilvering; - boolean_t mm_root; - mirror_child_t mm_child[]; -} mirror_map_t; - -static int vdev_mirror_shift = 21; - -#ifdef _KERNEL -SYSCTL_DECL(_vfs_zfs_vdev); -static SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, - CTLFLAG_RD | CTLFLAG_MPSAFE, 0, - "ZFS VDEV Mirror"); -#endif - -/* - * The load configuration settings below are tuned by default for - * the case where all devices are of the same rotational type. - * - * If there is a mixture of rotating and non-rotating media, setting - * non_rotating_seek_inc to 0 may well provide better results as it - * will direct more reads to the non-rotating vdevs which are more - * likely to have a higher performance. - */ - -/* Rotating media load calculation configuration. */ -static int rotating_inc = 0; -#ifdef _KERNEL -SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_inc, CTLFLAG_RWTUN, - &rotating_inc, 0, "Rotating media load increment for non-seeking I/O's"); -#endif - -static int rotating_seek_inc = 5; -#ifdef _KERNEL -SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_inc, CTLFLAG_RWTUN, - &rotating_seek_inc, 0, "Rotating media load increment for seeking I/O's"); -#endif - -static int rotating_seek_offset = 1 * 1024 * 1024; -#ifdef _KERNEL -SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_offset, CTLFLAG_RWTUN, - &rotating_seek_offset, 0, "Offset in bytes from the last I/O which " - "triggers a reduced rotating media seek increment"); -#endif - -/* Non-rotating media load calculation configuration. */ -static int non_rotating_inc = 0; -#ifdef _KERNEL -SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_inc, CTLFLAG_RWTUN, - &non_rotating_inc, 0, - "Non-rotating media load increment for non-seeking I/O's"); -#endif - -static int non_rotating_seek_inc = 1; -#ifdef _KERNEL -SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_seek_inc, CTLFLAG_RWTUN, - &non_rotating_seek_inc, 0, - "Non-rotating media load increment for seeking I/O's"); -#endif - - -static inline size_t -vdev_mirror_map_size(int children) -{ - return (offsetof(mirror_map_t, mm_child[children]) + - sizeof(int) * children); -} - -static inline mirror_map_t * -vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root) -{ - mirror_map_t *mm; - - mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP); - mm->mm_children = children; - mm->mm_resilvering = resilvering; - mm->mm_root = root; - mm->mm_preferred = (int *)((uintptr_t)mm + - offsetof(mirror_map_t, mm_child[children])); - - return mm; -} - -static void -vdev_mirror_map_free(zio_t *zio) -{ - mirror_map_t *mm = zio->io_vsd; - - kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); -} - -static const zio_vsd_ops_t vdev_mirror_vsd_ops = { - vdev_mirror_map_free, - zio_vsd_default_cksum_report -}; - -static int -vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) -{ - uint64_t lastoffset; - int load; - - /* All DVAs have equal weight at the root. */ - if (mm->mm_root) - return (INT_MAX); - - /* - * We don't return INT_MAX if the device is resilvering i.e. - * vdev_resilver_txg != 0 as when tested performance was slightly - * worse overall when resilvering with compared to without. - */ - - /* Standard load based on pending queue length. */ - load = vdev_queue_length(vd); - lastoffset = vdev_queue_lastoffset(vd); - - if (vd->vdev_nonrot) { - /* Non-rotating media. */ - if (lastoffset == zio_offset) - return (load + non_rotating_inc); - - /* - * Apply a seek penalty even for non-rotating devices as - * sequential I/O'a can be aggregated into fewer operations - * on the device, thus avoiding unnecessary per-command - * overhead and boosting performance. - */ - return (load + non_rotating_seek_inc); - } - - /* Rotating media I/O's which directly follow the last I/O. */ - if (lastoffset == zio_offset) - return (load + rotating_inc); - - /* - * Apply half the seek increment to I/O's within seek offset - * of the last I/O queued to this vdev as they should incure less - * of a seek increment. - */ - if (ABS(lastoffset - zio_offset) < rotating_seek_offset) - return (load + (rotating_seek_inc / 2)); - - /* Apply the full seek increment to all other I/O's. */ - return (load + rotating_seek_inc); -} - - -static mirror_map_t * -vdev_mirror_map_init(zio_t *zio) -{ - mirror_map_t *mm = NULL; - mirror_child_t *mc; - vdev_t *vd = zio->io_vd; - int c; - - if (vd == NULL) { - dva_t *dva = zio->io_bp->blk_dva; - spa_t *spa = zio->io_spa; - dva_t dva_copy[SPA_DVAS_PER_BP]; - - c = BP_GET_NDVAS(zio->io_bp); - - /* - * If we do not trust the pool config, some DVAs might be - * invalid or point to vdevs that do not exist. We skip them. - */ - if (!spa_trust_config(spa)) { - ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - int j = 0; - for (int i = 0; i < c; i++) { - if (zfs_dva_valid(spa, &dva[i], zio->io_bp)) - dva_copy[j++] = dva[i]; - } - if (j == 0) { - zio->io_vsd = NULL; - zio->io_error = ENXIO; - return (NULL); - } - if (j < c) { - dva = dva_copy; - c = j; - } - } - - mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE); - - for (c = 0; c < mm->mm_children; c++) { - mc = &mm->mm_child[c]; - mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); - mc->mc_offset = DVA_GET_OFFSET(&dva[c]); - } - } else { - /* - * If we are resilvering, then we should handle scrub reads - * differently; we shouldn't issue them to the resilvering - * device because it might not have those blocks. - * - * We are resilvering iff: - * 1) We are a replacing vdev (ie our name is "replacing-1" or - * "spare-1" or something like that), and - * 2) The pool is currently being resilvered. - * - * We cannot simply check vd->vdev_resilver_txg, because it's - * not set in this path. - * - * Nor can we just check our vdev_ops; there are cases (such as - * when a user types "zpool replace pool odev spare_dev" and - * spare_dev is in the spare list, or when a spare device is - * automatically used to replace a DEGRADED device) when - * resilvering is complete but both the original vdev and the - * spare vdev remain in the pool. That behavior is intentional. - * It helps implement the policy that a spare should be - * automatically removed from the pool after the user replaces - * the device that originally failed. - * - * If a spa load is in progress, then spa_dsl_pool may be - * uninitialized. But we shouldn't be resilvering during a spa - * load anyway. - */ - boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops || - vd->vdev_ops == &vdev_spare_ops) && - spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE && - dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool); - mm = vdev_mirror_map_alloc(vd->vdev_children, replacing, - B_FALSE); - for (c = 0; c < mm->mm_children; c++) { - mc = &mm->mm_child[c]; - mc->mc_vd = vd->vdev_child[c]; - mc->mc_offset = zio->io_offset; - } - } - - zio->io_vsd = mm; - zio->io_vsd_ops = &vdev_mirror_vsd_ops; - return (mm); -} - -static int -vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - uint64_t *logical_ashift, uint64_t *physical_ashift) -{ - int numerrors = 0; - int lasterror = 0; - - if (vd->vdev_children == 0) { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (SET_ERROR(EINVAL)); - } - - vdev_open_children(vd); - - for (int c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - - if (cvd->vdev_open_error) { - lasterror = cvd->vdev_open_error; - numerrors++; - continue; - } - - *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; - *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; - *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); - *physical_ashift = MAX(*physical_ashift, - cvd->vdev_physical_ashift); - } - - if (numerrors == vd->vdev_children) { - if (vdev_children_are_offline(vd)) - vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE; - else - vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; - return (lasterror); - } - - return (0); -} - -static void -vdev_mirror_close(vdev_t *vd) -{ - for (int c = 0; c < vd->vdev_children; c++) - vdev_close(vd->vdev_child[c]); -} - -static void -vdev_mirror_child_done(zio_t *zio) -{ - mirror_child_t *mc = zio->io_private; - - mc->mc_error = zio->io_error; - mc->mc_tried = 1; - mc->mc_skipped = 0; -} - -static void -vdev_mirror_scrub_done(zio_t *zio) -{ - mirror_child_t *mc = zio->io_private; - - if (zio->io_error == 0) { - zio_t *pio; - zio_link_t *zl = NULL; - - mutex_enter(&zio->io_lock); - while ((pio = zio_walk_parents(zio, &zl)) != NULL) { - mutex_enter(&pio->io_lock); - ASSERT3U(zio->io_size, >=, pio->io_size); - abd_copy(pio->io_abd, zio->io_abd, pio->io_size); - mutex_exit(&pio->io_lock); - } - mutex_exit(&zio->io_lock); - } - abd_free(zio->io_abd); - - mc->mc_error = zio->io_error; - mc->mc_tried = 1; - mc->mc_skipped = 0; -} - -/* - * Check the other, lower-index DVAs to see if they're on the same - * vdev as the child we picked. If they are, use them since they - * are likely to have been allocated from the primary metaslab in - * use at the time, and hence are more likely to have locality with - * single-copy data. - */ -static int -vdev_mirror_dva_select(zio_t *zio, int p) -{ - dva_t *dva = zio->io_bp->blk_dva; - mirror_map_t *mm = zio->io_vsd; - int preferred; - int c; - - preferred = mm->mm_preferred[p]; - for (p-- ; p >= 0; p--) { - c = mm->mm_preferred[p]; - if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred])) - preferred = c; - } - return (preferred); -} - -static int -vdev_mirror_preferred_child_randomize(zio_t *zio) -{ - mirror_map_t *mm = zio->io_vsd; - int p; - - if (mm->mm_root) { - p = spa_get_random(mm->mm_preferred_cnt); - return (vdev_mirror_dva_select(zio, p)); - } - - /* - * To ensure we don't always favour the first matching vdev, - * which could lead to wear leveling issues on SSD's, we - * use the I/O offset as a pseudo random seed into the vdevs - * which have the lowest load. - */ - p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt; - return (mm->mm_preferred[p]); -} - -/* - * Try to find a vdev whose DTL doesn't contain the block we want to read - * prefering vdevs based on determined load. - * - * If we can't, try the read on any vdev we haven't already tried. - */ -static int -vdev_mirror_child_select(zio_t *zio) -{ - mirror_map_t *mm = zio->io_vsd; - uint64_t txg = zio->io_txg; - int c, lowest_load; - - ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); - - lowest_load = INT_MAX; - mm->mm_preferred_cnt = 0; - for (c = 0; c < mm->mm_children; c++) { - mirror_child_t *mc; - - mc = &mm->mm_child[c]; - if (mc->mc_tried || mc->mc_skipped) - continue; - - if (!vdev_readable(mc->mc_vd)) { - mc->mc_error = SET_ERROR(ENXIO); - mc->mc_tried = 1; /* don't even try */ - mc->mc_skipped = 1; - continue; - } - - if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { - mc->mc_error = SET_ERROR(ESTALE); - mc->mc_skipped = 1; - mc->mc_speculative = 1; - continue; - } - - mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); - if (mc->mc_load > lowest_load) - continue; - - if (mc->mc_load < lowest_load) { - lowest_load = mc->mc_load; - mm->mm_preferred_cnt = 0; - } - mm->mm_preferred[mm->mm_preferred_cnt] = c; - mm->mm_preferred_cnt++; - } - - if (mm->mm_preferred_cnt == 1) { - vdev_queue_register_lastoffset( - mm->mm_child[mm->mm_preferred[0]].mc_vd, zio); - return (mm->mm_preferred[0]); - } - - if (mm->mm_preferred_cnt > 1) { - int c = vdev_mirror_preferred_child_randomize(zio); - - vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio); - return (c); - } - - /* - * Every device is either missing or has this txg in its DTL. - * Look for any child we haven't already tried before giving up. - */ - for (c = 0; c < mm->mm_children; c++) { - if (!mm->mm_child[c].mc_tried) { - vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, - zio); - return (c); - } - } - - /* - * Every child failed. There's no place left to look. - */ - return (-1); -} - -static void -vdev_mirror_io_start(zio_t *zio) -{ - mirror_map_t *mm; - mirror_child_t *mc; - int c, children; - - mm = vdev_mirror_map_init(zio); - - if (mm == NULL) { - ASSERT(!spa_trust_config(zio->io_spa)); - ASSERT(zio->io_type == ZIO_TYPE_READ); - zio_execute(zio); - return; - } - - if (zio->io_type == ZIO_TYPE_READ) { - if (zio->io_bp != NULL && - (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering && - mm->mm_children > 1) { - /* - * For scrubbing reads (if we can verify the - * checksum here, as indicated by io_bp being - * non-NULL) we need to allocate a read buffer for - * each child and issue reads to all children. If - * any child succeeds, it will copy its data into - * zio->io_data in vdev_mirror_scrub_done. - */ - for (c = 0; c < mm->mm_children; c++) { - mc = &mm->mm_child[c]; - zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, - abd_alloc_sametype(zio->io_abd, - zio->io_size), zio->io_size, - zio->io_type, zio->io_priority, 0, - vdev_mirror_scrub_done, mc)); - } - zio_execute(zio); - return; - } - /* - * For normal reads just pick one child. - */ - c = vdev_mirror_child_select(zio); - children = (c >= 0); - } else { - ASSERT(zio->io_type == ZIO_TYPE_WRITE || - zio->io_type == ZIO_TYPE_FREE); - - /* - * Writes and frees go to all children. - */ - c = 0; - children = mm->mm_children; - } - - while (children--) { - mc = &mm->mm_child[c]; - zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, - zio->io_type, zio->io_priority, 0, - vdev_mirror_child_done, mc)); - c++; - } - - zio_execute(zio); -} - -static int -vdev_mirror_worst_error(mirror_map_t *mm) -{ - int error[2] = { 0, 0 }; - - for (int c = 0; c < mm->mm_children; c++) { - mirror_child_t *mc = &mm->mm_child[c]; - int s = mc->mc_speculative; - error[s] = zio_worst_error(error[s], mc->mc_error); - } - - return (error[0] ? error[0] : error[1]); -} - -static void -vdev_mirror_io_done(zio_t *zio) -{ - mirror_map_t *mm = zio->io_vsd; - mirror_child_t *mc; - int c; - int good_copies = 0; - int unexpected_errors = 0; - - if (mm == NULL) - return; - - for (c = 0; c < mm->mm_children; c++) { - mc = &mm->mm_child[c]; - - if (mc->mc_error) { - if (!mc->mc_skipped) - unexpected_errors++; - } else if (mc->mc_tried) { - good_copies++; - } - } - - if (zio->io_type == ZIO_TYPE_WRITE) { - /* - * XXX -- for now, treat partial writes as success. - * - * Now that we support write reallocation, it would be better - * to treat partial failure as real failure unless there are - * no non-degraded top-level vdevs left, and not update DTLs - * if we intend to reallocate. - */ - /* XXPOLICY */ - if (good_copies != mm->mm_children) { - /* - * Always require at least one good copy. - * - * For ditto blocks (io_vd == NULL), require - * all copies to be good. - * - * XXX -- for replacing vdevs, there's no great answer. - * If the old device is really dead, we may not even - * be able to access it -- so we only want to - * require good writes to the new device. But if - * the new device turns out to be flaky, we want - * to be able to detach it -- which requires all - * writes to the old device to have succeeded. - */ - if (good_copies == 0 || zio->io_vd == NULL) - zio->io_error = vdev_mirror_worst_error(mm); - } - return; - } else if (zio->io_type == ZIO_TYPE_FREE) { - return; - } - - ASSERT(zio->io_type == ZIO_TYPE_READ); - - /* - * If we don't have a good copy yet, keep trying other children. - */ - /* XXPOLICY */ - if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { - ASSERT(c >= 0 && c < mm->mm_children); - mc = &mm->mm_child[c]; - zio_vdev_io_redone(zio); - zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, - ZIO_TYPE_READ, zio->io_priority, 0, - vdev_mirror_child_done, mc)); - return; - } - - /* XXPOLICY */ - if (good_copies == 0) { - zio->io_error = vdev_mirror_worst_error(mm); - ASSERT(zio->io_error != 0); - } - - if (good_copies && spa_writeable(zio->io_spa) && - (unexpected_errors || - (zio->io_flags & ZIO_FLAG_RESILVER) || - ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) { - /* - * Use the good data we have in hand to repair damaged children. - */ - for (c = 0; c < mm->mm_children; c++) { - /* - * Don't rewrite known good children. - * Not only is it unnecessary, it could - * actually be harmful: if the system lost - * power while rewriting the only good copy, - * there would be no good copies left! - */ - mc = &mm->mm_child[c]; - - if (mc->mc_error == 0) { - if (mc->mc_tried) - continue; - /* - * We didn't try this child. We need to - * repair it if: - * 1. it's a scrub (in which case we have - * tried everything that was healthy) - * - or - - * 2. it's an indirect vdev (in which case - * it could point to any other vdev, which - * might have a bad DTL) - * - or - - * 3. the DTL indicates that this data is - * missing from this vdev - */ - if (!(zio->io_flags & ZIO_FLAG_SCRUB) && - mc->mc_vd->vdev_ops != &vdev_indirect_ops && - !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, - zio->io_txg, 1)) - continue; - mc->mc_error = SET_ERROR(ESTALE); - } - - zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, - zio->io_abd, zio->io_size, - ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_IO_REPAIR | (unexpected_errors ? - ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); - } - } -} - -static void -vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) -{ - if (faulted == vd->vdev_children) { - if (vdev_children_are_offline(vd)) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE, - VDEV_AUX_CHILDREN_OFFLINE); - } else { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_NO_REPLICAS); - } - } else if (degraded + faulted != 0) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); - } else { - vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); - } -} - -vdev_ops_t vdev_mirror_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - NULL, - NULL, - NULL, - NULL, - vdev_default_xlate, - VDEV_TYPE_MIRROR, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ -}; - -vdev_ops_t vdev_replacing_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - NULL, - NULL, - NULL, - NULL, - vdev_default_xlate, - VDEV_TYPE_REPLACING, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ -}; - -vdev_ops_t vdev_spare_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - NULL, - NULL, - NULL, - NULL, - vdev_default_xlate, - VDEV_TYPE_SPARE, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ -}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c deleted file mode 100644 index 6852de445049..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c +++ /dev/null @@ -1,113 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. - */ - -/* - * The 'missing' vdev is a special vdev type used only during import. It - * signifies a placeholder in the root vdev for some vdev that we know is - * missing. We pass it down to the kernel to allow the rest of the - * configuration to parsed and an attempt made to open all available devices. - * Because its GUID is always 0, we know that the guid sum will mismatch and we - * won't be able to open the pool anyway. - */ - -#include -#include -#include -#include -#include - -/* ARGSUSED */ -static int -vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *logical_ashift, uint64_t *physical_ashift) -{ - /* - * Really this should just fail. But then the root vdev will be in the - * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is - * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we - * will fail the GUID sum check before ever trying to open the pool. - */ - *psize = 0; - *max_psize = 0; - *logical_ashift = 0; - *physical_ashift = 0; - return (0); -} - -/* ARGSUSED */ -static void -vdev_missing_close(vdev_t *vd) -{ -} - -/* ARGSUSED */ -static void -vdev_missing_io_start(zio_t *zio) -{ - zio->io_error = SET_ERROR(ENOTSUP); - zio_execute(zio); -} - -/* ARGSUSED */ -static void -vdev_missing_io_done(zio_t *zio) -{ -} - -vdev_ops_t vdev_missing_ops = { - vdev_missing_open, - vdev_missing_close, - vdev_default_asize, - vdev_missing_io_start, - vdev_missing_io_done, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - VDEV_TYPE_MISSING, /* name of this vdev type */ - B_TRUE /* leaf vdev */ -}; - -vdev_ops_t vdev_hole_ops = { - vdev_missing_open, - vdev_missing_close, - vdev_default_asize, - vdev_missing_io_start, - vdev_missing_io_done, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - VDEV_TYPE_HOLE, /* name of this vdev type */ - B_TRUE /* leaf vdev */ -}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c deleted file mode 100644 index 71e3a1fd54bc..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ /dev/null @@ -1,1047 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * ZFS I/O Scheduler - * --------------- - * - * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The - * I/O scheduler determines when and in what order those operations are - * issued. The I/O scheduler divides operations into six I/O classes - * prioritized in the following order: sync read, sync write, async read, - * async write, scrub/resilver and trim. Each queue defines the minimum and - * maximum number of concurrent operations that may be issued to the device. - * In addition, the device has an aggregate maximum. Note that the sum of the - * per-queue minimums must not exceed the aggregate maximum, and if the - * aggregate maximum is equal to or greater than the sum of the per-queue - * maximums, the per-queue minimum has no effect. - * - * For many physical devices, throughput increases with the number of - * concurrent operations, but latency typically suffers. Further, physical - * devices typically have a limit at which more concurrent operations have no - * effect on throughput or can actually cause it to decrease. - * - * The scheduler selects the next operation to issue by first looking for an - * I/O class whose minimum has not been satisfied. Once all are satisfied and - * the aggregate maximum has not been hit, the scheduler looks for classes - * whose maximum has not been satisfied. Iteration through the I/O classes is - * done in the order specified above. No further operations are issued if the - * aggregate maximum number of concurrent operations has been hit or if there - * are no operations queued for an I/O class that has not hit its maximum. - * Every time an I/O is queued or an operation completes, the I/O scheduler - * looks for new operations to issue. - * - * All I/O classes have a fixed maximum number of outstanding operations - * except for the async write class. Asynchronous writes represent the data - * that is committed to stable storage during the syncing stage for - * transaction groups (see txg.c). Transaction groups enter the syncing state - * periodically so the number of queued async writes will quickly burst up and - * then bleed down to zero. Rather than servicing them as quickly as possible, - * the I/O scheduler changes the maximum number of active async write I/Os - * according to the amount of dirty data in the pool (see dsl_pool.c). Since - * both throughput and latency typically increase with the number of - * concurrent operations issued to physical devices, reducing the burstiness - * in the number of concurrent operations also stabilizes the response time of - * operations from other -- and in particular synchronous -- queues. In broad - * strokes, the I/O scheduler will issue more concurrent operations from the - * async write queue as there's more dirty data in the pool. - * - * Async Writes - * - * The number of concurrent operations issued for the async write I/O class - * follows a piece-wise linear function defined by a few adjustable points. - * - * | o---------| <-- zfs_vdev_async_write_max_active - * ^ | /^ | - * | | / | | - * active | / | | - * I/O | / | | - * count | / | | - * | / | | - * |------------o | | <-- zfs_vdev_async_write_min_active - * 0|____________^______|_________| - * 0% | | 100% of zfs_dirty_data_max - * | | - * | `-- zfs_vdev_async_write_active_max_dirty_percent - * `--------- zfs_vdev_async_write_active_min_dirty_percent - * - * Until the amount of dirty data exceeds a minimum percentage of the dirty - * data allowed in the pool, the I/O scheduler will limit the number of - * concurrent operations to the minimum. As that threshold is crossed, the - * number of concurrent operations issued increases linearly to the maximum at - * the specified maximum percentage of the dirty data allowed in the pool. - * - * Ideally, the amount of dirty data on a busy pool will stay in the sloped - * part of the function between zfs_vdev_async_write_active_min_dirty_percent - * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the - * maximum percentage, this indicates that the rate of incoming data is - * greater than the rate that the backend storage can handle. In this case, we - * must further throttle incoming writes (see dmu_tx_delay() for details). - */ - -/* - * The maximum number of I/Os active to each device. Ideally, this will be >= - * the sum of each queue's max_active. It must be at least the sum of each - * queue's min_active. - */ -uint32_t zfs_vdev_max_active = 1000; - -/* - * Per-queue limits on the number of I/Os active to each device. If the - * sum of the queue's max_active is < zfs_vdev_max_active, then the - * min_active comes into play. We will send min_active from each queue, - * and then select from queues in the order defined by zio_priority_t. - * - * In general, smaller max_active's will lead to lower latency of synchronous - * operations. Larger max_active's may lead to higher overall throughput, - * depending on underlying storage. - * - * The ratio of the queues' max_actives determines the balance of performance - * between reads, writes, and scrubs. E.g., increasing - * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete - * more quickly, but reads and writes to have higher latency and lower - * throughput. - */ -uint32_t zfs_vdev_sync_read_min_active = 10; -uint32_t zfs_vdev_sync_read_max_active = 10; -uint32_t zfs_vdev_sync_write_min_active = 10; -uint32_t zfs_vdev_sync_write_max_active = 10; -uint32_t zfs_vdev_async_read_min_active = 1; -uint32_t zfs_vdev_async_read_max_active = 3; -uint32_t zfs_vdev_async_write_min_active = 1; -uint32_t zfs_vdev_async_write_max_active = 10; -uint32_t zfs_vdev_scrub_min_active = 1; -uint32_t zfs_vdev_scrub_max_active = 2; -uint32_t zfs_vdev_trim_min_active = 1; -/* - * TRIM max active is large in comparison to the other values due to the fact - * that TRIM IOs are coalesced at the device layer. This value is set such - * that a typical SSD can process the queued IOs in a single request. - */ -uint32_t zfs_vdev_trim_max_active = 64; -uint32_t zfs_vdev_removal_min_active = 1; -uint32_t zfs_vdev_removal_max_active = 2; -uint32_t zfs_vdev_initializing_min_active = 1; -uint32_t zfs_vdev_initializing_max_active = 1; - - -/* - * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent - * dirty data, use zfs_vdev_async_write_min_active. When it has more than - * zfs_vdev_async_write_active_max_dirty_percent, use - * zfs_vdev_async_write_max_active. The value is linearly interpolated - * between min and max. - */ -int zfs_vdev_async_write_active_min_dirty_percent = 30; -int zfs_vdev_async_write_active_max_dirty_percent = 60; - -/* - * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. - * For read I/Os, we also aggregate across small adjacency gaps; for writes - * we include spans of optional I/Os to aid aggregation at the disk even when - * they aren't able to help us aggregate at this level. - */ -int zfs_vdev_aggregation_limit = 1 << 20; -int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE; -int zfs_vdev_read_gap_limit = 32 << 10; -int zfs_vdev_write_gap_limit = 4 << 10; - -/* - * Define the queue depth percentage for each top-level. This percentage is - * used in conjunction with zfs_vdev_async_max_active to determine how many - * allocations a specific top-level vdev should handle. Once the queue depth - * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100 - * then allocator will stop allocating blocks on that top-level device. - * The default kernel setting is 1000% which will yield 100 allocations per - * device. For userland testing, the default setting is 300% which equates - * to 30 allocations per device. - */ -#ifdef _KERNEL -int zfs_vdev_queue_depth_pct = 1000; -#else -int zfs_vdev_queue_depth_pct = 300; -#endif - -/* - * When performing allocations for a given metaslab, we want to make sure that - * there are enough IOs to aggregate together to improve throughput. We want to - * ensure that there are at least 128k worth of IOs that can be aggregated, and - * we assume that the average allocation size is 4k, so we need the queue depth - * to be 32 per allocator to get good aggregation of sequential writes. - */ -int zfs_vdev_def_queue_depth = 32; - -#ifdef __FreeBSD__ -#ifdef _KERNEL -SYSCTL_DECL(_vfs_zfs_vdev); - -static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS); -SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent, - CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), - sysctl_zfs_async_write_active_min_dirty_percent, "I", - "Percentage of async write dirty data below which " - "async_write_min_active is used."); - -static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS); -SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent, - CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), - sysctl_zfs_async_write_active_max_dirty_percent, "I", - "Percentage of async write dirty data above which " - "async_write_max_active is used."); - -SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN, - &zfs_vdev_max_active, 0, - "The maximum number of I/Os of all types active for each device."); - -#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \ -SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\ - &zfs_vdev_ ## name ## _min_active, 0, \ - "Initial number of I/O requests of type " #name \ - " active for each device"); - -#define ZFS_VDEV_QUEUE_KNOB_MAX(name) \ -SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN,\ - &zfs_vdev_ ## name ## _max_active, 0, \ - "Maximum number of I/O requests of type " #name \ - " active for each device"); - -ZFS_VDEV_QUEUE_KNOB_MIN(sync_read); -ZFS_VDEV_QUEUE_KNOB_MAX(sync_read); -ZFS_VDEV_QUEUE_KNOB_MIN(sync_write); -ZFS_VDEV_QUEUE_KNOB_MAX(sync_write); -ZFS_VDEV_QUEUE_KNOB_MIN(async_read); -ZFS_VDEV_QUEUE_KNOB_MAX(async_read); -ZFS_VDEV_QUEUE_KNOB_MIN(async_write); -ZFS_VDEV_QUEUE_KNOB_MAX(async_write); -ZFS_VDEV_QUEUE_KNOB_MIN(scrub); -ZFS_VDEV_QUEUE_KNOB_MAX(scrub); -ZFS_VDEV_QUEUE_KNOB_MIN(trim); -ZFS_VDEV_QUEUE_KNOB_MAX(trim); -ZFS_VDEV_QUEUE_KNOB_MIN(removal); -ZFS_VDEV_QUEUE_KNOB_MAX(removal); -ZFS_VDEV_QUEUE_KNOB_MIN(initializing); -ZFS_VDEV_QUEUE_KNOB_MAX(initializing); - -#undef ZFS_VDEV_QUEUE_KNOB - -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN, - &zfs_vdev_aggregation_limit, 0, - "I/O requests are aggregated up to this size"); -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit_non_rotating, CTLFLAG_RWTUN, - &zfs_vdev_aggregation_limit_non_rotating, 0, - "I/O requests are aggregated up to this size for non-rotating media"); -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN, - &zfs_vdev_read_gap_limit, 0, - "Acceptable gap between two reads being aggregated"); -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN, - &zfs_vdev_write_gap_limit, 0, - "Acceptable gap between two writes being aggregated"); -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN, - &zfs_vdev_queue_depth_pct, 0, - "Queue depth percentage for each top-level"); -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN, - &zfs_vdev_def_queue_depth, 0, - "Default queue depth for each allocator"); - -static int -sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS) -{ - int val, err; - - val = zfs_vdev_async_write_active_min_dirty_percent; - err = sysctl_handle_int(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (val < 0 || val > 100 || - val >= zfs_vdev_async_write_active_max_dirty_percent) - return (EINVAL); - - zfs_vdev_async_write_active_min_dirty_percent = val; - - return (0); -} - -static int -sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS) -{ - int val, err; - - val = zfs_vdev_async_write_active_max_dirty_percent; - err = sysctl_handle_int(oidp, &val, 0, req); - if (err != 0 || req->newptr == NULL) - return (err); - - if (val < 0 || val > 100 || - val <= zfs_vdev_async_write_active_min_dirty_percent) - return (EINVAL); - - zfs_vdev_async_write_active_max_dirty_percent = val; - - return (0); -} -#endif -#endif - -int -vdev_queue_offset_compare(const void *x1, const void *x2) -{ - const zio_t *z1 = (const zio_t *)x1; - const zio_t *z2 = (const zio_t *)x2; - - int cmp = AVL_CMP(z1->io_offset, z2->io_offset); - - if (likely(cmp)) - return (cmp); - - return (AVL_PCMP(z1, z2)); -} - -static inline avl_tree_t * -vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) -{ - return (&vq->vq_class[p].vqc_queued_tree); -} - -static inline avl_tree_t * -vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) -{ - if (t == ZIO_TYPE_READ) - return (&vq->vq_read_offset_tree); - else if (t == ZIO_TYPE_WRITE) - return (&vq->vq_write_offset_tree); - else - return (NULL); -} - -int -vdev_queue_timestamp_compare(const void *x1, const void *x2) -{ - const zio_t *z1 = (const zio_t *)x1; - const zio_t *z2 = (const zio_t *)x2; - - int cmp = AVL_CMP(z1->io_timestamp, z2->io_timestamp); - - if (likely(cmp)) - return (cmp); - - return (AVL_PCMP(z1, z2)); -} - -void -vdev_queue_init(vdev_t *vd) -{ - vdev_queue_t *vq = &vd->vdev_queue; - - mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); - vq->vq_vdev = vd; - - avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, - sizeof (zio_t), offsetof(struct zio, io_queue_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); - - for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - int (*compfn) (const void *, const void *); - - /* - * The synchronous i/o queues are dispatched in FIFO rather - * than LBA order. This provides more consistent latency for - * these i/os. - */ - if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE) - compfn = vdev_queue_timestamp_compare; - else - compfn = vdev_queue_offset_compare; - - avl_create(vdev_queue_class_tree(vq, p), compfn, - sizeof (zio_t), offsetof(struct zio, io_queue_node)); - } - - vq->vq_lastoffset = 0; -} - -void -vdev_queue_fini(vdev_t *vd) -{ - vdev_queue_t *vq = &vd->vdev_queue; - - for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) - avl_destroy(vdev_queue_class_tree(vq, p)); - avl_destroy(&vq->vq_active_tree); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); - - mutex_destroy(&vq->vq_lock); -} - -static void -vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) -{ - spa_t *spa = zio->io_spa; - avl_tree_t *qtt; - - ASSERT(MUTEX_HELD(&vq->vq_lock)); - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); - qtt = vdev_queue_type_tree(vq, zio->io_type); - if (qtt) - avl_add(qtt, zio); - -#ifdef illumos - mutex_enter(&spa->spa_iokstat_lock); - spa->spa_queue_stats[zio->io_priority].spa_queued++; - if (spa->spa_iokstat != NULL) - kstat_waitq_enter(spa->spa_iokstat->ks_data); - mutex_exit(&spa->spa_iokstat_lock); -#endif -} - -static void -vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) -{ - spa_t *spa = zio->io_spa; - avl_tree_t *qtt; - - ASSERT(MUTEX_HELD(&vq->vq_lock)); - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); - qtt = vdev_queue_type_tree(vq, zio->io_type); - if (qtt) - avl_remove(qtt, zio); - -#ifdef illumos - mutex_enter(&spa->spa_iokstat_lock); - ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0); - spa->spa_queue_stats[zio->io_priority].spa_queued--; - if (spa->spa_iokstat != NULL) - kstat_waitq_exit(spa->spa_iokstat->ks_data); - mutex_exit(&spa->spa_iokstat_lock); -#endif -} - -static void -vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) -{ - spa_t *spa = zio->io_spa; - ASSERT(MUTEX_HELD(&vq->vq_lock)); - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_class[zio->io_priority].vqc_active++; - avl_add(&vq->vq_active_tree, zio); - -#ifdef illumos - mutex_enter(&spa->spa_iokstat_lock); - spa->spa_queue_stats[zio->io_priority].spa_active++; - if (spa->spa_iokstat != NULL) - kstat_runq_enter(spa->spa_iokstat->ks_data); - mutex_exit(&spa->spa_iokstat_lock); -#endif -} - -static void -vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) -{ - spa_t *spa = zio->io_spa; - ASSERT(MUTEX_HELD(&vq->vq_lock)); - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_class[zio->io_priority].vqc_active--; - avl_remove(&vq->vq_active_tree, zio); - -#ifdef illumos - mutex_enter(&spa->spa_iokstat_lock); - ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0); - spa->spa_queue_stats[zio->io_priority].spa_active--; - if (spa->spa_iokstat != NULL) { - kstat_io_t *ksio = spa->spa_iokstat->ks_data; - - kstat_runq_exit(spa->spa_iokstat->ks_data); - if (zio->io_type == ZIO_TYPE_READ) { - ksio->reads++; - ksio->nread += zio->io_size; - } else if (zio->io_type == ZIO_TYPE_WRITE) { - ksio->writes++; - ksio->nwritten += zio->io_size; - } - } - mutex_exit(&spa->spa_iokstat_lock); -#endif -} - -static void -vdev_queue_agg_io_done(zio_t *aio) -{ - if (aio->io_type == ZIO_TYPE_READ) { - zio_t *pio; - zio_link_t *zl = NULL; - while ((pio = zio_walk_parents(aio, &zl)) != NULL) { - abd_copy_off(pio->io_abd, aio->io_abd, - 0, pio->io_offset - aio->io_offset, pio->io_size); - } - } - - abd_free(aio->io_abd); -} - -static int -vdev_queue_class_min_active(zio_priority_t p) -{ - switch (p) { - case ZIO_PRIORITY_SYNC_READ: - return (zfs_vdev_sync_read_min_active); - case ZIO_PRIORITY_SYNC_WRITE: - return (zfs_vdev_sync_write_min_active); - case ZIO_PRIORITY_ASYNC_READ: - return (zfs_vdev_async_read_min_active); - case ZIO_PRIORITY_ASYNC_WRITE: - return (zfs_vdev_async_write_min_active); - case ZIO_PRIORITY_SCRUB: - return (zfs_vdev_scrub_min_active); - case ZIO_PRIORITY_TRIM: - return (zfs_vdev_trim_min_active); - case ZIO_PRIORITY_REMOVAL: - return (zfs_vdev_removal_min_active); - case ZIO_PRIORITY_INITIALIZING: - return (zfs_vdev_initializing_min_active); - default: - panic("invalid priority %u", p); - return (0); - } -} - -static __noinline int -vdev_queue_max_async_writes(spa_t *spa) -{ - int writes; - uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total; - uint64_t min_bytes = zfs_dirty_data_max * - zfs_vdev_async_write_active_min_dirty_percent / 100; - uint64_t max_bytes = zfs_dirty_data_max * - zfs_vdev_async_write_active_max_dirty_percent / 100; - - /* - * Sync tasks correspond to interactive user actions. To reduce the - * execution time of those actions we push data out as fast as possible. - */ - if (spa_has_pending_synctask(spa)) { - return (zfs_vdev_async_write_max_active); - } - - if (dirty < min_bytes) - return (zfs_vdev_async_write_min_active); - if (dirty > max_bytes) - return (zfs_vdev_async_write_max_active); - - /* - * linear interpolation: - * slope = (max_writes - min_writes) / (max_bytes - min_bytes) - * move right by min_bytes - * move up by min_writes - */ - writes = (dirty - min_bytes) * - (zfs_vdev_async_write_max_active - - zfs_vdev_async_write_min_active) / - (max_bytes - min_bytes) + - zfs_vdev_async_write_min_active; - ASSERT3U(writes, >=, zfs_vdev_async_write_min_active); - ASSERT3U(writes, <=, zfs_vdev_async_write_max_active); - return (writes); -} - -static int -vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) -{ - switch (p) { - case ZIO_PRIORITY_SYNC_READ: - return (zfs_vdev_sync_read_max_active); - case ZIO_PRIORITY_SYNC_WRITE: - return (zfs_vdev_sync_write_max_active); - case ZIO_PRIORITY_ASYNC_READ: - return (zfs_vdev_async_read_max_active); - case ZIO_PRIORITY_ASYNC_WRITE: - return (vdev_queue_max_async_writes(spa)); - case ZIO_PRIORITY_SCRUB: - return (zfs_vdev_scrub_max_active); - case ZIO_PRIORITY_TRIM: - return (zfs_vdev_trim_max_active); - case ZIO_PRIORITY_REMOVAL: - return (zfs_vdev_removal_max_active); - case ZIO_PRIORITY_INITIALIZING: - return (zfs_vdev_initializing_max_active); - default: - panic("invalid priority %u", p); - return (0); - } -} - -/* - * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if - * there is no eligible class. - */ -static zio_priority_t -vdev_queue_class_to_issue(vdev_queue_t *vq) -{ - spa_t *spa = vq->vq_vdev->vdev_spa; - zio_priority_t p; - - ASSERT(MUTEX_HELD(&vq->vq_lock)); - - if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) - return (ZIO_PRIORITY_NUM_QUEUEABLE); - - /* find a queue that has not reached its minimum # outstanding i/os */ - for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && - vq->vq_class[p].vqc_active < - vdev_queue_class_min_active(p)) - return (p); - } - - /* - * If we haven't found a queue, look for one that hasn't reached its - * maximum # outstanding i/os. - */ - for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && - vq->vq_class[p].vqc_active < - vdev_queue_class_max_active(spa, p)) - return (p); - } - - /* No eligible queued i/os */ - return (ZIO_PRIORITY_NUM_QUEUEABLE); -} - -/* - * Compute the range spanned by two i/os, which is the endpoint of the last - * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). - * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); - * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. - */ -#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) -#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) - -static zio_t * -vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) -{ - zio_t *first, *last, *aio, *dio, *mandatory, *nio; - zio_link_t *zl = NULL; - uint64_t maxgap = 0; - uint64_t size; - uint64_t limit; - int maxblocksize; - boolean_t stretch; - avl_tree_t *t; - enum zio_flag flags; - - ASSERT(MUTEX_HELD(&vq->vq_lock)); - - maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa); - if (vq->vq_vdev->vdev_nonrot) - limit = zfs_vdev_aggregation_limit_non_rotating; - else - limit = zfs_vdev_aggregation_limit; - limit = MAX(MIN(limit, maxblocksize), 0); - - if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0) - return (NULL); - - first = last = zio; - - if (zio->io_type == ZIO_TYPE_READ) - maxgap = zfs_vdev_read_gap_limit; - - /* - * We can aggregate I/Os that are sufficiently adjacent and of - * the same flavor, as expressed by the AGG_INHERIT flags. - * The latter requirement is necessary so that certain - * attributes of the I/O, such as whether it's a normal I/O - * or a scrub/resilver, can be preserved in the aggregate. - * We can include optional I/Os, but don't allow them - * to begin a range as they add no benefit in that situation. - */ - - /* - * We keep track of the last non-optional I/O. - */ - mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first; - - /* - * Walk backwards through sufficiently contiguous I/Os - * recording the last non-optional I/O. - */ - flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; - t = vdev_queue_type_tree(vq, zio->io_type); - while (t != NULL && (dio = AVL_PREV(t, first)) != NULL && - (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && - IO_SPAN(dio, last) <= limit && - IO_GAP(dio, first) <= maxgap && - dio->io_type == zio->io_type) { - first = dio; - if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) - mandatory = first; - } - - /* - * Skip any initial optional I/Os. - */ - while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { - first = AVL_NEXT(t, first); - ASSERT(first != NULL); - } - - /* - * Walk forward through sufficiently contiguous I/Os. - * The aggregation limit does not apply to optional i/os, so that - * we can issue contiguous writes even if they are larger than the - * aggregation limit. - */ - while ((dio = AVL_NEXT(t, last)) != NULL && - (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && - (IO_SPAN(first, dio) <= limit || - (dio->io_flags & ZIO_FLAG_OPTIONAL)) && - IO_SPAN(first, dio) <= maxblocksize && - IO_GAP(last, dio) <= maxgap && - dio->io_type == zio->io_type) { - last = dio; - if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) - mandatory = last; - } - - /* - * Now that we've established the range of the I/O aggregation - * we must decide what to do with trailing optional I/Os. - * For reads, there's nothing to do. While we are unable to - * aggregate further, it's possible that a trailing optional - * I/O would allow the underlying device to aggregate with - * subsequent I/Os. We must therefore determine if the next - * non-optional I/O is close enough to make aggregation - * worthwhile. - */ - stretch = B_FALSE; - if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { - zio_t *nio = last; - while ((dio = AVL_NEXT(t, nio)) != NULL && - IO_GAP(nio, dio) == 0 && - IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) { - nio = dio; - if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { - stretch = B_TRUE; - break; - } - } - } - - if (stretch) { - /* - * We are going to include an optional io in our aggregated - * span, thus closing the write gap. Only mandatory i/os can - * start aggregated spans, so make sure that the next i/o - * after our span is mandatory. - */ - dio = AVL_NEXT(t, last); - dio->io_flags &= ~ZIO_FLAG_OPTIONAL; - } else { - /* do not include the optional i/o */ - while (last != mandatory && last != first) { - ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); - last = AVL_PREV(t, last); - ASSERT(last != NULL); - } - } - - if (first == last) - return (NULL); - - size = IO_SPAN(first, last); - ASSERT3U(size, <=, maxblocksize); - - aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, - abd_alloc_for_io(size, B_TRUE), size, first->io_type, - zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, - vdev_queue_agg_io_done, NULL); - aio->io_timestamp = first->io_timestamp; - - nio = first; - do { - dio = nio; - nio = AVL_NEXT(t, dio); - zio_add_child(dio, aio); - vdev_queue_io_remove(vq, dio); - } while (dio != last); - - /* - * We need to drop the vdev queue's lock during zio_execute() to - * avoid a deadlock that we could encounter due to lock order - * reversal between vq_lock and io_lock in zio_change_priority(). - * Use the dropped lock to do memory copy without congestion. - */ - mutex_exit(&vq->vq_lock); - while ((dio = zio_walk_parents(aio, &zl)) != NULL) { - ASSERT3U(dio->io_type, ==, aio->io_type); - - if (dio->io_flags & ZIO_FLAG_NODATA) { - ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); - abd_zero_off(aio->io_abd, - dio->io_offset - aio->io_offset, dio->io_size); - } else if (dio->io_type == ZIO_TYPE_WRITE) { - abd_copy_off(aio->io_abd, dio->io_abd, - dio->io_offset - aio->io_offset, 0, dio->io_size); - } - - zio_vdev_io_bypass(dio); - zio_execute(dio); - } - mutex_enter(&vq->vq_lock); - - return (aio); -} - -static zio_t * -vdev_queue_io_to_issue(vdev_queue_t *vq) -{ - zio_t *zio, *aio; - zio_priority_t p; - avl_index_t idx; - avl_tree_t *tree; - zio_t search; - -again: - ASSERT(MUTEX_HELD(&vq->vq_lock)); - - p = vdev_queue_class_to_issue(vq); - - if (p == ZIO_PRIORITY_NUM_QUEUEABLE) { - /* No eligible queued i/os */ - return (NULL); - } - - /* - * For LBA-ordered queues (async / scrub / initializing), issue the - * i/o which follows the most recently issued i/o in LBA (offset) order. - * - * For FIFO queues (sync), issue the i/o with the lowest timestamp. - */ - tree = vdev_queue_class_tree(vq, p); - search.io_timestamp = 0; - search.io_offset = vq->vq_last_offset + 1; - VERIFY3P(avl_find(tree, &search, &idx), ==, NULL); - zio = avl_nearest(tree, idx, AVL_AFTER); - if (zio == NULL) - zio = avl_first(tree); - ASSERT3U(zio->io_priority, ==, p); - - aio = vdev_queue_aggregate(vq, zio); - if (aio != NULL) - zio = aio; - else - vdev_queue_io_remove(vq, zio); - - /* - * If the I/O is or was optional and therefore has no data, we need to - * simply discard it. We need to drop the vdev queue's lock to avoid a - * deadlock that we could encounter since this I/O will complete - * immediately. - */ - if (zio->io_flags & ZIO_FLAG_NODATA) { - mutex_exit(&vq->vq_lock); - zio_vdev_io_bypass(zio); - zio_execute(zio); - mutex_enter(&vq->vq_lock); - goto again; - } - - vdev_queue_pending_add(vq, zio); - vq->vq_last_offset = zio->io_offset; - - return (zio); -} - -zio_t * -vdev_queue_io(zio_t *zio) -{ - vdev_queue_t *vq = &zio->io_vd->vdev_queue; - zio_t *nio; - - if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) - return (zio); - - /* - * Children i/os inherent their parent's priority, which might - * not match the child's i/o type. Fix it up here. - */ - if (zio->io_type == ZIO_TYPE_READ) { - if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && - zio->io_priority != ZIO_PRIORITY_ASYNC_READ && - zio->io_priority != ZIO_PRIORITY_SCRUB && - zio->io_priority != ZIO_PRIORITY_REMOVAL && - zio->io_priority != ZIO_PRIORITY_INITIALIZING) - zio->io_priority = ZIO_PRIORITY_ASYNC_READ; - } else if (zio->io_type == ZIO_TYPE_WRITE) { - if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && - zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && - zio->io_priority != ZIO_PRIORITY_REMOVAL && - zio->io_priority != ZIO_PRIORITY_INITIALIZING) - zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; - } else { - ASSERT(zio->io_type == ZIO_TYPE_FREE); - zio->io_priority = ZIO_PRIORITY_TRIM; - } - - zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; - - mutex_enter(&vq->vq_lock); - zio->io_timestamp = gethrtime(); - vdev_queue_io_add(vq, zio); - nio = vdev_queue_io_to_issue(vq); - mutex_exit(&vq->vq_lock); - - if (nio == NULL) - return (NULL); - - if (nio->io_done == vdev_queue_agg_io_done) { - zio_nowait(nio); - return (NULL); - } - - return (nio); -} - -void -vdev_queue_io_done(zio_t *zio) -{ - vdev_queue_t *vq = &zio->io_vd->vdev_queue; - zio_t *nio; - - mutex_enter(&vq->vq_lock); - - vdev_queue_pending_remove(vq, zio); - - vq->vq_io_complete_ts = gethrtime(); - - while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { - mutex_exit(&vq->vq_lock); - if (nio->io_done == vdev_queue_agg_io_done) { - zio_nowait(nio); - } else { - zio_vdev_io_reissue(nio); - zio_execute(nio); - } - mutex_enter(&vq->vq_lock); - } - - mutex_exit(&vq->vq_lock); -} - -void -vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) -{ - vdev_queue_t *vq = &zio->io_vd->vdev_queue; - avl_tree_t *tree; - - /* - * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio - * code to issue IOs without adding them to the vdev queue. In this - * case, the zio is already going to be issued as quickly as possible - * and so it doesn't need any reprioitization to help. - */ - if (zio->io_priority == ZIO_PRIORITY_NOW) - return; - - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - - if (zio->io_type == ZIO_TYPE_READ) { - if (priority != ZIO_PRIORITY_SYNC_READ && - priority != ZIO_PRIORITY_ASYNC_READ && - priority != ZIO_PRIORITY_SCRUB) - priority = ZIO_PRIORITY_ASYNC_READ; - } else { - ASSERT(zio->io_type == ZIO_TYPE_WRITE); - if (priority != ZIO_PRIORITY_SYNC_WRITE && - priority != ZIO_PRIORITY_ASYNC_WRITE) - priority = ZIO_PRIORITY_ASYNC_WRITE; - } - - mutex_enter(&vq->vq_lock); - - /* - * If the zio is in none of the queues we can simply change - * the priority. If the zio is waiting to be submitted we must - * remove it from the queue and re-insert it with the new priority. - * Otherwise, the zio is currently active and we cannot change its - * priority. - */ - tree = vdev_queue_class_tree(vq, zio->io_priority); - if (avl_find(tree, zio, NULL) == zio) { - avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); - zio->io_priority = priority; - avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); - } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { - zio->io_priority = priority; - } - - mutex_exit(&vq->vq_lock); -} - -/* - * As these three methods are only used for load calculations we're not concerned - * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex - * use here, instead we prefer to keep it lock free for performance. - */ -int -vdev_queue_length(vdev_t *vd) -{ - return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); -} - -uint64_t -vdev_queue_lastoffset(vdev_t *vd) -{ - return (vd->vdev_queue.vq_lastoffset); -} - -void -vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio) -{ - vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size; -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c deleted file mode 100644 index 29878ea6eaf6..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c +++ /dev/null @@ -1,2707 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -#include -#include -#include -#ifdef illumos -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef ZFS_DEBUG -#include /* vdev_xlate testing */ -#endif - -/* - * Virtual device vector for RAID-Z. - * - * This vdev supports single, double, and triple parity. For single parity, - * we use a simple XOR of all the data columns. For double or triple parity, - * we use a special case of Reed-Solomon coding. This extends the - * technique described in "The mathematics of RAID-6" by H. Peter Anvin by - * drawing on the system described in "A Tutorial on Reed-Solomon Coding for - * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the - * former is also based. The latter is designed to provide higher performance - * for writes. - * - * Note that the Plank paper claimed to support arbitrary N+M, but was then - * amended six years later identifying a critical flaw that invalidates its - * claims. Nevertheless, the technique can be adapted to work for up to - * triple parity. For additional parity, the amendment "Note: Correction to - * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding - * is viable, but the additional complexity means that write performance will - * suffer. - * - * All of the methods above operate on a Galois field, defined over the - * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements - * can be expressed with a single byte. Briefly, the operations on the - * field are defined as follows: - * - * o addition (+) is represented by a bitwise XOR - * o subtraction (-) is therefore identical to addition: A + B = A - B - * o multiplication of A by 2 is defined by the following bitwise expression: - * - * (A * 2)_7 = A_6 - * (A * 2)_6 = A_5 - * (A * 2)_5 = A_4 - * (A * 2)_4 = A_3 + A_7 - * (A * 2)_3 = A_2 + A_7 - * (A * 2)_2 = A_1 + A_7 - * (A * 2)_1 = A_0 - * (A * 2)_0 = A_7 - * - * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). - * As an aside, this multiplication is derived from the error correcting - * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. - * - * Observe that any number in the field (except for 0) can be expressed as a - * power of 2 -- a generator for the field. We store a table of the powers of - * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can - * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather - * than field addition). The inverse of a field element A (A^-1) is therefore - * A ^ (255 - 1) = A^254. - * - * The up-to-three parity columns, P, Q, R over several data columns, - * D_0, ... D_n-1, can be expressed by field operations: - * - * P = D_0 + D_1 + ... + D_n-2 + D_n-1 - * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 - * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 - * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 - * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 - * - * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival - * XOR operation, and 2 and 4 can be computed quickly and generate linearly- - * independent coefficients. (There are no additional coefficients that have - * this property which is why the uncorrected Plank method breaks down.) - * - * See the reconstruction code below for how P, Q and R can used individually - * or in concert to recover missing data columns. - */ - -typedef struct raidz_col { - uint64_t rc_devidx; /* child device index for I/O */ - uint64_t rc_offset; /* device offset */ - uint64_t rc_size; /* I/O size */ - abd_t *rc_abd; /* I/O data */ - void *rc_gdata; /* used to store the "good" version */ - int rc_error; /* I/O error for this device */ - uint8_t rc_tried; /* Did we attempt this I/O column? */ - uint8_t rc_skipped; /* Did we skip this I/O column? */ -} raidz_col_t; - -typedef struct raidz_map { - uint64_t rm_cols; /* Regular column count */ - uint64_t rm_scols; /* Count including skipped columns */ - uint64_t rm_bigcols; /* Number of oversized columns */ - uint64_t rm_asize; /* Actual total I/O size */ - uint64_t rm_missingdata; /* Count of missing data devices */ - uint64_t rm_missingparity; /* Count of missing parity devices */ - uint64_t rm_firstdatacol; /* First data column/parity count */ - uint64_t rm_nskip; /* Skipped sectors for padding */ - uint64_t rm_skipstart; /* Column index of padding start */ - abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ - uintptr_t rm_reports; /* # of referencing checksum reports */ - uint8_t rm_freed; /* map no longer has referencing ZIO */ - uint8_t rm_ecksuminjected; /* checksum error was injected */ - raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ -} raidz_map_t; - -#define VDEV_RAIDZ_P 0 -#define VDEV_RAIDZ_Q 1 -#define VDEV_RAIDZ_R 2 - -#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) -#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) - -/* - * We provide a mechanism to perform the field multiplication operation on a - * 64-bit value all at once rather than a byte at a time. This works by - * creating a mask from the top bit in each byte and using that to - * conditionally apply the XOR of 0x1d. - */ -#define VDEV_RAIDZ_64MUL_2(x, mask) \ -{ \ - (mask) = (x) & 0x8080808080808080ULL; \ - (mask) = ((mask) << 1) - ((mask) >> 7); \ - (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ - ((mask) & 0x1d1d1d1d1d1d1d1d); \ -} - -#define VDEV_RAIDZ_64MUL_4(x, mask) \ -{ \ - VDEV_RAIDZ_64MUL_2((x), mask); \ - VDEV_RAIDZ_64MUL_2((x), mask); \ -} - -#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) - -/* - * Force reconstruction to use the general purpose method. - */ -int vdev_raidz_default_to_general; - -/* Powers of 2 in the Galois field defined above. */ -static const uint8_t vdev_raidz_pow2[256] = { - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, - 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, - 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, - 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, - 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, - 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, - 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, - 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, - 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, - 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, - 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, - 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, - 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, - 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, - 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, - 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, - 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, - 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, - 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, - 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, - 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, - 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, - 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, - 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, - 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, - 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, - 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, - 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, - 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, - 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, - 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 -}; -/* Logs of 2 in the Galois field defined above. */ -static const uint8_t vdev_raidz_log2[256] = { - 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, - 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, - 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, - 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, - 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, - 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, - 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, - 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, - 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, - 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, - 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, - 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, - 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, - 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, - 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, - 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, - 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, - 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, - 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, - 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, - 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, - 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, - 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, - 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, - 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, - 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, - 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, - 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, - 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, - 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, - 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, - 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, -}; - -static void vdev_raidz_generate_parity(raidz_map_t *rm); - -/* - * Multiply a given number by 2 raised to the given power. - */ -static uint8_t -vdev_raidz_exp2(uint_t a, int exp) -{ - if (a == 0) - return (0); - - ASSERT(exp >= 0); - ASSERT(vdev_raidz_log2[a] > 0 || a == 1); - - exp += vdev_raidz_log2[a]; - if (exp > 255) - exp -= 255; - - return (vdev_raidz_pow2[exp]); -} - -static void -vdev_raidz_map_free(raidz_map_t *rm) -{ - int c; - - for (c = 0; c < rm->rm_firstdatacol; c++) { - if (rm->rm_col[c].rc_abd != NULL) - abd_free(rm->rm_col[c].rc_abd); - - if (rm->rm_col[c].rc_gdata != NULL) - zio_buf_free(rm->rm_col[c].rc_gdata, - rm->rm_col[c].rc_size); - } - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - if (rm->rm_col[c].rc_abd != NULL) - abd_put(rm->rm_col[c].rc_abd); - } - - if (rm->rm_abd_copy != NULL) - abd_free(rm->rm_abd_copy); - - kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); -} - -static void -vdev_raidz_map_free_vsd(zio_t *zio) -{ - raidz_map_t *rm = zio->io_vsd; - - ASSERT0(rm->rm_freed); - rm->rm_freed = 1; - - if (rm->rm_reports == 0) - vdev_raidz_map_free(rm); -} - -/*ARGSUSED*/ -static void -vdev_raidz_cksum_free(void *arg, size_t ignored) -{ - raidz_map_t *rm = arg; - - ASSERT3U(rm->rm_reports, >, 0); - - if (--rm->rm_reports == 0 && rm->rm_freed != 0) - vdev_raidz_map_free(rm); -} - -static void -vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) -{ - raidz_map_t *rm = zcr->zcr_cbdata; - size_t c = zcr->zcr_cbinfo; - size_t x; - - const char *good = NULL; - char *bad; - - if (good_data == NULL) { - zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); - return; - } - - if (c < rm->rm_firstdatacol) { - /* - * The first time through, calculate the parity blocks for - * the good data (this relies on the fact that the good - * data never changes for a given logical ZIO) - */ - if (rm->rm_col[0].rc_gdata == NULL) { - abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; - char *buf; - int offset; - - /* - * Set up the rm_col[]s to generate the parity for - * good_data, first saving the parity bufs and - * replacing them with buffers to hold the result. - */ - for (x = 0; x < rm->rm_firstdatacol; x++) { - bad_parity[x] = rm->rm_col[x].rc_abd; - rm->rm_col[x].rc_gdata = - zio_buf_alloc(rm->rm_col[x].rc_size); - rm->rm_col[x].rc_abd = - abd_get_from_buf(rm->rm_col[x].rc_gdata, - rm->rm_col[x].rc_size); - } - - /* fill in the data columns from good_data */ - buf = (char *)good_data; - for (; x < rm->rm_cols; x++) { - abd_put(rm->rm_col[x].rc_abd); - rm->rm_col[x].rc_abd = abd_get_from_buf(buf, - rm->rm_col[x].rc_size); - buf += rm->rm_col[x].rc_size; - } - - /* - * Construct the parity from the good data. - */ - vdev_raidz_generate_parity(rm); - - /* restore everything back to its original state */ - for (x = 0; x < rm->rm_firstdatacol; x++) { - abd_put(rm->rm_col[x].rc_abd); - rm->rm_col[x].rc_abd = bad_parity[x]; - } - - offset = 0; - for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { - abd_put(rm->rm_col[x].rc_abd); - rm->rm_col[x].rc_abd = abd_get_offset( - rm->rm_abd_copy, offset); - offset += rm->rm_col[x].rc_size; - } - } - - ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); - good = rm->rm_col[c].rc_gdata; - } else { - /* adjust good_data to point at the start of our column */ - good = good_data; - - for (x = rm->rm_firstdatacol; x < c; x++) - good += rm->rm_col[x].rc_size; - } - - bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size); - /* we drop the ereport if it ends up that the data was good */ - zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); - abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size); -} - -/* - * Invoked indirectly by zfs_ereport_start_checksum(), called - * below when our read operation fails completely. The main point - * is to keep a copy of everything we read from disk, so that at - * vdev_raidz_cksum_finish() time we can compare it with the good data. - */ -static void -vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) -{ - size_t c = (size_t)(uintptr_t)arg; - size_t offset; - - raidz_map_t *rm = zio->io_vsd; - size_t size; - - /* set up the report and bump the refcount */ - zcr->zcr_cbdata = rm; - zcr->zcr_cbinfo = c; - zcr->zcr_finish = vdev_raidz_cksum_finish; - zcr->zcr_free = vdev_raidz_cksum_free; - - rm->rm_reports++; - ASSERT3U(rm->rm_reports, >, 0); - - if (rm->rm_abd_copy != NULL) - return; - - /* - * It's the first time we're called for this raidz_map_t, so we need - * to copy the data aside; there's no guarantee that our zio's buffer - * won't be re-used for something else. - * - * Our parity data is already in separate buffers, so there's no need - * to copy them. - */ - - size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) - size += rm->rm_col[c].rc_size; - - rm->rm_abd_copy = - abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size); - - for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; - abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset); - - abd_copy(tmp, col->rc_abd, col->rc_size); - abd_put(col->rc_abd); - col->rc_abd = tmp; - - offset += col->rc_size; - } - ASSERT3U(offset, ==, size); -} - -static const zio_vsd_ops_t vdev_raidz_vsd_ops = { - vdev_raidz_map_free_vsd, - vdev_raidz_cksum_report -}; - -/* - * Divides the IO evenly across all child vdevs; usually, dcols is - * the number of children in the target vdev. - */ -static raidz_map_t * -vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree, - uint64_t unit_shift, uint64_t dcols, uint64_t nparity) -{ - raidz_map_t *rm; - /* The starting RAIDZ (parent) vdev sector of the block. */ - uint64_t b = offset >> unit_shift; - /* The zio's size in units of the vdev's minimum sector size. */ - uint64_t s = size >> unit_shift; - /* The first column for this stripe. */ - uint64_t f = b % dcols; - /* The starting byte offset on each child vdev. */ - uint64_t o = (b / dcols) << unit_shift; - uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; - uint64_t off = 0; - - /* - * "Quotient": The number of data sectors for this stripe on all but - * the "big column" child vdevs that also contain "remainder" data. - */ - q = s / (dcols - nparity); - - /* - * "Remainder": The number of partial stripe data sectors in this I/O. - * This will add a sector to some, but not all, child vdevs. - */ - r = s - q * (dcols - nparity); - - /* The number of "big columns" - those which contain remainder data. */ - bc = (r == 0 ? 0 : r + nparity); - - /* - * The total number of data and parity sectors associated with - * this I/O. - */ - tot = s + nparity * (q + (r == 0 ? 0 : 1)); - - /* acols: The columns that will be accessed. */ - /* scols: The columns that will be accessed or skipped. */ - if (q == 0) { - /* Our I/O request doesn't span all child vdevs. */ - acols = bc; - scols = MIN(dcols, roundup(bc, nparity + 1)); - } else { - acols = dcols; - scols = dcols; - } - - ASSERT3U(acols, <=, scols); - - rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); - - rm->rm_cols = acols; - rm->rm_scols = scols; - rm->rm_bigcols = bc; - rm->rm_skipstart = bc; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; - rm->rm_firstdatacol = nparity; - rm->rm_abd_copy = NULL; - rm->rm_reports = 0; - rm->rm_freed = 0; - rm->rm_ecksuminjected = 0; - - asize = 0; - - for (c = 0; c < scols; c++) { - col = f + c; - coff = o; - if (col >= dcols) { - col -= dcols; - coff += 1ULL << unit_shift; - } - rm->rm_col[c].rc_devidx = col; - rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_abd = NULL; - rm->rm_col[c].rc_gdata = NULL; - rm->rm_col[c].rc_error = 0; - rm->rm_col[c].rc_tried = 0; - rm->rm_col[c].rc_skipped = 0; - - if (c >= acols) - rm->rm_col[c].rc_size = 0; - else if (c < bc) - rm->rm_col[c].rc_size = (q + 1) << unit_shift; - else - rm->rm_col[c].rc_size = q << unit_shift; - - asize += rm->rm_col[c].rc_size; - } - - ASSERT3U(asize, ==, tot << unit_shift); - rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); - rm->rm_nskip = roundup(tot, nparity + 1) - tot; - ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); - ASSERT3U(rm->rm_nskip, <=, nparity); - - if (!dofree) { - for (c = 0; c < rm->rm_firstdatacol; c++) { - rm->rm_col[c].rc_abd = - abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); - } - - for (off = 0, c = rm->rm_firstdatacol; c < acols; c++) { - rm->rm_col[c].rc_abd = abd_get_offset(abd, off); - off += rm->rm_col[c].rc_size; - } - } - - /* - * If all data stored spans all columns, there's a danger that parity - * will always be on the same device and, since parity isn't read - * during normal operation, that that device's I/O bandwidth won't be - * used effectively. We therefore switch the parity every 1MB. - * - * ... at least that was, ostensibly, the theory. As a practical - * matter unless we juggle the parity between all devices evenly, we - * won't see any benefit. Further, occasional writes that aren't a - * multiple of the LCM of the number of children and the minimum - * stripe width are sufficient to avoid pessimal behavior. - * Unfortunately, this decision created an implicit on-disk format - * requirement that we need to support for all eternity, but only - * for single-parity RAID-Z. - * - * If we intend to skip a sector in the zeroth column for padding - * we must make sure to note this swap. We will never intend to - * skip the first column since at least one data and one parity - * column must appear in each row. - */ - ASSERT(rm->rm_cols >= 2); - ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); - - if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { - devidx = rm->rm_col[0].rc_devidx; - o = rm->rm_col[0].rc_offset; - rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; - rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; - rm->rm_col[1].rc_devidx = devidx; - rm->rm_col[1].rc_offset = o; - - if (rm->rm_skipstart == 0) - rm->rm_skipstart = 1; - } - - return (rm); -} - -struct pqr_struct { - uint64_t *p; - uint64_t *q; - uint64_t *r; -}; - -static int -vdev_raidz_p_func(void *buf, size_t size, void *private) -{ - struct pqr_struct *pqr = private; - const uint64_t *src = buf; - int i, cnt = size / sizeof (src[0]); - - ASSERT(pqr->p && !pqr->q && !pqr->r); - - for (i = 0; i < cnt; i++, src++, pqr->p++) - *pqr->p ^= *src; - - return (0); -} - -static int -vdev_raidz_pq_func(void *buf, size_t size, void *private) -{ - struct pqr_struct *pqr = private; - const uint64_t *src = buf; - uint64_t mask; - int i, cnt = size / sizeof (src[0]); - - ASSERT(pqr->p && pqr->q && !pqr->r); - - for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { - *pqr->p ^= *src; - VDEV_RAIDZ_64MUL_2(*pqr->q, mask); - *pqr->q ^= *src; - } - - return (0); -} - -static int -vdev_raidz_pqr_func(void *buf, size_t size, void *private) -{ - struct pqr_struct *pqr = private; - const uint64_t *src = buf; - uint64_t mask; - int i, cnt = size / sizeof (src[0]); - - ASSERT(pqr->p && pqr->q && pqr->r); - - for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { - *pqr->p ^= *src; - VDEV_RAIDZ_64MUL_2(*pqr->q, mask); - *pqr->q ^= *src; - VDEV_RAIDZ_64MUL_4(*pqr->r, mask); - *pqr->r ^= *src; - } - - return (0); -} - -static void -vdev_raidz_generate_parity_p(raidz_map_t *rm) -{ - uint64_t *p; - int c; - abd_t *src; - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - - if (c == rm->rm_firstdatacol) { - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - } else { - struct pqr_struct pqr = { p, NULL, NULL }; - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, - vdev_raidz_p_func, &pqr); - } - } -} - -static void -vdev_raidz_generate_parity_pq(raidz_map_t *rm) -{ - uint64_t *p, *q, pcnt, ccnt, mask, i; - int c; - abd_t *src; - - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_Q].rc_size); - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - - ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); - - if (c == rm->rm_firstdatacol) { - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - (void) memcpy(q, p, rm->rm_col[c].rc_size); - } else { - struct pqr_struct pqr = { p, q, NULL }; - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, - vdev_raidz_pq_func, &pqr); - } - - if (c == rm->rm_firstdatacol) { - for (i = ccnt; i < pcnt; i++) { - p[i] = 0; - q[i] = 0; - } - } else { - /* - * Treat short columns as though they are full of 0s. - * Note that there's therefore nothing needed for P. - */ - for (i = ccnt; i < pcnt; i++) { - VDEV_RAIDZ_64MUL_2(q[i], mask); - } - } - } -} - -static void -vdev_raidz_generate_parity_pqr(raidz_map_t *rm) -{ - uint64_t *p, *q, *r, pcnt, ccnt, mask, i; - int c; - abd_t *src; - - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_Q].rc_size); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_R].rc_size); - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); - - ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); - - if (c == rm->rm_firstdatacol) { - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - (void) memcpy(q, p, rm->rm_col[c].rc_size); - (void) memcpy(r, p, rm->rm_col[c].rc_size); - } else { - struct pqr_struct pqr = { p, q, r }; - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, - vdev_raidz_pqr_func, &pqr); - } - - if (c == rm->rm_firstdatacol) { - for (i = ccnt; i < pcnt; i++) { - p[i] = 0; - q[i] = 0; - r[i] = 0; - } - } else { - /* - * Treat short columns as though they are full of 0s. - * Note that there's therefore nothing needed for P. - */ - for (i = ccnt; i < pcnt; i++) { - VDEV_RAIDZ_64MUL_2(q[i], mask); - VDEV_RAIDZ_64MUL_4(r[i], mask); - } - } - } -} - -/* - * Generate RAID parity in the first virtual columns according to the number of - * parity columns available. - */ -static void -vdev_raidz_generate_parity(raidz_map_t *rm) -{ - switch (rm->rm_firstdatacol) { - case 1: - vdev_raidz_generate_parity_p(rm); - break; - case 2: - vdev_raidz_generate_parity_pq(rm); - break; - case 3: - vdev_raidz_generate_parity_pqr(rm); - break; - default: - cmn_err(CE_PANIC, "invalid RAID-Z configuration"); - } -} - -/* ARGSUSED */ -static int -vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) -{ - uint64_t *dst = dbuf; - uint64_t *src = sbuf; - int cnt = size / sizeof (src[0]); - - for (int i = 0; i < cnt; i++) { - dst[i] ^= src[i]; - } - - return (0); -} - -/* ARGSUSED */ -static int -vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, - void *private) -{ - uint64_t *dst = dbuf; - uint64_t *src = sbuf; - uint64_t mask; - int cnt = size / sizeof (dst[0]); - - for (int i = 0; i < cnt; i++, dst++, src++) { - VDEV_RAIDZ_64MUL_2(*dst, mask); - *dst ^= *src; - } - - return (0); -} - -/* ARGSUSED */ -static int -vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) -{ - uint64_t *dst = buf; - uint64_t mask; - int cnt = size / sizeof (dst[0]); - - for (int i = 0; i < cnt; i++, dst++) { - /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ - VDEV_RAIDZ_64MUL_2(*dst, mask); - } - - return (0); -} - -struct reconst_q_struct { - uint64_t *q; - int exp; -}; - -static int -vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) -{ - struct reconst_q_struct *rq = private; - uint64_t *dst = buf; - int cnt = size / sizeof (dst[0]); - - for (int i = 0; i < cnt; i++, dst++, rq->q++) { - *dst ^= *rq->q; - - int j; - uint8_t *b; - for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { - *b = vdev_raidz_exp2(*b, rq->exp); - } - } - - return (0); -} - -struct reconst_pq_struct { - uint8_t *p; - uint8_t *q; - uint8_t *pxy; - uint8_t *qxy; - int aexp; - int bexp; -}; - -static int -vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) -{ - struct reconst_pq_struct *rpq = private; - uint8_t *xd = xbuf; - uint8_t *yd = ybuf; - - for (int i = 0; i < size; - i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { - *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ - vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); - *yd = *rpq->p ^ *rpq->pxy ^ *xd; - } - - return (0); -} - -static int -vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) -{ - struct reconst_pq_struct *rpq = private; - uint8_t *xd = xbuf; - - for (int i = 0; i < size; - i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { - /* same operation as vdev_raidz_reconst_pq_func() on xd */ - *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ - vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); - } - - return (0); -} - -static int -vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) -{ - int x = tgts[0]; - int c; - abd_t *dst, *src; - - ASSERT(ntgts == 1); - ASSERT(x >= rm->rm_firstdatacol); - ASSERT(x < rm->rm_cols); - - ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); - ASSERT(rm->rm_col[x].rc_size > 0); - - src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; - dst = rm->rm_col[x].rc_abd; - - abd_copy(dst, src, rm->rm_col[x].rc_size); - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - uint64_t size = MIN(rm->rm_col[x].rc_size, - rm->rm_col[c].rc_size); - - src = rm->rm_col[c].rc_abd; - dst = rm->rm_col[x].rc_abd; - - if (c == x) - continue; - - (void) abd_iterate_func2(dst, src, 0, 0, size, - vdev_raidz_reconst_p_func, NULL); - } - - return (1 << VDEV_RAIDZ_P); -} - -static int -vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) -{ - int x = tgts[0]; - int c, exp; - abd_t *dst, *src; - - ASSERT(ntgts == 1); - - ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, - rm->rm_col[c].rc_size); - - src = rm->rm_col[c].rc_abd; - dst = rm->rm_col[x].rc_abd; - - if (c == rm->rm_firstdatacol) { - abd_copy(dst, src, size); - if (rm->rm_col[x].rc_size > size) - abd_zero_off(dst, size, - rm->rm_col[x].rc_size - size); - } else { - ASSERT3U(size, <=, rm->rm_col[x].rc_size); - (void) abd_iterate_func2(dst, src, 0, 0, size, - vdev_raidz_reconst_q_pre_func, NULL); - (void) abd_iterate_func(dst, - size, rm->rm_col[x].rc_size - size, - vdev_raidz_reconst_q_pre_tail_func, NULL); - } - } - - src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; - dst = rm->rm_col[x].rc_abd; - exp = 255 - (rm->rm_cols - 1 - x); - - struct reconst_q_struct rq = { abd_to_buf(src), exp }; - (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, - vdev_raidz_reconst_q_post_func, &rq); - - return (1 << VDEV_RAIDZ_Q); -} - -static int -vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) -{ - uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; - abd_t *pdata, *qdata; - uint64_t xsize, ysize; - int x = tgts[0]; - int y = tgts[1]; - abd_t *xd, *yd; - - ASSERT(ntgts == 2); - ASSERT(x < y); - ASSERT(x >= rm->rm_firstdatacol); - ASSERT(y < rm->rm_cols); - - ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); - - /* - * Move the parity data aside -- we're going to compute parity as - * though columns x and y were full of zeros -- Pxy and Qxy. We want to - * reuse the parity generation mechanism without trashing the actual - * parity so we make those columns appear to be full of zeros by - * setting their lengths to zero. - */ - pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; - qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; - xsize = rm->rm_col[x].rc_size; - ysize = rm->rm_col[y].rc_size; - - rm->rm_col[VDEV_RAIDZ_P].rc_abd = - abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); - rm->rm_col[VDEV_RAIDZ_Q].rc_abd = - abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); - rm->rm_col[x].rc_size = 0; - rm->rm_col[y].rc_size = 0; - - vdev_raidz_generate_parity_pq(rm); - - rm->rm_col[x].rc_size = xsize; - rm->rm_col[y].rc_size = ysize; - - p = abd_to_buf(pdata); - q = abd_to_buf(qdata); - pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - xd = rm->rm_col[x].rc_abd; - yd = rm->rm_col[y].rc_abd; - - /* - * We now have: - * Pxy = P + D_x + D_y - * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y - * - * We can then solve for D_x: - * D_x = A * (P + Pxy) + B * (Q + Qxy) - * where - * A = 2^(x - y) * (2^(x - y) + 1)^-1 - * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 - * - * With D_x in hand, we can easily solve for D_y: - * D_y = P + Pxy + D_x - */ - - a = vdev_raidz_pow2[255 + x - y]; - b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; - tmp = 255 - vdev_raidz_log2[a ^ 1]; - - aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; - bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; - - ASSERT3U(xsize, >=, ysize); - struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; - (void) abd_iterate_func2(xd, yd, 0, 0, ysize, - vdev_raidz_reconst_pq_func, &rpq); - (void) abd_iterate_func(xd, ysize, xsize - ysize, - vdev_raidz_reconst_pq_tail_func, &rpq); - - abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - - /* - * Restore the saved parity data. - */ - rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; - rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; - - return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); -} - -/* BEGIN CSTYLED */ -/* - * In the general case of reconstruction, we must solve the system of linear - * equations defined by the coeffecients used to generate parity as well as - * the contents of the data and parity disks. This can be expressed with - * vectors for the original data (D) and the actual data (d) and parity (p) - * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): - * - * __ __ __ __ - * | | __ __ | p_0 | - * | V | | D_0 | | p_m-1 | - * | | x | : | = | d_0 | - * | I | | D_n-1 | | : | - * | | ~~ ~~ | d_n-1 | - * ~~ ~~ ~~ ~~ - * - * I is simply a square identity matrix of size n, and V is a vandermonde - * matrix defined by the coeffecients we chose for the various parity columns - * (1, 2, 4). Note that these values were chosen both for simplicity, speedy - * computation as well as linear separability. - * - * __ __ __ __ - * | 1 .. 1 1 1 | | p_0 | - * | 2^n-1 .. 4 2 1 | __ __ | : | - * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | - * | 1 .. 0 0 0 | | D_1 | | d_0 | - * | 0 .. 0 0 0 | x | D_2 | = | d_1 | - * | : : : : | | : | | d_2 | - * | 0 .. 1 0 0 | | D_n-1 | | : | - * | 0 .. 0 1 0 | ~~ ~~ | : | - * | 0 .. 0 0 1 | | d_n-1 | - * ~~ ~~ ~~ ~~ - * - * Note that I, V, d, and p are known. To compute D, we must invert the - * matrix and use the known data and parity values to reconstruct the unknown - * data values. We begin by removing the rows in V|I and d|p that correspond - * to failed or missing columns; we then make V|I square (n x n) and d|p - * sized n by removing rows corresponding to unused parity from the bottom up - * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' - * using Gauss-Jordan elimination. In the example below we use m=3 parity - * columns, n=8 data columns, with errors in d_1, d_2, and p_1: - * __ __ - * | 1 1 1 1 1 1 1 1 | - * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks - * | 19 205 116 29 64 16 4 1 | / / - * | 1 0 0 0 0 0 0 0 | / / - * | 0 1 0 0 0 0 0 0 | <--' / - * (V|I) = | 0 0 1 0 0 0 0 0 | <---' - * | 0 0 0 1 0 0 0 0 | - * | 0 0 0 0 1 0 0 0 | - * | 0 0 0 0 0 1 0 0 | - * | 0 0 0 0 0 0 1 0 | - * | 0 0 0 0 0 0 0 1 | - * ~~ ~~ - * __ __ - * | 1 1 1 1 1 1 1 1 | - * | 19 205 116 29 64 16 4 1 | - * | 1 0 0 0 0 0 0 0 | - * (V|I)' = | 0 0 0 1 0 0 0 0 | - * | 0 0 0 0 1 0 0 0 | - * | 0 0 0 0 0 1 0 0 | - * | 0 0 0 0 0 0 1 0 | - * | 0 0 0 0 0 0 0 1 | - * ~~ ~~ - * - * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We - * have carefully chosen the seed values 1, 2, and 4 to ensure that this - * matrix is not singular. - * __ __ - * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | - * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | - * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | - * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | - * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | - * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | - * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | - * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | - * ~~ ~~ - * __ __ - * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | - * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | - * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | - * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | - * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | - * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | - * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | - * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | - * ~~ ~~ - * __ __ - * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | - * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | - * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | - * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | - * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | - * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | - * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | - * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | - * ~~ ~~ - * __ __ - * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | - * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | - * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | - * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | - * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | - * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | - * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | - * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | - * ~~ ~~ - * __ __ - * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | - * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | - * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | - * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | - * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | - * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | - * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | - * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | - * ~~ ~~ - * __ __ - * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | - * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | - * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | - * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | - * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | - * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | - * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | - * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | - * ~~ ~~ - * __ __ - * | 0 0 1 0 0 0 0 0 | - * | 167 100 5 41 159 169 217 208 | - * | 166 100 4 40 158 168 216 209 | - * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | - * | 0 0 0 0 1 0 0 0 | - * | 0 0 0 0 0 1 0 0 | - * | 0 0 0 0 0 0 1 0 | - * | 0 0 0 0 0 0 0 1 | - * ~~ ~~ - * - * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values - * of the missing data. - * - * As is apparent from the example above, the only non-trivial rows in the - * inverse matrix correspond to the data disks that we're trying to - * reconstruct. Indeed, those are the only rows we need as the others would - * only be useful for reconstructing data known or assumed to be valid. For - * that reason, we only build the coefficients in the rows that correspond to - * targeted columns. - */ -/* END CSTYLED */ - -static void -vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, - uint8_t **rows) -{ - int i, j; - int pow; - - ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); - - /* - * Fill in the missing rows of interest. - */ - for (i = 0; i < nmap; i++) { - ASSERT3S(0, <=, map[i]); - ASSERT3S(map[i], <=, 2); - - pow = map[i] * n; - if (pow > 255) - pow -= 255; - ASSERT(pow <= 255); - - for (j = 0; j < n; j++) { - pow -= map[i]; - if (pow < 0) - pow += 255; - rows[i][j] = vdev_raidz_pow2[pow]; - } - } -} - -static void -vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, - uint8_t **rows, uint8_t **invrows, const uint8_t *used) -{ - int i, j, ii, jj; - uint8_t log; - - /* - * Assert that the first nmissing entries from the array of used - * columns correspond to parity columns and that subsequent entries - * correspond to data columns. - */ - for (i = 0; i < nmissing; i++) { - ASSERT3S(used[i], <, rm->rm_firstdatacol); - } - for (; i < n; i++) { - ASSERT3S(used[i], >=, rm->rm_firstdatacol); - } - - /* - * First initialize the storage where we'll compute the inverse rows. - */ - for (i = 0; i < nmissing; i++) { - for (j = 0; j < n; j++) { - invrows[i][j] = (i == j) ? 1 : 0; - } - } - - /* - * Subtract all trivial rows from the rows of consequence. - */ - for (i = 0; i < nmissing; i++) { - for (j = nmissing; j < n; j++) { - ASSERT3U(used[j], >=, rm->rm_firstdatacol); - jj = used[j] - rm->rm_firstdatacol; - ASSERT3S(jj, <, n); - invrows[i][j] = rows[i][jj]; - rows[i][jj] = 0; - } - } - - /* - * For each of the rows of interest, we must normalize it and subtract - * a multiple of it from the other rows. - */ - for (i = 0; i < nmissing; i++) { - for (j = 0; j < missing[i]; j++) { - ASSERT0(rows[i][j]); - } - ASSERT3U(rows[i][missing[i]], !=, 0); - - /* - * Compute the inverse of the first element and multiply each - * element in the row by that value. - */ - log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; - - for (j = 0; j < n; j++) { - rows[i][j] = vdev_raidz_exp2(rows[i][j], log); - invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); - } - - for (ii = 0; ii < nmissing; ii++) { - if (i == ii) - continue; - - ASSERT3U(rows[ii][missing[i]], !=, 0); - - log = vdev_raidz_log2[rows[ii][missing[i]]]; - - for (j = 0; j < n; j++) { - rows[ii][j] ^= - vdev_raidz_exp2(rows[i][j], log); - invrows[ii][j] ^= - vdev_raidz_exp2(invrows[i][j], log); - } - } - } - - /* - * Verify that the data that is left in the rows are properly part of - * an identity matrix. - */ - for (i = 0; i < nmissing; i++) { - for (j = 0; j < n; j++) { - if (j == missing[i]) { - ASSERT3U(rows[i][j], ==, 1); - } else { - ASSERT0(rows[i][j]); - } - } - } -} - -static void -vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, - int *missing, uint8_t **invrows, const uint8_t *used) -{ - int i, j, x, cc, c; - uint8_t *src; - uint64_t ccount; - uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; - uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; - uint8_t log = 0; - uint8_t val; - int ll; - uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; - uint8_t *p, *pp; - size_t psize; - - psize = sizeof (invlog[0][0]) * n * nmissing; - p = kmem_alloc(psize, KM_SLEEP); - - for (pp = p, i = 0; i < nmissing; i++) { - invlog[i] = pp; - pp += n; - } - - for (i = 0; i < nmissing; i++) { - for (j = 0; j < n; j++) { - ASSERT3U(invrows[i][j], !=, 0); - invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; - } - } - - for (i = 0; i < n; i++) { - c = used[i]; - ASSERT3U(c, <, rm->rm_cols); - - src = abd_to_buf(rm->rm_col[c].rc_abd); - ccount = rm->rm_col[c].rc_size; - for (j = 0; j < nmissing; j++) { - cc = missing[j] + rm->rm_firstdatacol; - ASSERT3U(cc, >=, rm->rm_firstdatacol); - ASSERT3U(cc, <, rm->rm_cols); - ASSERT3U(cc, !=, c); - - dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); - dcount[j] = rm->rm_col[cc].rc_size; - } - - ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); - - for (x = 0; x < ccount; x++, src++) { - if (*src != 0) - log = vdev_raidz_log2[*src]; - - for (cc = 0; cc < nmissing; cc++) { - if (x >= dcount[cc]) - continue; - - if (*src == 0) { - val = 0; - } else { - if ((ll = log + invlog[cc][i]) >= 255) - ll -= 255; - val = vdev_raidz_pow2[ll]; - } - - if (i == 0) - dst[cc][x] = val; - else - dst[cc][x] ^= val; - } - } - } - - kmem_free(p, psize); -} - -static int -vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) -{ - int n, i, c, t, tt; - int nmissing_rows; - int missing_rows[VDEV_RAIDZ_MAXPARITY]; - int parity_map[VDEV_RAIDZ_MAXPARITY]; - - uint8_t *p, *pp; - size_t psize; - - uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; - uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; - uint8_t *used; - - abd_t **bufs = NULL; - - int code = 0; - - /* - * Matrix reconstruction can't use scatter ABDs yet, so we allocate - * temporary linear ABDs. - */ - if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { - bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; - - bufs[c] = col->rc_abd; - col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); - abd_copy(col->rc_abd, bufs[c], col->rc_size); - } - } - - n = rm->rm_cols - rm->rm_firstdatacol; - - /* - * Figure out which data columns are missing. - */ - nmissing_rows = 0; - for (t = 0; t < ntgts; t++) { - if (tgts[t] >= rm->rm_firstdatacol) { - missing_rows[nmissing_rows++] = - tgts[t] - rm->rm_firstdatacol; - } - } - - /* - * Figure out which parity columns to use to help generate the missing - * data columns. - */ - for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { - ASSERT(tt < ntgts); - ASSERT(c < rm->rm_firstdatacol); - - /* - * Skip any targeted parity columns. - */ - if (c == tgts[tt]) { - tt++; - continue; - } - - code |= 1 << c; - - parity_map[i] = c; - i++; - } - - ASSERT(code != 0); - ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); - - psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * - nmissing_rows * n + sizeof (used[0]) * n; - p = kmem_alloc(psize, KM_SLEEP); - - for (pp = p, i = 0; i < nmissing_rows; i++) { - rows[i] = pp; - pp += n; - invrows[i] = pp; - pp += n; - } - used = pp; - - for (i = 0; i < nmissing_rows; i++) { - used[i] = parity_map[i]; - } - - for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - if (tt < nmissing_rows && - c == missing_rows[tt] + rm->rm_firstdatacol) { - tt++; - continue; - } - - ASSERT3S(i, <, n); - used[i] = c; - i++; - } - - /* - * Initialize the interesting rows of the matrix. - */ - vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); - - /* - * Invert the matrix. - */ - vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, - invrows, used); - - /* - * Reconstruct the missing data using the generated matrix. - */ - vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, - invrows, used); - - kmem_free(p, psize); - - /* - * copy back from temporary linear abds and free them - */ - if (bufs) { - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; - - abd_copy(bufs[c], col->rc_abd, col->rc_size); - abd_free(col->rc_abd); - col->rc_abd = bufs[c]; - } - kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); - } - - return (code); -} - -static int -vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) -{ - int tgts[VDEV_RAIDZ_MAXPARITY], *dt; - int ntgts; - int i, c; - int code; - int nbadparity, nbaddata; - int parity_valid[VDEV_RAIDZ_MAXPARITY]; - - /* - * The tgts list must already be sorted. - */ - for (i = 1; i < nt; i++) { - ASSERT(t[i] > t[i - 1]); - } - - nbadparity = rm->rm_firstdatacol; - nbaddata = rm->rm_cols - nbadparity; - ntgts = 0; - for (i = 0, c = 0; c < rm->rm_cols; c++) { - if (c < rm->rm_firstdatacol) - parity_valid[c] = B_FALSE; - - if (i < nt && c == t[i]) { - tgts[ntgts++] = c; - i++; - } else if (rm->rm_col[c].rc_error != 0) { - tgts[ntgts++] = c; - } else if (c >= rm->rm_firstdatacol) { - nbaddata--; - } else { - parity_valid[c] = B_TRUE; - nbadparity--; - } - } - - ASSERT(ntgts >= nt); - ASSERT(nbaddata >= 0); - ASSERT(nbaddata + nbadparity == ntgts); - - dt = &tgts[nbadparity]; - - /* - * See if we can use any of our optimized reconstruction routines. - */ - if (!vdev_raidz_default_to_general) { - switch (nbaddata) { - case 1: - if (parity_valid[VDEV_RAIDZ_P]) - return (vdev_raidz_reconstruct_p(rm, dt, 1)); - - ASSERT(rm->rm_firstdatacol > 1); - - if (parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_q(rm, dt, 1)); - - ASSERT(rm->rm_firstdatacol > 2); - break; - - case 2: - ASSERT(rm->rm_firstdatacol > 1); - - if (parity_valid[VDEV_RAIDZ_P] && - parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_pq(rm, dt, 2)); - - ASSERT(rm->rm_firstdatacol > 2); - - break; - } - } - - code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); - ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); - ASSERT(code > 0); - return (code); -} - -static int -vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - uint64_t *logical_ashift, uint64_t *physical_ashift) -{ - vdev_t *cvd; - uint64_t nparity = vd->vdev_nparity; - int c; - int lasterror = 0; - int numerrors = 0; - - ASSERT(nparity > 0); - - if (nparity > VDEV_RAIDZ_MAXPARITY || - vd->vdev_children < nparity + 1) { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (SET_ERROR(EINVAL)); - } - - vdev_open_children(vd); - - for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; - - if (cvd->vdev_open_error != 0) { - lasterror = cvd->vdev_open_error; - numerrors++; - continue; - } - - *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; - *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; - *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); - *physical_ashift = MAX(*physical_ashift, - cvd->vdev_physical_ashift); - } - - *asize *= vd->vdev_children; - *max_asize *= vd->vdev_children; - - if (numerrors > nparity) { - vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; - return (lasterror); - } - - return (0); -} - -static void -vdev_raidz_close(vdev_t *vd) -{ - int c; - - for (c = 0; c < vd->vdev_children; c++) - vdev_close(vd->vdev_child[c]); -} - -#ifdef illumos -/* - * Handle a read or write I/O to a RAID-Z dump device. - * - * The dump device is in a unique situation compared to other ZFS datasets: - * writing to this device should be as simple and fast as possible. In - * addition, durability matters much less since the dump will be extracted - * once the machine reboots. For that reason, this function eschews parity for - * performance and simplicity. The dump device uses the checksum setting - * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this - * dataset. - * - * Blocks of size 128 KB have been preallocated for this volume. I/Os less than - * 128 KB will not fill an entire block; in addition, they may not be properly - * aligned. In that case, this function uses the preallocated 128 KB block and - * omits reading or writing any "empty" portions of that block, as opposed to - * allocating a fresh appropriately-sized block. - * - * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: - * - * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) - * - * If this were a standard RAID-Z dataset, a block of at least 40 KB would be - * allocated which spans all five child vdevs. 8 KB of data would be written to - * each of four vdevs, with the fifth containing the parity bits. - * - * parity data data data data - * | PP | XX | XX | XX | XX | - * ^ ^ ^ ^ ^ - * | | | | | - * 8 KB parity ------8 KB data blocks------ - * - * However, when writing to the dump device, the behavior is different: - * - * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) - * - * Unlike the normal RAID-Z case in which the block is allocated based on the - * I/O size, reads and writes here always use a 128 KB logical I/O size. If the - * I/O size is less than 128 KB, only the actual portions of data are written. - * In this example the data is written to the third data vdev since that vdev - * contains the offset [64 KB, 96 KB). - * - * parity data data data data - * | | | | XX | | - * ^ - * | - * 32 KB data block - * - * As a result, an individual I/O may not span all child vdevs; moreover, a - * small I/O may only operate on a single child vdev. - * - * Note that since there are no parity bits calculated or written, this format - * remains the same no matter how many parity bits are used in a normal RAID-Z - * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above - * would look like: - * - * parity parity parity data data data data - * | | | | | | XX | | - * ^ - * | - * 32 KB data block - */ -int -vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, - uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) -{ - vdev_t *tvd = vd->vdev_top; - vdev_t *cvd; - raidz_map_t *rm; - raidz_col_t *rc; - int c, err = 0; - - uint64_t start, end, colstart, colend; - uint64_t coloffset, colsize, colskip; - - int flags = doread ? BIO_READ : BIO_WRITE; - -#ifdef _KERNEL - - /* - * Don't write past the end of the block - */ - VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); - - start = offset; - end = start + size; - - /* - * Allocate a RAID-Z map for this block. Note that this block starts - * from the "original" offset, this is, the offset of the extent which - * contains the requisite offset of the data being read or written. - * - * Even if this I/O operation doesn't span the full block size, let's - * treat the on-disk format as if the only blocks are the complete 128 - * KB size. - */ - abd_t *abd = abd_get_from_buf(data - (offset - origoffset), - SPA_OLD_MAXBLOCKSIZE); - rm = vdev_raidz_map_alloc(abd, - SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, - vd->vdev_children, vd->vdev_nparity); - - coloffset = origoffset; - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; - c++, coloffset += rc->rc_size) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - - /* - * Find the start and end of this column in the RAID-Z map, - * keeping in mind that the stated size and offset of the - * operation may not fill the entire column for this vdev. - * - * If any portion of the data spans this column, issue the - * appropriate operation to the vdev. - */ - if (coloffset + rc->rc_size <= start) - continue; - if (coloffset >= end) - continue; - - colstart = MAX(coloffset, start); - colend = MIN(end, coloffset + rc->rc_size); - colsize = colend - colstart; - colskip = colstart - coloffset; - - VERIFY3U(colsize, <=, rc->rc_size); - VERIFY3U(colskip, <=, rc->rc_size); - - /* - * Note that the child vdev will have a vdev label at the start - * of its range of offsets, hence the need for - * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another - * example of why this calculation is needed. - */ - if ((err = vdev_disk_physio(cvd, - ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize, - VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, - flags, isdump)) != 0) - break; - } - - vdev_raidz_map_free(rm); - abd_put(abd); -#endif /* KERNEL */ - - return (err); -} -#endif - -static uint64_t -vdev_raidz_asize(vdev_t *vd, uint64_t psize) -{ - uint64_t asize; - uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vd->vdev_children; - uint64_t nparity = vd->vdev_nparity; - - asize = ((psize - 1) >> ashift) + 1; - asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); - asize = roundup(asize, nparity + 1) << ashift; - - return (asize); -} - -static void -vdev_raidz_child_done(zio_t *zio) -{ - raidz_col_t *rc = zio->io_private; - - rc->rc_error = zio->io_error; - rc->rc_tried = 1; - rc->rc_skipped = 0; -} - -static void -vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) -{ -#ifdef ZFS_DEBUG - vdev_t *vd = zio->io_vd; - vdev_t *tvd = vd->vdev_top; - - range_seg_t logical_rs, physical_rs; - logical_rs.rs_start = zio->io_offset; - logical_rs.rs_end = logical_rs.rs_start + - vdev_raidz_asize(zio->io_vd, zio->io_size); - - raidz_col_t *rc = &rm->rm_col[col]; - vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; - - vdev_xlate(cvd, &logical_rs, &physical_rs); - ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); - ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); - /* - * It would be nice to assert that rs_end is equal - * to rc_offset + rc_size but there might be an - * optional I/O at the end that is not accounted in - * rc_size. - */ - if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { - ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + - rc->rc_size + (1 << tvd->vdev_ashift)); - } else { - ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); - } -#endif -} - -/* - * Start an IO operation on a RAIDZ VDev - * - * Outline: - * - For write operations: - * 1. Generate the parity data - * 2. Create child zio write operations to each column's vdev, for both - * data and parity. - * 3. If the column skips any sectors for padding, create optional dummy - * write zio children for those areas to improve aggregation continuity. - * - For read operations: - * 1. Create child zio read operations to each data column's vdev to read - * the range of data required for zio. - * 2. If this is a scrub or resilver operation, or if any of the data - * vdevs have had errors, then create zio read operations to the parity - * columns' VDevs as well. - */ -static void -vdev_raidz_io_start(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_t *tvd = vd->vdev_top; - vdev_t *cvd; - raidz_map_t *rm; - raidz_col_t *rc; - int c, i; - - rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset, - zio->io_type == ZIO_TYPE_FREE, - tvd->vdev_ashift, vd->vdev_children, - vd->vdev_nparity); - - zio->io_vsd = rm; - zio->io_vsd_ops = &vdev_raidz_vsd_ops; - - ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); - - if (zio->io_type == ZIO_TYPE_FREE) { - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } - - zio_execute(zio); - return; - } - - if (zio->io_type == ZIO_TYPE_WRITE) { - vdev_raidz_generate_parity(rm); - - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - - /* - * Verify physical to logical translation. - */ - vdev_raidz_io_verify(zio, rm, c); - - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } - - /* - * Generate optional I/Os for any skipped sectors to improve - * aggregation contiguity. - */ - for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { - ASSERT(c <= rm->rm_scols); - if (c == rm->rm_scols) - c = 0; - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset + rc->rc_size, NULL, - 1 << tvd->vdev_ashift, - zio->io_type, zio->io_priority, - ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); - } - - zio_execute(zio); - return; - } - - ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - - /* - * Iterate over the columns in reverse order so that we hit the parity - * last -- any errors along the way will force us to read the parity. - */ - for (c = rm->rm_cols - 1; c >= 0; c--) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - if (!vdev_readable(cvd)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; - else - rm->rm_missingparity++; - rc->rc_error = SET_ERROR(ENXIO); - rc->rc_tried = 1; /* don't even try */ - rc->rc_skipped = 1; - continue; - } - if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; - else - rm->rm_missingparity++; - rc->rc_error = SET_ERROR(ESTALE); - rc->rc_skipped = 1; - continue; - } - if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || - (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } - } - - zio_execute(zio); -} - - -/* - * Report a checksum error for a child of a RAID-Z device. - */ -static void -raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) -{ - void *buf; - vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; - - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - zio_bad_cksum_t zbc; - raidz_map_t *rm = zio->io_vsd; - - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - - zbc.zbc_has_cksum = 0; - zbc.zbc_injected = rm->rm_ecksuminjected; - - buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size); - zfs_ereport_post_checksum(zio->io_spa, vd, zio, - rc->rc_offset, rc->rc_size, buf, bad_data, - &zbc); - abd_return_buf(rc->rc_abd, buf, rc->rc_size); - } -} - -/* - * We keep track of whether or not there were any injected errors, so that - * any ereports we generate can note it. - */ -static int -raidz_checksum_verify(zio_t *zio) -{ - zio_bad_cksum_t zbc; - raidz_map_t *rm = zio->io_vsd; - - int ret = zio_checksum_error(zio, &zbc); - if (ret != 0 && zbc.zbc_injected != 0) - rm->rm_ecksuminjected = 1; - - return (ret); -} - -/* - * Generate the parity from the data columns. If we tried and were able to - * read the parity without error, verify that the generated parity matches the - * data we read. If it doesn't, we fire off a checksum error. Return the - * number such failures. - */ -static int -raidz_parity_verify(zio_t *zio, raidz_map_t *rm) -{ - void *orig[VDEV_RAIDZ_MAXPARITY]; - int c, ret = 0; - raidz_col_t *rc; - - blkptr_t *bp = zio->io_bp; - enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : - (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); - - if (checksum == ZIO_CHECKSUM_NOPARITY) - return (ret); - - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; - if (!rc->rc_tried || rc->rc_error != 0) - continue; - orig[c] = zio_buf_alloc(rc->rc_size); - abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size); - } - - vdev_raidz_generate_parity(rm); - - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; - if (!rc->rc_tried || rc->rc_error != 0) - continue; - if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) { - raidz_checksum_error(zio, rc, orig[c]); - rc->rc_error = SET_ERROR(ECKSUM); - ret++; - } - zio_buf_free(orig[c], rc->rc_size); - } - - return (ret); -} - -/* - * Keep statistics on all the ways that we used parity to correct data. - */ -static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; - -static int -vdev_raidz_worst_error(raidz_map_t *rm) -{ - int error = 0; - - for (int c = 0; c < rm->rm_cols; c++) - error = zio_worst_error(error, rm->rm_col[c].rc_error); - - return (error); -} - -/* - * Iterate over all combinations of bad data and attempt a reconstruction. - * Note that the algorithm below is non-optimal because it doesn't take into - * account how reconstruction is actually performed. For example, with - * triple-parity RAID-Z the reconstruction procedure is the same if column 4 - * is targeted as invalid as if columns 1 and 4 are targeted since in both - * cases we'd only use parity information in column 0. - */ -static int -vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) -{ - raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc; - void *orig[VDEV_RAIDZ_MAXPARITY]; - int tstore[VDEV_RAIDZ_MAXPARITY + 2]; - int *tgts = &tstore[1]; - int current, next, i, c, n; - int code, ret = 0; - - ASSERT(total_errors < rm->rm_firstdatacol); - - /* - * This simplifies one edge condition. - */ - tgts[-1] = -1; - - for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { - /* - * Initialize the targets array by finding the first n columns - * that contain no error. - * - * If there were no data errors, we need to ensure that we're - * always explicitly attempting to reconstruct at least one - * data column. To do this, we simply push the highest target - * up into the data columns. - */ - for (c = 0, i = 0; i < n; i++) { - if (i == n - 1 && data_errors == 0 && - c < rm->rm_firstdatacol) { - c = rm->rm_firstdatacol; - } - - while (rm->rm_col[c].rc_error != 0) { - c++; - ASSERT3S(c, <, rm->rm_cols); - } - - tgts[i] = c++; - } - - /* - * Setting tgts[n] simplifies the other edge condition. - */ - tgts[n] = rm->rm_cols; - - /* - * These buffers were allocated in previous iterations. - */ - for (i = 0; i < n - 1; i++) { - ASSERT(orig[i] != NULL); - } - - orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); - - current = 0; - next = tgts[current]; - - while (current != n) { - tgts[current] = next; - current = 0; - - /* - * Save off the original data that we're going to - * attempt to reconstruct. - */ - for (i = 0; i < n; i++) { - ASSERT(orig[i] != NULL); - c = tgts[i]; - ASSERT3S(c, >=, 0); - ASSERT3S(c, <, rm->rm_cols); - rc = &rm->rm_col[c]; - abd_copy_to_buf(orig[i], rc->rc_abd, - rc->rc_size); - } - - /* - * Attempt a reconstruction and exit the outer loop on - * success. - */ - code = vdev_raidz_reconstruct(rm, tgts, n); - if (raidz_checksum_verify(zio) == 0) { - atomic_inc_64(&raidz_corrected[code]); - - for (i = 0; i < n; i++) { - c = tgts[i]; - rc = &rm->rm_col[c]; - ASSERT(rc->rc_error == 0); - if (rc->rc_tried) - raidz_checksum_error(zio, rc, - orig[i]); - rc->rc_error = SET_ERROR(ECKSUM); - } - - ret = code; - goto done; - } - - /* - * Restore the original data. - */ - for (i = 0; i < n; i++) { - c = tgts[i]; - rc = &rm->rm_col[c]; - abd_copy_from_buf(rc->rc_abd, orig[i], - rc->rc_size); - } - - do { - /* - * Find the next valid column after the current - * position.. - */ - for (next = tgts[current] + 1; - next < rm->rm_cols && - rm->rm_col[next].rc_error != 0; next++) - continue; - - ASSERT(next <= tgts[current + 1]); - - /* - * If that spot is available, we're done here. - */ - if (next != tgts[current + 1]) - break; - - /* - * Otherwise, find the next valid column after - * the previous position. - */ - for (c = tgts[current - 1] + 1; - rm->rm_col[c].rc_error != 0; c++) - continue; - - tgts[current] = c; - current++; - - } while (current != n); - } - } - n--; -done: - for (i = 0; i < n; i++) { - zio_buf_free(orig[i], rm->rm_col[0].rc_size); - } - - return (ret); -} - -/* - * Complete an IO operation on a RAIDZ VDev - * - * Outline: - * - For write operations: - * 1. Check for errors on the child IOs. - * 2. Return, setting an error code if too few child VDevs were written - * to reconstruct the data later. Note that partial writes are - * considered successful if they can be reconstructed at all. - * - For read operations: - * 1. Check for errors on the child IOs. - * 2. If data errors occurred: - * a. Try to reassemble the data from the parity available. - * b. If we haven't yet read the parity drives, read them now. - * c. If all parity drives have been read but the data still doesn't - * reassemble with a correct checksum, then try combinatorial - * reconstruction. - * d. If that doesn't work, return an error. - * 3. If there were unexpected errors or this is a resilver operation, - * rewrite the vdevs that had errors. - */ -static void -vdev_raidz_io_done(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_t *cvd; - raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc; - int unexpected_errors = 0; - int parity_errors = 0; - int parity_untried = 0; - int data_errors = 0; - int total_errors = 0; - int n, c; - int tgts[VDEV_RAIDZ_MAXPARITY]; - int code; - - ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ - - ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); - ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); - - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - - if (rc->rc_error) { - ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ - - if (c < rm->rm_firstdatacol) - parity_errors++; - else - data_errors++; - - if (!rc->rc_skipped) - unexpected_errors++; - - total_errors++; - } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { - parity_untried++; - } - } - - if (zio->io_type == ZIO_TYPE_WRITE) { - /* - * XXX -- for now, treat partial writes as a success. - * (If we couldn't write enough columns to reconstruct - * the data, the I/O failed. Otherwise, good enough.) - * - * Now that we support write reallocation, it would be better - * to treat partial failure as real failure unless there are - * no non-degraded top-level vdevs left, and not update DTLs - * if we intend to reallocate. - */ - /* XXPOLICY */ - if (total_errors > rm->rm_firstdatacol) - zio->io_error = vdev_raidz_worst_error(rm); - - return; - } else if (zio->io_type == ZIO_TYPE_FREE) { - return; - } - - ASSERT(zio->io_type == ZIO_TYPE_READ); - /* - * There are three potential phases for a read: - * 1. produce valid data from the columns read - * 2. read all disks and try again - * 3. perform combinatorial reconstruction - * - * Each phase is progressively both more expensive and less likely to - * occur. If we encounter more errors than we can repair or all phases - * fail, we have no choice but to return an error. - */ - - /* - * If the number of errors we saw was correctable -- less than or equal - * to the number of parity disks read -- attempt to produce data that - * has a valid checksum. Naturally, this case applies in the absence of - * any errors. - */ - if (total_errors <= rm->rm_firstdatacol - parity_untried) { - if (data_errors == 0) { - if (raidz_checksum_verify(zio) == 0) { - /* - * If we read parity information (unnecessarily - * as it happens since no reconstruction was - * needed) regenerate and verify the parity. - * We also regenerate parity when resilvering - * so we can write it out to the failed device - * later. - */ - if (parity_errors + parity_untried < - rm->rm_firstdatacol || - (zio->io_flags & ZIO_FLAG_RESILVER)) { - n = raidz_parity_verify(zio, rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); - } - goto done; - } - } else { - /* - * We either attempt to read all the parity columns or - * none of them. If we didn't try to read parity, we - * wouldn't be here in the correctable case. There must - * also have been fewer parity errors than parity - * columns or, again, we wouldn't be in this code path. - */ - ASSERT(parity_untried == 0); - ASSERT(parity_errors < rm->rm_firstdatacol); - - /* - * Identify the data columns that reported an error. - */ - n = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) { - ASSERT(n < VDEV_RAIDZ_MAXPARITY); - tgts[n++] = c; - } - } - - ASSERT(rm->rm_firstdatacol >= n); - - code = vdev_raidz_reconstruct(rm, tgts, n); - - if (raidz_checksum_verify(zio) == 0) { - atomic_inc_64(&raidz_corrected[code]); - - /* - * If we read more parity disks than were used - * for reconstruction, confirm that the other - * parity disks produced correct data. This - * routine is suboptimal in that it regenerates - * the parity that we already used in addition - * to the parity that we're attempting to - * verify, but this should be a relatively - * uncommon case, and can be optimized if it - * becomes a problem. Note that we regenerate - * parity when resilvering so we can write it - * out to failed devices later. - */ - if (parity_errors < rm->rm_firstdatacol - n || - (zio->io_flags & ZIO_FLAG_RESILVER)) { - n = raidz_parity_verify(zio, rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); - } - - goto done; - } - } - } - - /* - * This isn't a typical situation -- either we got a read error or - * a child silently returned bad data. Read every block so we can - * try again with as much data and parity as we can track down. If - * we've already been through once before, all children will be marked - * as tried so we'll proceed to combinatorial reconstruction. - */ - unexpected_errors = 1; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; - - for (c = 0; c < rm->rm_cols; c++) { - if (rm->rm_col[c].rc_tried) - continue; - - zio_vdev_io_redone(zio); - do { - rc = &rm->rm_col[c]; - if (rc->rc_tried) - continue; - zio_nowait(zio_vdev_child_io(zio, NULL, - vd->vdev_child[rc->rc_devidx], - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } while (++c < rm->rm_cols); - - return; - } - - /* - * At this point we've attempted to reconstruct the data given the - * errors we detected, and we've attempted to read all columns. There - * must, therefore, be one or more additional problems -- silent errors - * resulting in invalid data rather than explicit I/O errors resulting - * in absent data. We check if there is enough additional data to - * possibly reconstruct the data and then perform combinatorial - * reconstruction over all possible combinations. If that fails, - * we're cooked. - */ - if (total_errors > rm->rm_firstdatacol) { - zio->io_error = vdev_raidz_worst_error(rm); - - } else if (total_errors < rm->rm_firstdatacol && - (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { - /* - * If we didn't use all the available parity for the - * combinatorial reconstruction, verify that the remaining - * parity is correct. - */ - if (code != (1 << rm->rm_firstdatacol) - 1) - (void) raidz_parity_verify(zio, rm); - } else { - /* - * We're here because either: - * - * total_errors == rm_firstdatacol, or - * vdev_raidz_combrec() failed - * - * In either case, there is enough bad data to prevent - * reconstruction. - * - * Start checksum ereports for all children which haven't - * failed, and the IO wasn't speculative. - */ - zio->io_error = SET_ERROR(ECKSUM); - - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error == 0) { - zio_bad_cksum_t zbc; - zbc.zbc_has_cksum = 0; - zbc.zbc_injected = - rm->rm_ecksuminjected; - - zfs_ereport_start_checksum( - zio->io_spa, - vd->vdev_child[rc->rc_devidx], - zio, rc->rc_offset, rc->rc_size, - (void *)(uintptr_t)c, &zbc); - } - } - } - } - -done: - zio_checksum_verified(zio); - - if (zio->io_error == 0 && spa_writeable(zio->io_spa) && - (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { - /* - * Use the good data we have in hand to repair damaged children. - */ - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - - if (rc->rc_error == 0) - continue; - - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_IO_REPAIR | (unexpected_errors ? - ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); - } - } -} - -static void -vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) -{ - if (faulted > vd->vdev_nparity) - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_NO_REPLICAS); - else if (degraded + faulted != 0) - vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); - else - vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); -} - -/* - * Determine if any portion of the provided block resides on a child vdev - * with a dirty DTL and therefore needs to be resilvered. The function - * assumes that at least one DTL is dirty which imples that full stripe - * width blocks must be resilvered. - */ -static boolean_t -vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) -{ - uint64_t dcols = vd->vdev_children; - uint64_t nparity = vd->vdev_nparity; - uint64_t ashift = vd->vdev_top->vdev_ashift; - /* The starting RAIDZ (parent) vdev sector of the block. */ - uint64_t b = offset >> ashift; - /* The zio's size in units of the vdev's minimum sector size. */ - uint64_t s = ((psize - 1) >> ashift) + 1; - /* The first column for this stripe. */ - uint64_t f = b % dcols; - - if (s + nparity >= dcols) - return (B_TRUE); - - for (uint64_t c = 0; c < s + nparity; c++) { - uint64_t devidx = (f + c) % dcols; - vdev_t *cvd = vd->vdev_child[devidx]; - - /* - * dsl_scan_need_resilver() already checked vd with - * vdev_dtl_contains(). So here just check cvd with - * vdev_dtl_empty(), cheaper and a good approximation. - */ - if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) - return (B_TRUE); - } - - return (B_FALSE); -} - -static void -vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) -{ - vdev_t *raidvd = cvd->vdev_parent; - ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); - - uint64_t width = raidvd->vdev_children; - uint64_t tgt_col = cvd->vdev_id; - uint64_t ashift = raidvd->vdev_top->vdev_ashift; - - /* make sure the offsets are block-aligned */ - ASSERT0(in->rs_start % (1 << ashift)); - ASSERT0(in->rs_end % (1 << ashift)); - uint64_t b_start = in->rs_start >> ashift; - uint64_t b_end = in->rs_end >> ashift; - - uint64_t start_row = 0; - if (b_start > tgt_col) /* avoid underflow */ - start_row = ((b_start - tgt_col - 1) / width) + 1; - - uint64_t end_row = 0; - if (b_end > tgt_col) - end_row = ((b_end - tgt_col - 1) / width) + 1; - - res->rs_start = start_row << ashift; - res->rs_end = end_row << ashift; - - ASSERT3U(res->rs_start, <=, in->rs_start); - ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); -} - -vdev_ops_t vdev_raidz_ops = { - vdev_raidz_open, - vdev_raidz_close, - vdev_raidz_asize, - vdev_raidz_io_start, - vdev_raidz_io_done, - vdev_raidz_state_change, - vdev_raidz_need_resilver, - NULL, - NULL, - NULL, - vdev_raidz_xlate, - VDEV_TYPE_RAIDZ, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ -}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c deleted file mode 100644 index ab51c8c79055..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c +++ /dev/null @@ -1,2156 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * This file contains the necessary logic to remove vdevs from a - * storage pool. Currently, the only devices that can be removed - * are log, cache, and spare devices; and top level vdevs from a pool - * w/o raidz. (Note that members of a mirror can also be removed - * by the detach operation.) - * - * Log vdevs are removed by evacuating them and then turning the vdev - * into a hole vdev while holding spa config locks. - * - * Top level vdevs are removed and converted into an indirect vdev via - * a multi-step process: - * - * - Disable allocations from this device (spa_vdev_remove_top). - * - * - From a new thread (spa_vdev_remove_thread), copy data from - * the removing vdev to a different vdev. The copy happens in open - * context (spa_vdev_copy_impl) and issues a sync task - * (vdev_mapping_sync) so the sync thread can update the partial - * indirect mappings in core and on disk. - * - * - If a free happens during a removal, it is freed from the - * removing vdev, and if it has already been copied, from the new - * location as well (free_from_removing_vdev). - * - * - After the removal is completed, the copy thread converts the vdev - * into an indirect vdev (vdev_remove_complete) before instructing - * the sync thread to destroy the space maps and finish the removal - * (spa_finish_removal). - */ - -typedef struct vdev_copy_arg { - metaslab_t *vca_msp; - uint64_t vca_outstanding_bytes; - kcondvar_t vca_cv; - kmutex_t vca_lock; -} vdev_copy_arg_t; - -/* - * The maximum amount of memory we can use for outstanding i/o while - * doing a device removal. This determines how much i/o we can have - * in flight concurrently. - */ -int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; - -/* - * The largest contiguous segment that we will attempt to allocate when - * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If - * there is a performance problem with attempting to allocate large blocks, - * consider decreasing this. - * - * Note: we will issue I/Os of up to this size. The mpt driver does not - * respond well to I/Os larger than 1MB, so we set this to 1MB. (When - * mpt processes an I/O larger than 1MB, it needs to do an allocation of - * 2 physically contiguous pages; if this allocation fails, mpt will drop - * the I/O and hang the device.) - */ -int zfs_remove_max_segment = 1024 * 1024; - -/* - * Allow a remap segment to span free chunks of at most this size. The main - * impact of a larger span is that we will read and write larger, more - * contiguous chunks, with more "unnecessary" data -- trading off bandwidth - * for iops. The value here was chosen to align with - * zfs_vdev_read_gap_limit, which is a similar concept when doing regular - * reads (but there's no reason it has to be the same). - * - * Additionally, a higher span will have the following relatively minor - * effects: - * - the mapping will be smaller, since one entry can cover more allocated - * segments - * - more of the fragmentation in the removing device will be preserved - * - we'll do larger allocations, which may fail and fall back on smaller - * allocations - */ -int vdev_removal_max_span = 32 * 1024; - -/* - * This is used by the test suite so that it can ensure that certain - * actions happen while in the middle of a removal. - */ -uint64_t zfs_remove_max_bytes_pause = UINT64_MAX; - -#define VDEV_REMOVAL_ZAP_OBJS "lzap" - -static void spa_vdev_remove_thread(void *arg); - -static void -spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx) -{ - VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_REMOVING, sizeof (uint64_t), - sizeof (spa->spa_removing_phys) / sizeof (uint64_t), - &spa->spa_removing_phys, tx)); -} - -static nvlist_t * -spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) -{ - for (int i = 0; i < count; i++) { - uint64_t guid = - fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID); - - if (guid == target_guid) - return (nvpp[i]); - } - - return (NULL); -} - -static void -spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, - nvlist_t *dev_to_remove) -{ - nvlist_t **newdev = NULL; - - if (count > 1) - newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); - - for (int i = 0, j = 0; i < count; i++) { - if (dev[i] == dev_to_remove) - continue; - VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); - } - - VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); - VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); - - for (int i = 0; i < count - 1; i++) - nvlist_free(newdev[i]); - - if (count > 1) - kmem_free(newdev, (count - 1) * sizeof (void *)); -} - -static spa_vdev_removal_t * -spa_vdev_removal_create(vdev_t *vd) -{ - spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); - mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); - svr->svr_allocd_segs = range_tree_create(NULL, NULL); - svr->svr_vdev_id = vd->vdev_id; - - for (int i = 0; i < TXG_SIZE; i++) { - svr->svr_frees[i] = range_tree_create(NULL, NULL); - list_create(&svr->svr_new_segments[i], - sizeof (vdev_indirect_mapping_entry_t), - offsetof(vdev_indirect_mapping_entry_t, vime_node)); - } - - return (svr); -} - -void -spa_vdev_removal_destroy(spa_vdev_removal_t *svr) -{ - for (int i = 0; i < TXG_SIZE; i++) { - ASSERT0(svr->svr_bytes_done[i]); - ASSERT0(svr->svr_max_offset_to_sync[i]); - range_tree_destroy(svr->svr_frees[i]); - list_destroy(&svr->svr_new_segments[i]); - } - - range_tree_destroy(svr->svr_allocd_segs); - mutex_destroy(&svr->svr_lock); - cv_destroy(&svr->svr_cv); - kmem_free(svr, sizeof (*svr)); -} - -/* - * This is called as a synctask in the txg in which we will mark this vdev - * as removing (in the config stored in the MOS). - * - * It begins the evacuation of a toplevel vdev by: - * - initializing the spa_removing_phys which tracks this removal - * - computing the amount of space to remove for accounting purposes - * - dirtying all dbufs in the spa_config_object - * - creating the spa_vdev_removal - * - starting the spa_vdev_remove_thread - */ -static void -vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) -{ - int vdev_id = (uintptr_t)arg; - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - vdev_t *vd = vdev_lookup_top(spa, vdev_id); - vdev_indirect_config_t *vic = &vd->vdev_indirect_config; - objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; - spa_vdev_removal_t *svr = NULL; - uint64_t txg = dmu_tx_get_txg(tx); - - ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); - svr = spa_vdev_removal_create(vd); - - ASSERT(vd->vdev_removing); - ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); - - spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); - if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { - /* - * By activating the OBSOLETE_COUNTS feature, we prevent - * the pool from being downgraded and ensure that the - * refcounts are precise. - */ - spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); - uint64_t one = 1; - VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap, - VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1, - &one, tx)); - ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0); - } - - vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx); - vd->vdev_indirect_mapping = - vdev_indirect_mapping_open(mos, vic->vic_mapping_object); - vic->vic_births_object = vdev_indirect_births_alloc(mos, tx); - vd->vdev_indirect_births = - vdev_indirect_births_open(mos, vic->vic_births_object); - spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id; - spa->spa_removing_phys.sr_start_time = gethrestime_sec(); - spa->spa_removing_phys.sr_end_time = 0; - spa->spa_removing_phys.sr_state = DSS_SCANNING; - spa->spa_removing_phys.sr_to_copy = 0; - spa->spa_removing_phys.sr_copied = 0; - - /* - * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because - * there may be space in the defer tree, which is free, but still - * counted in vs_alloc. - */ - for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { - metaslab_t *ms = vd->vdev_ms[i]; - if (ms->ms_sm == NULL) - continue; - - spa->spa_removing_phys.sr_to_copy += - metaslab_allocated_space(ms); - - /* - * Space which we are freeing this txg does not need to - * be copied. - */ - spa->spa_removing_phys.sr_to_copy -= - range_tree_space(ms->ms_freeing); - - ASSERT0(range_tree_space(ms->ms_freed)); - for (int t = 0; t < TXG_SIZE; t++) - ASSERT0(range_tree_space(ms->ms_allocating[t])); - } - - /* - * Sync tasks are called before metaslab_sync(), so there should - * be no already-synced metaslabs in the TXG_CLEAN list. - */ - ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL); - - spa_sync_removing_state(spa, tx); - - /* - * All blocks that we need to read the most recent mapping must be - * stored on concrete vdevs. Therefore, we must dirty anything that - * is read before spa_remove_init(). Specifically, the - * spa_config_object. (Note that although we already modified the - * spa_config_object in spa_sync_removing_state, that may not have - * modified all blocks of the object.) - */ - dmu_object_info_t doi; - VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi)); - for (uint64_t offset = 0; offset < doi.doi_max_offset; ) { - dmu_buf_t *dbuf; - VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT, - offset, FTAG, &dbuf, 0)); - dmu_buf_will_dirty(dbuf, tx); - offset += dbuf->db_size; - dmu_buf_rele(dbuf, FTAG); - } - - /* - * Now that we've allocated the im_object, dirty the vdev to ensure - * that the object gets written to the config on disk. - */ - vdev_config_dirty(vd); - - zfs_dbgmsg("starting removal thread for vdev %llu (%p) in txg %llu " - "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx), - vic->vic_mapping_object); - - spa_history_log_internal(spa, "vdev remove started", tx, - "%s vdev %llu %s", spa_name(spa), vd->vdev_id, - (vd->vdev_path != NULL) ? vd->vdev_path : "-"); - /* - * Setting spa_vdev_removal causes subsequent frees to call - * free_from_removing_vdev(). Note that we don't need any locking - * because we are the sync thread, and metaslab_free_impl() is only - * called from syncing context (potentially from a zio taskq thread, - * but in any case only when there are outstanding free i/os, which - * there are not). - */ - ASSERT3P(spa->spa_vdev_removal, ==, NULL); - spa->spa_vdev_removal = svr; - svr->svr_thread = thread_create(NULL, 0, - spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); -} - -/* - * When we are opening a pool, we must read the mapping for each - * indirect vdev in order from most recently removed to least - * recently removed. We do this because the blocks for the mapping - * of older indirect vdevs may be stored on more recently removed vdevs. - * In order to read each indirect mapping object, we must have - * initialized all more recently removed vdevs. - */ -int -spa_remove_init(spa_t *spa) -{ - int error; - - error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_REMOVING, sizeof (uint64_t), - sizeof (spa->spa_removing_phys) / sizeof (uint64_t), - &spa->spa_removing_phys); - - if (error == ENOENT) { - spa->spa_removing_phys.sr_state = DSS_NONE; - spa->spa_removing_phys.sr_removing_vdev = -1; - spa->spa_removing_phys.sr_prev_indirect_vdev = -1; - spa->spa_indirect_vdevs_loaded = B_TRUE; - return (0); - } else if (error != 0) { - return (error); - } - - if (spa->spa_removing_phys.sr_state == DSS_SCANNING) { - /* - * We are currently removing a vdev. Create and - * initialize a spa_vdev_removal_t from the bonus - * buffer of the removing vdevs vdev_im_object, and - * initialize its partial mapping. - */ - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - vdev_t *vd = vdev_lookup_top(spa, - spa->spa_removing_phys.sr_removing_vdev); - - if (vd == NULL) { - spa_config_exit(spa, SCL_STATE, FTAG); - return (EINVAL); - } - - vdev_indirect_config_t *vic = &vd->vdev_indirect_config; - - ASSERT(vdev_is_concrete(vd)); - spa_vdev_removal_t *svr = spa_vdev_removal_create(vd); - ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id); - ASSERT(vd->vdev_removing); - - vd->vdev_indirect_mapping = vdev_indirect_mapping_open( - spa->spa_meta_objset, vic->vic_mapping_object); - vd->vdev_indirect_births = vdev_indirect_births_open( - spa->spa_meta_objset, vic->vic_births_object); - spa_config_exit(spa, SCL_STATE, FTAG); - - spa->spa_vdev_removal = svr; - } - - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - uint64_t indirect_vdev_id = - spa->spa_removing_phys.sr_prev_indirect_vdev; - while (indirect_vdev_id != UINT64_MAX) { - vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id); - vdev_indirect_config_t *vic = &vd->vdev_indirect_config; - - ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); - vd->vdev_indirect_mapping = vdev_indirect_mapping_open( - spa->spa_meta_objset, vic->vic_mapping_object); - vd->vdev_indirect_births = vdev_indirect_births_open( - spa->spa_meta_objset, vic->vic_births_object); - - indirect_vdev_id = vic->vic_prev_indirect_vdev; - } - spa_config_exit(spa, SCL_STATE, FTAG); - - /* - * Now that we've loaded all the indirect mappings, we can allow - * reads from other blocks (e.g. via predictive prefetch). - */ - spa->spa_indirect_vdevs_loaded = B_TRUE; - return (0); -} - -void -spa_restart_removal(spa_t *spa) -{ - spa_vdev_removal_t *svr = spa->spa_vdev_removal; - - if (svr == NULL) - return; - - /* - * In general when this function is called there is no - * removal thread running. The only scenario where this - * is not true is during spa_import() where this function - * is called twice [once from spa_import_impl() and - * spa_async_resume()]. Thus, in the scenario where we - * import a pool that has an ongoing removal we don't - * want to spawn a second thread. - */ - if (svr->svr_thread != NULL) - return; - - if (!spa_writeable(spa)) - return; - - zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id); - svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, - 0, &p0, TS_RUN, minclsyspri); -} - -/* - * Process freeing from a device which is in the middle of being removed. - * We must handle this carefully so that we attempt to copy freed data, - * and we correctly free already-copied data. - */ -void -free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size) -{ - spa_t *spa = vd->vdev_spa; - spa_vdev_removal_t *svr = spa->spa_vdev_removal; - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - uint64_t txg = spa_syncing_txg(spa); - uint64_t max_offset_yet = 0; - - ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); - ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, - vdev_indirect_mapping_object(vim)); - ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id); - - mutex_enter(&svr->svr_lock); - - /* - * Remove the segment from the removing vdev's spacemap. This - * ensures that we will not attempt to copy this space (if the - * removal thread has not yet visited it), and also ensures - * that we know what is actually allocated on the new vdevs - * (needed if we cancel the removal). - * - * Note: we must do the metaslab_free_concrete() with the svr_lock - * held, so that the remove_thread can not load this metaslab and then - * visit this offset between the time that we metaslab_free_concrete() - * and when we check to see if it has been visited. - * - * Note: The checkpoint flag is set to false as having/taking - * a checkpoint and removing a device can't happen at the same - * time. - */ - ASSERT(!spa_has_checkpoint(spa)); - metaslab_free_concrete(vd, offset, size, B_FALSE); - - uint64_t synced_size = 0; - uint64_t synced_offset = 0; - uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim); - if (offset < max_offset_synced) { - /* - * The mapping for this offset is already on disk. - * Free from the new location. - * - * Note that we use svr_max_synced_offset because it is - * updated atomically with respect to the in-core mapping. - * By contrast, vim_max_offset is not. - * - * This block may be split between a synced entry and an - * in-flight or unvisited entry. Only process the synced - * portion of it here. - */ - synced_size = MIN(size, max_offset_synced - offset); - synced_offset = offset; - - ASSERT3U(max_offset_yet, <=, max_offset_synced); - max_offset_yet = max_offset_synced; - - DTRACE_PROBE3(remove__free__synced, - spa_t *, spa, - uint64_t, offset, - uint64_t, synced_size); - - size -= synced_size; - offset += synced_size; - } - - /* - * Look at all in-flight txgs starting from the currently syncing one - * and see if a section of this free is being copied. By starting from - * this txg and iterating forward, we might find that this region - * was copied in two different txgs and handle it appropriately. - */ - for (int i = 0; i < TXG_CONCURRENT_STATES; i++) { - int txgoff = (txg + i) & TXG_MASK; - if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) { - /* - * The mapping for this offset is in flight, and - * will be synced in txg+i. - */ - uint64_t inflight_size = MIN(size, - svr->svr_max_offset_to_sync[txgoff] - offset); - - DTRACE_PROBE4(remove__free__inflight, - spa_t *, spa, - uint64_t, offset, - uint64_t, inflight_size, - uint64_t, txg + i); - - /* - * We copy data in order of increasing offset. - * Therefore the max_offset_to_sync[] must increase - * (or be zero, indicating that nothing is being - * copied in that txg). - */ - if (svr->svr_max_offset_to_sync[txgoff] != 0) { - ASSERT3U(svr->svr_max_offset_to_sync[txgoff], - >=, max_offset_yet); - max_offset_yet = - svr->svr_max_offset_to_sync[txgoff]; - } - - /* - * We've already committed to copying this segment: - * we have allocated space elsewhere in the pool for - * it and have an IO outstanding to copy the data. We - * cannot free the space before the copy has - * completed, or else the copy IO might overwrite any - * new data. To free that space, we record the - * segment in the appropriate svr_frees tree and free - * the mapped space later, in the txg where we have - * completed the copy and synced the mapping (see - * vdev_mapping_sync). - */ - range_tree_add(svr->svr_frees[txgoff], - offset, inflight_size); - size -= inflight_size; - offset += inflight_size; - - /* - * This space is already accounted for as being - * done, because it is being copied in txg+i. - * However, if i!=0, then it is being copied in - * a future txg. If we crash after this txg - * syncs but before txg+i syncs, then the space - * will be free. Therefore we must account - * for the space being done in *this* txg - * (when it is freed) rather than the future txg - * (when it will be copied). - */ - ASSERT3U(svr->svr_bytes_done[txgoff], >=, - inflight_size); - svr->svr_bytes_done[txgoff] -= inflight_size; - svr->svr_bytes_done[txg & TXG_MASK] += inflight_size; - } - } - ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]); - - if (size > 0) { - /* - * The copy thread has not yet visited this offset. Ensure - * that it doesn't. - */ - - DTRACE_PROBE3(remove__free__unvisited, - spa_t *, spa, - uint64_t, offset, - uint64_t, size); - - if (svr->svr_allocd_segs != NULL) - range_tree_clear(svr->svr_allocd_segs, offset, size); - - /* - * Since we now do not need to copy this data, for - * accounting purposes we have done our job and can count - * it as completed. - */ - svr->svr_bytes_done[txg & TXG_MASK] += size; - } - mutex_exit(&svr->svr_lock); - - /* - * Now that we have dropped svr_lock, process the synced portion - * of this free. - */ - if (synced_size > 0) { - vdev_indirect_mark_obsolete(vd, synced_offset, synced_size); - - /* - * Note: this can only be called from syncing context, - * and the vdev_indirect_mapping is only changed from the - * sync thread, so we don't need svr_lock while doing - * metaslab_free_impl_cb. - */ - boolean_t checkpoint = B_FALSE; - vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size, - metaslab_free_impl_cb, &checkpoint); - } -} - -/* - * Stop an active removal and update the spa_removing phys. - */ -static void -spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) -{ - spa_vdev_removal_t *svr = spa->spa_vdev_removal; - ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa)); - - /* Ensure the removal thread has completed before we free the svr. */ - spa_vdev_remove_suspend(spa); - - ASSERT(state == DSS_FINISHED || state == DSS_CANCELED); - - if (state == DSS_FINISHED) { - spa_removing_phys_t *srp = &spa->spa_removing_phys; - vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); - vdev_indirect_config_t *vic = &vd->vdev_indirect_config; - - if (srp->sr_prev_indirect_vdev != UINT64_MAX) { - vdev_t *pvd = vdev_lookup_top(spa, - srp->sr_prev_indirect_vdev); - ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops); - } - - vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev; - srp->sr_prev_indirect_vdev = vd->vdev_id; - } - spa->spa_removing_phys.sr_state = state; - spa->spa_removing_phys.sr_end_time = gethrestime_sec(); - - spa->spa_vdev_removal = NULL; - spa_vdev_removal_destroy(svr); - - spa_sync_removing_state(spa, tx); - - vdev_config_dirty(spa->spa_root_vdev); -} - -static void -free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size) -{ - vdev_t *vd = arg; - vdev_indirect_mark_obsolete(vd, offset, size); - boolean_t checkpoint = B_FALSE; - vdev_indirect_ops.vdev_op_remap(vd, offset, size, - metaslab_free_impl_cb, &checkpoint); -} - -/* - * On behalf of the removal thread, syncs an incremental bit more of - * the indirect mapping to disk and updates the in-memory mapping. - * Called as a sync task in every txg that the removal thread makes progress. - */ -static void -vdev_mapping_sync(void *arg, dmu_tx_t *tx) -{ - spa_vdev_removal_t *svr = arg; - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); - vdev_indirect_config_t *vic = &vd->vdev_indirect_config; - uint64_t txg = dmu_tx_get_txg(tx); - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - - ASSERT(vic->vic_mapping_object != 0); - ASSERT3U(txg, ==, spa_syncing_txg(spa)); - - vdev_indirect_mapping_add_entries(vim, - &svr->svr_new_segments[txg & TXG_MASK], tx); - vdev_indirect_births_add_entry(vd->vdev_indirect_births, - vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx); - - /* - * Free the copied data for anything that was freed while the - * mapping entries were in flight. - */ - mutex_enter(&svr->svr_lock); - range_tree_vacate(svr->svr_frees[txg & TXG_MASK], - free_mapped_segment_cb, vd); - ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=, - vdev_indirect_mapping_max_offset(vim)); - svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0; - mutex_exit(&svr->svr_lock); - - spa_sync_removing_state(spa, tx); -} - -typedef struct vdev_copy_segment_arg { - spa_t *vcsa_spa; - dva_t *vcsa_dest_dva; - uint64_t vcsa_txg; - range_tree_t *vcsa_obsolete_segs; -} vdev_copy_segment_arg_t; - -static void -unalloc_seg(void *arg, uint64_t start, uint64_t size) -{ - vdev_copy_segment_arg_t *vcsa = arg; - spa_t *spa = vcsa->vcsa_spa; - blkptr_t bp = { 0 }; - - BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL); - BP_SET_LSIZE(&bp, size); - BP_SET_PSIZE(&bp, size); - BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); - BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF); - BP_SET_TYPE(&bp, DMU_OT_NONE); - BP_SET_LEVEL(&bp, 0); - BP_SET_DEDUP(&bp, 0); - BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER); - - DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva)); - DVA_SET_OFFSET(&bp.blk_dva[0], - DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start); - DVA_SET_ASIZE(&bp.blk_dva[0], size); - - zio_free(spa, vcsa->vcsa_txg, &bp); -} - -/* - * All reads and writes associated with a call to spa_vdev_copy_segment() - * are done. - */ -static void -spa_vdev_copy_segment_done(zio_t *zio) -{ - vdev_copy_segment_arg_t *vcsa = zio->io_private; - - range_tree_vacate(vcsa->vcsa_obsolete_segs, - unalloc_seg, vcsa); - range_tree_destroy(vcsa->vcsa_obsolete_segs); - kmem_free(vcsa, sizeof (*vcsa)); - - spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); -} - -/* - * The write of the new location is done. - */ -static void -spa_vdev_copy_segment_write_done(zio_t *zio) -{ - vdev_copy_arg_t *vca = zio->io_private; - - abd_free(zio->io_abd); - - mutex_enter(&vca->vca_lock); - vca->vca_outstanding_bytes -= zio->io_size; - cv_signal(&vca->vca_cv); - mutex_exit(&vca->vca_lock); -} - -/* - * The read of the old location is done. The parent zio is the write to - * the new location. Allow it to start. - */ -static void -spa_vdev_copy_segment_read_done(zio_t *zio) -{ - zio_nowait(zio_unique_parent(zio)); -} - -/* - * If the old and new vdevs are mirrors, we will read both sides of the old - * mirror, and write each copy to the corresponding side of the new mirror. - * If the old and new vdevs have a different number of children, we will do - * this as best as possible. Since we aren't verifying checksums, this - * ensures that as long as there's a good copy of the data, we'll have a - * good copy after the removal, even if there's silent damage to one side - * of the mirror. If we're removing a mirror that has some silent damage, - * we'll have exactly the same damage in the new location (assuming that - * the new location is also a mirror). - * - * We accomplish this by creating a tree of zio_t's, with as many writes as - * there are "children" of the new vdev (a non-redundant vdev counts as one - * child, a 2-way mirror has 2 children, etc). Each write has an associated - * read from a child of the old vdev. Typically there will be the same - * number of children of the old and new vdevs. However, if there are more - * children of the new vdev, some child(ren) of the old vdev will be issued - * multiple reads. If there are more children of the old vdev, some copies - * will be dropped. - * - * For example, the tree of zio_t's for a 2-way mirror is: - * - * null - * / \ - * write(new vdev, child 0) write(new vdev, child 1) - * | | - * read(old vdev, child 0) read(old vdev, child 1) - * - * Child zio's complete before their parents complete. However, zio's - * created with zio_vdev_child_io() may be issued before their children - * complete. In this case we need to make sure that the children (reads) - * complete before the parents (writes) are *issued*. We do this by not - * calling zio_nowait() on each write until its corresponding read has - * completed. - * - * The spa_config_lock must be held while zio's created by - * zio_vdev_child_io() are in progress, to ensure that the vdev tree does - * not change (e.g. due to a concurrent "zpool attach/detach"). The "null" - * zio is needed to release the spa_config_lock after all the reads and - * writes complete. (Note that we can't grab the config lock for each read, - * because it is not reentrant - we could deadlock with a thread waiting - * for a write lock.) - */ -static void -spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio, - vdev_t *source_vd, uint64_t source_offset, - vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size) -{ - ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0); - - mutex_enter(&vca->vca_lock); - vca->vca_outstanding_bytes += size; - mutex_exit(&vca->vca_lock); - - abd_t *abd = abd_alloc_for_io(size, B_FALSE); - - vdev_t *source_child_vd; - if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) { - /* - * Source and dest are both mirrors. Copy from the same - * child id as we are copying to (wrapping around if there - * are more dest children than source children). - */ - source_child_vd = - source_vd->vdev_child[dest_id % source_vd->vdev_children]; - } else { - source_child_vd = source_vd; - } - - zio_t *write_zio = zio_vdev_child_io(nzio, NULL, - dest_child_vd, dest_offset, abd, size, - ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, - ZIO_FLAG_CANFAIL, - spa_vdev_copy_segment_write_done, vca); - - zio_nowait(zio_vdev_child_io(write_zio, NULL, - source_child_vd, source_offset, abd, size, - ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, - ZIO_FLAG_CANFAIL, - spa_vdev_copy_segment_read_done, vca)); -} - -/* - * Allocate a new location for this segment, and create the zio_t's to - * read from the old location and write to the new location. - */ -static int -spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, - uint64_t maxalloc, uint64_t txg, - vdev_copy_arg_t *vca, zio_alloc_list_t *zal) -{ - metaslab_group_t *mg = vd->vdev_mg; - spa_t *spa = vd->vdev_spa; - spa_vdev_removal_t *svr = spa->spa_vdev_removal; - vdev_indirect_mapping_entry_t *entry; - dva_t dst = { 0 }; - uint64_t start = range_tree_min(segs); - - ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE); - - uint64_t size = range_tree_span(segs); - if (range_tree_span(segs) > maxalloc) { - /* - * We can't allocate all the segments. Prefer to end - * the allocation at the end of a segment, thus avoiding - * additional split blocks. - */ - range_seg_t search; - avl_index_t where; - search.rs_start = start + maxalloc; - search.rs_end = search.rs_start; - range_seg_t *rs = avl_find(&segs->rt_root, &search, &where); - if (rs == NULL) { - rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE); - } else { - rs = AVL_PREV(&segs->rt_root, rs); - } - if (rs != NULL) { - size = rs->rs_end - start; - } else { - /* - * There are no segments that end before maxalloc. - * I.e. the first segment is larger than maxalloc, - * so we must split it. - */ - size = maxalloc; - } - } - ASSERT3U(size, <=, maxalloc); - - /* - * An allocation class might not have any remaining vdevs or space - */ - metaslab_class_t *mc = mg->mg_class; - if (mc != spa_normal_class(spa) && mc->mc_groups <= 1) - mc = spa_normal_class(spa); - int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0, - zal, 0); - if (error == ENOSPC && mc != spa_normal_class(spa)) { - error = metaslab_alloc_dva(spa, spa_normal_class(spa), size, - &dst, 0, NULL, txg, 0, zal, 0); - } - if (error != 0) - return (error); - - /* - * Determine the ranges that are not actually needed. Offsets are - * relative to the start of the range to be copied (i.e. relative to the - * local variable "start"). - */ - range_tree_t *obsolete_segs = range_tree_create(NULL, NULL); - - range_seg_t *rs = avl_first(&segs->rt_root); - ASSERT3U(rs->rs_start, ==, start); - uint64_t prev_seg_end = rs->rs_end; - while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) { - if (rs->rs_start >= start + size) { - break; - } else { - range_tree_add(obsolete_segs, - prev_seg_end - start, - rs->rs_start - prev_seg_end); - } - prev_seg_end = rs->rs_end; - } - /* We don't end in the middle of an obsolete range */ - ASSERT3U(start + size, <=, prev_seg_end); - - range_tree_clear(segs, start, size); - - /* - * We can't have any padding of the allocated size, otherwise we will - * misunderstand what's allocated, and the size of the mapping. - * The caller ensures this will be true by passing in a size that is - * aligned to the worst (highest) ashift in the pool. - */ - ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); - - entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); - DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); - entry->vime_mapping.vimep_dst = dst; - if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { - entry->vime_obsolete_count = range_tree_space(obsolete_segs); - } - - vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP); - vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; - vcsa->vcsa_obsolete_segs = obsolete_segs; - vcsa->vcsa_spa = spa; - vcsa->vcsa_txg = txg; - - /* - * See comment before spa_vdev_copy_one_child(). - */ - spa_config_enter(spa, SCL_STATE, spa, RW_READER); - zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL, - spa_vdev_copy_segment_done, vcsa, 0); - vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst)); - if (dest_vd->vdev_ops == &vdev_mirror_ops) { - for (int i = 0; i < dest_vd->vdev_children; i++) { - vdev_t *child = dest_vd->vdev_child[i]; - spa_vdev_copy_one_child(vca, nzio, vd, start, - child, DVA_GET_OFFSET(&dst), i, size); - } - } else { - spa_vdev_copy_one_child(vca, nzio, vd, start, - dest_vd, DVA_GET_OFFSET(&dst), -1, size); - } - zio_nowait(nzio); - - list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry); - ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); - vdev_dirty(vd, 0, NULL, txg); - - return (0); -} - -/* - * Complete the removal of a toplevel vdev. This is called as a - * synctask in the same txg that we will sync out the new config (to the - * MOS object) which indicates that this vdev is indirect. - */ -static void -vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) -{ - spa_vdev_removal_t *svr = arg; - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); - - ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); - - for (int i = 0; i < TXG_SIZE; i++) { - ASSERT0(svr->svr_bytes_done[i]); - } - - ASSERT3U(spa->spa_removing_phys.sr_copied, ==, - spa->spa_removing_phys.sr_to_copy); - - vdev_destroy_spacemaps(vd, tx); - - /* destroy leaf zaps, if any */ - ASSERT3P(svr->svr_zaplist, !=, NULL); - for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL); - pair != NULL; - pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) { - vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx); - } - fnvlist_free(svr->svr_zaplist); - - spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx); - /* vd->vdev_path is not available here */ - spa_history_log_internal(spa, "vdev remove completed", tx, - "%s vdev %llu", spa_name(spa), vd->vdev_id); -} - -static void -vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) -{ - ASSERT3P(zlist, !=, NULL); - ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); - - if (vd->vdev_leaf_zap != 0) { - char zkey[32]; - (void) snprintf(zkey, sizeof (zkey), "%s-%ju", - VDEV_REMOVAL_ZAP_OBJS, (uintmax_t)vd->vdev_leaf_zap); - fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap); - } - - for (uint64_t id = 0; id < vd->vdev_children; id++) { - vdev_remove_enlist_zaps(vd->vdev_child[id], zlist); - } -} - -static void -vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) -{ - vdev_t *ivd; - dmu_tx_t *tx; - spa_t *spa = vd->vdev_spa; - spa_vdev_removal_t *svr = spa->spa_vdev_removal; - - /* - * First, build a list of leaf zaps to be destroyed. - * This is passed to the sync context thread, - * which does the actual unlinking. - */ - svr->svr_zaplist = fnvlist_alloc(); - vdev_remove_enlist_zaps(vd, svr->svr_zaplist); - - ivd = vdev_add_parent(vd, &vdev_indirect_ops); - ivd->vdev_removing = 0; - - vd->vdev_leaf_zap = 0; - - vdev_remove_child(ivd, vd); - vdev_compact_children(ivd); - - ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); - - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr, - 0, ZFS_SPACE_CHECK_NONE, tx); - dmu_tx_commit(tx); - - /* - * Indicate that this thread has exited. - * After this, we can not use svr. - */ - mutex_enter(&svr->svr_lock); - svr->svr_thread = NULL; - cv_broadcast(&svr->svr_cv); - mutex_exit(&svr->svr_lock); -} - -/* - * Complete the removal of a toplevel vdev. This is called in open - * context by the removal thread after we have copied all vdev's data. - */ -static void -vdev_remove_complete(spa_t *spa) -{ - uint64_t txg; - - /* - * Wait for any deferred frees to be synced before we call - * vdev_metaslab_fini() - */ - txg_wait_synced(spa->spa_dsl_pool, 0); - txg = spa_vdev_enter(spa); - vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); - ASSERT3P(vd->vdev_initialize_thread, ==, NULL); - - sysevent_t *ev = spa_event_create(spa, vd, NULL, - ESC_ZFS_VDEV_REMOVE_DEV); - - zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", - vd->vdev_id, txg); - - /* - * Discard allocation state. - */ - if (vd->vdev_mg != NULL) { - vdev_metaslab_fini(vd); - metaslab_group_destroy(vd->vdev_mg); - vd->vdev_mg = NULL; - } - ASSERT0(vd->vdev_stat.vs_space); - ASSERT0(vd->vdev_stat.vs_dspace); - - vdev_remove_replace_with_indirect(vd, txg); - - /* - * We now release the locks, allowing spa_sync to run and finish the - * removal via vdev_remove_complete_sync in syncing context. - * - * Note that we hold on to the vdev_t that has been replaced. Since - * it isn't part of the vdev tree any longer, it can't be concurrently - * manipulated, even while we don't have the config lock. - */ - (void) spa_vdev_exit(spa, NULL, txg, 0); - - /* - * Top ZAP should have been transferred to the indirect vdev in - * vdev_remove_replace_with_indirect. - */ - ASSERT0(vd->vdev_top_zap); - - /* - * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect. - */ - ASSERT0(vd->vdev_leaf_zap); - - txg = spa_vdev_enter(spa); - (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); - /* - * Request to update the config and the config cachefile. - */ - vdev_config_dirty(spa->spa_root_vdev); - (void) spa_vdev_exit(spa, vd, txg, 0); - - spa_event_post(ev); -} - -/* - * Evacuates a segment of size at most max_alloc from the vdev - * via repeated calls to spa_vdev_copy_segment. If an allocation - * fails, the pool is probably too fragmented to handle such a - * large size, so decrease max_alloc so that the caller will not try - * this size again this txg. - */ -static void -spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, - uint64_t *max_alloc, dmu_tx_t *tx) -{ - uint64_t txg = dmu_tx_get_txg(tx); - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - - mutex_enter(&svr->svr_lock); - - /* - * Determine how big of a chunk to copy. We can allocate up - * to max_alloc bytes, and we can span up to vdev_removal_max_span - * bytes of unallocated space at a time. "segs" will track the - * allocated segments that we are copying. We may also be copying - * free segments (of up to vdev_removal_max_span bytes). - */ - range_tree_t *segs = range_tree_create(NULL, NULL); - for (;;) { - range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root); - if (rs == NULL) - break; - - uint64_t seg_length; - - if (range_tree_is_empty(segs)) { - /* need to truncate the first seg based on max_alloc */ - seg_length = - MIN(rs->rs_end - rs->rs_start, *max_alloc); - } else { - if (rs->rs_start - range_tree_max(segs) > - vdev_removal_max_span) { - /* - * Including this segment would cause us to - * copy a larger unneeded chunk than is allowed. - */ - break; - } else if (rs->rs_end - range_tree_min(segs) > - *max_alloc) { - /* - * This additional segment would extend past - * max_alloc. Rather than splitting this - * segment, leave it for the next mapping. - */ - break; - } else { - seg_length = rs->rs_end - rs->rs_start; - } - } - - range_tree_add(segs, rs->rs_start, seg_length); - range_tree_remove(svr->svr_allocd_segs, - rs->rs_start, seg_length); - } - - if (range_tree_is_empty(segs)) { - mutex_exit(&svr->svr_lock); - range_tree_destroy(segs); - return; - } - - if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { - dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, - svr, 0, ZFS_SPACE_CHECK_NONE, tx); - } - - svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs); - - /* - * Note: this is the amount of *allocated* space - * that we are taking care of each txg. - */ - svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs); - - mutex_exit(&svr->svr_lock); - - zio_alloc_list_t zal; - metaslab_trace_init(&zal); - uint64_t thismax = SPA_MAXBLOCKSIZE; - while (!range_tree_is_empty(segs)) { - int error = spa_vdev_copy_segment(vd, - segs, thismax, txg, vca, &zal); - - if (error == ENOSPC) { - /* - * Cut our segment in half, and don't try this - * segment size again this txg. Note that the - * allocation size must be aligned to the highest - * ashift in the pool, so that the allocation will - * not be padded out to a multiple of the ashift, - * which could cause us to think that this mapping - * is larger than we intended. - */ - ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); - ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); - uint64_t attempted = - MIN(range_tree_span(segs), thismax); - thismax = P2ROUNDUP(attempted / 2, - 1 << spa->spa_max_ashift); - /* - * The minimum-size allocation can not fail. - */ - ASSERT3U(attempted, >, 1 << spa->spa_max_ashift); - *max_alloc = attempted - (1 << spa->spa_max_ashift); - } else { - ASSERT0(error); - - /* - * We've performed an allocation, so reset the - * alloc trace list. - */ - metaslab_trace_fini(&zal); - metaslab_trace_init(&zal); - } - } - metaslab_trace_fini(&zal); - range_tree_destroy(segs); -} - -/* - * The removal thread operates in open context. It iterates over all - * allocated space in the vdev, by loading each metaslab's spacemap. - * For each contiguous segment of allocated space (capping the segment - * size at SPA_MAXBLOCKSIZE), we: - * - Allocate space for it on another vdev. - * - Create a new mapping from the old location to the new location - * (as a record in svr_new_segments). - * - Initiate a logical read zio to get the data off the removing disk. - * - In the read zio's done callback, initiate a logical write zio to - * write it to the new vdev. - * Note that all of this will take effect when a particular TXG syncs. - * The sync thread ensures that all the phys reads and writes for the syncing - * TXG have completed (see spa_txg_zio) and writes the new mappings to disk - * (see vdev_mapping_sync()). - */ -static void -spa_vdev_remove_thread(void *arg) -{ - spa_t *spa = arg; - spa_vdev_removal_t *svr = spa->spa_vdev_removal; - vdev_copy_arg_t vca; - uint64_t max_alloc = zfs_remove_max_segment; - uint64_t last_txg = 0; - - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - uint64_t start_offset = vdev_indirect_mapping_max_offset(vim); - - ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); - ASSERT(vdev_is_concrete(vd)); - ASSERT(vd->vdev_removing); - ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); - ASSERT(vim != NULL); - - mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL); - vca.vca_outstanding_bytes = 0; - - mutex_enter(&svr->svr_lock); - - /* - * Start from vim_max_offset so we pick up where we left off - * if we are restarting the removal after opening the pool. - */ - uint64_t msi; - for (msi = start_offset >> vd->vdev_ms_shift; - msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) { - metaslab_t *msp = vd->vdev_ms[msi]; - ASSERT3U(msi, <=, vd->vdev_ms_count); - - ASSERT0(range_tree_space(svr->svr_allocd_segs)); - - mutex_enter(&msp->ms_sync_lock); - mutex_enter(&msp->ms_lock); - - /* - * Assert nothing in flight -- ms_*tree is empty. - */ - for (int i = 0; i < TXG_SIZE; i++) { - ASSERT0(range_tree_space(msp->ms_allocating[i])); - } - - /* - * If the metaslab has ever been allocated from (ms_sm!=NULL), - * read the allocated segments from the space map object - * into svr_allocd_segs. Since we do this while holding - * svr_lock and ms_sync_lock, concurrent frees (which - * would have modified the space map) will wait for us - * to finish loading the spacemap, and then take the - * appropriate action (see free_from_removing_vdev()). - */ - if (msp->ms_sm != NULL) { - VERIFY0(space_map_load(msp->ms_sm, - svr->svr_allocd_segs, SM_ALLOC)); - - range_tree_walk(msp->ms_freeing, - range_tree_remove, svr->svr_allocd_segs); - - /* - * When we are resuming from a paused removal (i.e. - * when importing a pool with a removal in progress), - * discard any state that we have already processed. - */ - range_tree_clear(svr->svr_allocd_segs, 0, start_offset); - } - mutex_exit(&msp->ms_lock); - mutex_exit(&msp->ms_sync_lock); - - vca.vca_msp = msp; - zfs_dbgmsg("copying %llu segments for metaslab %llu", - avl_numnodes(&svr->svr_allocd_segs->rt_root), - msp->ms_id); - - while (!svr->svr_thread_exit && - !range_tree_is_empty(svr->svr_allocd_segs)) { - - mutex_exit(&svr->svr_lock); - - /* - * We need to periodically drop the config lock so that - * writers can get in. Additionally, we can't wait - * for a txg to sync while holding a config lock - * (since a waiting writer could cause a 3-way deadlock - * with the sync thread, which also gets a config - * lock for reader). So we can't hold the config lock - * while calling dmu_tx_assign(). - */ - spa_config_exit(spa, SCL_CONFIG, FTAG); - - /* - * This delay will pause the removal around the point - * specified by zfs_remove_max_bytes_pause. We do this - * solely from the test suite or during debugging. - */ - uint64_t bytes_copied = - spa->spa_removing_phys.sr_copied; - for (int i = 0; i < TXG_SIZE; i++) - bytes_copied += svr->svr_bytes_done[i]; - while (zfs_remove_max_bytes_pause <= bytes_copied && - !svr->svr_thread_exit) - delay(hz); - - mutex_enter(&vca.vca_lock); - while (vca.vca_outstanding_bytes > - zfs_remove_max_copy_bytes) { - cv_wait(&vca.vca_cv, &vca.vca_lock); - } - mutex_exit(&vca.vca_lock); - - dmu_tx_t *tx = - dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - uint64_t txg = dmu_tx_get_txg(tx); - - /* - * Reacquire the vdev_config lock. The vdev_t - * that we're removing may have changed, e.g. due - * to a vdev_attach or vdev_detach. - */ - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - vd = vdev_lookup_top(spa, svr->svr_vdev_id); - - if (txg != last_txg) - max_alloc = zfs_remove_max_segment; - last_txg = txg; - - spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx); - - dmu_tx_commit(tx); - mutex_enter(&svr->svr_lock); - } - } - - mutex_exit(&svr->svr_lock); - - spa_config_exit(spa, SCL_CONFIG, FTAG); - - /* - * Wait for all copies to finish before cleaning up the vca. - */ - txg_wait_synced(spa->spa_dsl_pool, 0); - ASSERT0(vca.vca_outstanding_bytes); - - mutex_destroy(&vca.vca_lock); - cv_destroy(&vca.vca_cv); - - if (svr->svr_thread_exit) { - mutex_enter(&svr->svr_lock); - range_tree_vacate(svr->svr_allocd_segs, NULL, NULL); - svr->svr_thread = NULL; - cv_broadcast(&svr->svr_cv); - mutex_exit(&svr->svr_lock); - } else { - ASSERT0(range_tree_space(svr->svr_allocd_segs)); - vdev_remove_complete(spa); - } - thread_exit(); -} - -void -spa_vdev_remove_suspend(spa_t *spa) -{ - spa_vdev_removal_t *svr = spa->spa_vdev_removal; - - if (svr == NULL) - return; - - mutex_enter(&svr->svr_lock); - svr->svr_thread_exit = B_TRUE; - while (svr->svr_thread != NULL) - cv_wait(&svr->svr_cv, &svr->svr_lock); - svr->svr_thread_exit = B_FALSE; - mutex_exit(&svr->svr_lock); -} - -/* ARGSUSED */ -static int -spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - - if (spa->spa_vdev_removal == NULL) - return (ESRCH); - return (0); -} - -/* - * Cancel a removal by freeing all entries from the partial mapping - * and marking the vdev as no longer being removing. - */ -/* ARGSUSED */ -static void -spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - spa_vdev_removal_t *svr = spa->spa_vdev_removal; - vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); - vdev_indirect_config_t *vic = &vd->vdev_indirect_config; - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - objset_t *mos = spa->spa_meta_objset; - - ASSERT3P(svr->svr_thread, ==, NULL); - - spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); - if (vdev_obsolete_counts_are_precise(vd)) { - spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); - VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, - VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx)); - } - - if (vdev_obsolete_sm_object(vd) != 0) { - ASSERT(vd->vdev_obsolete_sm != NULL); - ASSERT3U(vdev_obsolete_sm_object(vd), ==, - space_map_object(vd->vdev_obsolete_sm)); - - space_map_free(vd->vdev_obsolete_sm, tx); - VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, - VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); - space_map_close(vd->vdev_obsolete_sm); - vd->vdev_obsolete_sm = NULL; - spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); - } - for (int i = 0; i < TXG_SIZE; i++) { - ASSERT(list_is_empty(&svr->svr_new_segments[i])); - ASSERT3U(svr->svr_max_offset_to_sync[i], <=, - vdev_indirect_mapping_max_offset(vim)); - } - - for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { - metaslab_t *msp = vd->vdev_ms[msi]; - - if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) - break; - - ASSERT0(range_tree_space(svr->svr_allocd_segs)); - - mutex_enter(&msp->ms_lock); - - /* - * Assert nothing in flight -- ms_*tree is empty. - */ - for (int i = 0; i < TXG_SIZE; i++) - ASSERT0(range_tree_space(msp->ms_allocating[i])); - for (int i = 0; i < TXG_DEFER_SIZE; i++) - ASSERT0(range_tree_space(msp->ms_defer[i])); - ASSERT0(range_tree_space(msp->ms_freed)); - - if (msp->ms_sm != NULL) { - mutex_enter(&svr->svr_lock); - VERIFY0(space_map_load(msp->ms_sm, - svr->svr_allocd_segs, SM_ALLOC)); - range_tree_walk(msp->ms_freeing, - range_tree_remove, svr->svr_allocd_segs); - - /* - * Clear everything past what has been synced, - * because we have not allocated mappings for it yet. - */ - uint64_t syncd = vdev_indirect_mapping_max_offset(vim); - uint64_t sm_end = msp->ms_sm->sm_start + - msp->ms_sm->sm_size; - if (sm_end > syncd) - range_tree_clear(svr->svr_allocd_segs, - syncd, sm_end - syncd); - - mutex_exit(&svr->svr_lock); - } - mutex_exit(&msp->ms_lock); - - mutex_enter(&svr->svr_lock); - range_tree_vacate(svr->svr_allocd_segs, - free_mapped_segment_cb, vd); - mutex_exit(&svr->svr_lock); - } - - /* - * Note: this must happen after we invoke free_mapped_segment_cb, - * because it adds to the obsolete_segments. - */ - range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); - - ASSERT3U(vic->vic_mapping_object, ==, - vdev_indirect_mapping_object(vd->vdev_indirect_mapping)); - vdev_indirect_mapping_close(vd->vdev_indirect_mapping); - vd->vdev_indirect_mapping = NULL; - vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); - vic->vic_mapping_object = 0; - - ASSERT3U(vic->vic_births_object, ==, - vdev_indirect_births_object(vd->vdev_indirect_births)); - vdev_indirect_births_close(vd->vdev_indirect_births); - vd->vdev_indirect_births = NULL; - vdev_indirect_births_free(mos, vic->vic_births_object, tx); - vic->vic_births_object = 0; - - /* - * We may have processed some frees from the removing vdev in this - * txg, thus increasing svr_bytes_done; discard that here to - * satisfy the assertions in spa_vdev_removal_destroy(). - * Note that future txg's can not have any bytes_done, because - * future TXG's are only modified from open context, and we have - * already shut down the copying thread. - */ - svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0; - spa_finish_removal(spa, DSS_CANCELED, tx); - - vd->vdev_removing = B_FALSE; - vdev_config_dirty(vd); - - zfs_dbgmsg("canceled device removal for vdev %llu in %llu", - vd->vdev_id, dmu_tx_get_txg(tx)); - spa_history_log_internal(spa, "vdev remove canceled", tx, - "%s vdev %llu %s", spa_name(spa), - vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); -} - -int -spa_vdev_remove_cancel(spa_t *spa) -{ - spa_vdev_remove_suspend(spa); - - if (spa->spa_vdev_removal == NULL) - return (ESRCH); - - uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id; - - int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, - spa_vdev_remove_cancel_sync, NULL, 0, - ZFS_SPACE_CHECK_EXTRA_RESERVED); - - if (error == 0) { - spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); - vdev_t *vd = vdev_lookup_top(spa, vdid); - metaslab_group_activate(vd->vdev_mg); - spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); - } - - return (error); -} - -void -svr_sync(spa_t *spa, dmu_tx_t *tx) -{ - spa_vdev_removal_t *svr = spa->spa_vdev_removal; - int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; - - /* - * This check is necessary so that we do not dirty the - * DIRECTORY_OBJECT via spa_sync_removing_state() when there - * is nothing to do. Dirtying it every time would prevent us - * from syncing-to-convergence. - */ - if (svr->svr_bytes_done[txgoff] == 0) - return; - - /* - * Update progress accounting. - */ - spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff]; - svr->svr_bytes_done[txgoff] = 0; - - spa_sync_removing_state(spa, tx); -} - -static void -vdev_remove_make_hole_and_free(vdev_t *vd) -{ - uint64_t id = vd->vdev_id; - spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; - boolean_t last_vdev = (id == (rvd->vdev_children - 1)); - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - - vdev_free(vd); - - if (last_vdev) { - vdev_compact_children(rvd); - } else { - vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); - vdev_add_child(rvd, vd); - } - vdev_config_dirty(rvd); - - /* - * Reassess the health of our root vdev. - */ - vdev_reopen(rvd); -} - -/* - * Remove a log device. The config lock is held for the specified TXG. - */ -static int -spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) -{ - metaslab_group_t *mg = vd->vdev_mg; - spa_t *spa = vd->vdev_spa; - int error = 0; - - ASSERT(vd->vdev_islog); - ASSERT(vd == vd->vdev_top); - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - /* - * Stop allocating from this vdev. - */ - metaslab_group_passivate(mg); - - /* - * Wait for the youngest allocations and frees to sync, - * and then wait for the deferral of those frees to finish. - */ - spa_vdev_config_exit(spa, NULL, - *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); - - /* - * Evacuate the device. We don't hold the config lock as - * writer since we need to do I/O but we do keep the - * spa_namespace_lock held. Once this completes the device - * should no longer have any blocks allocated on it. - */ - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - if (vd->vdev_stat.vs_alloc != 0) - error = spa_reset_logs(spa); - - *txg = spa_vdev_config_enter(spa); - - if (error != 0) { - metaslab_group_activate(mg); - return (error); - } - ASSERT0(vd->vdev_stat.vs_alloc); - - /* - * The evacuation succeeded. Remove any remaining MOS metadata - * associated with this vdev, and wait for these changes to sync. - */ - vd->vdev_removing = B_TRUE; - - vdev_dirty_leaves(vd, VDD_DTL, *txg); - vdev_config_dirty(vd); - - vdev_metaslab_fini(vd); - - spa_history_log_internal(spa, "vdev remove", NULL, - "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id, - (vd->vdev_path != NULL) ? vd->vdev_path : "-"); - - /* Make sure these changes are sync'ed */ - spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); - - /* Stop initializing */ - (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); - - *txg = spa_vdev_config_enter(spa); - - sysevent_t *ev = spa_event_create(spa, vd, NULL, - ESC_ZFS_VDEV_REMOVE_DEV); - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - - /* The top ZAP should have been destroyed by vdev_remove_empty. */ - ASSERT0(vd->vdev_top_zap); - /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */ - ASSERT0(vd->vdev_leaf_zap); - - (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); - - if (list_link_active(&vd->vdev_state_dirty_node)) - vdev_state_clean(vd); - if (list_link_active(&vd->vdev_config_dirty_node)) - vdev_config_clean(vd); - - ASSERT0(vd->vdev_stat.vs_alloc); - - /* - * Clean up the vdev namespace. - */ - vdev_remove_make_hole_and_free(vd); - - if (ev != NULL) - spa_event_post(ev); - - return (0); -} - -static int -spa_vdev_remove_top_check(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - if (vd != vd->vdev_top) - return (SET_ERROR(ENOTSUP)); - - if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) - return (SET_ERROR(ENOTSUP)); - - /* available space in the pool's normal class */ - uint64_t available = dsl_dir_space_available( - spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE); - - metaslab_class_t *mc = vd->vdev_mg->mg_class; - - /* - * When removing a vdev from an allocation class that has - * remaining vdevs, include available space from the class. - */ - if (mc != spa_normal_class(spa) && mc->mc_groups > 1) { - uint64_t class_avail = metaslab_class_get_space(mc) - - metaslab_class_get_alloc(mc); - - /* add class space, adjusted for overhead */ - available += (class_avail * 94) / 100; - } - - /* - * There has to be enough free space to remove the - * device and leave double the "slop" space (i.e. we - * must leave at least 3% of the pool free, in addition to - * the normal slop space). - */ - if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { - return (SET_ERROR(ENOSPC)); - } - - /* - * There can not be a removal in progress. - */ - if (spa->spa_removing_phys.sr_state == DSS_SCANNING) - return (SET_ERROR(EBUSY)); - - /* - * The device must have all its data. - */ - if (!vdev_dtl_empty(vd, DTL_MISSING) || - !vdev_dtl_empty(vd, DTL_OUTAGE)) - return (SET_ERROR(EBUSY)); - - /* - * The device must be healthy. - */ - if (!vdev_readable(vd)) - return (SET_ERROR(EIO)); - - /* - * All vdevs in normal class must have the same ashift. - */ - if (spa->spa_max_ashift != spa->spa_min_ashift) { - return (SET_ERROR(EINVAL)); - } - - /* - * All vdevs in normal class must have the same ashift - * and not be raidz. - */ - vdev_t *rvd = spa->spa_root_vdev; - int num_indirect = 0; - for (uint64_t id = 0; id < rvd->vdev_children; id++) { - vdev_t *cvd = rvd->vdev_child[id]; - if (cvd->vdev_ashift != 0 && !cvd->vdev_islog) - ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); - if (cvd->vdev_ops == &vdev_indirect_ops) - num_indirect++; - if (!vdev_is_concrete(cvd)) - continue; - if (cvd->vdev_ops == &vdev_raidz_ops) - return (SET_ERROR(EINVAL)); - /* - * Need the mirror to be mirror of leaf vdevs only - */ - if (cvd->vdev_ops == &vdev_mirror_ops) { - for (uint64_t cid = 0; - cid < cvd->vdev_children; cid++) { - vdev_t *tmp = cvd->vdev_child[cid]; - if (!tmp->vdev_ops->vdev_op_leaf) - return (SET_ERROR(EINVAL)); - } - } - } - - return (0); -} - -/* - * Initiate removal of a top-level vdev, reducing the total space in the pool. - * The config lock is held for the specified TXG. Once initiated, - * evacuation of all allocated space (copying it to other vdevs) happens - * in the background (see spa_vdev_remove_thread()), and can be canceled - * (see spa_vdev_remove_cancel()). If successful, the vdev will - * be transformed to an indirect vdev (see spa_vdev_remove_complete()). - */ -static int -spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) -{ - spa_t *spa = vd->vdev_spa; - int error; - - /* - * Check for errors up-front, so that we don't waste time - * passivating the metaslab group and clearing the ZIL if there - * are errors. - */ - error = spa_vdev_remove_top_check(vd); - if (error != 0) - return (error); - - /* - * Stop allocating from this vdev. Note that we must check - * that this is not the only device in the pool before - * passivating, otherwise we will not be able to make - * progress because we can't allocate from any vdevs. - * The above check for sufficient free space serves this - * purpose. - */ - metaslab_group_t *mg = vd->vdev_mg; - metaslab_group_passivate(mg); - - /* - * Wait for the youngest allocations and frees to sync, - * and then wait for the deferral of those frees to finish. - */ - spa_vdev_config_exit(spa, NULL, - *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); - - /* - * We must ensure that no "stubby" log blocks are allocated - * on the device to be removed. These blocks could be - * written at any time, including while we are in the middle - * of copying them. - */ - error = spa_reset_logs(spa); - - /* - * We stop any initializing that is currently in progress but leave - * the state as "active". This will allow the initializing to resume - * if the removal is canceled sometime later. - */ - vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE); - - *txg = spa_vdev_config_enter(spa); - - /* - * Things might have changed while the config lock was dropped - * (e.g. space usage). Check for errors again. - */ - if (error == 0) - error = spa_vdev_remove_top_check(vd); - - if (error != 0) { - metaslab_group_activate(mg); - spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); - return (error); - } - - vd->vdev_removing = B_TRUE; - - vdev_dirty_leaves(vd, VDD_DTL, *txg); - vdev_config_dirty(vd); - dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); - dsl_sync_task_nowait(spa->spa_dsl_pool, - vdev_remove_initiate_sync, - (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); - dmu_tx_commit(tx); - - return (0); -} - -/* - * Remove a device from the pool. - * - * Removing a device from the vdev namespace requires several steps - * and can take a significant amount of time. As a result we use - * the spa_vdev_config_[enter/exit] functions which allow us to - * grab and release the spa_config_lock while still holding the namespace - * lock. During each step the configuration is synced out. - */ -int -spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) -{ - vdev_t *vd; - nvlist_t **spares, **l2cache, *nv; - uint64_t txg = 0; - uint_t nspares, nl2cache; - int error = 0; - boolean_t locked = MUTEX_HELD(&spa_namespace_lock); - sysevent_t *ev = NULL; - - ASSERT(spa_writeable(spa)); - - if (!locked) - txg = spa_vdev_enter(spa); - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { - error = (spa_has_checkpoint(spa)) ? - ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; - - if (!locked) - return (spa_vdev_exit(spa, NULL, txg, error)); - - return (error); - } - - vd = spa_lookup_by_guid(spa, guid, B_FALSE); - - if (spa->spa_spares.sav_vdevs != NULL && - nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && - (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { - /* - * Only remove the hot spare if it's not currently in use - * in this pool. - */ - if (vd == NULL || unspare) { - char *nvstr = fnvlist_lookup_string(nv, - ZPOOL_CONFIG_PATH); - spa_history_log_internal(spa, "vdev remove", NULL, - "%s vdev (%s) %s", spa_name(spa), - VDEV_TYPE_SPARE, nvstr); - if (vd == NULL) - vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, NULL, - ESC_ZFS_VDEV_REMOVE_AUX); - spa_vdev_remove_aux(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares, nv); - spa_load_spares(spa); - spa->spa_spares.sav_sync = B_TRUE; - } else { - error = SET_ERROR(EBUSY); - } - } else if (spa->spa_l2cache.sav_vdevs != NULL && - nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && - (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { - char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); - spa_history_log_internal(spa, "vdev remove", NULL, - "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr); - /* - * Cache devices can always be removed. - */ - vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); - spa_vdev_remove_aux(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); - spa_load_l2cache(spa); - spa->spa_l2cache.sav_sync = B_TRUE; - } else if (vd != NULL && vd->vdev_islog) { - ASSERT(!locked); - error = spa_vdev_remove_log(vd, &txg); - } else if (vd != NULL) { - ASSERT(!locked); - error = spa_vdev_remove_top(vd, &txg); - } else { - /* - * There is no vdev of any kind with the specified guid. - */ - error = SET_ERROR(ENOENT); - } - - if (!locked) - error = spa_vdev_exit(spa, NULL, txg, error); - - if (ev != NULL) { - if (error != 0) { - spa_event_discard(ev); - } else { - spa_event_post(ev); - } - } - - return (error); -} - -int -spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) -{ - prs->prs_state = spa->spa_removing_phys.sr_state; - - if (prs->prs_state == DSS_NONE) - return (SET_ERROR(ENOENT)); - - prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev; - prs->prs_start_time = spa->spa_removing_phys.sr_start_time; - prs->prs_end_time = spa->spa_removing_phys.sr_end_time; - prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy; - prs->prs_copied = spa->spa_removing_phys.sr_copied; - - if (spa->spa_vdev_removal != NULL) { - for (int i = 0; i < TXG_SIZE; i++) { - prs->prs_copied += - spa->spa_vdev_removal->svr_bytes_done[i]; - } - } - - prs->prs_mapping_memory = 0; - uint64_t indirect_vdev_id = - spa->spa_removing_phys.sr_prev_indirect_vdev; - while (indirect_vdev_id != -1) { - vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id]; - vdev_indirect_config_t *vic = &vd->vdev_indirect_config; - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - - ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); - prs->prs_mapping_memory += vdev_indirect_mapping_size(vim); - indirect_vdev_id = vic->vic_prev_indirect_vdev; - } - - return (0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c deleted file mode 100644 index a03d18704dfc..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include - -/* - * Virtual device vector for the pool's root vdev. - */ - -static uint64_t -vdev_root_core_tvds(vdev_t *vd) -{ - uint64_t tvds = 0; - - for (uint64_t c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - - if (!cvd->vdev_ishole && !cvd->vdev_islog && - cvd->vdev_ops != &vdev_indirect_ops) { - tvds++; - } - } - - return (tvds); -} - -/* - * We should be able to tolerate one failure with absolutely no damage - * to our metadata. Two failures will take out space maps, a bunch of - * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy - * place to live. When we get smarter, we can liberalize this policy. - * e.g. If we haven't lost two consecutive top-level vdevs, then we are - * probably fine. Adding bean counters during alloc/free can make this - * future guesswork more accurate. - */ -static boolean_t -too_many_errors(vdev_t *vd, uint64_t numerrors) -{ - uint64_t tvds; - - if (numerrors == 0) - return (B_FALSE); - - tvds = vdev_root_core_tvds(vd); - ASSERT3U(numerrors, <=, tvds); - - if (numerrors == tvds) - return (B_TRUE); - - return (numerrors > spa_missing_tvds_allowed(vd->vdev_spa)); -} - -static int -vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - uint64_t *logical_ashift, uint64_t *physical_ashift) -{ - spa_t *spa = vd->vdev_spa; - int lasterror = 0; - int numerrors = 0; - - if (vd->vdev_children == 0) { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (SET_ERROR(EINVAL)); - } - - vdev_open_children(vd); - - for (int c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - - if (cvd->vdev_open_error && !cvd->vdev_islog) { - lasterror = cvd->vdev_open_error; - numerrors++; - } - } - - if (spa_load_state(spa) != SPA_LOAD_NONE) - spa_set_missing_tvds(spa, numerrors); - - if (too_many_errors(vd, numerrors)) { - vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; - return (lasterror); - } - - *asize = 0; - *max_asize = 0; - *logical_ashift = 0; - *physical_ashift = 0; - - return (0); -} - -static void -vdev_root_close(vdev_t *vd) -{ - for (int c = 0; c < vd->vdev_children; c++) - vdev_close(vd->vdev_child[c]); -} - -static void -vdev_root_state_change(vdev_t *vd, int faulted, int degraded) -{ - if (too_many_errors(vd, faulted)) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_NO_REPLICAS); - } else if (degraded || faulted) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); - } else { - vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); - } -} - -vdev_ops_t vdev_root_ops = { - vdev_root_open, - vdev_root_close, - vdev_default_asize, - NULL, /* io_start - not applicable to the root */ - NULL, /* io_done - not applicable to the root */ - vdev_root_state_change, - NULL, - NULL, - NULL, - NULL, - NULL, - VDEV_TYPE_ROOT, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ -}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c deleted file mode 100644 index a0cadaae949d..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c +++ /dev/null @@ -1,1378 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - */ - -/* - * This file contains the top half of the zfs directory structure - * implementation. The bottom half is in zap_leaf.c. - * - * The zdir is an extendable hash data structure. There is a table of - * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are - * each a constant size and hold a variable number of directory entries. - * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. - * - * The pointer table holds a power of 2 number of pointers. - * (1<zd_data->zd_phys->zd_prefix_len). The bucket pointed to - * by the pointer at index i in the table holds entries whose hash value - * has a zd_prefix_len - bit prefix - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object - * (all leaf blocks) when we start iterating over it. - * - * For zap_cursor_init(), the callers all intend to iterate through all the - * entries. There are a few cases where an error (typically i/o error) could - * cause it to bail out early. - * - * For zap_cursor_init_serialized(), there are callers that do the iteration - * outside of ZFS. Typically they would iterate over everything, but we - * don't have control of that. E.g. zfs_ioc_snapshot_list_next(), - * zcp_snapshots_iter(), and other iterators over things in the MOS - these - * are called by /sbin/zfs and channel programs. The other example is - * zfs_readdir() which iterates over directory entries for the getdents() - * syscall. /sbin/ls iterates to the end (unless it receives a signal), but - * userland doesn't have to. - * - * Given that the ZAP entries aren't returned in a specific order, the only - * legitimate use cases for partial iteration would be: - * - * 1. Pagination: e.g. you only want to display 100 entries at a time, so you - * get the first 100 and then wait for the user to hit "next page", which - * they may never do). - * - * 2. You want to know if there are more than X entries, without relying on - * the zfs-specific implementation of the directory's st_size (which is - * the number of entries). - */ -boolean_t zap_iterate_prefetch = B_TRUE; - -int fzap_default_block_shift = 14; /* 16k blocksize */ - -extern inline zap_phys_t *zap_f_phys(zap_t *zap); - -static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); - -void -fzap_byteswap(void *vbuf, size_t size) -{ - uint64_t block_type = *(uint64_t *)vbuf; - - if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) - zap_leaf_byteswap(vbuf, size); - else { - /* it's a ptrtbl block */ - byteswap_uint64_array(vbuf, size); - } -} - -void -fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) -{ - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - zap->zap_ismicro = FALSE; - - zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync; - zap->zap_dbu.dbu_evict_func_async = NULL; - - mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); - zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; - - zap_phys_t *zp = zap_f_phys(zap); - /* - * explicitly zero it since it might be coming from an - * initialized microzap - */ - bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size); - zp->zap_block_type = ZBT_HEADER; - zp->zap_magic = ZAP_MAGIC; - - zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); - - zp->zap_freeblk = 2; /* block 1 will be the first leaf */ - zp->zap_num_leafs = 1; - zp->zap_num_entries = 0; - zp->zap_salt = zap->zap_salt; - zp->zap_normflags = zap->zap_normflags; - zp->zap_flags = flags; - - /* block 1 will be the first leaf */ - for (int i = 0; i < (1<zap_ptrtbl.zt_shift); i++) - ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; - - /* - * set up block 1 - the first leaf - */ - dmu_buf_t *db; - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, - 1<l_dbuf = db; - - zap_leaf_init(l, zp->zap_normflags != 0); - - kmem_free(l, sizeof (zap_leaf_t)); - dmu_buf_rele(db, FTAG); -} - -static int -zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) -{ - if (RW_WRITE_HELD(&zap->zap_rwlock)) - return (1); - if (rw_tryupgrade(&zap->zap_rwlock)) { - dmu_buf_will_dirty(zap->zap_dbuf, tx); - return (1); - } - return (0); -} - -/* - * Generic routines for dealing with the pointer & cookie tables. - */ - -static int -zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, - void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), - dmu_tx_t *tx) -{ - uint64_t newblk; - int bs = FZAP_BLOCK_SHIFT(zap); - int hepb = 1<<(bs-4); - /* hepb = half the number of entries in a block */ - - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT(tbl->zt_blk != 0); - ASSERT(tbl->zt_numblks > 0); - - if (tbl->zt_nextblk != 0) { - newblk = tbl->zt_nextblk; - } else { - newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); - tbl->zt_nextblk = newblk; - ASSERT0(tbl->zt_blks_copied); - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, - tbl->zt_blk << bs, tbl->zt_numblks << bs, - ZIO_PRIORITY_SYNC_READ); - } - - /* - * Copy the ptrtbl from the old to new location. - */ - - uint64_t b = tbl->zt_blks_copied; - dmu_buf_t *db_old; - int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); - - /* first half of entries in old[b] go to new[2*b+0] */ - dmu_buf_t *db_new; - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, - (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); - dmu_buf_will_dirty(db_new, tx); - transfer_func(db_old->db_data, db_new->db_data, hepb); - dmu_buf_rele(db_new, FTAG); - - /* second half of entries in old[b] go to new[2*b+1] */ - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, - (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); - dmu_buf_will_dirty(db_new, tx); - transfer_func((uint64_t *)db_old->db_data + hepb, - db_new->db_data, hepb); - dmu_buf_rele(db_new, FTAG); - - dmu_buf_rele(db_old, FTAG); - - tbl->zt_blks_copied++; - - dprintf("copied block %llu of %llu\n", - tbl->zt_blks_copied, tbl->zt_numblks); - - if (tbl->zt_blks_copied == tbl->zt_numblks) { - (void) dmu_free_range(zap->zap_objset, zap->zap_object, - tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); - - tbl->zt_blk = newblk; - tbl->zt_numblks *= 2; - tbl->zt_shift++; - tbl->zt_nextblk = 0; - tbl->zt_blks_copied = 0; - - dprintf("finished; numblocks now %llu (%lluk entries)\n", - tbl->zt_numblks, 1<<(tbl->zt_shift-10)); - } - - return (0); -} - -static int -zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, - dmu_tx_t *tx) -{ - int bs = FZAP_BLOCK_SHIFT(zap); - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - ASSERT(tbl->zt_blk != 0); - - dprintf("storing %llx at index %llx\n", val, idx); - - uint64_t blk = idx >> (bs-3); - uint64_t off = idx & ((1<<(bs-3))-1); - - dmu_buf_t *db; - int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); - dmu_buf_will_dirty(db, tx); - - if (tbl->zt_nextblk != 0) { - uint64_t idx2 = idx * 2; - uint64_t blk2 = idx2 >> (bs-3); - uint64_t off2 = idx2 & ((1<<(bs-3))-1); - dmu_buf_t *db2; - - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, - DMU_READ_NO_PREFETCH); - if (err != 0) { - dmu_buf_rele(db, FTAG); - return (err); - } - dmu_buf_will_dirty(db2, tx); - ((uint64_t *)db2->db_data)[off2] = val; - ((uint64_t *)db2->db_data)[off2+1] = val; - dmu_buf_rele(db2, FTAG); - } - - ((uint64_t *)db->db_data)[off] = val; - dmu_buf_rele(db, FTAG); - - return (0); -} - -static int -zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) -{ - int bs = FZAP_BLOCK_SHIFT(zap); - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - uint64_t blk = idx >> (bs-3); - uint64_t off = idx & ((1<<(bs-3))-1); - - /* - * Note: this is equivalent to dmu_buf_hold(), but we use - * _dnode_enter / _by_dnode because it's faster because we don't - * have to hold the dnode. - */ - dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); - dmu_buf_t *db; - int err = dmu_buf_hold_by_dnode(dn, - (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - dmu_buf_dnode_exit(zap->zap_dbuf); - if (err != 0) - return (err); - *valp = ((uint64_t *)db->db_data)[off]; - dmu_buf_rele(db, FTAG); - - if (tbl->zt_nextblk != 0) { - /* - * read the nextblk for the sake of i/o error checking, - * so that zap_table_load() will catch errors for - * zap_table_store. - */ - blk = (idx*2) >> (bs-3); - - dn = dmu_buf_dnode_enter(zap->zap_dbuf); - err = dmu_buf_hold_by_dnode(dn, - (tbl->zt_nextblk + blk) << bs, FTAG, &db, - DMU_READ_NO_PREFETCH); - dmu_buf_dnode_exit(zap->zap_dbuf); - if (err == 0) - dmu_buf_rele(db, FTAG); - } - return (err); -} - -/* - * Routines for growing the ptrtbl. - */ - -static void -zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) -{ - for (int i = 0; i < n; i++) { - uint64_t lb = src[i]; - dst[2 * i + 0] = lb; - dst[2 * i + 1] = lb; - } -} - -static int -zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) -{ - /* - * The pointer table should never use more hash bits than we - * have (otherwise we'd be using useless zero bits to index it). - * If we are within 2 bits of running out, stop growing, since - * this is already an aberrant condition. - */ - if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) - return (SET_ERROR(ENOSPC)); - - if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { - /* - * We are outgrowing the "embedded" ptrtbl (the one - * stored in the header block). Give it its own entire - * block, which will double the size of the ptrtbl. - */ - ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, - ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); - ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk); - - uint64_t newblk = zap_allocate_blocks(zap, 1); - dmu_buf_t *db_new; - int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, - DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); - dmu_buf_will_dirty(db_new, tx); - zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), - db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); - dmu_buf_rele(db_new, FTAG); - - zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk; - zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1; - zap_f_phys(zap)->zap_ptrtbl.zt_shift++; - - ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, - zap_f_phys(zap)->zap_ptrtbl.zt_numblks << - (FZAP_BLOCK_SHIFT(zap)-3)); - - return (0); - } else { - return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl, - zap_ptrtbl_transfer, tx)); - } -} - -static void -zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) -{ - dmu_buf_will_dirty(zap->zap_dbuf, tx); - mutex_enter(&zap->zap_f.zap_num_entries_mtx); - ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta); - zap_f_phys(zap)->zap_num_entries += delta; - mutex_exit(&zap->zap_f.zap_num_entries_mtx); -} - -static uint64_t -zap_allocate_blocks(zap_t *zap, int nblocks) -{ - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - uint64_t newblk = zap_f_phys(zap)->zap_freeblk; - zap_f_phys(zap)->zap_freeblk += nblocks; - return (newblk); -} - -static void -zap_leaf_evict_sync(void *dbu) -{ - zap_leaf_t *l = dbu; - - rw_destroy(&l->l_rwlock); - kmem_free(l, sizeof (zap_leaf_t)); -} - -static zap_leaf_t * -zap_create_leaf(zap_t *zap, dmu_tx_t *tx) -{ - zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); - - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - rw_init(&l->l_rwlock, 0, 0, 0); - rw_enter(&l->l_rwlock, RW_WRITER); - l->l_blkid = zap_allocate_blocks(zap, 1); - l->l_dbuf = NULL; - - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, - l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, - DMU_READ_NO_PREFETCH)); - dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); - VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu)); - dmu_buf_will_dirty(l->l_dbuf, tx); - - zap_leaf_init(l, zap->zap_normflags != 0); - - zap_f_phys(zap)->zap_num_leafs++; - - return (l); -} - -int -fzap_count(zap_t *zap, uint64_t *count) -{ - ASSERT(!zap->zap_ismicro); - mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ - *count = zap_f_phys(zap)->zap_num_entries; - mutex_exit(&zap->zap_f.zap_num_entries_mtx); - return (0); -} - -/* - * Routines for obtaining zap_leaf_t's - */ - -void -zap_put_leaf(zap_leaf_t *l) -{ - rw_exit(&l->l_rwlock); - dmu_buf_rele(l->l_dbuf, NULL); -} - -static zap_leaf_t * -zap_open_leaf(uint64_t blkid, dmu_buf_t *db) -{ - ASSERT(blkid != 0); - - zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); - rw_init(&l->l_rwlock, 0, 0, 0); - rw_enter(&l->l_rwlock, RW_WRITER); - l->l_blkid = blkid; - l->l_bs = highbit64(db->db_size) - 1; - l->l_dbuf = db; - - dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); - zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu); - - rw_exit(&l->l_rwlock); - if (winner != NULL) { - /* someone else set it first */ - zap_leaf_evict_sync(&l->l_dbu); - l = winner; - } - - /* - * lhr_pad was previously used for the next leaf in the leaf - * chain. There should be no chained leafs (as we have removed - * support for them). - */ - ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1); - - /* - * There should be more hash entries than there can be - * chunks to put in the hash table - */ - ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); - - /* The chunks should begin at the end of the hash table */ - ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, - &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); - - /* The chunks should end at the end of the block */ - ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - - (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size); - - return (l); -} - -static int -zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, - zap_leaf_t **lp) -{ - dmu_buf_t *db; - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - int bs = FZAP_BLOCK_SHIFT(zap); - dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); - int err = dmu_buf_hold_by_dnode(dn, - blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); - dmu_buf_dnode_exit(zap->zap_dbuf); - if (err != 0) - return (err); - - ASSERT3U(db->db_object, ==, zap->zap_object); - ASSERT3U(db->db_offset, ==, blkid << bs); - ASSERT3U(db->db_size, ==, 1 << bs); - ASSERT(blkid != 0); - - zap_leaf_t *l = dmu_buf_get_user(db); - - if (l == NULL) - l = zap_open_leaf(blkid, db); - - rw_enter(&l->l_rwlock, lt); - /* - * Must lock before dirtying, otherwise zap_leaf_phys(l) could change, - * causing ASSERT below to fail. - */ - if (lt == RW_WRITER) - dmu_buf_will_dirty(db, tx); - ASSERT3U(l->l_blkid, ==, blkid); - ASSERT3P(l->l_dbuf, ==, db); - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF); - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); - - *lp = l; - return (0); -} - -static int -zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) -{ - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { - ASSERT3U(idx, <, - (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); - *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); - return (0); - } else { - return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl, - idx, valp)); - } -} - -static int -zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) -{ - ASSERT(tx != NULL); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { - ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; - return (0); - } else { - return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl, - idx, blk, tx)); - } -} - -static int -zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) -{ - uint64_t blk; - - ASSERT(zap->zap_dbuf == NULL || - zap_f_phys(zap) == zap->zap_dbuf->db_data); - - /* Reality check for corrupt zap objects (leaf or header). */ - if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF && - zap_f_phys(zap)->zap_block_type != ZBT_HEADER) || - zap_f_phys(zap)->zap_magic != ZAP_MAGIC) { - return (SET_ERROR(EIO)); - } - - uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); - int err = zap_idx_to_blk(zap, idx, &blk); - if (err != 0) - return (err); - err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); - - ASSERT(err || - ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) == - zap_leaf_phys(*lp)->l_hdr.lh_prefix); - return (err); -} - -static int -zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, - void *tag, dmu_tx_t *tx, zap_leaf_t **lp) -{ - zap_t *zap = zn->zn_zap; - uint64_t hash = zn->zn_hash; - int err; - int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; - - ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, - zap_leaf_phys(l)->l_hdr.lh_prefix); - - if (zap_tryupgradedir(zap, tx) == 0 || - old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { - /* We failed to upgrade, or need to grow the pointer table */ - objset_t *os = zap->zap_objset; - uint64_t object = zap->zap_object; - - zap_put_leaf(l); - zap_unlockdir(zap, tag); - err = zap_lockdir(os, object, tx, RW_WRITER, - FALSE, FALSE, tag, &zn->zn_zap); - zap = zn->zn_zap; - if (err != 0) - return (err); - ASSERT(!zap->zap_ismicro); - - while (old_prefix_len == - zap_f_phys(zap)->zap_ptrtbl.zt_shift) { - err = zap_grow_ptrtbl(zap, tx); - if (err != 0) - return (err); - } - - err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); - if (err != 0) - return (err); - - if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) { - /* it split while our locks were down */ - *lp = l; - return (0); - } - } - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift); - ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, - zap_leaf_phys(l)->l_hdr.lh_prefix); - - int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - - (old_prefix_len + 1); - uint64_t sibling = - (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; - - /* check for i/o errors before doing zap_leaf_split */ - for (int i = 0; i < (1ULL << prefix_diff); i++) { - uint64_t blk; - err = zap_idx_to_blk(zap, sibling + i, &blk); - if (err != 0) - return (err); - ASSERT3U(blk, ==, l->l_blkid); - } - - zap_leaf_t *nl = zap_create_leaf(zap, tx); - zap_leaf_split(l, nl, zap->zap_normflags != 0); - - /* set sibling pointers */ - for (int i = 0; i < (1ULL << prefix_diff); i++) { - err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx); - ASSERT0(err); /* we checked for i/o errors above */ - } - - if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) { - /* we want the sibling */ - zap_put_leaf(l); - *lp = nl; - } else { - zap_put_leaf(nl); - *lp = l; - } - - return (0); -} - -static void -zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, - void *tag, dmu_tx_t *tx) -{ - zap_t *zap = zn->zn_zap; - int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; - int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift && - zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); - - zap_put_leaf(l); - - if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) { - /* - * We are in the middle of growing the pointer table, or - * this leaf will soon make us grow it. - */ - if (zap_tryupgradedir(zap, tx) == 0) { - objset_t *os = zap->zap_objset; - uint64_t zapobj = zap->zap_object; - - zap_unlockdir(zap, tag); - int err = zap_lockdir(os, zapobj, tx, - RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); - zap = zn->zn_zap; - if (err != 0) - return; - } - - /* could have finished growing while our locks were down */ - if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift) - (void) zap_grow_ptrtbl(zap, tx); - } -} - -static int -fzap_checkname(zap_name_t *zn) -{ - if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) - return (SET_ERROR(ENAMETOOLONG)); - return (0); -} - -static int -fzap_checksize(uint64_t integer_size, uint64_t num_integers) -{ - /* Only integer sizes supported by C */ - switch (integer_size) { - case 1: - case 2: - case 4: - case 8: - break; - default: - return (SET_ERROR(EINVAL)); - } - - if (integer_size * num_integers > ZAP_MAXVALUELEN) - return (E2BIG); - - return (0); -} - -static int -fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers) -{ - int err = fzap_checkname(zn); - if (err != 0) - return (err); - return (fzap_checksize(integer_size, num_integers)); -} - -/* - * Routines for manipulating attributes. - */ -int -fzap_lookup(zap_name_t *zn, - uint64_t integer_size, uint64_t num_integers, void *buf, - char *realname, int rn_len, boolean_t *ncp) -{ - zap_leaf_t *l; - zap_entry_handle_t zeh; - - int err = fzap_checkname(zn); - if (err != 0) - return (err); - - err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); - if (err != 0) - return (err); - err = zap_leaf_lookup(l, zn, &zeh); - if (err == 0) { - if ((err = fzap_checksize(integer_size, num_integers)) != 0) { - zap_put_leaf(l); - return (err); - } - - err = zap_entry_read(&zeh, integer_size, num_integers, buf); - (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname); - if (ncp) { - *ncp = zap_entry_normalization_conflict(&zeh, - zn, NULL, zn->zn_zap); - } - } - - zap_put_leaf(l); - return (err); -} - -int -fzap_add_cd(zap_name_t *zn, - uint64_t integer_size, uint64_t num_integers, - const void *val, uint32_t cd, void *tag, dmu_tx_t *tx) -{ - zap_leaf_t *l; - int err; - zap_entry_handle_t zeh; - zap_t *zap = zn->zn_zap; - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - ASSERT(!zap->zap_ismicro); - ASSERT(fzap_check(zn, integer_size, num_integers) == 0); - - err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); - if (err != 0) - return (err); -retry: - err = zap_leaf_lookup(l, zn, &zeh); - if (err == 0) { - err = SET_ERROR(EEXIST); - goto out; - } - if (err != ENOENT) - goto out; - - err = zap_entry_create(l, zn, cd, - integer_size, num_integers, val, &zeh); - - if (err == 0) { - zap_increment_num_entries(zap, 1, tx); - } else if (err == EAGAIN) { - err = zap_expand_leaf(zn, l, tag, tx, &l); - zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ - if (err == 0) - goto retry; - } - -out: - if (zap != NULL) - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); - return (err); -} - -int -fzap_add(zap_name_t *zn, - uint64_t integer_size, uint64_t num_integers, - const void *val, void *tag, dmu_tx_t *tx) -{ - int err = fzap_check(zn, integer_size, num_integers); - if (err != 0) - return (err); - - return (fzap_add_cd(zn, integer_size, num_integers, - val, ZAP_NEED_CD, tag, tx)); -} - -int -fzap_update(zap_name_t *zn, - int integer_size, uint64_t num_integers, const void *val, - void *tag, dmu_tx_t *tx) -{ - zap_leaf_t *l; - int err; - boolean_t create; - zap_entry_handle_t zeh; - zap_t *zap = zn->zn_zap; - - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - err = fzap_check(zn, integer_size, num_integers); - if (err != 0) - return (err); - - err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); - if (err != 0) - return (err); -retry: - err = zap_leaf_lookup(l, zn, &zeh); - create = (err == ENOENT); - ASSERT(err == 0 || err == ENOENT); - - if (create) { - err = zap_entry_create(l, zn, ZAP_NEED_CD, - integer_size, num_integers, val, &zeh); - if (err == 0) - zap_increment_num_entries(zap, 1, tx); - } else { - err = zap_entry_update(&zeh, integer_size, num_integers, val); - } - - if (err == EAGAIN) { - err = zap_expand_leaf(zn, l, tag, tx, &l); - zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ - if (err == 0) - goto retry; - } - - if (zap != NULL) - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); - return (err); -} - -int -fzap_length(zap_name_t *zn, - uint64_t *integer_size, uint64_t *num_integers) -{ - zap_leaf_t *l; - int err; - zap_entry_handle_t zeh; - - err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); - if (err != 0) - return (err); - err = zap_leaf_lookup(l, zn, &zeh); - if (err != 0) - goto out; - - if (integer_size != 0) - *integer_size = zeh.zeh_integer_size; - if (num_integers != 0) - *num_integers = zeh.zeh_num_integers; -out: - zap_put_leaf(l); - return (err); -} - -int -fzap_remove(zap_name_t *zn, dmu_tx_t *tx) -{ - zap_leaf_t *l; - int err; - zap_entry_handle_t zeh; - - err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l); - if (err != 0) - return (err); - err = zap_leaf_lookup(l, zn, &zeh); - if (err == 0) { - zap_entry_remove(&zeh); - zap_increment_num_entries(zn->zn_zap, -1, tx); - } - zap_put_leaf(l); - return (err); -} - -void -fzap_prefetch(zap_name_t *zn) -{ - uint64_t blk; - zap_t *zap = zn->zn_zap; - - uint64_t idx = ZAP_HASH_IDX(zn->zn_hash, - zap_f_phys(zap)->zap_ptrtbl.zt_shift); - if (zap_idx_to_blk(zap, idx, &blk) != 0) - return; - int bs = FZAP_BLOCK_SHIFT(zap); - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, - ZIO_PRIORITY_SYNC_READ); -} - -/* - * Helper functions for consumers. - */ - -uint64_t -zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, - const char *name, dmu_tx_t *tx) -{ - return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx)); -} - -uint64_t -zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, - const char *name, int dnodesize, dmu_tx_t *tx) -{ - uint64_t new_obj; - - VERIFY((new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0, - dnodesize, tx)) > 0); - VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, - tx)); - - return (new_obj); -} - -int -zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, - char *name) -{ - zap_cursor_t zc; - int err; - - if (mask == 0) - mask = -1ULL; - - zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); - for (zap_cursor_init(&zc, os, zapobj); - (err = zap_cursor_retrieve(&zc, za)) == 0; - zap_cursor_advance(&zc)) { - if ((za->za_first_integer & mask) == (value & mask)) { - (void) strcpy(name, za->za_name); - break; - } - } - zap_cursor_fini(&zc); - kmem_free(za, sizeof (*za)); - return (err); -} - -int -zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) -{ - zap_cursor_t zc; - int err = 0; - - zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); - for (zap_cursor_init(&zc, os, fromobj); - zap_cursor_retrieve(&zc, za) == 0; - (void) zap_cursor_advance(&zc)) { - if (za->za_integer_length != 8 || za->za_num_integers != 1) { - err = SET_ERROR(EINVAL); - break; - } - err = zap_add(os, intoobj, za->za_name, - 8, 1, &za->za_first_integer, tx); - if (err != 0) - break; - } - zap_cursor_fini(&zc); - kmem_free(za, sizeof (*za)); - return (err); -} - -int -zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, - uint64_t value, dmu_tx_t *tx) -{ - zap_cursor_t zc; - int err = 0; - - zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); - for (zap_cursor_init(&zc, os, fromobj); - zap_cursor_retrieve(&zc, za) == 0; - (void) zap_cursor_advance(&zc)) { - if (za->za_integer_length != 8 || za->za_num_integers != 1) { - err = SET_ERROR(EINVAL); - break; - } - err = zap_add(os, intoobj, za->za_name, - 8, 1, &value, tx); - if (err != 0) - break; - } - zap_cursor_fini(&zc); - kmem_free(za, sizeof (*za)); - return (err); -} - -int -zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, - dmu_tx_t *tx) -{ - zap_cursor_t zc; - int err = 0; - - zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); - for (zap_cursor_init(&zc, os, fromobj); - zap_cursor_retrieve(&zc, za) == 0; - (void) zap_cursor_advance(&zc)) { - uint64_t delta = 0; - - if (za->za_integer_length != 8 || za->za_num_integers != 1) { - err = SET_ERROR(EINVAL); - break; - } - - err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta); - if (err != 0 && err != ENOENT) - break; - delta += za->za_first_integer; - err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx); - if (err != 0) - break; - } - zap_cursor_fini(&zc); - kmem_free(za, sizeof (*za)); - return (err); -} - -int -zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) -{ - char name[20]; - - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); - return (zap_add(os, obj, name, 8, 1, &value, tx)); -} - -int -zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) -{ - char name[20]; - - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); - return (zap_remove(os, obj, name, tx)); -} - -int -zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) -{ - char name[20]; - - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); - return (zap_lookup(os, obj, name, 8, 1, &value)); -} - -int -zap_add_int_key(objset_t *os, uint64_t obj, - uint64_t key, uint64_t value, dmu_tx_t *tx) -{ - char name[20]; - - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); - return (zap_add(os, obj, name, 8, 1, &value, tx)); -} - -int -zap_update_int_key(objset_t *os, uint64_t obj, - uint64_t key, uint64_t value, dmu_tx_t *tx) -{ - char name[20]; - - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); - return (zap_update(os, obj, name, 8, 1, &value, tx)); -} - -int -zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) -{ - char name[20]; - - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); - return (zap_lookup(os, obj, name, 8, 1, valuep)); -} - -int -zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, - dmu_tx_t *tx) -{ - uint64_t value = 0; - - if (delta == 0) - return (0); - - int err = zap_lookup(os, obj, name, 8, 1, &value); - if (err != 0 && err != ENOENT) - return (err); - value += delta; - if (value == 0) - err = zap_remove(os, obj, name, tx); - else - err = zap_update(os, obj, name, 8, 1, &value, tx); - return (err); -} - -int -zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, - dmu_tx_t *tx) -{ - char name[20]; - - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); - return (zap_increment(os, obj, name, delta, tx)); -} - -/* - * Routines for iterating over the attributes. - */ - -int -fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) -{ - int err = ENOENT; - zap_entry_handle_t zeh; - zap_leaf_t *l; - - /* retrieve the next entry at or after zc_hash/zc_cd */ - /* if no entry, return ENOENT */ - - /* - * If we are reading from the beginning, we're almost - * certain to iterate over the entire ZAP object. If there are - * multiple leaf blocks (freeblk > 2), prefetch the whole - * object, so that we read the leaf blocks concurrently. - * (Unless noprefetch was requested via zap_cursor_init_noprefetch()). - */ - if (zc->zc_hash == 0 && zap_iterate_prefetch && - zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) { - dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0, - zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap), - ZIO_PRIORITY_ASYNC_READ); - } - - if (zc->zc_leaf && - (ZAP_HASH_IDX(zc->zc_hash, - zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != - zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { - rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); - zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; - } - -again: - if (zc->zc_leaf == NULL) { - err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, - &zc->zc_leaf); - if (err != 0) - return (err); - } else { - rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); - } - l = zc->zc_leaf; - - err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); - - if (err == ENOENT) { - uint64_t nocare = - (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1; - zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; - zc->zc_cd = 0; - if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 || - zc->zc_hash == 0) { - zc->zc_hash = -1ULL; - } else { - zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; - goto again; - } - } - - if (err == 0) { - zc->zc_hash = zeh.zeh_hash; - zc->zc_cd = zeh.zeh_cd; - za->za_integer_length = zeh.zeh_integer_size; - za->za_num_integers = zeh.zeh_num_integers; - if (zeh.zeh_num_integers == 0) { - za->za_first_integer = 0; - } else { - err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); - ASSERT(err == 0 || err == EOVERFLOW); - } - err = zap_entry_read_name(zap, &zeh, - sizeof (za->za_name), za->za_name); - ASSERT(err == 0); - - za->za_normalization_conflict = - zap_entry_normalization_conflict(&zeh, - NULL, za->za_name, zap); - } - rw_exit(&zc->zc_leaf->l_rwlock); - return (err); -} - -static void -zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) -{ - uint64_t lastblk = 0; - - /* - * NB: if a leaf has more pointers than an entire ptrtbl block - * can hold, then it'll be accounted for more than once, since - * we won't have lastblk. - */ - for (int i = 0; i < len; i++) { - zap_leaf_t *l; - - if (tbl[i] == lastblk) - continue; - lastblk = tbl[i]; - - int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); - if (err == 0) { - zap_leaf_stats(zap, l, zs); - zap_put_leaf(l); - } - } -} - -int -fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn) -{ - int err; - zap_leaf_t *l; - zap_entry_handle_t zeh; - - if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) - return (SET_ERROR(ENAMETOOLONG)); - - err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l); - if (err != 0) - return (err); - - err = zap_leaf_lookup(l, zn, &zeh); - if (err != 0) - return (err); - - zc->zc_leaf = l; - zc->zc_hash = zeh.zeh_hash; - zc->zc_cd = zeh.zeh_cd; - - return (err); -} - -void -fzap_get_stats(zap_t *zap, zap_stats_t *zs) -{ - int bs = FZAP_BLOCK_SHIFT(zap); - zs->zs_blocksize = 1ULL << bs; - - /* - * Set zap_phys_t fields - */ - zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs; - zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries; - zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk; - zs->zs_block_type = zap_f_phys(zap)->zap_block_type; - zs->zs_magic = zap_f_phys(zap)->zap_magic; - zs->zs_salt = zap_f_phys(zap)->zap_salt; - - /* - * Set zap_ptrtbl fields - */ - zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift; - zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk; - zs->zs_ptrtbl_blks_copied = - zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied; - zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk; - zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks; - zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; - - if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { - /* the ptrtbl is entirely in the header block. */ - zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), - 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); - } else { - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, - zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, - zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, - ZIO_PRIORITY_SYNC_READ); - - for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; - b++) { - dmu_buf_t *db; - int err; - - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, - FTAG, &db, DMU_READ_NO_PREFETCH); - if (err == 0) { - zap_stats_ptrtbl(zap, db->db_data, - 1<<(bs-3), zs); - dmu_buf_rele(db, FTAG); - } - } - } -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c deleted file mode 100644 index 1c7c736d8e97..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c +++ /dev/null @@ -1,849 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. - */ - -/* - * The 512-byte leaf is broken into 32 16-byte chunks. - * chunk number n means l_chunk[n], even though the header precedes it. - * the names are stored null-terminated. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); - -#define CHAIN_END 0xffff /* end of the chunk chain */ - -/* half the (current) minimum block size */ -#define MAX_ARRAY_BYTES (8<<10) - -#define LEAF_HASH(l, h) \ - ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ - ((h) >> \ - (64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) - -#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)]) - -extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l); - -static void -zap_memset(void *a, int c, size_t n) -{ - char *cp = a; - char *cpend = cp + n; - - while (cp < cpend) - *cp++ = c; -} - -static void -stv(int len, void *addr, uint64_t value) -{ - switch (len) { - case 1: - *(uint8_t *)addr = value; - return; - case 2: - *(uint16_t *)addr = value; - return; - case 4: - *(uint32_t *)addr = value; - return; - case 8: - *(uint64_t *)addr = value; - return; - } - ASSERT(!"bad int len"); -} - -static uint64_t -ldv(int len, const void *addr) -{ - switch (len) { - case 1: - return (*(uint8_t *)addr); - case 2: - return (*(uint16_t *)addr); - case 4: - return (*(uint32_t *)addr); - case 8: - return (*(uint64_t *)addr); - } - ASSERT(!"bad int len"); - return (0xFEEDFACEDEADBEEFULL); -} - -void -zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) -{ - zap_leaf_t l; - dmu_buf_t l_dbuf; - - l_dbuf.db_data = buf; - l.l_bs = highbit64(size) - 1; - l.l_dbuf = &l_dbuf; - - buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type); - buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix); - buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic); - buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree); - buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries); - buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); - buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); - - for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) - buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); - - for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { - zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i); - struct zap_leaf_entry *le; - - switch (lc->l_free.lf_type) { - case ZAP_CHUNK_ENTRY: - le = &lc->l_entry; - - le->le_type = BSWAP_8(le->le_type); - le->le_value_intlen = BSWAP_8(le->le_value_intlen); - le->le_next = BSWAP_16(le->le_next); - le->le_name_chunk = BSWAP_16(le->le_name_chunk); - le->le_name_numints = BSWAP_16(le->le_name_numints); - le->le_value_chunk = BSWAP_16(le->le_value_chunk); - le->le_value_numints = BSWAP_16(le->le_value_numints); - le->le_cd = BSWAP_32(le->le_cd); - le->le_hash = BSWAP_64(le->le_hash); - break; - case ZAP_CHUNK_FREE: - lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type); - lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next); - break; - case ZAP_CHUNK_ARRAY: - lc->l_array.la_type = BSWAP_8(lc->l_array.la_type); - lc->l_array.la_next = BSWAP_16(lc->l_array.la_next); - /* la_array doesn't need swapping */ - break; - default: - ASSERT(!"bad leaf type"); - } - } -} - -void -zap_leaf_init(zap_leaf_t *l, boolean_t sort) -{ - l->l_bs = highbit64(l->l_dbuf->db_size) - 1; - zap_memset(&zap_leaf_phys(l)->l_hdr, 0, - sizeof (struct zap_leaf_header)); - zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, - 2*ZAP_LEAF_HASH_NUMENTRIES(l)); - for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { - ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; - ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; - } - ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END; - zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF; - zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC; - zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); - if (sort) - zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; -} - -/* - * Routines which manipulate leaf chunks (l_chunk[]). - */ - -static uint16_t -zap_leaf_chunk_alloc(zap_leaf_t *l) -{ - ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0); - - int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist; - ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); - - zap_leaf_phys(l)->l_hdr.lh_freelist = - ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next; - - zap_leaf_phys(l)->l_hdr.lh_nfree--; - - return (chunk); -} - -static void -zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) -{ - struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free; - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l)); - ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - ASSERT(zlf->lf_type != ZAP_CHUNK_FREE); - - zlf->lf_type = ZAP_CHUNK_FREE; - zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist; - bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */ - zap_leaf_phys(l)->l_hdr.lh_freelist = chunk; - - zap_leaf_phys(l)->l_hdr.lh_nfree++; -} - -/* - * Routines which manipulate leaf arrays (zap_leaf_array type chunks). - */ - -static uint16_t -zap_leaf_array_create(zap_leaf_t *l, const char *buf, - int integer_size, int num_integers) -{ - uint16_t chunk_head; - uint16_t *chunkp = &chunk_head; - int byten = 0; - uint64_t value = 0; - int shift = (integer_size - 1) * 8; - int len = num_integers; - - ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES); - - while (len > 0) { - uint16_t chunk = zap_leaf_chunk_alloc(l); - struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - - la->la_type = ZAP_CHUNK_ARRAY; - for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { - if (byten == 0) - value = ldv(integer_size, buf); - la->la_array[i] = value >> shift; - value <<= 8; - if (++byten == integer_size) { - byten = 0; - buf += integer_size; - if (--len == 0) - break; - } - } - - *chunkp = chunk; - chunkp = &la->la_next; - } - *chunkp = CHAIN_END; - - return (chunk_head); -} - -static void -zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) -{ - uint16_t chunk = *chunkp; - - *chunkp = CHAIN_END; - - while (chunk != CHAIN_END) { - int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; - ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==, - ZAP_CHUNK_ARRAY); - zap_leaf_chunk_free(l, chunk); - chunk = nextchunk; - } -} - -/* array_len and buf_len are in integers, not bytes */ -static void -zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, - int array_int_len, int array_len, int buf_int_len, uint64_t buf_len, - void *buf) -{ - int len = MIN(array_len, buf_len); - int byten = 0; - uint64_t value = 0; - char *p = buf; - - ASSERT3U(array_int_len, <=, buf_int_len); - - /* Fast path for one 8-byte integer */ - if (array_int_len == 8 && buf_int_len == 8 && len == 1) { - struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - uint8_t *ip = la->la_array; - uint64_t *buf64 = buf; - - *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 | - (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 | - (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 | - (uint64_t)ip[6] << 8 | (uint64_t)ip[7]; - return; - } - - /* Fast path for an array of 1-byte integers (eg. the entry name) */ - if (array_int_len == 1 && buf_int_len == 1 && - buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) { - while (chunk != CHAIN_END) { - struct zap_leaf_array *la = - &ZAP_LEAF_CHUNK(l, chunk).l_array; - bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES); - p += ZAP_LEAF_ARRAY_BYTES; - chunk = la->la_next; - } - return; - } - - while (len > 0) { - struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - - ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { - value = (value << 8) | la->la_array[i]; - byten++; - if (byten == array_int_len) { - stv(buf_int_len, p, value); - byten = 0; - len--; - if (len == 0) - return; - p += buf_int_len; - } - } - chunk = la->la_next; - } -} - -static boolean_t -zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, - int chunk, int array_numints) -{ - int bseen = 0; - - if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) { - uint64_t *thiskey = - kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP); - ASSERT(zn->zn_key_intlen == sizeof (*thiskey)); - - zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints, - sizeof (*thiskey), array_numints, thiskey); - boolean_t match = bcmp(thiskey, zn->zn_key_orig, - array_numints * sizeof (*thiskey)) == 0; - kmem_free(thiskey, array_numints * sizeof (*thiskey)); - return (match); - } - - ASSERT(zn->zn_key_intlen == 1); - if (zn->zn_matchtype & MT_NORMALIZE) { - char *thisname = kmem_alloc(array_numints, KM_SLEEP); - - zap_leaf_array_read(l, chunk, sizeof (char), array_numints, - sizeof (char), array_numints, thisname); - boolean_t match = zap_match(zn, thisname); - kmem_free(thisname, array_numints); - return (match); - } - - /* - * Fast path for exact matching. - * First check that the lengths match, so that we don't read - * past the end of the zn_key_orig array. - */ - if (array_numints != zn->zn_key_orig_numints) - return (B_FALSE); - while (bseen < array_numints) { - struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES); - ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread)) - break; - chunk = la->la_next; - bseen += toread; - } - return (bseen == array_numints); -} - -/* - * Routines which manipulate leaf entries. - */ - -int -zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh) -{ - struct zap_leaf_entry *le; - - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); - - for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash); - *chunkp != CHAIN_END; chunkp = &le->le_next) { - uint16_t chunk = *chunkp; - le = ZAP_LEAF_ENTRY(l, chunk); - - ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - - if (le->le_hash != zn->zn_hash) - continue; - - /* - * NB: the entry chain is always sorted by cd on - * normalized zap objects, so this will find the - * lowest-cd match for MT_NORMALIZE. - */ - ASSERT((zn->zn_matchtype == 0) || - (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED)); - if (zap_leaf_array_match(l, zn, le->le_name_chunk, - le->le_name_numints)) { - zeh->zeh_num_integers = le->le_value_numints; - zeh->zeh_integer_size = le->le_value_intlen; - zeh->zeh_cd = le->le_cd; - zeh->zeh_hash = le->le_hash; - zeh->zeh_chunkp = chunkp; - zeh->zeh_leaf = l; - return (0); - } - } - - return (SET_ERROR(ENOENT)); -} - -/* Return (h1,cd1 >= h2,cd2) */ -#define HCD_GTEQ(h1, cd1, h2, cd2) \ - ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE)) - -int -zap_leaf_lookup_closest(zap_leaf_t *l, - uint64_t h, uint32_t cd, zap_entry_handle_t *zeh) -{ - uint64_t besth = -1ULL; - uint32_t bestcd = -1U; - uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1; - struct zap_leaf_entry *le; - - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); - - for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) { - for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh]; - chunk != CHAIN_END; chunk = le->le_next) { - le = ZAP_LEAF_ENTRY(l, chunk); - - ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - - if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) && - HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) { - ASSERT3U(bestlh, >=, lh); - bestlh = lh; - besth = le->le_hash; - bestcd = le->le_cd; - - zeh->zeh_num_integers = le->le_value_numints; - zeh->zeh_integer_size = le->le_value_intlen; - zeh->zeh_cd = le->le_cd; - zeh->zeh_hash = le->le_hash; - zeh->zeh_fakechunk = chunk; - zeh->zeh_chunkp = &zeh->zeh_fakechunk; - zeh->zeh_leaf = l; - } - } - } - - return (bestcd == -1U ? ENOENT : 0); -} - -int -zap_entry_read(const zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, void *buf) -{ - struct zap_leaf_entry *le = - ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); - ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - - if (le->le_value_intlen > integer_size) - return (SET_ERROR(EINVAL)); - - zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, - le->le_value_intlen, le->le_value_numints, - integer_size, num_integers, buf); - - if (zeh->zeh_num_integers > num_integers) - return (SET_ERROR(EOVERFLOW)); - return (0); - -} - -int -zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen, - char *buf) -{ - struct zap_leaf_entry *le = - ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); - ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - - if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { - zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8, - le->le_name_numints, 8, buflen / 8, buf); - } else { - zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1, - le->le_name_numints, 1, buflen, buf); - } - if (le->le_name_numints > buflen) - return (SET_ERROR(EOVERFLOW)); - return (0); -} - -int -zap_entry_update(zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, const void *buf) -{ - zap_leaf_t *l = zeh->zeh_leaf; - struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp); - - int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - - ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen); - - if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks) - return (SET_ERROR(EAGAIN)); - - zap_leaf_array_free(l, &le->le_value_chunk); - le->le_value_chunk = - zap_leaf_array_create(l, buf, integer_size, num_integers); - le->le_value_numints = num_integers; - le->le_value_intlen = integer_size; - return (0); -} - -void -zap_entry_remove(zap_entry_handle_t *zeh) -{ - zap_leaf_t *l = zeh->zeh_leaf; - - ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk); - - uint16_t entry_chunk = *zeh->zeh_chunkp; - struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk); - ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - - zap_leaf_array_free(l, &le->le_name_chunk); - zap_leaf_array_free(l, &le->le_value_chunk); - - *zeh->zeh_chunkp = le->le_next; - zap_leaf_chunk_free(l, entry_chunk); - - zap_leaf_phys(l)->l_hdr.lh_nentries--; -} - -int -zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, - uint8_t integer_size, uint64_t num_integers, const void *buf, - zap_entry_handle_t *zeh) -{ - uint16_t chunk; - struct zap_leaf_entry *le; - uint64_t h = zn->zn_hash; - - uint64_t valuelen = integer_size * num_integers; - - int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * - zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen); - if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) - return (E2BIG); - - if (cd == ZAP_NEED_CD) { - /* find the lowest unused cd */ - if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) { - cd = 0; - - for (chunk = *LEAF_HASH_ENTPTR(l, h); - chunk != CHAIN_END; chunk = le->le_next) { - le = ZAP_LEAF_ENTRY(l, chunk); - if (le->le_cd > cd) - break; - if (le->le_hash == h) { - ASSERT3U(cd, ==, le->le_cd); - cd++; - } - } - } else { - /* old unsorted format; do it the O(n^2) way */ - for (cd = 0; ; cd++) { - for (chunk = *LEAF_HASH_ENTPTR(l, h); - chunk != CHAIN_END; chunk = le->le_next) { - le = ZAP_LEAF_ENTRY(l, chunk); - if (le->le_hash == h && - le->le_cd == cd) { - break; - } - } - /* If this cd is not in use, we are good. */ - if (chunk == CHAIN_END) - break; - } - } - /* - * We would run out of space in a block before we could - * store enough entries to run out of CD values. - */ - ASSERT3U(cd, <, zap_maxcd(zn->zn_zap)); - } - - if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks) - return (SET_ERROR(EAGAIN)); - - /* make the entry */ - chunk = zap_leaf_chunk_alloc(l); - le = ZAP_LEAF_ENTRY(l, chunk); - le->le_type = ZAP_CHUNK_ENTRY; - le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig, - zn->zn_key_intlen, zn->zn_key_orig_numints); - le->le_name_numints = zn->zn_key_orig_numints; - le->le_value_chunk = - zap_leaf_array_create(l, buf, integer_size, num_integers); - le->le_value_numints = num_integers; - le->le_value_intlen = integer_size; - le->le_hash = h; - le->le_cd = cd; - - /* link it into the hash chain */ - /* XXX if we did the search above, we could just use that */ - uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk); - - zap_leaf_phys(l)->l_hdr.lh_nentries++; - - zeh->zeh_leaf = l; - zeh->zeh_num_integers = num_integers; - zeh->zeh_integer_size = le->le_value_intlen; - zeh->zeh_cd = le->le_cd; - zeh->zeh_hash = le->le_hash; - zeh->zeh_chunkp = chunkp; - - return (0); -} - -/* - * Determine if there is another entry with the same normalized form. - * For performance purposes, either zn or name must be provided (the - * other can be NULL). Note, there usually won't be any hash - * conflicts, in which case we don't need the concatenated/normalized - * form of the name. But all callers have one of these on hand anyway, - * so might as well take advantage. A cleaner but slower interface - * would accept neither argument, and compute the normalized name as - * needed (using zap_name_alloc(zap_entry_read_name(zeh))). - */ -boolean_t -zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, - const char *name, zap_t *zap) -{ - struct zap_leaf_entry *le; - boolean_t allocdzn = B_FALSE; - - if (zap->zap_normflags == 0) - return (B_FALSE); - - for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash); - chunk != CHAIN_END; chunk = le->le_next) { - le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk); - if (le->le_hash != zeh->zeh_hash) - continue; - if (le->le_cd == zeh->zeh_cd) - continue; - - if (zn == NULL) { - zn = zap_name_alloc(zap, name, MT_NORMALIZE); - allocdzn = B_TRUE; - } - if (zap_leaf_array_match(zeh->zeh_leaf, zn, - le->le_name_chunk, le->le_name_numints)) { - if (allocdzn) - zap_name_free(zn); - return (B_TRUE); - } - } - if (allocdzn) - zap_name_free(zn); - return (B_FALSE); -} - -/* - * Routines for transferring entries between leafs. - */ - -static uint16_t * -zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry) -{ - struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); - struct zap_leaf_entry *le2; - uint16_t *chunkp; - - /* - * keep the entry chain sorted by cd - * NB: this will not cause problems for unsorted leafs, though - * it is unnecessary there. - */ - for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash); - *chunkp != CHAIN_END; chunkp = &le2->le_next) { - le2 = ZAP_LEAF_ENTRY(l, *chunkp); - if (le2->le_cd > le->le_cd) - break; - } - - le->le_next = *chunkp; - *chunkp = entry; - return (chunkp); -} - -static uint16_t -zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) -{ - uint16_t new_chunk; - uint16_t *nchunkp = &new_chunk; - - while (chunk != CHAIN_END) { - uint16_t nchunk = zap_leaf_chunk_alloc(nl); - struct zap_leaf_array *nla = - &ZAP_LEAF_CHUNK(nl, nchunk).l_array; - struct zap_leaf_array *la = - &ZAP_LEAF_CHUNK(l, chunk).l_array; - int nextchunk = la->la_next; - - ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l)); - - *nla = *la; /* structure assignment */ - - zap_leaf_chunk_free(l, chunk); - chunk = nextchunk; - *nchunkp = nchunk; - nchunkp = &nla->la_next; - } - *nchunkp = CHAIN_END; - return (new_chunk); -} - -static void -zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) -{ - struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); - ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - - uint16_t chunk = zap_leaf_chunk_alloc(nl); - struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk); - *nle = *le; /* structure assignment */ - - (void) zap_leaf_rehash_entry(nl, chunk); - - nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); - nle->le_value_chunk = - zap_leaf_transfer_array(l, le->le_value_chunk, nl); - - zap_leaf_chunk_free(l, entry); - - zap_leaf_phys(l)->l_hdr.lh_nentries--; - zap_leaf_phys(nl)->l_hdr.lh_nentries++; -} - -/* - * Transfer the entries whose hash prefix ends in 1 to the new leaf. - */ -void -zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) -{ - int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len; - - /* set new prefix and prefix_len */ - zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1; - zap_leaf_phys(l)->l_hdr.lh_prefix_len++; - zap_leaf_phys(nl)->l_hdr.lh_prefix = - zap_leaf_phys(l)->l_hdr.lh_prefix | 1; - zap_leaf_phys(nl)->l_hdr.lh_prefix_len = - zap_leaf_phys(l)->l_hdr.lh_prefix_len; - - /* break existing hash chains */ - zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, - 2*ZAP_LEAF_HASH_NUMENTRIES(l)); - - if (sort) - zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; - - /* - * Transfer entries whose hash bit 'bit' is set to nl; rehash - * the remaining entries - * - * NB: We could find entries via the hashtable instead. That - * would be O(hashents+numents) rather than O(numblks+numents), - * but this accesses memory more sequentially, and when we're - * called, the block is usually pretty full. - */ - for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { - struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i); - if (le->le_type != ZAP_CHUNK_ENTRY) - continue; - - if (le->le_hash & (1ULL << bit)) - zap_leaf_transfer_entry(l, i, nl); - else - (void) zap_leaf_rehash_entry(l, i); - } -} - -void -zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) -{ - int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift - - zap_leaf_phys(l)->l_hdr.lh_prefix_len; - n = MIN(n, ZAP_HISTOGRAM_SIZE-1); - zs->zs_leafs_with_2n_pointers[n]++; - - - n = zap_leaf_phys(l)->l_hdr.lh_nentries/5; - n = MIN(n, ZAP_HISTOGRAM_SIZE-1); - zs->zs_blocks_with_n5_entries[n]++; - - n = ((1<l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 / - (1<zs_blocks_n_tenths_full[n]++; - - for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { - int nentries = 0; - int chunk = zap_leaf_phys(l)->l_hash[i]; - - while (chunk != CHAIN_END) { - struct zap_leaf_entry *le = - ZAP_LEAF_ENTRY(l, chunk); - - n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) + - ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * - le->le_value_intlen); - n = MIN(n, ZAP_HISTOGRAM_SIZE-1); - zs->zs_entries_using_n_chunks[n]++; - - chunk = le->le_next; - nentries++; - } - - n = nentries; - n = MIN(n, ZAP_HISTOGRAM_SIZE-1); - zs->zs_buckets_with_n_entries[n]++; - } -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c deleted file mode 100644 index 133989eca324..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c +++ /dev/null @@ -1,1609 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2017 Nexenta Systems, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef _KERNEL -#include -#endif - -extern inline mzap_phys_t *zap_m_phys(zap_t *zap); - -static int mzap_upgrade(zap_t **zapp, - void *tag, dmu_tx_t *tx, zap_flags_t flags); - -uint64_t -zap_getflags(zap_t *zap) -{ - if (zap->zap_ismicro) - return (0); - return (zap_f_phys(zap)->zap_flags); -} - -int -zap_hashbits(zap_t *zap) -{ - if (zap_getflags(zap) & ZAP_FLAG_HASH64) - return (48); - else - return (28); -} - -uint32_t -zap_maxcd(zap_t *zap) -{ - if (zap_getflags(zap) & ZAP_FLAG_HASH64) - return ((1<<16)-1); - else - return (-1U); -} - -static uint64_t -zap_hash(zap_name_t *zn) -{ - zap_t *zap = zn->zn_zap; - uint64_t h = 0; - - if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { - ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); - h = *(uint64_t *)zn->zn_key_orig; - } else { - h = zap->zap_salt; - ASSERT(h != 0); - ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); - - if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { - const uint64_t *wp = zn->zn_key_norm; - - ASSERT(zn->zn_key_intlen == 8); - for (int i = 0; i < zn->zn_key_norm_numints; - wp++, i++) { - uint64_t word = *wp; - - for (int j = 0; j < zn->zn_key_intlen; j++) { - h = (h >> 8) ^ - zfs_crc64_table[(h ^ word) & 0xFF]; - word >>= NBBY; - } - } - } else { - const uint8_t *cp = zn->zn_key_norm; - - /* - * We previously stored the terminating null on - * disk, but didn't hash it, so we need to - * continue to not hash it. (The - * zn_key_*_numints includes the terminating - * null for non-binary keys.) - */ - int len = zn->zn_key_norm_numints - 1; - - ASSERT(zn->zn_key_intlen == 1); - for (int i = 0; i < len; cp++, i++) { - h = (h >> 8) ^ - zfs_crc64_table[(h ^ *cp) & 0xFF]; - } - } - } - /* - * Don't use all 64 bits, since we need some in the cookie for - * the collision differentiator. We MUST use the high bits, - * since those are the ones that we first pay attention to when - * chosing the bucket. - */ - h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); - - return (h); -} - -static int -zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags) -{ - ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); - - size_t inlen = strlen(name) + 1; - size_t outlen = ZAP_MAXNAMELEN; - - int err = 0; - (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, - normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, - U8_UNICODE_LATEST, &err); - - return (err); -} - -boolean_t -zap_match(zap_name_t *zn, const char *matchname) -{ - ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); - - if (zn->zn_matchtype & MT_NORMALIZE) { - char norm[ZAP_MAXNAMELEN]; - - if (zap_normalize(zn->zn_zap, matchname, norm, - zn->zn_normflags) != 0) - return (B_FALSE); - - return (strcmp(zn->zn_key_norm, norm) == 0); - } else { - return (strcmp(zn->zn_key_orig, matchname) == 0); - } -} - -void -zap_name_free(zap_name_t *zn) -{ - kmem_free(zn, sizeof (zap_name_t)); -} - -zap_name_t * -zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) -{ - zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); - - zn->zn_zap = zap; - zn->zn_key_intlen = sizeof (*key); - zn->zn_key_orig = key; - zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; - zn->zn_matchtype = mt; - zn->zn_normflags = zap->zap_normflags; - - /* - * If we're dealing with a case sensitive lookup on a mixed or - * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup - * will fold case to all caps overriding the lookup request. - */ - if (mt & MT_MATCH_CASE) - zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; - - if (zap->zap_normflags) { - /* - * We *must* use zap_normflags because this normalization is - * what the hash is computed from. - */ - if (zap_normalize(zap, key, zn->zn_normbuf, - zap->zap_normflags) != 0) { - zap_name_free(zn); - return (NULL); - } - zn->zn_key_norm = zn->zn_normbuf; - zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; - } else { - if (mt != 0) { - zap_name_free(zn); - return (NULL); - } - zn->zn_key_norm = zn->zn_key_orig; - zn->zn_key_norm_numints = zn->zn_key_orig_numints; - } - - zn->zn_hash = zap_hash(zn); - - if (zap->zap_normflags != zn->zn_normflags) { - /* - * We *must* use zn_normflags because this normalization is - * what the matching is based on. (Not the hash!) - */ - if (zap_normalize(zap, key, zn->zn_normbuf, - zn->zn_normflags) != 0) { - zap_name_free(zn); - return (NULL); - } - zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; - } - - return (zn); -} - -zap_name_t * -zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) -{ - zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); - - ASSERT(zap->zap_normflags == 0); - zn->zn_zap = zap; - zn->zn_key_intlen = sizeof (*key); - zn->zn_key_orig = zn->zn_key_norm = key; - zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; - zn->zn_matchtype = 0; - - zn->zn_hash = zap_hash(zn); - return (zn); -} - -static void -mzap_byteswap(mzap_phys_t *buf, size_t size) -{ - buf->mz_block_type = BSWAP_64(buf->mz_block_type); - buf->mz_salt = BSWAP_64(buf->mz_salt); - buf->mz_normflags = BSWAP_64(buf->mz_normflags); - int max = (size / MZAP_ENT_LEN) - 1; - for (int i = 0; i < max; i++) { - buf->mz_chunk[i].mze_value = - BSWAP_64(buf->mz_chunk[i].mze_value); - buf->mz_chunk[i].mze_cd = - BSWAP_32(buf->mz_chunk[i].mze_cd); - } -} - -void -zap_byteswap(void *buf, size_t size) -{ - uint64_t block_type = *(uint64_t *)buf; - - if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { - /* ASSERT(magic == ZAP_LEAF_MAGIC); */ - mzap_byteswap(buf, size); - } else { - fzap_byteswap(buf, size); - } -} - -static int -mze_compare(const void *arg1, const void *arg2) -{ - const mzap_ent_t *mze1 = arg1; - const mzap_ent_t *mze2 = arg2; - - int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash); - if (likely(cmp)) - return (cmp); - - return (AVL_CMP(mze1->mze_cd, mze2->mze_cd)); -} - -static int -mze_insert(zap_t *zap, int chunkid, uint64_t hash) -{ - avl_index_t idx; - - ASSERT(zap->zap_ismicro); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); - mze->mze_chunkid = chunkid; - mze->mze_hash = hash; - mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; - ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); - if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) { - kmem_free(mze, sizeof (mzap_ent_t)); - return (EEXIST); - } - avl_insert(&zap->zap_m.zap_avl, mze, idx); - return (0); -} - -static mzap_ent_t * -mze_find(zap_name_t *zn) -{ - mzap_ent_t mze_tofind; - mzap_ent_t *mze; - avl_index_t idx; - avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; - - ASSERT(zn->zn_zap->zap_ismicro); - ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); - - mze_tofind.mze_hash = zn->zn_hash; - mze_tofind.mze_cd = 0; - - mze = avl_find(avl, &mze_tofind, &idx); - if (mze == NULL) - mze = avl_nearest(avl, idx, AVL_AFTER); - for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { - ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); - if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) - return (mze); - } - - return (NULL); -} - -static uint32_t -mze_find_unused_cd(zap_t *zap, uint64_t hash) -{ - mzap_ent_t mze_tofind; - avl_index_t idx; - avl_tree_t *avl = &zap->zap_m.zap_avl; - - ASSERT(zap->zap_ismicro); - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - - mze_tofind.mze_hash = hash; - mze_tofind.mze_cd = 0; - - uint32_t cd = 0; - for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx); - mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { - if (mze->mze_cd != cd) - break; - cd++; - } - - return (cd); -} - -static void -mze_remove(zap_t *zap, mzap_ent_t *mze) -{ - ASSERT(zap->zap_ismicro); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - avl_remove(&zap->zap_m.zap_avl, mze); - kmem_free(mze, sizeof (mzap_ent_t)); -} - -static void -mze_destroy(zap_t *zap) -{ - mzap_ent_t *mze; - void *avlcookie = NULL; - - while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)) - kmem_free(mze, sizeof (mzap_ent_t)); - avl_destroy(&zap->zap_m.zap_avl); -} - -static zap_t * -mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) -{ - zap_t *winner; - uint64_t *zap_hdr = (uint64_t *)db->db_data; - uint64_t zap_block_type = zap_hdr[0]; - uint64_t zap_magic = zap_hdr[1]; - - ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); - - zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); - rw_init(&zap->zap_rwlock, 0, 0, 0); - rw_enter(&zap->zap_rwlock, RW_WRITER); - zap->zap_objset = os; - zap->zap_object = obj; - zap->zap_dbuf = db; - - if (zap_block_type != ZBT_MICRO) { - mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); - zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; - if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) { - winner = NULL; /* No actual winner here... */ - goto handle_winner; - } - } else { - zap->zap_ismicro = TRUE; - } - - /* - * Make sure that zap_ismicro is set before we let others see - * it, because zap_lockdir() checks zap_ismicro without the lock - * held. - */ - dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); - winner = dmu_buf_set_user(db, &zap->zap_dbu); - - if (winner != NULL) - goto handle_winner; - - if (zap->zap_ismicro) { - zap->zap_salt = zap_m_phys(zap)->mz_salt; - zap->zap_normflags = zap_m_phys(zap)->mz_normflags; - zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; - avl_create(&zap->zap_m.zap_avl, mze_compare, - sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); - - for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { - mzap_ent_phys_t *mze = - &zap_m_phys(zap)->mz_chunk[i]; - if (mze->mze_name[0]) { - zap_name_t *zn; - - zn = zap_name_alloc(zap, mze->mze_name, 0); - if (mze_insert(zap, i, zn->zn_hash) == 0) - zap->zap_m.zap_num_entries++; - else { - printf("ZFS WARNING: Duplicated ZAP " - "entry detected (%s).\n", - mze->mze_name); - } - zap_name_free(zn); - } - } - } else { - zap->zap_salt = zap_f_phys(zap)->zap_salt; - zap->zap_normflags = zap_f_phys(zap)->zap_normflags; - - ASSERT3U(sizeof (struct zap_leaf_header), ==, - 2*ZAP_LEAF_CHUNKSIZE); - - /* - * The embedded pointer table should not overlap the - * other members. - */ - ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, - &zap_f_phys(zap)->zap_salt); - - /* - * The embedded pointer table should end at the end of - * the block - */ - ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, - 1<zap_dbuf->db_size); - } - rw_exit(&zap->zap_rwlock); - return (zap); - -handle_winner: - rw_exit(&zap->zap_rwlock); - rw_destroy(&zap->zap_rwlock); - if (!zap->zap_ismicro) - mutex_destroy(&zap->zap_f.zap_num_entries_mtx); - kmem_free(zap, sizeof (zap_t)); - return (winner); -} - -/* - * This routine "consumes" the caller's hold on the dbuf, which must - * have the specified tag. - */ -static int -zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, - krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) -{ - ASSERT0(db->db_offset); - objset_t *os = dmu_buf_get_objset(db); - uint64_t obj = db->db_object; - - *zapp = NULL; - - zap_t *zap = dmu_buf_get_user(db); - if (zap == NULL) { - zap = mzap_open(os, obj, db); - if (zap == NULL) { - /* - * mzap_open() didn't like what it saw on-disk. - * Check for corruption! - */ - return (SET_ERROR(EIO)); - } - } - - /* - * We're checking zap_ismicro without the lock held, in order to - * tell what type of lock we want. Once we have some sort of - * lock, see if it really is the right type. In practice this - * can only be different if it was upgraded from micro to fat, - * and micro wanted WRITER but fat only needs READER. - */ - krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; - rw_enter(&zap->zap_rwlock, lt); - if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { - /* it was upgraded, now we only need reader */ - ASSERT(lt == RW_WRITER); - ASSERT(RW_READER == - (!zap->zap_ismicro && fatreader) ? RW_READER : lti); - rw_downgrade(&zap->zap_rwlock); - lt = RW_READER; - } - - zap->zap_objset = os; - - if (lt == RW_WRITER) - dmu_buf_will_dirty(db, tx); - - ASSERT3P(zap->zap_dbuf, ==, db); - - ASSERT(!zap->zap_ismicro || - zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); - if (zap->zap_ismicro && tx && adding && - zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { - uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; - if (newsz > MZAP_MAX_BLKSZ) { - dprintf("upgrading obj %llu: num_entries=%u\n", - obj, zap->zap_m.zap_num_entries); - *zapp = zap; - int err = mzap_upgrade(zapp, tag, tx, 0); - if (err != 0) - rw_exit(&zap->zap_rwlock); - return (err); - } - VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); - zap->zap_m.zap_num_chunks = - db->db_size / MZAP_ENT_LEN - 1; - } - - *zapp = zap; - return (0); -} - -static int -zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, - krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) -{ - dmu_buf_t *db; - - int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); - if (err != 0) { - return (err); - } -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(db, &doi); - ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); - } -#endif - - err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); - if (err != 0) { - dmu_buf_rele(db, tag); - } - return (err); -} - -int -zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, - krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) -{ - dmu_buf_t *db; - - int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); - if (err != 0) - return (err); -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(db, &doi); - ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); - } -#endif - err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); - if (err != 0) - dmu_buf_rele(db, tag); - return (err); -} - -void -zap_unlockdir(zap_t *zap, void *tag) -{ - rw_exit(&zap->zap_rwlock); - dmu_buf_rele(zap->zap_dbuf, tag); -} - -static int -mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) -{ - int err = 0; - zap_t *zap = *zapp; - - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - int sz = zap->zap_dbuf->db_size; - mzap_phys_t *mzp = zio_buf_alloc(sz); - bcopy(zap->zap_dbuf->db_data, mzp, sz); - int nchunks = zap->zap_m.zap_num_chunks; - - if (!flags) { - err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, - 1ULL << fzap_default_block_shift, 0, tx); - if (err != 0) { - zio_buf_free(mzp, sz); - return (err); - } - } - - dprintf("upgrading obj=%llu with %u chunks\n", - zap->zap_object, nchunks); - /* XXX destroy the avl later, so we can use the stored hash value */ - mze_destroy(zap); - - fzap_upgrade(zap, tx, flags); - - for (int i = 0; i < nchunks; i++) { - mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; - if (mze->mze_name[0] == 0) - continue; - dprintf("adding %s=%llu\n", - mze->mze_name, mze->mze_value); - zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0); - err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, - tag, tx); - zap = zn->zn_zap; /* fzap_add_cd() may change zap */ - zap_name_free(zn); - if (err != 0) - break; - } - zio_buf_free(mzp, sz); - *zapp = zap; - return (err); -} - -/* - * The "normflags" determine the behavior of the matchtype_t which is - * passed to zap_lookup_norm(). Names which have the same normalized - * version will be stored with the same hash value, and therefore we can - * perform normalization-insensitive lookups. We can be Unicode form- - * insensitive and/or case-insensitive. The following flags are valid for - * "normflags": - * - * U8_TEXTPREP_NFC - * U8_TEXTPREP_NFD - * U8_TEXTPREP_NFKC - * U8_TEXTPREP_NFKD - * U8_TEXTPREP_TOUPPER - * - * The *_NF* (Normalization Form) flags are mutually exclusive; at most one - * of them may be supplied. - */ -void -mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, - dmu_tx_t *tx) -{ - dmu_buf_t *db; - - VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); - - dmu_buf_will_dirty(db, tx); - mzap_phys_t *zp = db->db_data; - zp->mz_block_type = ZBT_MICRO; - zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; - zp->mz_normflags = normflags; - - if (flags != 0) { - zap_t *zap; - /* Only fat zap supports flags; upgrade immediately. */ - VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER, - B_FALSE, B_FALSE, &zap)); - VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); - zap_unlockdir(zap, FTAG); - } else { - dmu_buf_rele(db, FTAG); - } -} - -int -zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, - 0, tx)); -} - -int -zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) -{ - return (zap_create_claim_norm_dnsize(os, obj, - 0, ot, bonustype, bonuslen, dnodesize, tx)); -} - -int -zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, - dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, - bonuslen, 0, tx)); -} - -int -zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, - dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, - int dnodesize, dmu_tx_t *tx) -{ - int err; - - err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, - dnodesize, tx); - if (err != 0) - return (err); - mzap_create_impl(os, obj, normflags, 0, tx); - return (0); -} - -uint64_t -zap_create(objset_t *os, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); -} - -uint64_t -zap_create_dnsize(objset_t *os, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) -{ - return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, - dnodesize, tx)); -} - -uint64_t -zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); - return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, - 0, tx)); -} - -uint64_t -zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) -{ - uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen, - dnodesize, tx); - - mzap_create_impl(os, obj, normflags, 0, tx); - return (obj); -} - -uint64_t -zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, - dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, - dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) -{ - ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); - return (zap_create_flags_dnsize(os, normflags, flags, ot, - leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); -} - -uint64_t -zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, - dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, - dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) -{ - uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen, - dnodesize, tx); - - ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && - leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT && - indirect_blockshift >= SPA_MINBLOCKSHIFT && - indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT); - - VERIFY(dmu_object_set_blocksize(os, obj, - 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0); - - mzap_create_impl(os, obj, normflags, flags, tx); - return (obj); -} - -int -zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) -{ - /* - * dmu_object_free will free the object number and free the - * data. Freeing the data will cause our pageout function to be - * called, which will destroy our data (zap_leaf_t's and zap_t). - */ - - return (dmu_object_free(os, zapobj, tx)); -} - -void -zap_evict_sync(void *dbu) -{ - zap_t *zap = dbu; - - rw_destroy(&zap->zap_rwlock); - - if (zap->zap_ismicro) - mze_destroy(zap); - else - mutex_destroy(&zap->zap_f.zap_num_entries_mtx); - - kmem_free(zap, sizeof (zap_t)); -} - -int -zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - if (!zap->zap_ismicro) { - err = fzap_count(zap, count); - } else { - *count = zap->zap_m.zap_num_entries; - } - zap_unlockdir(zap, FTAG); - return (err); -} - -/* - * zn may be NULL; if not specified, it will be computed if needed. - * See also the comment above zap_entry_normalization_conflict(). - */ -static boolean_t -mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) -{ - int direction = AVL_BEFORE; - boolean_t allocdzn = B_FALSE; - - if (zap->zap_normflags == 0) - return (B_FALSE); - -again: - for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction); - other && other->mze_hash == mze->mze_hash; - other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { - - if (zn == NULL) { - zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, - MT_NORMALIZE); - allocdzn = B_TRUE; - } - if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { - if (allocdzn) - zap_name_free(zn); - return (B_TRUE); - } - } - - if (direction == AVL_BEFORE) { - direction = AVL_AFTER; - goto again; - } - - if (allocdzn) - zap_name_free(zn); - return (B_FALSE); -} - -/* - * Routines for manipulating attributes. - */ - -int -zap_lookup(objset_t *os, uint64_t zapobj, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf) -{ - return (zap_lookup_norm(os, zapobj, name, integer_size, - num_integers, buf, 0, NULL, 0, NULL)); -} - -static int -zap_lookup_impl(zap_t *zap, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf, - matchtype_t mt, char *realname, int rn_len, - boolean_t *ncp) -{ - int err = 0; - - zap_name_t *zn = zap_name_alloc(zap, name, mt); - if (zn == NULL) - return (SET_ERROR(ENOTSUP)); - - if (!zap->zap_ismicro) { - err = fzap_lookup(zn, integer_size, num_integers, buf, - realname, rn_len, ncp); - } else { - mzap_ent_t *mze = mze_find(zn); - if (mze == NULL) { - err = SET_ERROR(ENOENT); - } else { - if (num_integers < 1) { - err = SET_ERROR(EOVERFLOW); - } else if (integer_size != 8) { - err = SET_ERROR(EINVAL); - } else { - *(uint64_t *)buf = - MZE_PHYS(zap, mze)->mze_value; - (void) strlcpy(realname, - MZE_PHYS(zap, mze)->mze_name, rn_len); - if (ncp) { - *ncp = mzap_normalization_conflict(zap, - zn, mze); - } - } - } - } - zap_name_free(zn); - return (err); -} - -int -zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf, - matchtype_t mt, char *realname, int rn_len, - boolean_t *ncp) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_lookup_impl(zap, name, integer_size, - num_integers, buf, mt, realname, rn_len, ncp); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_lookup_by_dnode(dnode_t *dn, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf) -{ - return (zap_lookup_norm_by_dnode(dn, name, integer_size, - num_integers, buf, 0, NULL, 0, NULL)); -} - -int -zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf, - matchtype_t mt, char *realname, int rn_len, - boolean_t *ncp) -{ - zap_t *zap; - - int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, - FTAG, &zap); - if (err != 0) - return (err); - err = zap_lookup_impl(zap, name, integer_size, - num_integers, buf, mt, realname, rn_len, ncp); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - - fzap_prefetch(zn); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - - err = fzap_lookup(zn, integer_size, num_integers, buf, - NULL, 0, NULL); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_contains(objset_t *os, uint64_t zapobj, const char *name) -{ - int err = zap_lookup_norm(os, zapobj, name, 0, - 0, NULL, 0, NULL, 0, NULL); - if (err == EOVERFLOW || err == EINVAL) - err = 0; /* found, but skipped reading the value */ - return (err); -} - -int -zap_length(objset_t *os, uint64_t zapobj, const char *name, - uint64_t *integer_size, uint64_t *num_integers) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc(zap, name, 0); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - if (!zap->zap_ismicro) { - err = fzap_length(zn, integer_size, num_integers); - } else { - mzap_ent_t *mze = mze_find(zn); - if (mze == NULL) { - err = SET_ERROR(ENOENT); - } else { - if (integer_size) - *integer_size = 8; - if (num_integers) - *num_integers = 1; - } - } - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, uint64_t *integer_size, uint64_t *num_integers) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_length(zn, integer_size, num_integers); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (err); -} - -static void -mzap_addent(zap_name_t *zn, uint64_t value) -{ - zap_t *zap = zn->zn_zap; - int start = zap->zap_m.zap_alloc_next; - - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - -#ifdef ZFS_DEBUG - for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { - mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; - ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); - } -#endif - - uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash); - /* given the limited size of the microzap, this can't happen */ - ASSERT(cd < zap_maxcd(zap)); - -again: - for (int i = start; i < zap->zap_m.zap_num_chunks; i++) { - mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; - if (mze->mze_name[0] == 0) { - mze->mze_value = value; - mze->mze_cd = cd; - (void) strcpy(mze->mze_name, zn->zn_key_orig); - zap->zap_m.zap_num_entries++; - zap->zap_m.zap_alloc_next = i+1; - if (zap->zap_m.zap_alloc_next == - zap->zap_m.zap_num_chunks) - zap->zap_m.zap_alloc_next = 0; - VERIFY(0 == mze_insert(zap, i, zn->zn_hash)); - return; - } - } - if (start != 0) { - start = 0; - goto again; - } - ASSERT(!"out of entries!"); -} - -static int -zap_add_impl(zap_t *zap, const char *key, - int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx, void *tag) -{ - const uint64_t *intval = val; - int err = 0; - - zap_name_t *zn = zap_name_alloc(zap, key, 0); - if (zn == NULL) { - zap_unlockdir(zap, tag); - return (SET_ERROR(ENOTSUP)); - } - if (!zap->zap_ismicro) { - err = fzap_add(zn, integer_size, num_integers, val, tag, tx); - zap = zn->zn_zap; /* fzap_add() may change zap */ - } else if (integer_size != 8 || num_integers != 1 || - strlen(key) >= MZAP_NAME_LEN) { - err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); - if (err == 0) { - err = fzap_add(zn, integer_size, num_integers, val, - tag, tx); - } - zap = zn->zn_zap; /* fzap_add() may change zap */ - } else { - if (mze_find(zn) != NULL) { - err = SET_ERROR(EEXIST); - } else { - mzap_addent(zn, *intval); - } - } - ASSERT(zap == zn->zn_zap); - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap, tag); - return (err); -} - -int -zap_add(objset_t *os, uint64_t zapobj, const char *key, - int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - int err; - - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); - /* zap_add_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_add_by_dnode(dnode_t *dn, const char *key, - int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - int err; - - err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); - /* zap_add_impl() calls zap_unlockdir() */ - return (err); -} - -int -zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); - zap = zn->zn_zap; /* fzap_add() may change zap */ - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_update(objset_t *os, uint64_t zapobj, const char *name, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - uint64_t oldval; - const uint64_t *intval = val; - -#ifdef ZFS_DEBUG - /* - * If there is an old value, it shouldn't change across the - * lockdir (eg, due to bprewrite's xlation). - */ - if (integer_size == 8 && num_integers == 1) - (void) zap_lookup(os, zapobj, name, 8, 1, &oldval); -#endif - - int err = - zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc(zap, name, 0); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - if (!zap->zap_ismicro) { - err = fzap_update(zn, integer_size, num_integers, val, - FTAG, tx); - zap = zn->zn_zap; /* fzap_update() may change zap */ - } else if (integer_size != 8 || num_integers != 1 || - strlen(name) >= MZAP_NAME_LEN) { - dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", - zapobj, integer_size, num_integers, name); - err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); - if (err == 0) { - err = fzap_update(zn, integer_size, num_integers, - val, FTAG, tx); - } - zap = zn->zn_zap; /* fzap_update() may change zap */ - } else { - mzap_ent_t *mze = mze_find(zn); - if (mze != NULL) { - ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval); - MZE_PHYS(zap, mze)->mze_value = *intval; - } else { - mzap_addent(zn, *intval); - } - } - ASSERT(zap == zn->zn_zap); - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); - zap = zn->zn_zap; /* fzap_update() may change zap */ - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) -{ - return (zap_remove_norm(os, zapobj, name, 0, tx)); -} - -static int -zap_remove_impl(zap_t *zap, const char *name, - matchtype_t mt, dmu_tx_t *tx) -{ - int err = 0; - - zap_name_t *zn = zap_name_alloc(zap, name, mt); - if (zn == NULL) - return (SET_ERROR(ENOTSUP)); - if (!zap->zap_ismicro) { - err = fzap_remove(zn, tx); - } else { - mzap_ent_t *mze = mze_find(zn); - if (mze == NULL) { - err = SET_ERROR(ENOENT); - } else { - zap->zap_m.zap_num_entries--; - bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid], - sizeof (mzap_ent_phys_t)); - mze_remove(zap, mze); - } - } - zap_name_free(zn); - return (err); -} - -int -zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, - matchtype_t mt, dmu_tx_t *tx) -{ - zap_t *zap; - int err; - - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); - if (err) - return (err); - err = zap_remove_impl(zap, name, mt, tx); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) -{ - zap_t *zap; - int err; - - err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); - if (err) - return (err); - err = zap_remove_impl(zap, name, 0, tx); - zap_unlockdir(zap, FTAG); - return (err); -} - -int -zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, dmu_tx_t *tx) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_remove(zn, tx); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); - return (err); -} - -/* - * Routines for iterating over the attributes. - */ - -static void -zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, - uint64_t serialized, boolean_t prefetch) -{ - zc->zc_objset = os; - zc->zc_zap = NULL; - zc->zc_leaf = NULL; - zc->zc_zapobj = zapobj; - zc->zc_serialized = serialized; - zc->zc_hash = 0; - zc->zc_cd = 0; - zc->zc_prefetch = prefetch; -} -void -zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, - uint64_t serialized) -{ - zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); -} - -/* - * Initialize a cursor at the beginning of the ZAP object. The entire - * ZAP object will be prefetched. - */ -void -zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) -{ - zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); -} - -/* - * Initialize a cursor at the beginning, but request that we not prefetch - * the entire ZAP object. - */ -void -zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) -{ - zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); -} - -void -zap_cursor_fini(zap_cursor_t *zc) -{ - if (zc->zc_zap) { - rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); - zap_unlockdir(zc->zc_zap, NULL); - zc->zc_zap = NULL; - } - if (zc->zc_leaf) { - rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); - zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; - } - zc->zc_objset = NULL; -} - -uint64_t -zap_cursor_serialize(zap_cursor_t *zc) -{ - if (zc->zc_hash == -1ULL) - return (-1ULL); - if (zc->zc_zap == NULL) - return (zc->zc_serialized); - ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); - ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); - - /* - * We want to keep the high 32 bits of the cursor zero if we can, so - * that 32-bit programs can access this. So usually use a small - * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits - * of the cursor. - * - * [ collision differentiator | zap_hashbits()-bit hash value ] - */ - return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | - ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); -} - -int -zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) -{ - int err; - - if (zc->zc_hash == -1ULL) - return (SET_ERROR(ENOENT)); - - if (zc->zc_zap == NULL) { - int hb; - err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, - RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); - if (err != 0) - return (err); - - /* - * To support zap_cursor_init_serialized, advance, retrieve, - * we must add to the existing zc_cd, which may already - * be 1 due to the zap_cursor_advance. - */ - ASSERT(zc->zc_hash == 0); - hb = zap_hashbits(zc->zc_zap); - zc->zc_hash = zc->zc_serialized << (64 - hb); - zc->zc_cd += zc->zc_serialized >> hb; - if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ - zc->zc_cd = 0; - } else { - rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); - } - if (!zc->zc_zap->zap_ismicro) { - err = fzap_cursor_retrieve(zc->zc_zap, zc, za); - } else { - avl_index_t idx; - mzap_ent_t mze_tofind; - - mze_tofind.mze_hash = zc->zc_hash; - mze_tofind.mze_cd = zc->zc_cd; - - mzap_ent_t *mze = - avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); - if (mze == NULL) { - mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, - idx, AVL_AFTER); - } - if (mze) { - mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); - ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); - za->za_normalization_conflict = - mzap_normalization_conflict(zc->zc_zap, NULL, mze); - za->za_integer_length = 8; - za->za_num_integers = 1; - za->za_first_integer = mzep->mze_value; - (void) strcpy(za->za_name, mzep->mze_name); - zc->zc_hash = mze->mze_hash; - zc->zc_cd = mze->mze_cd; - err = 0; - } else { - zc->zc_hash = -1ULL; - err = SET_ERROR(ENOENT); - } - } - rw_exit(&zc->zc_zap->zap_rwlock); - return (err); -} - -void -zap_cursor_advance(zap_cursor_t *zc) -{ - if (zc->zc_hash == -1ULL) - return; - zc->zc_cd++; -} - -int -zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) -{ - int err = 0; - mzap_ent_t *mze; - zap_name_t *zn; - - if (zc->zc_zap == NULL) { - err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, - RW_READER, TRUE, FALSE, FTAG, &zc->zc_zap); - if (err) - return (err); - } else { - rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); - } - - zn = zap_name_alloc(zc->zc_zap, name, mt); - if (zn == NULL) { - rw_exit(&zc->zc_zap->zap_rwlock); - return (SET_ERROR(ENOTSUP)); - } - - if (!zc->zc_zap->zap_ismicro) { - err = fzap_cursor_move_to_key(zc, zn); - } else { - mze = mze_find(zn); - if (mze == NULL) { - err = SET_ERROR(ENOENT); - goto out; - } - zc->zc_hash = mze->mze_hash; - zc->zc_cd = mze->mze_cd; - } - -out: - zap_name_free(zn); - rw_exit(&zc->zc_zap->zap_rwlock); - return (err); -} - -int -zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) -{ - zap_t *zap; - - int err = - zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err != 0) - return (err); - - bzero(zs, sizeof (zap_stats_t)); - - if (zap->zap_ismicro) { - zs->zs_blocksize = zap->zap_dbuf->db_size; - zs->zs_num_entries = zap->zap_m.zap_num_entries; - zs->zs_num_blocks = 1; - } else { - fzap_get_stats(zap, zs); - } - zap_unlockdir(zap, FTAG); - return (0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c deleted file mode 100644 index cf8b0a58d3ae..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c +++ /dev/null @@ -1,1432 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2016, 2017 by Delphix. All rights reserved. - */ - -/* - * ZFS Channel Programs (ZCP) - * - * The ZCP interface allows various ZFS commands and operations ZFS - * administrative operations (e.g. creating and destroying snapshots, typically - * performed via an ioctl to /dev/zfs by the zfs(1M) command and - * libzfs/libzfs_core) to be run * programmatically as a Lua script. A ZCP - * script is run as a dsl_sync_task and fully executed during one transaction - * group sync. This ensures that no other changes can be written concurrently - * with a running Lua script. Combining multiple calls to the exposed ZFS - * functions into one script gives a number of benefits: - * - * 1. Atomicity. For some compound or iterative operations, it's useful to be - * able to guarantee that the state of a pool has not changed between calls to - * ZFS. - * - * 2. Performance. If a large number of changes need to be made (e.g. deleting - * many filesystems), there can be a significant performance penalty as a - * result of the need to wait for a transaction group sync to pass for every - * single operation. When expressed as a single ZCP script, all these changes - * can be performed at once in one txg sync. - * - * A modified version of the Lua 5.2 interpreter is used to run channel program - * scripts. The Lua 5.2 manual can be found at: - * - * http://www.lua.org/manual/5.2/ - * - * If being run by a user (via an ioctl syscall), executing a ZCP script - * requires root privileges in the global zone. - * - * Scripts are passed to zcp_eval() as a string, then run in a synctask by - * zcp_eval_sync(). Arguments can be passed into the Lua script as an nvlist, - * which will be converted to a Lua table. Similarly, values returned from - * a ZCP script will be converted to an nvlist. See zcp_lua_to_nvlist_impl() - * for details on exact allowed types and conversion. - * - * ZFS functionality is exposed to a ZCP script as a library of function calls. - * These calls are sorted into submodules, such as zfs.list and zfs.sync, for - * iterators and synctasks, respectively. Each of these submodules resides in - * its own source file, with a zcp_*_info structure describing each library - * call in the submodule. - * - * Error handling in ZCP scripts is handled by a number of different methods - * based on severity: - * - * 1. Memory and time limits are in place to prevent a channel program from - * consuming excessive system or running forever. If one of these limits is - * hit, the channel program will be stopped immediately and return from - * zcp_eval() with an error code. No attempt will be made to roll back or undo - * any changes made by the channel program before the error occured. - * Consumers invoking zcp_eval() from elsewhere in the kernel may pass a time - * limit of 0, disabling the time limit. - * - * 2. Internal Lua errors can occur as a result of a syntax error, calling a - * library function with incorrect arguments, invoking the error() function, - * failing an assert(), or other runtime errors. In these cases the channel - * program will stop executing and return from zcp_eval() with an error code. - * In place of a return value, an error message will also be returned in the - * 'result' nvlist containing information about the error. No attempt will be - * made to roll back or undo any changes made by the channel program before the - * error occured. - * - * 3. If an error occurs inside a ZFS library call which returns an error code, - * the error is returned to the Lua script to be handled as desired. - * - * In the first two cases, Lua's error-throwing mechanism is used, which - * longjumps out of the script execution with luaL_error() and returns with the - * error. - * - * See zfs-program(1M) for more information on high level usage. - */ - -#include "lua.h" -#include "lualib.h" -#include "lauxlib.h" - -#include -#include -#include -#include -#include -#include -#include -#ifdef illumos -#include -#endif - -#ifdef __FreeBSD__ -#define ECHRNG EDOM -#define ETIME ETIMEDOUT -#endif - -#define ZCP_NVLIST_MAX_DEPTH 20 - -uint64_t zfs_lua_check_instrlimit_interval = 100; -uint64_t zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT; -uint64_t zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT; - -/* - * Forward declarations for mutually recursive functions - */ -static int zcp_nvpair_value_to_lua(lua_State *, nvpair_t *, char *, int); -static int zcp_lua_to_nvlist_impl(lua_State *, int, nvlist_t *, const char *, - int); - -/* - * The outer-most error callback handler for use with lua_pcall(). On - * error Lua will call this callback with a single argument that - * represents the error value. In most cases this will be a string - * containing an error message, but channel programs can use Lua's - * error() function to return arbitrary objects as errors. This callback - * returns (on the Lua stack) the original error object along with a traceback. - * - * Fatal Lua errors can occur while resources are held, so we also call any - * registered cleanup function here. - */ -static int -zcp_error_handler(lua_State *state) -{ - const char *msg; - - zcp_cleanup(state); - - VERIFY3U(1, ==, lua_gettop(state)); - msg = lua_tostring(state, 1); - luaL_traceback(state, state, msg, 1); - return (1); -} - -int -zcp_argerror(lua_State *state, int narg, const char *msg, ...) -{ - va_list alist; - - va_start(alist, msg); - const char *buf = lua_pushvfstring(state, msg, alist); - va_end(alist); - - return (luaL_argerror(state, narg, buf)); -} - -/* - * Install a new cleanup function, which will be invoked with the given - * opaque argument if a fatal error causes the Lua interpreter to longjump out - * of a function call. - * - * If an error occurs, the cleanup function will be invoked exactly once and - * then unreigstered. - * - * Returns the registered cleanup handler so the caller can deregister it - * if no error occurs. - */ -zcp_cleanup_handler_t * -zcp_register_cleanup(lua_State *state, zcp_cleanup_t cleanfunc, void *cleanarg) -{ - zcp_run_info_t *ri = zcp_run_info(state); - - zcp_cleanup_handler_t *zch = kmem_alloc(sizeof (*zch), KM_SLEEP); - zch->zch_cleanup_func = cleanfunc; - zch->zch_cleanup_arg = cleanarg; - list_insert_head(&ri->zri_cleanup_handlers, zch); - - return (zch); -} - -void -zcp_deregister_cleanup(lua_State *state, zcp_cleanup_handler_t *zch) -{ - zcp_run_info_t *ri = zcp_run_info(state); - list_remove(&ri->zri_cleanup_handlers, zch); - kmem_free(zch, sizeof (*zch)); -} - -/* - * Execute the currently registered cleanup handlers then free them and - * destroy the handler list. - */ -void -zcp_cleanup(lua_State *state) -{ - zcp_run_info_t *ri = zcp_run_info(state); - - for (zcp_cleanup_handler_t *zch = - list_remove_head(&ri->zri_cleanup_handlers); zch != NULL; - zch = list_remove_head(&ri->zri_cleanup_handlers)) { - zch->zch_cleanup_func(zch->zch_cleanup_arg); - kmem_free(zch, sizeof (*zch)); - } -} - -/* - * Convert the lua table at the given index on the Lua stack to an nvlist - * and return it. - * - * If the table can not be converted for any reason, NULL is returned and - * an error message is pushed onto the Lua stack. - */ -static nvlist_t * -zcp_table_to_nvlist(lua_State *state, int index, int depth) -{ - nvlist_t *nvl; - /* - * Converting a Lua table to an nvlist with key uniqueness checking is - * O(n^2) in the number of keys in the nvlist, which can take a long - * time when we return a large table from a channel program. - * Furthermore, Lua's table interface *almost* guarantees unique keys - * on its own (details below). Therefore, we don't use fnvlist_alloc() - * here to avoid the built-in uniqueness checking. - * - * The *almost* is because it's possible to have key collisions between - * e.g. the string "1" and the number 1, or the string "true" and the - * boolean true, so we explicitly check that when we're looking at a - * key which is an integer / boolean or a string that can be parsed as - * one of those types. In the worst case this could still devolve into - * O(n^2), so we only start doing these checks on boolean/integer keys - * once we've seen a string key which fits this weird usage pattern. - * - * Ultimately, we still want callers to know that the keys in this - * nvlist are unique, so before we return this we set the nvlist's - * flags to reflect that. - */ - VERIFY0(nvlist_alloc(&nvl, 0, KM_SLEEP)); - - /* - * Push an empty stack slot where lua_next() will store each - * table key. - */ - lua_pushnil(state); - boolean_t saw_str_could_collide = B_FALSE; - while (lua_next(state, index) != 0) { - /* - * The next key-value pair from the table at index is - * now on the stack, with the key at stack slot -2 and - * the value at slot -1. - */ - int err = 0; - char buf[32]; - const char *key = NULL; - boolean_t key_could_collide = B_FALSE; - - switch (lua_type(state, -2)) { - case LUA_TSTRING: - key = lua_tostring(state, -2); - - /* check if this could collide with a number or bool */ - long long tmp; - int parselen; - if ((sscanf(key, "%lld%n", &tmp, &parselen) > 0 && - parselen == strlen(key)) || - strcmp(key, "true") == 0 || - strcmp(key, "false") == 0) { - key_could_collide = B_TRUE; - saw_str_could_collide = B_TRUE; - } - break; - case LUA_TBOOLEAN: - key = (lua_toboolean(state, -2) == B_TRUE ? - "true" : "false"); - if (saw_str_could_collide) { - key_could_collide = B_TRUE; - } - break; - case LUA_TNUMBER: - VERIFY3U(sizeof (buf), >, - snprintf(buf, sizeof (buf), "%lld", - (longlong_t)lua_tonumber(state, -2))); - key = buf; - if (saw_str_could_collide) { - key_could_collide = B_TRUE; - } - break; - default: - fnvlist_free(nvl); - (void) lua_pushfstring(state, "Invalid key " - "type '%s' in table", - lua_typename(state, lua_type(state, -2))); - return (NULL); - } - /* - * Check for type-mismatched key collisions, and throw an error. - */ - if (key_could_collide && nvlist_exists(nvl, key)) { - fnvlist_free(nvl); - (void) lua_pushfstring(state, "Collision of " - "key '%s' in table", key); - return (NULL); - } - /* - * Recursively convert the table value and insert into - * the new nvlist with the parsed key. To prevent - * stack overflow on circular or heavily nested tables, - * we track the current nvlist depth. - */ - if (depth >= ZCP_NVLIST_MAX_DEPTH) { - fnvlist_free(nvl); - (void) lua_pushfstring(state, "Maximum table " - "depth (%d) exceeded for table", - ZCP_NVLIST_MAX_DEPTH); - return (NULL); - } - err = zcp_lua_to_nvlist_impl(state, -1, nvl, key, - depth + 1); - if (err != 0) { - fnvlist_free(nvl); - /* - * Error message has been pushed to the lua - * stack by the recursive call. - */ - return (NULL); - } - /* - * Pop the value pushed by lua_next(). - */ - lua_pop(state, 1); - } - - /* - * Mark the nvlist as having unique keys. This is a little ugly, but we - * ensured above that there are no duplicate keys in the nvlist. - */ - nvl->nvl_nvflag |= NV_UNIQUE_NAME; - - return (nvl); -} - -/* - * Convert a value from the given index into the lua stack to an nvpair, adding - * it to an nvlist with the given key. - * - * Values are converted as follows: - * - * string -> string - * number -> int64 - * boolean -> boolean - * nil -> boolean (no value) - * - * Lua tables are converted to nvlists and then inserted. The table's keys - * are converted to strings then used as keys in the nvlist to store each table - * element. Keys are converted as follows: - * - * string -> no change - * number -> "%lld" - * boolean -> "true" | "false" - * nil -> error - * - * In the case of a key collision, an error is thrown. - * - * If an error is encountered, a nonzero error code is returned, and an error - * string will be pushed onto the Lua stack. - */ -static int -zcp_lua_to_nvlist_impl(lua_State *state, int index, nvlist_t *nvl, - const char *key, int depth) -{ - /* - * Verify that we have enough remaining space in the lua stack to parse - * a key-value pair and push an error. - */ - if (!lua_checkstack(state, 3)) { - (void) lua_pushstring(state, "Lua stack overflow"); - return (1); - } - - index = lua_absindex(state, index); - - switch (lua_type(state, index)) { - case LUA_TNIL: - fnvlist_add_boolean(nvl, key); - break; - case LUA_TBOOLEAN: - fnvlist_add_boolean_value(nvl, key, - lua_toboolean(state, index)); - break; - case LUA_TNUMBER: - fnvlist_add_int64(nvl, key, lua_tonumber(state, index)); - break; - case LUA_TSTRING: - fnvlist_add_string(nvl, key, lua_tostring(state, index)); - break; - case LUA_TTABLE: { - nvlist_t *value_nvl = zcp_table_to_nvlist(state, index, depth); - if (value_nvl == NULL) - return (EINVAL); - - fnvlist_add_nvlist(nvl, key, value_nvl); - fnvlist_free(value_nvl); - break; - } - default: - (void) lua_pushfstring(state, - "Invalid value type '%s' for key '%s'", - lua_typename(state, lua_type(state, index)), key); - return (EINVAL); - } - - return (0); -} - -/* - * Convert a lua value to an nvpair, adding it to an nvlist with the given key. - */ -static void -zcp_lua_to_nvlist(lua_State *state, int index, nvlist_t *nvl, const char *key) -{ - /* - * On error, zcp_lua_to_nvlist_impl pushes an error string onto the Lua - * stack before returning with a nonzero error code. If an error is - * returned, throw a fatal lua error with the given string. - */ - if (zcp_lua_to_nvlist_impl(state, index, nvl, key, 0) != 0) - (void) lua_error(state); -} - -static int -zcp_lua_to_nvlist_helper(lua_State *state) -{ - nvlist_t *nv = (nvlist_t *)lua_touserdata(state, 2); - const char *key = (const char *)lua_touserdata(state, 1); - zcp_lua_to_nvlist(state, 3, nv, key); - return (0); -} - -static void -zcp_convert_return_values(lua_State *state, nvlist_t *nvl, - const char *key, int *result) -{ - int err; - VERIFY3U(1, ==, lua_gettop(state)); - lua_pushcfunction(state, zcp_lua_to_nvlist_helper); - lua_pushlightuserdata(state, (char *)key); - lua_pushlightuserdata(state, nvl); - lua_pushvalue(state, 1); - lua_remove(state, 1); - err = lua_pcall(state, 3, 0, 0); /* zcp_lua_to_nvlist_helper */ - if (err != 0) { - zcp_lua_to_nvlist(state, 1, nvl, ZCP_RET_ERROR); - *result = SET_ERROR(ECHRNG); - } -} - -/* - * Push a Lua table representing nvl onto the stack. If it can't be - * converted, return EINVAL, fill in errbuf, and push nothing. errbuf may - * be specified as NULL, in which case no error string will be output. - * - * Most nvlists are converted as simple key->value Lua tables, but we make - * an exception for the case where all nvlist entries are BOOLEANs (a string - * key without a value). In Lua, a table key pointing to a value of Nil - * (no value) is equivalent to the key not existing, so a BOOLEAN nvlist - * entry can't be directly converted to a Lua table entry. Nvlists of entirely - * BOOLEAN entries are frequently used to pass around lists of datasets, so for - * convenience we check for this case, and convert it to a simple Lua array of - * strings. - */ -int -zcp_nvlist_to_lua(lua_State *state, nvlist_t *nvl, - char *errbuf, int errbuf_len) -{ - nvpair_t *pair; - lua_newtable(state); - boolean_t has_values = B_FALSE; - /* - * If the list doesn't have any values, just convert it to a string - * array. - */ - for (pair = nvlist_next_nvpair(nvl, NULL); - pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) { - if (nvpair_type(pair) != DATA_TYPE_BOOLEAN) { - has_values = B_TRUE; - break; - } - } - if (!has_values) { - int i = 1; - for (pair = nvlist_next_nvpair(nvl, NULL); - pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) { - (void) lua_pushinteger(state, i); - (void) lua_pushstring(state, nvpair_name(pair)); - (void) lua_settable(state, -3); - i++; - } - } else { - for (pair = nvlist_next_nvpair(nvl, NULL); - pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) { - int err = zcp_nvpair_value_to_lua(state, pair, - errbuf, errbuf_len); - if (err != 0) { - lua_pop(state, 1); - return (err); - } - (void) lua_setfield(state, -2, nvpair_name(pair)); - } - } - return (0); -} - -/* - * Push a Lua object representing the value of "pair" onto the stack. - * - * Only understands boolean_value, string, int64, nvlist, - * string_array, and int64_array type values. For other - * types, returns EINVAL, fills in errbuf, and pushes nothing. - */ -static int -zcp_nvpair_value_to_lua(lua_State *state, nvpair_t *pair, - char *errbuf, int errbuf_len) -{ - int err = 0; - - if (pair == NULL) { - lua_pushnil(state); - return (0); - } - - switch (nvpair_type(pair)) { - case DATA_TYPE_BOOLEAN_VALUE: - (void) lua_pushboolean(state, - fnvpair_value_boolean_value(pair)); - break; - case DATA_TYPE_STRING: - (void) lua_pushstring(state, fnvpair_value_string(pair)); - break; - case DATA_TYPE_INT64: - (void) lua_pushinteger(state, fnvpair_value_int64(pair)); - break; - case DATA_TYPE_NVLIST: - err = zcp_nvlist_to_lua(state, - fnvpair_value_nvlist(pair), errbuf, errbuf_len); - break; - case DATA_TYPE_STRING_ARRAY: { - char **strarr; - uint_t nelem; - (void) nvpair_value_string_array(pair, &strarr, &nelem); - lua_newtable(state); - for (int i = 0; i < nelem; i++) { - (void) lua_pushinteger(state, i + 1); - (void) lua_pushstring(state, strarr[i]); - (void) lua_settable(state, -3); - } - break; - } - case DATA_TYPE_UINT64_ARRAY: { - uint64_t *intarr; - uint_t nelem; - (void) nvpair_value_uint64_array(pair, &intarr, &nelem); - lua_newtable(state); - for (int i = 0; i < nelem; i++) { - (void) lua_pushinteger(state, i + 1); - (void) lua_pushinteger(state, intarr[i]); - (void) lua_settable(state, -3); - } - break; - } - case DATA_TYPE_INT64_ARRAY: { - int64_t *intarr; - uint_t nelem; - (void) nvpair_value_int64_array(pair, &intarr, &nelem); - lua_newtable(state); - for (int i = 0; i < nelem; i++) { - (void) lua_pushinteger(state, i + 1); - (void) lua_pushinteger(state, intarr[i]); - (void) lua_settable(state, -3); - } - break; - } - default: { - if (errbuf != NULL) { - (void) snprintf(errbuf, errbuf_len, - "Unhandled nvpair type %d for key '%s'", - nvpair_type(pair), nvpair_name(pair)); - } - return (EINVAL); - } - } - return (err); -} - -int -zcp_dataset_hold_error(lua_State *state, dsl_pool_t *dp, const char *dsname, - int error) -{ - if (error == ENOENT) { - (void) zcp_argerror(state, 1, "no such dataset '%s'", dsname); - return (0); /* not reached; zcp_argerror will longjmp */ - } else if (error == EXDEV) { - (void) zcp_argerror(state, 1, - "dataset '%s' is not in the target pool '%s'", - dsname, spa_name(dp->dp_spa)); - return (0); /* not reached; zcp_argerror will longjmp */ - } else if (error == EIO) { - (void) luaL_error(state, - "I/O error while accessing dataset '%s'", dsname); - return (0); /* not reached; luaL_error will longjmp */ - } else if (error != 0) { - (void) luaL_error(state, - "unexpected error %d while accessing dataset '%s'", - error, dsname); - return (0); /* not reached; luaL_error will longjmp */ - } - return (0); -} - -/* - * Note: will longjmp (via lua_error()) on error. - * Assumes that the dsname is argument #1 (for error reporting purposes). - */ -dsl_dataset_t * -zcp_dataset_hold(lua_State *state, dsl_pool_t *dp, const char *dsname, - void *tag) -{ - dsl_dataset_t *ds; - int error = dsl_dataset_hold(dp, dsname, tag, &ds); - (void) zcp_dataset_hold_error(state, dp, dsname, error); - return (ds); -} - -static int zcp_debug(lua_State *); -static zcp_lib_info_t zcp_debug_info = { - .name = "debug", - .func = zcp_debug, - .pargs = { - { .za_name = "debug string", .za_lua_type = LUA_TSTRING}, - {NULL, 0} - }, - .kwargs = { - {NULL, 0} - } -}; - -static int -zcp_debug(lua_State *state) -{ - const char *dbgstring; - zcp_run_info_t *ri = zcp_run_info(state); - zcp_lib_info_t *libinfo = &zcp_debug_info; - - zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); - - dbgstring = lua_tostring(state, 1); - - zfs_dbgmsg("txg %lld ZCP: %s", ri->zri_tx->tx_txg, dbgstring); - - return (0); -} - -static int zcp_exists(lua_State *); -static zcp_lib_info_t zcp_exists_info = { - .name = "exists", - .func = zcp_exists, - .pargs = { - { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, - {NULL, 0} - }, - .kwargs = { - {NULL, 0} - } -}; - -static int -zcp_exists(lua_State *state) -{ - zcp_run_info_t *ri = zcp_run_info(state); - dsl_pool_t *dp = ri->zri_pool; - zcp_lib_info_t *libinfo = &zcp_exists_info; - - zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); - - const char *dsname = lua_tostring(state, 1); - - dsl_dataset_t *ds; - int error = dsl_dataset_hold(dp, dsname, FTAG, &ds); - if (error == 0) { - dsl_dataset_rele(ds, FTAG); - lua_pushboolean(state, B_TRUE); - } else if (error == ENOENT) { - lua_pushboolean(state, B_FALSE); - } else if (error == EXDEV) { - return (luaL_error(state, "dataset '%s' is not in the " - "target pool", dsname)); - } else if (error == EIO) { - return (luaL_error(state, "I/O error opening dataset '%s'", - dsname)); - } else if (error != 0) { - return (luaL_error(state, "unexpected error %d", error)); - } - - return (1); -} - -/* - * Allocate/realloc/free a buffer for the lua interpreter. - * - * When nsize is 0, behaves as free() and returns NULL. - * - * If ptr is NULL, behaves as malloc() and returns an allocated buffer of size - * at least nsize. - * - * Otherwise, behaves as realloc(), changing the allocation from osize to nsize. - * Shrinking the buffer size never fails. - * - * The original allocated buffer size is stored as a uint64 at the beginning of - * the buffer to avoid actually reallocating when shrinking a buffer, since lua - * requires that this operation never fail. - */ -static void * -zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize) -{ - zcp_alloc_arg_t *allocargs = ud; - int flags = (allocargs->aa_must_succeed) ? - KM_SLEEP : (KM_NOSLEEP | KM_NORMALPRI); - - if (nsize == 0) { - if (ptr != NULL) { - int64_t *allocbuf = (int64_t *)ptr - 1; - int64_t allocsize = *allocbuf; - ASSERT3S(allocsize, >, 0); - ASSERT3S(allocargs->aa_alloc_remaining + allocsize, <=, - allocargs->aa_alloc_limit); - allocargs->aa_alloc_remaining += allocsize; - kmem_free(allocbuf, allocsize); - } - return (NULL); - } else if (ptr == NULL) { - int64_t *allocbuf; - int64_t allocsize = nsize + sizeof (int64_t); - - if (!allocargs->aa_must_succeed && - (allocsize <= 0 || - allocsize > allocargs->aa_alloc_remaining)) { - return (NULL); - } - - allocbuf = kmem_alloc(allocsize, flags); - if (allocbuf == NULL) { - return (NULL); - } - allocargs->aa_alloc_remaining -= allocsize; - - *allocbuf = allocsize; - return (allocbuf + 1); - } else if (nsize <= osize) { - /* - * If shrinking the buffer, lua requires that the reallocation - * never fail. - */ - return (ptr); - } else { - ASSERT3U(nsize, >, osize); - - uint64_t *luabuf = zcp_lua_alloc(ud, NULL, 0, nsize); - if (luabuf == NULL) { - return (NULL); - } - (void) memcpy(luabuf, ptr, osize); - VERIFY3P(zcp_lua_alloc(ud, ptr, osize, 0), ==, NULL); - return (luabuf); - } -} - -/* ARGSUSED */ -static void -zcp_lua_counthook(lua_State *state, lua_Debug *ar) -{ - lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY); - zcp_run_info_t *ri = lua_touserdata(state, -1); - - /* - * Check if we were canceled while waiting for the - * txg to sync or from our open context thread - */ - if (ri->zri_canceled || - (!ri->zri_sync && issig(JUSTLOOKING) && issig(FORREAL))) { - ri->zri_canceled = B_TRUE; - (void) lua_pushstring(state, "Channel program was canceled."); - (void) lua_error(state); - } - - /* - * Check how many instructions the channel program has - * executed so far, and compare against the limit. - */ - ri->zri_curinstrs += zfs_lua_check_instrlimit_interval; - if (ri->zri_maxinstrs != 0 && ri->zri_curinstrs > ri->zri_maxinstrs) { - ri->zri_timed_out = B_TRUE; - (void) lua_pushstring(state, - "Channel program timed out."); - (void) lua_error(state); - } -} - -static int -zcp_panic_cb(lua_State *state) -{ - panic("unprotected error in call to Lua API (%s)\n", - lua_tostring(state, -1)); - return (0); -} - -static void -zcp_eval_impl(dmu_tx_t *tx, zcp_run_info_t *ri) -{ - int err; - lua_State *state = ri->zri_state; - - VERIFY3U(3, ==, lua_gettop(state)); - - /* finish initializing our runtime state */ - ri->zri_pool = dmu_tx_pool(tx); - ri->zri_tx = tx; - list_create(&ri->zri_cleanup_handlers, sizeof (zcp_cleanup_handler_t), - offsetof(zcp_cleanup_handler_t, zch_node)); - - /* - * Store the zcp_run_info_t struct for this run in the Lua registry. - * Registry entries are not directly accessible by the Lua scripts but - * can be accessed by our callbacks. - */ - lua_pushlightuserdata(state, ri); - lua_setfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY); - VERIFY3U(3, ==, lua_gettop(state)); - - /* - * Tell the Lua interpreter to call our handler every count - * instructions. Channel programs that execute too many instructions - * should die with ETIMEDOUT. - */ - (void) lua_sethook(state, zcp_lua_counthook, LUA_MASKCOUNT, - zfs_lua_check_instrlimit_interval); - - /* - * Tell the Lua memory allocator to stop using KM_SLEEP before handing - * off control to the channel program. Channel programs that use too - * much memory should die with ENOSPC. - */ - ri->zri_allocargs->aa_must_succeed = B_FALSE; - - /* - * Call the Lua function that open-context passed us. This pops the - * function and its input from the stack and pushes any return - * or error values. - */ - err = lua_pcall(state, 1, LUA_MULTRET, 1); - - /* - * Let Lua use KM_SLEEP while we interpret the return values. - */ - ri->zri_allocargs->aa_must_succeed = B_TRUE; - - /* - * Remove the error handler callback from the stack. At this point, - * there shouldn't be any cleanup handler registered in the handler - * list (zri_cleanup_handlers), regardless of whether it ran or not. - */ - list_destroy(&ri->zri_cleanup_handlers); - lua_remove(state, 1); - - switch (err) { - case LUA_OK: { - /* - * Lua supports returning multiple values in a single return - * statement. Return values will have been pushed onto the - * stack: - * 1: Return value 1 - * 2: Return value 2 - * 3: etc... - * To simplify the process of retrieving a return value from a - * channel program, we disallow returning more than one value - * to ZFS from the Lua script, yielding a singleton return - * nvlist of the form { "return": Return value 1 }. - */ - int return_count = lua_gettop(state); - - if (return_count == 1) { - ri->zri_result = 0; - zcp_convert_return_values(state, ri->zri_outnvl, - ZCP_RET_RETURN, &ri->zri_result); - } else if (return_count > 1) { - ri->zri_result = SET_ERROR(ECHRNG); - lua_settop(state, 0); - (void) lua_pushfstring(state, "Multiple return " - "values not supported"); - zcp_convert_return_values(state, ri->zri_outnvl, - ZCP_RET_ERROR, &ri->zri_result); - } - break; - } - case LUA_ERRRUN: - case LUA_ERRGCMM: { - /* - * The channel program encountered a fatal error within the - * script, such as failing an assertion, or calling a function - * with incompatible arguments. The error value and the - * traceback generated by zcp_error_handler() should be on the - * stack. - */ - VERIFY3U(1, ==, lua_gettop(state)); - if (ri->zri_timed_out) { - ri->zri_result = SET_ERROR(ETIME); - } else if (ri->zri_canceled) { - ri->zri_result = SET_ERROR(EINTR); - } else { - ri->zri_result = SET_ERROR(ECHRNG); - } - - zcp_convert_return_values(state, ri->zri_outnvl, - ZCP_RET_ERROR, &ri->zri_result); - break; - } - case LUA_ERRERR: { - /* - * The channel program encountered a fatal error within the - * script, and we encountered another error while trying to - * compute the traceback in zcp_error_handler(). We can only - * return the error message. - */ - VERIFY3U(1, ==, lua_gettop(state)); - if (ri->zri_timed_out) { - ri->zri_result = SET_ERROR(ETIME); - } else if (ri->zri_canceled) { - ri->zri_result = SET_ERROR(EINTR); - } else { - ri->zri_result = SET_ERROR(ECHRNG); - } - - zcp_convert_return_values(state, ri->zri_outnvl, - ZCP_RET_ERROR, &ri->zri_result); - break; - } - case LUA_ERRMEM: - /* - * Lua ran out of memory while running the channel program. - * There's not much we can do. - */ - ri->zri_result = SET_ERROR(ENOSPC); - break; - default: - VERIFY0(err); - } -} - -static void -zcp_pool_error(zcp_run_info_t *ri, const char *poolname) -{ - ri->zri_result = SET_ERROR(ECHRNG); - lua_settop(ri->zri_state, 0); - (void) lua_pushfstring(ri->zri_state, "Could not open pool: %s", - poolname); - zcp_convert_return_values(ri->zri_state, ri->zri_outnvl, - ZCP_RET_ERROR, &ri->zri_result); - -} - -/* - * This callback is called when txg_wait_synced_sig encountered a signal. - * The txg_wait_synced_sig will continue to wait for the txg to complete - * after calling this callback. - */ -/* ARGSUSED */ -static void -zcp_eval_sig(void *arg, dmu_tx_t *tx) -{ - zcp_run_info_t *ri = arg; - - ri->zri_canceled = B_TRUE; -} - -static void -zcp_eval_sync(void *arg, dmu_tx_t *tx) -{ - zcp_run_info_t *ri = arg; - - /* - * Open context should have setup the stack to contain: - * 1: Error handler callback - * 2: Script to run (converted to a Lua function) - * 3: nvlist input to function (converted to Lua table or nil) - */ - VERIFY3U(3, ==, lua_gettop(ri->zri_state)); - - zcp_eval_impl(tx, ri); -} - -static void -zcp_eval_open(zcp_run_info_t *ri, const char *poolname) -{ - int error; - dsl_pool_t *dp; - dmu_tx_t *tx; - - /* - * See comment from the same assertion in zcp_eval_sync(). - */ - VERIFY3U(3, ==, lua_gettop(ri->zri_state)); - - error = dsl_pool_hold(poolname, FTAG, &dp); - if (error != 0) { - zcp_pool_error(ri, poolname); - return; - } - - /* - * As we are running in open-context, we have no transaction associated - * with the channel program. At the same time, functions from the - * zfs.check submodule need to be associated with a transaction as - * they are basically dry-runs of their counterparts in the zfs.sync - * submodule. These functions should be able to run in open-context. - * Therefore we create a new transaction that we later abort once - * the channel program has been evaluated. - */ - tx = dmu_tx_create_dd(dp->dp_mos_dir); - - zcp_eval_impl(tx, ri); - - dmu_tx_abort(tx); - - dsl_pool_rele(dp, FTAG); -} - -int -zcp_eval(const char *poolname, const char *program, boolean_t sync, - uint64_t instrlimit, uint64_t memlimit, nvpair_t *nvarg, nvlist_t *outnvl) -{ - int err; - lua_State *state; - zcp_run_info_t runinfo; - - if (instrlimit > zfs_lua_max_instrlimit) - return (SET_ERROR(EINVAL)); - if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) - return (SET_ERROR(EINVAL)); - - zcp_alloc_arg_t allocargs = { - .aa_must_succeed = B_TRUE, - .aa_alloc_remaining = (int64_t)memlimit, - .aa_alloc_limit = (int64_t)memlimit, - }; - - /* - * Creates a Lua state with a memory allocator that uses KM_SLEEP. - * This should never fail. - */ - state = lua_newstate(zcp_lua_alloc, &allocargs); - VERIFY(state != NULL); - (void) lua_atpanic(state, zcp_panic_cb); - - /* - * Load core Lua libraries we want access to. - */ - VERIFY3U(1, ==, luaopen_base(state)); - lua_pop(state, 1); - VERIFY3U(1, ==, luaopen_coroutine(state)); - lua_setglobal(state, LUA_COLIBNAME); - VERIFY0(lua_gettop(state)); - VERIFY3U(1, ==, luaopen_string(state)); - lua_setglobal(state, LUA_STRLIBNAME); - VERIFY0(lua_gettop(state)); - VERIFY3U(1, ==, luaopen_table(state)); - lua_setglobal(state, LUA_TABLIBNAME); - VERIFY0(lua_gettop(state)); - - /* - * Load globally visible variables such as errno aliases. - */ - zcp_load_globals(state); - VERIFY0(lua_gettop(state)); - - /* - * Load ZFS-specific modules. - */ - lua_newtable(state); - VERIFY3U(1, ==, zcp_load_list_lib(state)); - lua_setfield(state, -2, "list"); - VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_FALSE)); - lua_setfield(state, -2, "check"); - VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_TRUE)); - lua_setfield(state, -2, "sync"); - VERIFY3U(1, ==, zcp_load_get_lib(state)); - lua_pushcclosure(state, zcp_debug_info.func, 0); - lua_setfield(state, -2, zcp_debug_info.name); - lua_pushcclosure(state, zcp_exists_info.func, 0); - lua_setfield(state, -2, zcp_exists_info.name); - lua_setglobal(state, "zfs"); - VERIFY0(lua_gettop(state)); - - /* - * Push the error-callback that calculates Lua stack traces on - * unexpected failures. - */ - lua_pushcfunction(state, zcp_error_handler); - VERIFY3U(1, ==, lua_gettop(state)); - - /* - * Load the actual script as a function onto the stack as text ("t"). - * The only valid error condition is a syntax error in the script. - * ERRMEM should not be possible because our allocator is using - * KM_SLEEP. ERRGCMM should not be possible because we have not added - * any objects with __gc metamethods to the interpreter that could - * fail. - */ - err = luaL_loadbufferx(state, program, strlen(program), - "channel program", "t"); - if (err == LUA_ERRSYNTAX) { - fnvlist_add_string(outnvl, ZCP_RET_ERROR, - lua_tostring(state, -1)); - lua_close(state); - return (SET_ERROR(EINVAL)); - } - VERIFY0(err); - VERIFY3U(2, ==, lua_gettop(state)); - - /* - * Convert the input nvlist to a Lua object and put it on top of the - * stack. - */ - char errmsg[128]; - err = zcp_nvpair_value_to_lua(state, nvarg, - errmsg, sizeof (errmsg)); - if (err != 0) { - fnvlist_add_string(outnvl, ZCP_RET_ERROR, errmsg); - lua_close(state); - return (SET_ERROR(EINVAL)); - } - VERIFY3U(3, ==, lua_gettop(state)); - - runinfo.zri_state = state; - runinfo.zri_allocargs = &allocargs; - runinfo.zri_outnvl = outnvl; - runinfo.zri_result = 0; - runinfo.zri_cred = CRED(); - runinfo.zri_timed_out = B_FALSE; - runinfo.zri_canceled = B_FALSE; - runinfo.zri_sync = sync; - runinfo.zri_space_used = 0; - runinfo.zri_curinstrs = 0; - runinfo.zri_maxinstrs = instrlimit; - - if (sync) { - err = dsl_sync_task_sig(poolname, NULL, zcp_eval_sync, - zcp_eval_sig, &runinfo, 0, ZFS_SPACE_CHECK_ZCP_EVAL); - if (err != 0) - zcp_pool_error(&runinfo, poolname); - } else { - zcp_eval_open(&runinfo, poolname); - } - lua_close(state); - - return (runinfo.zri_result); -} - -/* - * Retrieve metadata about the currently running channel program. - */ -zcp_run_info_t * -zcp_run_info(lua_State *state) -{ - zcp_run_info_t *ri; - - lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY); - ri = lua_touserdata(state, -1); - lua_pop(state, 1); - return (ri); -} - -/* - * Argument Parsing - * ================ - * - * The Lua language allows methods to be called with any number - * of arguments of any type. When calling back into ZFS we need to sanitize - * arguments from channel programs to make sure unexpected arguments or - * arguments of the wrong type result in clear error messages. To do this - * in a uniform way all callbacks from channel programs should use the - * zcp_parse_args() function to interpret inputs. - * - * Positional vs Keyword Arguments - * =============================== - * - * Every callback function takes a fixed set of required positional arguments - * and optional keyword arguments. For example, the destroy function takes - * a single positional string argument (the name of the dataset to destroy) - * and an optional "defer" keyword boolean argument. When calling lua functions - * with parentheses, only positional arguments can be used: - * - * zfs.sync.snapshot("rpool@snap") - * - * To use keyword arguments functions should be called with a single argument - * that is a lua table containing mappings of integer -> positional arguments - * and string -> keyword arguments: - * - * zfs.sync.snapshot({1="rpool@snap", defer=true}) - * - * The lua language allows curly braces to be used in place of parenthesis as - * syntactic sugar for this calling convention: - * - * zfs.sync.snapshot{"rpool@snap", defer=true} - */ - -/* - * Throw an error and print the given arguments. If there are too many - * arguments to fit in the output buffer, only the error format string is - * output. - */ -static void -zcp_args_error(lua_State *state, const char *fname, const zcp_arg_t *pargs, - const zcp_arg_t *kwargs, const char *fmt, ...) -{ - int i; - char errmsg[512]; - size_t len = sizeof (errmsg); - size_t msglen = 0; - va_list argp; - - va_start(argp, fmt); - VERIFY3U(len, >, vsnprintf(errmsg, len, fmt, argp)); - va_end(argp); - - /* - * Calculate the total length of the final string, including extra - * formatting characters. If the argument dump would be too large, - * only print the error string. - */ - msglen = strlen(errmsg); - msglen += strlen(fname) + 4; /* : + {} + null terminator */ - for (i = 0; pargs[i].za_name != NULL; i++) { - msglen += strlen(pargs[i].za_name); - msglen += strlen(lua_typename(state, pargs[i].za_lua_type)); - if (pargs[i + 1].za_name != NULL || kwargs[0].za_name != NULL) - msglen += 5; /* < + ( + )> + , */ - else - msglen += 4; /* < + ( + )> */ - } - for (i = 0; kwargs[i].za_name != NULL; i++) { - msglen += strlen(kwargs[i].za_name); - msglen += strlen(lua_typename(state, kwargs[i].za_lua_type)); - if (kwargs[i + 1].za_name != NULL) - msglen += 4; /* =( + ) + , */ - else - msglen += 3; /* =( + ) */ - } - - if (msglen >= len) - (void) luaL_error(state, errmsg); - - VERIFY3U(len, >, strlcat(errmsg, ": ", len)); - VERIFY3U(len, >, strlcat(errmsg, fname, len)); - VERIFY3U(len, >, strlcat(errmsg, "{", len)); - for (i = 0; pargs[i].za_name != NULL; i++) { - VERIFY3U(len, >, strlcat(errmsg, "<", len)); - VERIFY3U(len, >, strlcat(errmsg, pargs[i].za_name, len)); - VERIFY3U(len, >, strlcat(errmsg, "(", len)); - VERIFY3U(len, >, strlcat(errmsg, - lua_typename(state, pargs[i].za_lua_type), len)); - VERIFY3U(len, >, strlcat(errmsg, ")>", len)); - if (pargs[i + 1].za_name != NULL || kwargs[0].za_name != NULL) { - VERIFY3U(len, >, strlcat(errmsg, ", ", len)); - } - } - for (i = 0; kwargs[i].za_name != NULL; i++) { - VERIFY3U(len, >, strlcat(errmsg, kwargs[i].za_name, len)); - VERIFY3U(len, >, strlcat(errmsg, "=(", len)); - VERIFY3U(len, >, strlcat(errmsg, - lua_typename(state, kwargs[i].za_lua_type), len)); - VERIFY3U(len, >, strlcat(errmsg, ")", len)); - if (kwargs[i + 1].za_name != NULL) { - VERIFY3U(len, >, strlcat(errmsg, ", ", len)); - } - } - VERIFY3U(len, >, strlcat(errmsg, "}", len)); - - (void) luaL_error(state, errmsg); - panic("unreachable code"); -} - -static void -zcp_parse_table_args(lua_State *state, const char *fname, - const zcp_arg_t *pargs, const zcp_arg_t *kwargs) -{ - int i; - int type; - - for (i = 0; pargs[i].za_name != NULL; i++) { - /* - * Check the table for this positional argument, leaving it - * on the top of the stack once we finish validating it. - */ - lua_pushinteger(state, i + 1); - lua_gettable(state, 1); - - type = lua_type(state, -1); - if (type == LUA_TNIL) { - zcp_args_error(state, fname, pargs, kwargs, - "too few arguments"); - panic("unreachable code"); - } else if (type != pargs[i].za_lua_type) { - zcp_args_error(state, fname, pargs, kwargs, - "arg %d wrong type (is '%s', expected '%s')", - i + 1, lua_typename(state, type), - lua_typename(state, pargs[i].za_lua_type)); - panic("unreachable code"); - } - - /* - * Remove the positional argument from the table. - */ - lua_pushinteger(state, i + 1); - lua_pushnil(state); - lua_settable(state, 1); - } - - for (i = 0; kwargs[i].za_name != NULL; i++) { - /* - * Check the table for this keyword argument, which may be - * nil if it was omitted. Leave the value on the top of - * the stack after validating it. - */ - lua_getfield(state, 1, kwargs[i].za_name); - - type = lua_type(state, -1); - if (type != LUA_TNIL && type != kwargs[i].za_lua_type) { - zcp_args_error(state, fname, pargs, kwargs, - "kwarg '%s' wrong type (is '%s', expected '%s')", - kwargs[i].za_name, lua_typename(state, type), - lua_typename(state, kwargs[i].za_lua_type)); - panic("unreachable code"); - } - - /* - * Remove the keyword argument from the table. - */ - lua_pushnil(state); - lua_setfield(state, 1, kwargs[i].za_name); - } - - /* - * Any entries remaining in the table are invalid inputs, print - * an error message based on what the entry is. - */ - lua_pushnil(state); - if (lua_next(state, 1)) { - if (lua_isnumber(state, -2) && lua_tointeger(state, -2) > 0) { - zcp_args_error(state, fname, pargs, kwargs, - "too many positional arguments"); - } else if (lua_isstring(state, -2)) { - zcp_args_error(state, fname, pargs, kwargs, - "invalid kwarg '%s'", lua_tostring(state, -2)); - } else { - zcp_args_error(state, fname, pargs, kwargs, - "kwarg keys must be strings"); - } - panic("unreachable code"); - } - - lua_remove(state, 1); -} - -static void -zcp_parse_pos_args(lua_State *state, const char *fname, const zcp_arg_t *pargs, - const zcp_arg_t *kwargs) -{ - int i; - int type; - - for (i = 0; pargs[i].za_name != NULL; i++) { - type = lua_type(state, i + 1); - if (type == LUA_TNONE) { - zcp_args_error(state, fname, pargs, kwargs, - "too few arguments"); - panic("unreachable code"); - } else if (type != pargs[i].za_lua_type) { - zcp_args_error(state, fname, pargs, kwargs, - "arg %d wrong type (is '%s', expected '%s')", - i + 1, lua_typename(state, type), - lua_typename(state, pargs[i].za_lua_type)); - panic("unreachable code"); - } - } - if (lua_gettop(state) != i) { - zcp_args_error(state, fname, pargs, kwargs, - "too many positional arguments"); - panic("unreachable code"); - } - - for (i = 0; kwargs[i].za_name != NULL; i++) { - lua_pushnil(state); - } -} - -/* - * Checks the current Lua stack against an expected set of positional and - * keyword arguments. If the stack does not match the expected arguments - * aborts the current channel program with a useful error message, otherwise - * it re-arranges the stack so that it contains the positional arguments - * followed by the keyword argument values in declaration order. Any missing - * keyword argument will be represented by a nil value on the stack. - * - * If the stack contains exactly one argument of type LUA_TTABLE the curly - * braces calling convention is assumed, otherwise the stack is parsed for - * positional arguments only. - * - * This function should be used by every function callback. It should be called - * before the callback manipulates the Lua stack as it assumes the stack - * represents the function arguments. - */ -void -zcp_parse_args(lua_State *state, const char *fname, const zcp_arg_t *pargs, - const zcp_arg_t *kwargs) -{ - if (lua_gettop(state) == 1 && lua_istable(state, 1)) { - zcp_parse_table_args(state, fname, pargs, kwargs); - } else { - zcp_parse_pos_args(state, fname, pargs, kwargs); - } -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c deleted file mode 100644 index dcba02c508b0..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c +++ /dev/null @@ -1,865 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2016, 2017 by Delphix. All rights reserved. - */ - -#include "lua.h" -#include "lualib.h" -#include "lauxlib.h" - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef _KERNEL -#include -#endif - -static int -get_objset_type(dsl_dataset_t *ds, zfs_type_t *type) -{ - int error; - objset_t *os; - error = dmu_objset_from_ds(ds, &os); - if (error != 0) - return (error); - if (ds->ds_is_snapshot) { - *type = ZFS_TYPE_SNAPSHOT; - } else { - switch (os->os_phys->os_type) { - case DMU_OST_ZFS: - *type = ZFS_TYPE_FILESYSTEM; - break; - case DMU_OST_ZVOL: - *type = ZFS_TYPE_VOLUME; - break; - default: - return (EINVAL); - } - } - return (0); -} - -/* - * Returns the string name of ds's type in str (a buffer which should be - * at least 12 bytes long). - */ -static int -get_objset_type_name(dsl_dataset_t *ds, char *str) -{ - int error; - zfs_type_t type; - error = get_objset_type(ds, &type); - if (error != 0) - return (error); - switch (type) { - case ZFS_TYPE_SNAPSHOT: - (void) strcpy(str, "snapshot"); - break; - case ZFS_TYPE_FILESYSTEM: - (void) strcpy(str, "filesystem"); - break; - case ZFS_TYPE_VOLUME: - (void) strcpy(str, "volume"); - break; - default: - return (EINVAL); - } - return (0); -} - -/* - * Determines the source of a property given its setpoint and - * property type. It pushes the source to the lua stack. - */ -static void -get_prop_src(lua_State *state, const char *setpoint, zfs_prop_t prop) -{ - if (zfs_prop_readonly(prop) || (prop == ZFS_PROP_VERSION)) { - lua_pushnil(state); - } else { - const char *src; - if (strcmp("", setpoint) == 0) { - src = "default"; - } else { - src = setpoint; - } - (void) lua_pushstring(state, src); - } -} - -/* - * Given an error encountered while getting properties, either longjmp's for - * a fatal error or pushes nothing to the stack for a non fatal one. - */ -static int -zcp_handle_error(lua_State *state, const char *dataset_name, - const char *property_name, int error) -{ - ASSERT3S(error, !=, 0); - if (error == ENOENT) { - return (0); - } else if (error == EINVAL) { - return (luaL_error(state, - "property '%s' is not a valid property on dataset '%s'", - property_name, dataset_name)); - } else if (error == EIO) { - return (luaL_error(state, - "I/O error while retrieving property '%s' on dataset '%s'", - property_name, dataset_name)); - } else { - return (luaL_error(state, "unexpected error %d while " - "retrieving property '%s' on dataset '%s'", - error, property_name, dataset_name)); - } -} - -/* - * Look up a user defined property in the zap object. If it exists, push it - * and the setpoint onto the stack, otherwise don't push anything. - */ -static int -zcp_get_user_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, - const char *property_name) -{ - int error; - char *buf; - char setpoint[ZFS_MAX_DATASET_NAME_LEN]; - /* - * zcp_dataset_hold will either successfully return the requested - * dataset or throw a lua error and longjmp out of the zfs.get_prop call - * without returning. - */ - dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); - if (ds == NULL) - return (1); /* not reached; zcp_dataset_hold() longjmp'd */ - - buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); - error = dsl_prop_get_ds(ds, property_name, 1, ZAP_MAXVALUELEN, - buf, setpoint); - dsl_dataset_rele(ds, FTAG); - - if (error != 0) { - kmem_free(buf, ZAP_MAXVALUELEN); - return (zcp_handle_error(state, dataset_name, property_name, - error)); - } - (void) lua_pushstring(state, buf); - (void) lua_pushstring(state, setpoint); - kmem_free(buf, ZAP_MAXVALUELEN); - return (2); -} - -/* - * Check if the property we're looking for is stored in the ds_dir. If so, - * return it in the 'val' argument. Return 0 on success and ENOENT and if - * the property is not present. - */ -static int -get_dsl_dir_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, - uint64_t *val) -{ - dsl_dir_t *dd = ds->ds_dir; - mutex_enter(&dd->dd_lock); - switch (zfs_prop) { - case ZFS_PROP_USEDSNAP: - *val = dsl_dir_get_usedsnap(dd); - break; - case ZFS_PROP_USEDCHILD: - *val = dsl_dir_get_usedchild(dd); - break; - case ZFS_PROP_USEDDS: - *val = dsl_dir_get_usedds(dd); - break; - case ZFS_PROP_USEDREFRESERV: - *val = dsl_dir_get_usedrefreserv(dd); - break; - case ZFS_PROP_LOGICALUSED: - *val = dsl_dir_get_logicalused(dd); - break; - default: - mutex_exit(&dd->dd_lock); - return (ENOENT); - } - mutex_exit(&dd->dd_lock); - return (0); -} - -/* - * Takes a dataset, a property, a value and that value's setpoint as - * found in the ZAP. Checks if the property has been changed in the vfs. - * If so, val and setpoint will be overwritten with updated content. - * Otherwise, they are left unchanged. - */ -static int -get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, - char *setpoint) -{ -#ifndef _KERNEL - return (0); -#else - int error; -#ifdef illumos - zfsvfs_t *zfvp; -#endif - vfs_t *vfsp; - objset_t *os; - uint64_t tmp = *val; - - error = dmu_objset_from_ds(ds, &os); - if (error != 0) - return (error); - - error = getzfsvfs_impl(os, &vfsp); - if (error != 0) - return (error); -#ifdef illumos - vfsp = zfvp->z_vfs; -#endif - switch (zfs_prop) { - case ZFS_PROP_ATIME: - if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) - tmp = 0; - if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) - tmp = 1; - break; - case ZFS_PROP_DEVICES: - if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) - tmp = 0; - if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) - tmp = 1; - break; - case ZFS_PROP_EXEC: - if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) - tmp = 0; - if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) - tmp = 1; - break; - case ZFS_PROP_SETUID: - if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) - tmp = 0; - if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) - tmp = 1; - break; - case ZFS_PROP_READONLY: - if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) - tmp = 0; - if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) - tmp = 1; - break; - case ZFS_PROP_XATTR: - if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) - tmp = 0; - if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) - tmp = 1; - break; - case ZFS_PROP_NBMAND: - if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) - tmp = 0; - if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) - tmp = 1; - break; - default: -#ifdef illumos - VFS_RELE(vfsp); -#else - vfs_rel(vfsp); -#endif - return (ENOENT); - } - -#ifdef illumos - VFS_RELE(vfsp); -#else - vfs_rel(vfsp); -#endif - if (tmp != *val) { - (void) strcpy(setpoint, "temporary"); - *val = tmp; - } - return (0); -#endif -} - -/* - * Check if the property we're looking for is stored at the dsl_dataset or - * dsl_dir level. If so, push the property value and source onto the lua stack - * and return 0. If it is not present or a failure occurs in lookup, return a - * non-zero error value. - */ -static int -get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, - zfs_prop_t zfs_prop) -{ - int error = 0; - objset_t *os; - uint64_t numval; - char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); - char setpoint[ZFS_MAX_DATASET_NAME_LEN] = - "Internal error - setpoint not determined"; - zfs_type_t ds_type; - zprop_type_t prop_type = zfs_prop_get_type(zfs_prop); - (void) get_objset_type(ds, &ds_type); - - switch (zfs_prop) { - case ZFS_PROP_REFRATIO: - numval = dsl_get_refratio(ds); - break; - case ZFS_PROP_USED: - numval = dsl_get_used(ds); - break; - case ZFS_PROP_CLONES: { - nvlist_t *clones = fnvlist_alloc(); - error = get_clones_stat_impl(ds, clones); - if (error == 0) { - /* push list to lua stack */ - VERIFY0(zcp_nvlist_to_lua(state, clones, NULL, 0)); - /* source */ - (void) lua_pushnil(state); - } - nvlist_free(clones); - kmem_free(strval, ZAP_MAXVALUELEN); - return (error); - } - case ZFS_PROP_COMPRESSRATIO: - numval = dsl_get_compressratio(ds); - break; - case ZFS_PROP_CREATION: - numval = dsl_get_creation(ds); - break; - case ZFS_PROP_REFERENCED: - numval = dsl_get_referenced(ds); - break; - case ZFS_PROP_AVAILABLE: - numval = dsl_get_available(ds); - break; - case ZFS_PROP_LOGICALREFERENCED: - numval = dsl_get_logicalreferenced(ds); - break; - case ZFS_PROP_CREATETXG: - numval = dsl_get_creationtxg(ds); - break; - case ZFS_PROP_GUID: - numval = dsl_get_guid(ds); - break; - case ZFS_PROP_UNIQUE: - numval = dsl_get_unique(ds); - break; - case ZFS_PROP_OBJSETID: - numval = dsl_get_objsetid(ds); - break; - case ZFS_PROP_ORIGIN: - dsl_dir_get_origin(ds->ds_dir, strval); - break; - case ZFS_PROP_USERACCOUNTING: - error = dmu_objset_from_ds(ds, &os); - if (error == 0) - numval = dmu_objset_userspace_present(os); - break; - case ZFS_PROP_WRITTEN: - error = dsl_get_written(ds, &numval); - break; - case ZFS_PROP_TYPE: - error = get_objset_type_name(ds, strval); - break; - case ZFS_PROP_PREV_SNAP: - error = dsl_get_prev_snap(ds, strval); - break; - case ZFS_PROP_NAME: - dsl_dataset_name(ds, strval); - break; - case ZFS_PROP_MOUNTPOINT: - error = dsl_get_mountpoint(ds, dsname, strval, setpoint); - break; - case ZFS_PROP_VERSION: - /* should be a snapshot or filesystem */ - ASSERT(ds_type != ZFS_TYPE_VOLUME); - error = dmu_objset_from_ds(ds, &os); - /* look in the master node for the version */ - if (error == 0) { - error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, - sizeof (numval), 1, &numval); - } - break; - case ZFS_PROP_DEFER_DESTROY: - numval = dsl_get_defer_destroy(ds); - break; - case ZFS_PROP_USERREFS: - numval = dsl_get_userrefs(ds); - break; - case ZFS_PROP_FILESYSTEM_COUNT: - error = dsl_dir_get_filesystem_count(ds->ds_dir, &numval); - (void) strcpy(setpoint, ""); - break; - case ZFS_PROP_SNAPSHOT_COUNT: - error = dsl_dir_get_snapshot_count(ds->ds_dir, &numval); - (void) strcpy(setpoint, ""); - break; - case ZFS_PROP_REMAPTXG: - error = dsl_dir_get_remaptxg(ds->ds_dir, &numval); - break; - case ZFS_PROP_NUMCLONES: - numval = dsl_get_numclones(ds); - break; - case ZFS_PROP_INCONSISTENT: - numval = dsl_get_inconsistent(ds); - break; - case ZFS_PROP_RECEIVE_RESUME_TOKEN: { - char *token = get_receive_resume_stats_impl(ds); - VERIFY3U(strlcpy(strval, token, ZAP_MAXVALUELEN), <, - ZAP_MAXVALUELEN); - strfree(token); - if (strcmp(strval, "") == 0) { - token = get_child_receive_stats(ds); - VERIFY3U(strlcpy(strval, token, ZAP_MAXVALUELEN), <, - ZAP_MAXVALUELEN); - strfree(token); - if (strcmp(strval, "") == 0) - error = ENOENT; - } - break; - } - case ZFS_PROP_VOLSIZE: - ASSERT(ds_type == ZFS_TYPE_VOLUME); - error = dmu_objset_from_ds(ds, &os); - if (error == 0) { - error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", - sizeof (numval), 1, &numval); - } - if (error == 0) - (void) strcpy(setpoint, dsname); - - break; - case ZFS_PROP_VOLBLOCKSIZE: { - ASSERT(ds_type == ZFS_TYPE_VOLUME); - dmu_object_info_t doi; - error = dmu_objset_from_ds(ds, &os); - if (error == 0) { - error = dmu_object_info(os, ZVOL_OBJ, &doi); - if (error == 0) - numval = doi.doi_data_block_size; - } - break; - } - default: - /* Did not match these props, check in the dsl_dir */ - error = get_dsl_dir_prop(ds, zfs_prop, &numval); - } - if (error != 0) { - kmem_free(strval, ZAP_MAXVALUELEN); - return (error); - } - - switch (prop_type) { - case PROP_TYPE_NUMBER: { - (void) lua_pushnumber(state, numval); - break; - } - case PROP_TYPE_STRING: { - (void) lua_pushstring(state, strval); - break; - } - case PROP_TYPE_INDEX: { - const char *propval; - error = zfs_prop_index_to_string(zfs_prop, numval, &propval); - if (error != 0) { - kmem_free(strval, ZAP_MAXVALUELEN); - return (error); - } - (void) lua_pushstring(state, propval); - break; - } - } - kmem_free(strval, ZAP_MAXVALUELEN); - - /* Push the source to the stack */ - get_prop_src(state, setpoint, zfs_prop); - return (0); -} - -/* - * Look up a property and its source in the zap object. If the value is - * present and successfully retrieved, push the value and source on the - * lua stack and return 0. On failure, return a non-zero error value. - */ -static int -get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop) -{ - int error = 0; - char setpoint[ZFS_MAX_DATASET_NAME_LEN]; - char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); - uint64_t numval; - const char *prop_name = zfs_prop_to_name(zfs_prop); - zprop_type_t prop_type = zfs_prop_get_type(zfs_prop); - - if (prop_type == PROP_TYPE_STRING) { - /* Push value to lua stack */ - error = dsl_prop_get_ds(ds, prop_name, 1, - ZAP_MAXVALUELEN, strval, setpoint); - if (error == 0) - (void) lua_pushstring(state, strval); - } else { - error = dsl_prop_get_ds(ds, prop_name, sizeof (numval), - 1, &numval, setpoint); - - /* Fill in temorary value for prop, if applicable */ - (void) get_temporary_prop(ds, zfs_prop, &numval, setpoint); - - /* Push value to lua stack */ - if (prop_type == PROP_TYPE_INDEX) { - const char *propval; - error = zfs_prop_index_to_string(zfs_prop, numval, - &propval); - if (error == 0) - (void) lua_pushstring(state, propval); - } else { - if (error == 0) - (void) lua_pushnumber(state, numval); - } - } - kmem_free(strval, ZAP_MAXVALUELEN); - if (error == 0) - get_prop_src(state, setpoint, zfs_prop); - return (error); -} - -/* - * Determine whether property is valid for a given dataset - */ -boolean_t -prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop) -{ - int error; - zfs_type_t zfs_type; - - /* properties not supported */ - if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) || - (zfs_prop == ZFS_PROP_MOUNTED)) - return (B_FALSE); - - /* if we want the origin prop, ds must be a clone */ - if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir))) - return (B_FALSE); - - error = get_objset_type(ds, &zfs_type); - if (error != 0) - return (B_FALSE); - return (zfs_prop_valid_for_type(zfs_prop, zfs_type)); -} - -/* - * Look up a given dataset property. On success return 2, the number of - * values pushed to the lua stack (property value and source). On a fatal - * error, longjmp. On a non fatal error push nothing. - */ -static int -zcp_get_system_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, - zfs_prop_t zfs_prop) -{ - int error; - /* - * zcp_dataset_hold will either successfully return the requested - * dataset or throw a lua error and longjmp out of the zfs.get_prop call - * without returning. - */ - dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); - if (ds == NULL) - return (1); /* not reached; zcp_dataset_hold() longjmp'd */ - - /* Check that the property is valid for the given dataset */ - const char *prop_name = zfs_prop_to_name(zfs_prop); - if (!prop_valid_for_ds(ds, zfs_prop)) { - dsl_dataset_rele(ds, FTAG); - return (0); - } - - /* Check if the property can be accessed directly */ - error = get_special_prop(state, ds, dataset_name, zfs_prop); - if (error == 0) { - dsl_dataset_rele(ds, FTAG); - /* The value and source have been pushed by get_special_prop */ - return (2); - } - if (error != ENOENT) { - dsl_dataset_rele(ds, FTAG); - return (zcp_handle_error(state, dataset_name, - prop_name, error)); - } - - /* If we were unable to find it, look in the zap object */ - error = get_zap_prop(state, ds, zfs_prop); - dsl_dataset_rele(ds, FTAG); - if (error != 0) { - return (zcp_handle_error(state, dataset_name, - prop_name, error)); - } - /* The value and source have been pushed by get_zap_prop */ - return (2); -} - -static zfs_userquota_prop_t -get_userquota_prop(const char *prop_name) -{ - zfs_userquota_prop_t type; - /* Figure out the property type ({user|group}{quota|used}) */ - for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { - if (strncmp(prop_name, zfs_userquota_prop_prefixes[type], - strlen(zfs_userquota_prop_prefixes[type])) == 0) - break; - } - return (type); -} - -#ifdef _KERNEL -/* - * Given the name of a zfs_userquota_prop, this function determines the - * prop type as well as the numeric group/user ids based on the string - * following the '@' in the property name. On success, returns 0. On failure, - * returns a non-zero error. - * 'domain' must be free'd by caller using strfree() - */ -static int -parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type, - char **domain, uint64_t *rid) -{ - char *cp, *end, *domain_val; - - *type = get_userquota_prop(prop_name); - if (*type >= ZFS_NUM_USERQUOTA_PROPS) - return (EINVAL); - - *rid = 0; - cp = strchr(prop_name, '@') + 1; - if (strncmp(cp, "S-1-", 4) == 0) { - /* - * It's a numeric SID (eg "S-1-234-567-89") and we want to - * seperate the domain id and the rid - */ - int domain_len = strrchr(cp, '-') - cp; - domain_val = kmem_alloc(domain_len + 1, KM_SLEEP); - (void) strncpy(domain_val, cp, domain_len); - domain_val[domain_len] = '\0'; - cp += domain_len + 1; - - (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid); - if (*end != '\0') { - strfree(domain_val); - return (EINVAL); - } - } else { - /* It's only a user/group ID (eg "12345"), just get the rid */ - domain_val = NULL; - (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid); - if (*end != '\0') - return (EINVAL); - } - *domain = domain_val; - return (0); -} - -/* - * Look up {user|group}{quota|used} property for given dataset. On success - * push the value (quota or used amount) and the setpoint. On failure, push - * a lua error. - */ -static int -zcp_get_userquota_prop(lua_State *state, dsl_pool_t *dp, - const char *dataset_name, const char *prop_name) -{ - zfsvfs_t *zfvp; - zfsvfs_t *zfsvfs; - int error; - zfs_userquota_prop_t type; - char *domain; - uint64_t rid, value; - objset_t *os; - - dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); - if (ds == NULL) - return (1); /* not reached; zcp_dataset_hold() longjmp'd */ - - error = parse_userquota_prop(prop_name, &type, &domain, &rid); - if (error == 0) { - error = dmu_objset_from_ds(ds, &os); - if (error == 0) { - zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); - error = zfsvfs_create_impl(&zfvp, zfsvfs, os); - if (error == 0) { - error = zfs_userspace_one(zfvp, type, domain, - rid, &value); - zfsvfs_free(zfvp); - } - } - if (domain != NULL) - strfree(domain); - } - dsl_dataset_rele(ds, FTAG); - - if ((value == 0) && ((type == ZFS_PROP_USERQUOTA) || - (type == ZFS_PROP_GROUPQUOTA))) - error = ENOENT; - if (error != 0) { - return (zcp_handle_error(state, dataset_name, - prop_name, error)); - } - - (void) lua_pushnumber(state, value); - (void) lua_pushstring(state, dataset_name); - return (2); -} -#endif - -/* - * Determines the name of the snapshot referenced in the written property - * name. Returns snapshot name in snap_name, a buffer that must be at least - * as large as ZFS_MAX_DATASET_NAME_LEN - */ -static void -parse_written_prop(const char *dataset_name, const char *prop_name, - char *snap_name) -{ - ASSERT(zfs_prop_written(prop_name)); - const char *name = prop_name + ZFS_WRITTEN_PROP_PREFIX_LEN; - if (strchr(name, '@') == NULL) { - (void) sprintf(snap_name, "%s@%s", dataset_name, name); - } else { - (void) strcpy(snap_name, name); - } -} - -/* - * Look up written@ property for given dataset. On success - * push the value and the setpoint. If error is fatal, we will - * longjmp, otherwise push nothing. - */ -static int -zcp_get_written_prop(lua_State *state, dsl_pool_t *dp, - const char *dataset_name, const char *prop_name) -{ - char snap_name[ZFS_MAX_DATASET_NAME_LEN]; - uint64_t used, comp, uncomp; - dsl_dataset_t *old; - int error = 0; - - parse_written_prop(dataset_name, prop_name, snap_name); - dsl_dataset_t *new = zcp_dataset_hold(state, dp, dataset_name, FTAG); - if (new == NULL) - return (1); /* not reached; zcp_dataset_hold() longjmp'd */ - - error = dsl_dataset_hold(dp, snap_name, FTAG, &old); - if (error != 0) { - dsl_dataset_rele(new, FTAG); - return (zcp_dataset_hold_error(state, dp, snap_name, - error)); - } - error = dsl_dataset_space_written(old, new, - &used, &comp, &uncomp); - - dsl_dataset_rele(old, FTAG); - dsl_dataset_rele(new, FTAG); - - if (error != 0) { - return (zcp_handle_error(state, dataset_name, - snap_name, error)); - } - (void) lua_pushnumber(state, used); - (void) lua_pushstring(state, dataset_name); - return (2); -} - -static int zcp_get_prop(lua_State *state); -static zcp_lib_info_t zcp_get_prop_info = { - .name = "get_prop", - .func = zcp_get_prop, - .pargs = { - { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, - { .za_name = "property", .za_lua_type = LUA_TSTRING}, - {NULL, 0} - }, - .kwargs = { - {NULL, 0} - } -}; - -static int -zcp_get_prop(lua_State *state) -{ - const char *dataset_name; - const char *property_name; - dsl_pool_t *dp = zcp_run_info(state)->zri_pool; - zcp_lib_info_t *libinfo = &zcp_get_prop_info; - - zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); - - dataset_name = lua_tostring(state, 1); - property_name = lua_tostring(state, 2); - - /* User defined property */ - if (zfs_prop_user(property_name)) { - return (zcp_get_user_prop(state, dp, - dataset_name, property_name)); - } - /* userspace property */ - if (zfs_prop_userquota(property_name)) { -#ifdef _KERNEL - return (zcp_get_userquota_prop(state, dp, - dataset_name, property_name)); -#else - return (luaL_error(state, - "user quota properties only supported in kernel mode", - property_name)); -#endif - } - /* written@ property */ - if (zfs_prop_written(property_name)) { - return (zcp_get_written_prop(state, dp, - dataset_name, property_name)); - } - - zfs_prop_t zfs_prop = zfs_name_to_prop(property_name); - /* Valid system property */ - if (zfs_prop != ZPROP_INVAL) { - return (zcp_get_system_prop(state, dp, dataset_name, - zfs_prop)); - } - - /* Invalid property name */ - return (luaL_error(state, - "'%s' is not a valid property", property_name)); -} - -int -zcp_load_get_lib(lua_State *state) -{ - lua_pushcclosure(state, zcp_get_prop_info.func, 0); - lua_setfield(state, -2, zcp_get_prop_info.name); - - return (1); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c deleted file mode 100644 index c25431fd6703..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2016, 2017 by Delphix. All rights reserved. - */ - -#include - -#include "lua.h" -#include "lauxlib.h" - -typedef struct zcp_errno_global { - const char *zeg_name; - int zeg_errno; -} zcp_errno_global_t; - -static const zcp_errno_global_t errno_globals[] = { - {"EPERM", EPERM}, - {"ENOENT", ENOENT}, - {"ESRCH", ESRCH}, - {"EINTR", EINTR}, - {"EIO", EIO}, - {"ENXIO", ENXIO}, - {"E2BIG", E2BIG}, - {"ENOEXEC", ENOEXEC}, - {"EBADF", EBADF}, - {"ECHILD", ECHILD}, - {"EAGAIN", EAGAIN}, - {"ENOMEM", ENOMEM}, - {"EACCES", EACCES}, - {"EFAULT", EFAULT}, - {"ENOTBLK", ENOTBLK}, - {"EBUSY", EBUSY}, - {"EEXIST", EEXIST}, - {"EXDEV", EXDEV}, - {"ENODEV", ENODEV}, - {"ENOTDIR", ENOTDIR}, - {"EISDIR", EISDIR}, - {"EINVAL", EINVAL}, - {"ENFILE", ENFILE}, - {"EMFILE", EMFILE}, - {"ENOTTY", ENOTTY}, - {"ETXTBSY", ETXTBSY}, - {"EFBIG", EFBIG}, - {"ENOSPC", ENOSPC}, - {"ESPIPE", ESPIPE}, - {"EROFS", EROFS}, - {"EMLINK", EMLINK}, - {"EPIPE", EPIPE}, - {"EDOM", EDOM}, - {"ERANGE", ERANGE}, - {"EDEADLK", EDEADLK}, - {"ENOLCK", ENOLCK}, - {"ECANCELED", ECANCELED}, - {"ENOTSUP", ENOTSUP}, - {"EDQUOT", EDQUOT}, - {"ENAMETOOLONG", ENAMETOOLONG}, - {NULL, 0} -}; - -static void -zcp_load_errno_globals(lua_State *state) -{ - const zcp_errno_global_t *global = errno_globals; - while (global->zeg_name != NULL) { - lua_pushnumber(state, (lua_Number)global->zeg_errno); - lua_setglobal(state, global->zeg_name); - global++; - } -} - -void -zcp_load_globals(lua_State *state) -{ - zcp_load_errno_globals(state); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c deleted file mode 100644 index 0236c6474ef6..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c +++ /dev/null @@ -1,531 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2016 by Delphix. All rights reserved. - */ - -#include "lua.h" -#include "lauxlib.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -typedef int (zcp_list_func_t)(lua_State *); -typedef struct zcp_list_info { - const char *name; - zcp_list_func_t *func; - zcp_list_func_t *gc; - const zcp_arg_t pargs[4]; - const zcp_arg_t kwargs[2]; -} zcp_list_info_t; - -static int -zcp_clones_iter(lua_State *state) -{ - int err; - char clonename[ZFS_MAX_DATASET_NAME_LEN]; - uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1)); - uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2)); - dsl_pool_t *dp = zcp_run_info(state)->zri_pool; - dsl_dataset_t *ds, *clone; - zap_attribute_t za; - zap_cursor_t zc; - - err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); - if (err == ENOENT) { - return (0); - } else if (err != 0) { - return (luaL_error(state, - "unexpected error %d from dsl_dataset_hold_obj(dsobj)", - err)); - } - - if (dsl_dataset_phys(ds)->ds_next_clones_obj == 0) { - dsl_dataset_rele(ds, FTAG); - return (0); - } - - zap_cursor_init_serialized(&zc, dp->dp_meta_objset, - dsl_dataset_phys(ds)->ds_next_clones_obj, cursor); - dsl_dataset_rele(ds, FTAG); - - err = zap_cursor_retrieve(&zc, &za); - if (err != 0) { - zap_cursor_fini(&zc); - if (err != ENOENT) { - return (luaL_error(state, - "unexpected error %d from zap_cursor_retrieve()", - err)); - } - return (0); - } - zap_cursor_advance(&zc); - cursor = zap_cursor_serialize(&zc); - zap_cursor_fini(&zc); - - err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &clone); - if (err != 0) { - return (luaL_error(state, - "unexpected error %d from " - "dsl_dataset_hold_obj(za_first_integer)", err)); - } - - dsl_dir_name(clone->ds_dir, clonename); - dsl_dataset_rele(clone, FTAG); - - lua_pushnumber(state, cursor); - lua_replace(state, lua_upvalueindex(2)); - - (void) lua_pushstring(state, clonename); - return (1); -} - -static int zcp_clones_list(lua_State *); -static zcp_list_info_t zcp_clones_list_info = { - .name = "clones", - .func = zcp_clones_list, - .gc = NULL, - .pargs = { - { .za_name = "snapshot", .za_lua_type = LUA_TSTRING}, - {NULL, 0} - }, - .kwargs = { - {NULL, 0} - } -}; - -static int -zcp_clones_list(lua_State *state) -{ - const char *snapname = lua_tostring(state, 1); - dsl_pool_t *dp = zcp_run_info(state)->zri_pool; - boolean_t issnap; - uint64_t dsobj, cursor; - - /* - * zcp_dataset_hold will either successfully return the requested - * dataset or throw a lua error and longjmp out of the zfs.list.clones - * call without returning. - */ - dsl_dataset_t *ds = zcp_dataset_hold(state, dp, snapname, FTAG); - if (ds == NULL) - return (1); /* not reached; zcp_dataset_hold() longjmp'd */ - cursor = 0; - issnap = ds->ds_is_snapshot; - dsobj = ds->ds_object; - dsl_dataset_rele(ds, FTAG); - - if (!issnap) { - return (zcp_argerror(state, 1, "%s is not a snapshot", - snapname)); - } - - lua_pushnumber(state, dsobj); - lua_pushnumber(state, cursor); - lua_pushcclosure(state, &zcp_clones_iter, 2); - return (1); -} - -static int -zcp_snapshots_iter(lua_State *state) -{ - int err; - char snapname[ZFS_MAX_DATASET_NAME_LEN]; - uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1)); - uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2)); - dsl_pool_t *dp = zcp_run_info(state)->zri_pool; - dsl_dataset_t *ds; - objset_t *os; - char *p; - - err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); - if (err != 0) { - return (luaL_error(state, - "unexpected error %d from dsl_dataset_hold_obj(dsobj)", - err)); - } - - dsl_dataset_name(ds, snapname); - VERIFY3U(sizeof (snapname), >, - strlcat(snapname, "@", sizeof (snapname))); - - p = strchr(snapname, '\0'); - VERIFY0(dmu_objset_from_ds(ds, &os)); - err = dmu_snapshot_list_next(os, - sizeof (snapname) - (p - snapname), p, NULL, &cursor, NULL); - dsl_dataset_rele(ds, FTAG); - - if (err == ENOENT) { - return (0); - } else if (err != 0) { - return (luaL_error(state, - "unexpected error %d from dmu_snapshot_list_next()", err)); - } - - lua_pushnumber(state, cursor); - lua_replace(state, lua_upvalueindex(2)); - - (void) lua_pushstring(state, snapname); - return (1); -} - -static int zcp_snapshots_list(lua_State *); -static zcp_list_info_t zcp_snapshots_list_info = { - .name = "snapshots", - .func = zcp_snapshots_list, - .gc = NULL, - .pargs = { - { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING}, - {NULL, 0} - }, - .kwargs = { - {NULL, 0} - } -}; - -static int -zcp_snapshots_list(lua_State *state) -{ - const char *fsname = lua_tostring(state, 1); - dsl_pool_t *dp = zcp_run_info(state)->zri_pool; - boolean_t issnap; - uint64_t dsobj; - - dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG); - if (ds == NULL) - return (1); /* not reached; zcp_dataset_hold() longjmp'd */ - issnap = ds->ds_is_snapshot; - dsobj = ds->ds_object; - dsl_dataset_rele(ds, FTAG); - - if (issnap) { - return (zcp_argerror(state, 1, - "argument %s cannot be a snapshot", fsname)); - } - - lua_pushnumber(state, dsobj); - lua_pushnumber(state, 0); - lua_pushcclosure(state, &zcp_snapshots_iter, 2); - return (1); -} - -/* - * Note: channel programs only run in the global zone, so all datasets - * are visible to this zone. - */ -static boolean_t -dataset_name_hidden(const char *name) -{ - if (strchr(name, '$') != NULL) - return (B_TRUE); - if (strchr(name, '%') != NULL) - return (B_TRUE); - return (B_FALSE); -} - -static int -zcp_children_iter(lua_State *state) -{ - int err; - char childname[ZFS_MAX_DATASET_NAME_LEN]; - uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1)); - uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2)); - zcp_run_info_t *ri = zcp_run_info(state); - dsl_pool_t *dp = ri->zri_pool; - dsl_dataset_t *ds; - objset_t *os; - char *p; - - err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); - if (err != 0) { - return (luaL_error(state, - "unexpected error %d from dsl_dataset_hold_obj(dsobj)", - err)); - } - - dsl_dataset_name(ds, childname); - VERIFY3U(sizeof (childname), >, - strlcat(childname, "/", sizeof (childname))); - p = strchr(childname, '\0'); - - VERIFY0(dmu_objset_from_ds(ds, &os)); - do { - err = dmu_dir_list_next(os, - sizeof (childname) - (p - childname), p, NULL, &cursor); - } while (err == 0 && dataset_name_hidden(childname)); - dsl_dataset_rele(ds, FTAG); - - if (err == ENOENT) { - return (0); - } else if (err != 0) { - return (luaL_error(state, - "unexpected error %d from dmu_dir_list_next()", - err)); - } - - lua_pushnumber(state, cursor); - lua_replace(state, lua_upvalueindex(2)); - - (void) lua_pushstring(state, childname); - return (1); -} - -static int zcp_children_list(lua_State *); -static zcp_list_info_t zcp_children_list_info = { - .name = "children", - .func = zcp_children_list, - .gc = NULL, - .pargs = { - { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING}, - {NULL, 0} - }, - .kwargs = { - {NULL, 0} - } -}; - -static int -zcp_children_list(lua_State *state) -{ - const char *fsname = lua_tostring(state, 1); - dsl_pool_t *dp = zcp_run_info(state)->zri_pool; - boolean_t issnap; - uint64_t dsobj; - - dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG); - if (ds == NULL) - return (1); /* not reached; zcp_dataset_hold() longjmp'd */ - - issnap = ds->ds_is_snapshot; - dsobj = ds->ds_object; - dsl_dataset_rele(ds, FTAG); - - if (issnap) { - return (zcp_argerror(state, 1, - "argument %s cannot be a snapshot", fsname)); - } - - lua_pushnumber(state, dsobj); - lua_pushnumber(state, 0); - lua_pushcclosure(state, &zcp_children_iter, 2); - return (1); -} - -static int -zcp_props_list_gc(lua_State *state) -{ - nvlist_t **props = lua_touserdata(state, 1); - if (*props != NULL) - fnvlist_free(*props); - return (0); -} - -static int -zcp_props_iter(lua_State *state) -{ - char *source, *val; - nvlist_t *nvprop; - nvlist_t **props = lua_touserdata(state, lua_upvalueindex(1)); - nvpair_t *pair = lua_touserdata(state, lua_upvalueindex(2)); - - do { - pair = nvlist_next_nvpair(*props, pair); - if (pair == NULL) { - fnvlist_free(*props); - *props = NULL; - return (0); - } - } while (!zfs_prop_user(nvpair_name(pair))); - - lua_pushlightuserdata(state, pair); - lua_replace(state, lua_upvalueindex(2)); - - nvprop = fnvpair_value_nvlist(pair); - val = fnvlist_lookup_string(nvprop, ZPROP_VALUE); - source = fnvlist_lookup_string(nvprop, ZPROP_SOURCE); - - (void) lua_pushstring(state, nvpair_name(pair)); - (void) lua_pushstring(state, val); - (void) lua_pushstring(state, source); - return (3); -} - -static int zcp_props_list(lua_State *); -static zcp_list_info_t zcp_props_list_info = { - .name = "properties", - .func = zcp_props_list, - .gc = zcp_props_list_gc, - .pargs = { - { .za_name = "filesystem | snapshot | volume", - .za_lua_type = LUA_TSTRING}, - {NULL, 0} - }, - .kwargs = { - {NULL, 0} - } -}; - -static int -zcp_props_list(lua_State *state) -{ - const char *dsname = lua_tostring(state, 1); - dsl_pool_t *dp = zcp_run_info(state)->zri_pool; - objset_t *os; - nvlist_t **props = lua_newuserdata(state, sizeof (nvlist_t *)); - - dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG); - if (ds == NULL) - return (1); /* not reached; zcp_dataset_hold() longjmp'd */ - VERIFY0(dmu_objset_from_ds(ds, &os)); - VERIFY0(dsl_prop_get_all(os, props)); - dsl_dataset_rele(ds, FTAG); - - /* - * Set the metatable for the properties list to free it on completion. - */ - luaL_getmetatable(state, zcp_props_list_info.name); - (void) lua_setmetatable(state, -2); - - lua_pushlightuserdata(state, NULL); - lua_pushcclosure(state, &zcp_props_iter, 2); - return (1); -} - - -/* - * Populate nv with all valid properties and their values for the given - * dataset. - */ -static void -zcp_dataset_props(dsl_dataset_t *ds, nvlist_t *nv) -{ - for (int prop = ZFS_PROP_TYPE; prop < ZFS_NUM_PROPS; prop++) { - /* Do not display hidden props */ - if (!zfs_prop_visible(prop)) - continue; - /* Do not display props not valid for this dataset */ - if (!prop_valid_for_ds(ds, prop)) - continue; - fnvlist_add_boolean(nv, zfs_prop_to_name(prop)); - } -} - -static int zcp_system_props_list(lua_State *); -static zcp_list_info_t zcp_system_props_list_info = { - .name = "system_properties", - .func = zcp_system_props_list, - .pargs = { - { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, - {NULL, 0} - }, - .kwargs = { - {NULL, 0} - } -}; - -/* - * Get a list of all visble properties and their values for a given dataset. - * Returned on the stack as a Lua table. - */ -static int -zcp_system_props_list(lua_State *state) -{ - int error; - char errbuf[128]; - const char *dataset_name; - dsl_pool_t *dp = zcp_run_info(state)->zri_pool; - zcp_list_info_t *libinfo = &zcp_system_props_list_info; - zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); - dataset_name = lua_tostring(state, 1); - nvlist_t *nv = fnvlist_alloc(); - - dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); - if (ds == NULL) - return (1); /* not reached; zcp_dataset_hold() longjmp'd */ - - /* Get the names of all valid properties for this dataset */ - zcp_dataset_props(ds, nv); - dsl_dataset_rele(ds, FTAG); - - /* push list as lua table */ - error = zcp_nvlist_to_lua(state, nv, errbuf, sizeof (errbuf)); - nvlist_free(nv); - if (error != 0) { - return (luaL_error(state, - "Error returning nvlist: %s", errbuf)); - } - return (1); -} - -static int -zcp_list_func(lua_State *state) -{ - zcp_list_info_t *info = lua_touserdata(state, lua_upvalueindex(1)); - - zcp_parse_args(state, info->name, info->pargs, info->kwargs); - - return (info->func(state)); -} - -int -zcp_load_list_lib(lua_State *state) -{ - int i; - zcp_list_info_t *zcp_list_funcs[] = { - &zcp_children_list_info, - &zcp_snapshots_list_info, - &zcp_props_list_info, - &zcp_clones_list_info, - &zcp_system_props_list_info, - NULL - }; - - lua_newtable(state); - - for (i = 0; zcp_list_funcs[i] != NULL; i++) { - zcp_list_info_t *info = zcp_list_funcs[i]; - - if (info->gc != NULL) { - /* - * If the function requires garbage collection, create - * a metatable with its name and register the __gc - * function. - */ - (void) luaL_newmetatable(state, info->name); - (void) lua_pushstring(state, "__gc"); - lua_pushcfunction(state, info->gc); - lua_settable(state, -3); - lua_pop(state, 1); - } - - lua_pushlightuserdata(state, info); - lua_pushcclosure(state, &zcp_list_func, 1); - lua_setfield(state, -2, info->name); - info++; - } - - return (1); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c deleted file mode 100644 index 25d970ec0888..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c +++ /dev/null @@ -1,360 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2016, 2017 by Delphix. All rights reserved. - */ - -#include "lua.h" -#include "lauxlib.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define DST_AVG_BLKSHIFT 14 - -typedef int (zcp_synctask_func_t)(lua_State *, boolean_t, nvlist_t *); -typedef struct zcp_synctask_info { - const char *name; - zcp_synctask_func_t *func; - const zcp_arg_t pargs[4]; - const zcp_arg_t kwargs[2]; - zfs_space_check_t space_check; - int blocks_modified; -} zcp_synctask_info_t; - -/* - * Generic synctask interface for channel program syncfuncs. - * - * To perform some action in syncing context, we'd generally call - * dsl_sync_task(), but since the Lua script is already running inside a - * synctask we need to leave out some actions (such as acquiring the config - * rwlock and performing space checks). - * - * If 'sync' is false, executes a dry run and returns the error code. - * - * If we are not running in syncing context and we are not doing a dry run - * (meaning we are running a zfs.sync function in open-context) then we - * return a Lua error. - * - * This function also handles common fatal error cases for channel program - * library functions. If a fatal error occurs, err_dsname will be the dataset - * name reported in error messages, if supplied. - */ -static int -zcp_sync_task(lua_State *state, dsl_checkfunc_t *checkfunc, - dsl_syncfunc_t *syncfunc, void *arg, boolean_t sync, const char *err_dsname) -{ - int err; - zcp_run_info_t *ri = zcp_run_info(state); - - err = checkfunc(arg, ri->zri_tx); - if (!sync) - return (err); - - if (!ri->zri_sync) { - return (luaL_error(state, "running functions from the zfs.sync " - "submodule requires passing sync=TRUE to " - "lzc_channel_program() (i.e. do not specify the \"-n\" " - "command line argument)")); - } - - if (err == 0) { - syncfunc(arg, ri->zri_tx); - } else if (err == EIO) { - if (err_dsname != NULL) { - return (luaL_error(state, - "I/O error while accessing dataset '%s'", - err_dsname)); - } else { - return (luaL_error(state, - "I/O error while accessing dataset.")); - } - } - - return (err); -} - - -static int zcp_synctask_destroy(lua_State *, boolean_t, nvlist_t *); -static zcp_synctask_info_t zcp_synctask_destroy_info = { - .name = "destroy", - .func = zcp_synctask_destroy, - .pargs = { - {.za_name = "filesystem | snapshot", .za_lua_type = LUA_TSTRING}, - {NULL, 0} - }, - .kwargs = { - {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN}, - {NULL, 0} - }, - .space_check = ZFS_SPACE_CHECK_DESTROY, - .blocks_modified = 0 -}; - -/* ARGSUSED */ -static int -zcp_synctask_destroy(lua_State *state, boolean_t sync, nvlist_t *err_details) -{ - int err; - const char *dsname = lua_tostring(state, 1); - - boolean_t issnap = (strchr(dsname, '@') != NULL); - - if (!issnap && !lua_isnil(state, 2)) { - return (luaL_error(state, - "'deferred' kwarg only supported for snapshots: %s", - dsname)); - } - - if (issnap) { - dsl_destroy_snapshot_arg_t ddsa = { 0 }; - ddsa.ddsa_name = dsname; - if (!lua_isnil(state, 2)) { - ddsa.ddsa_defer = lua_toboolean(state, 2); - } else { - ddsa.ddsa_defer = B_FALSE; - } - - err = zcp_sync_task(state, dsl_destroy_snapshot_check, - dsl_destroy_snapshot_sync, &ddsa, sync, dsname); - } else { - dsl_destroy_head_arg_t ddha = { 0 }; - ddha.ddha_name = dsname; - - err = zcp_sync_task(state, dsl_destroy_head_check, - dsl_destroy_head_sync, &ddha, sync, dsname); - } - - return (err); -} - -static int zcp_synctask_promote(lua_State *, boolean_t, nvlist_t *); -static zcp_synctask_info_t zcp_synctask_promote_info = { - .name = "promote", - .func = zcp_synctask_promote, - .pargs = { - {.za_name = "clone", .za_lua_type = LUA_TSTRING}, - {NULL, 0} - }, - .kwargs = { - {NULL, 0} - }, - .space_check = ZFS_SPACE_CHECK_RESERVED, - .blocks_modified = 3 -}; - -static int -zcp_synctask_promote(lua_State *state, boolean_t sync, nvlist_t *err_details) -{ - int err; - dsl_dataset_promote_arg_t ddpa = { 0 }; - const char *dsname = lua_tostring(state, 1); - zcp_run_info_t *ri = zcp_run_info(state); - - ddpa.ddpa_clonename = dsname; - ddpa.err_ds = err_details; - ddpa.cr = ri->zri_cred; - - /* - * If there was a snapshot name conflict, then err_ds will be filled - * with a list of conflicting snapshot names. - */ - err = zcp_sync_task(state, dsl_dataset_promote_check, - dsl_dataset_promote_sync, &ddpa, sync, dsname); - - return (err); -} - -static int zcp_synctask_rollback(lua_State *, boolean_t, nvlist_t *err_details); -static zcp_synctask_info_t zcp_synctask_rollback_info = { - .name = "rollback", - .func = zcp_synctask_rollback, - .space_check = ZFS_SPACE_CHECK_RESERVED, - .blocks_modified = 1, - .pargs = { - {.za_name = "filesystem", .za_lua_type = LUA_TSTRING}, - {NULL, 0} - }, - .kwargs = { - {NULL, 0} - } -}; - -static int -zcp_synctask_rollback(lua_State *state, boolean_t sync, nvlist_t *err_details) -{ - int err; - const char *dsname = lua_tostring(state, 1); - dsl_dataset_rollback_arg_t ddra = { 0 }; - - ddra.ddra_fsname = dsname; - ddra.ddra_result = err_details; - - err = zcp_sync_task(state, dsl_dataset_rollback_check, - dsl_dataset_rollback_sync, &ddra, sync, dsname); - - return (err); -} - -static int zcp_synctask_snapshot(lua_State *, boolean_t, nvlist_t *); -static zcp_synctask_info_t zcp_synctask_snapshot_info = { - .name = "snapshot", - .func = zcp_synctask_snapshot, - .pargs = { - {.za_name = "filesystem@snapname | volume@snapname", - .za_lua_type = LUA_TSTRING}, - {NULL, 0} - }, - .kwargs = { - {NULL, 0} - }, - .space_check = ZFS_SPACE_CHECK_NORMAL, - .blocks_modified = 3 -}; - -/* ARGSUSED */ -static int -zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details) -{ - int err; - dsl_dataset_snapshot_arg_t ddsa = { 0 }; - const char *dsname = lua_tostring(state, 1); - zcp_run_info_t *ri = zcp_run_info(state); - - /* - * On old pools, the ZIL must not be active when a snapshot is created, - * but we can't suspend the ZIL because we're already in syncing - * context. - */ - if (spa_version(ri->zri_pool->dp_spa) < SPA_VERSION_FAST_SNAP) { - return (ENOTSUP); - } - - /* - * We only allow for a single snapshot rather than a list, so the - * error list output is unnecessary. - */ - ddsa.ddsa_errors = NULL; - ddsa.ddsa_props = NULL; - ddsa.ddsa_cr = ri->zri_cred; - ddsa.ddsa_snaps = fnvlist_alloc(); - fnvlist_add_boolean(ddsa.ddsa_snaps, dsname); - - zcp_cleanup_handler_t *zch = zcp_register_cleanup(state, - (zcp_cleanup_t *)&fnvlist_free, ddsa.ddsa_snaps); - - err = zcp_sync_task(state, dsl_dataset_snapshot_check, - dsl_dataset_snapshot_sync, &ddsa, sync, dsname); - - zcp_deregister_cleanup(state, zch); - fnvlist_free(ddsa.ddsa_snaps); - - return (err); -} - -static int -zcp_synctask_wrapper(lua_State *state) -{ - int err; - zcp_cleanup_handler_t *zch; - int num_ret = 1; - nvlist_t *err_details = fnvlist_alloc(); - - /* - * Make sure err_details is properly freed, even if a fatal error is - * thrown during the synctask. - */ - zch = zcp_register_cleanup(state, - (zcp_cleanup_t *)&fnvlist_free, err_details); - - zcp_synctask_info_t *info = lua_touserdata(state, lua_upvalueindex(1)); - boolean_t sync = lua_toboolean(state, lua_upvalueindex(2)); - - zcp_run_info_t *ri = zcp_run_info(state); - dsl_pool_t *dp = ri->zri_pool; - - /* MOS space is triple-dittoed, so we multiply by 3. */ - uint64_t funcspace = (info->blocks_modified << DST_AVG_BLKSHIFT) * 3; - - zcp_parse_args(state, info->name, info->pargs, info->kwargs); - - err = 0; - if (info->space_check != ZFS_SPACE_CHECK_NONE) { - uint64_t quota = dsl_pool_unreserved_space(dp, - info->space_check); - uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes + - ri->zri_space_used; - - if (used + funcspace > quota) { - err = SET_ERROR(ENOSPC); - } - } - - if (err == 0) { - err = info->func(state, sync, err_details); - } - - if (err == 0) { - ri->zri_space_used += funcspace; - } - - lua_pushnumber(state, (lua_Number)err); - if (fnvlist_num_pairs(err_details) > 0) { - (void) zcp_nvlist_to_lua(state, err_details, NULL, 0); - num_ret++; - } - - zcp_deregister_cleanup(state, zch); - fnvlist_free(err_details); - - return (num_ret); -} - -int -zcp_load_synctask_lib(lua_State *state, boolean_t sync) -{ - int i; - zcp_synctask_info_t *zcp_synctask_funcs[] = { - &zcp_synctask_destroy_info, - &zcp_synctask_promote_info, - &zcp_synctask_rollback_info, - &zcp_synctask_snapshot_info, - NULL - }; - - lua_newtable(state); - - for (i = 0; zcp_synctask_funcs[i] != NULL; i++) { - zcp_synctask_info_t *info = zcp_synctask_funcs[i]; - lua_pushlightuserdata(state, info); - lua_pushboolean(state, sync); - lua_pushcclosure(state, &zcp_synctask_wrapper, 2); - lua_setfield(state, -2, info->name); - info++; - } - - return (1); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c deleted file mode 100644 index f78414ae10ec..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c +++ /dev/null @@ -1,505 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include "zfeature_common.h" -#include - -/* - * ZFS Feature Flags - * ----------------- - * - * ZFS feature flags are used to provide fine-grained versioning to the ZFS - * on-disk format. Once enabled on a pool feature flags replace the old - * spa_version() number. - * - * Each new on-disk format change will be given a uniquely identifying string - * guid rather than a version number. This avoids the problem of different - * organizations creating new on-disk formats with the same version number. To - * keep feature guids unique they should consist of the reverse dns name of the - * organization which implemented the feature and a short name for the feature, - * separated by a colon (e.g. com.delphix:async_destroy). - * - * Reference Counts - * ---------------- - * - * Within each pool features can be in one of three states: disabled, enabled, - * or active. These states are differentiated by a reference count stored on - * disk for each feature: - * - * 1) If there is no reference count stored on disk the feature is disabled. - * 2) If the reference count is 0 a system administrator has enabled the - * feature, but the feature has not been used yet, so no on-disk - * format changes have been made. - * 3) If the reference count is greater than 0 the feature is active. - * The format changes required by the feature are currently on disk. - * Note that if the feature's format changes are reversed the feature - * may choose to set its reference count back to 0. - * - * Feature flags makes no differentiation between non-zero reference counts - * for an active feature (e.g. a reference count of 1 means the same thing as a - * reference count of 27834721), but feature implementations may choose to use - * the reference count to store meaningful information. For example, a new RAID - * implementation might set the reference count to the number of vdevs using - * it. If all those disks are removed from the pool the feature goes back to - * having a reference count of 0. - * - * It is the responsibility of the individual features to maintain a non-zero - * reference count as long as the feature's format changes are present on disk. - * - * Dependencies - * ------------ - * - * Each feature may depend on other features. The only effect of this - * relationship is that when a feature is enabled all of its dependencies are - * automatically enabled as well. Any future work to support disabling of - * features would need to ensure that features cannot be disabled if other - * enabled features depend on them. - * - * On-disk Format - * -------------- - * - * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES - * (5000). In order for this to work the pool is automatically upgraded to - * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk - * format changes will be in use. - * - * Information about features is stored in 3 ZAP objects in the pool's MOS. - * These objects are linked to by the following names in the pool directory - * object: - * - * 1) features_for_read: feature guid -> reference count - * Features needed to open the pool for reading. - * 2) features_for_write: feature guid -> reference count - * Features needed to open the pool for writing. - * 3) feature_descriptions: feature guid -> descriptive string - * A human readable string. - * - * All enabled features appear in either features_for_read or - * features_for_write, but not both. - * - * To open a pool in read-only mode only the features listed in - * features_for_read need to be supported. - * - * To open the pool in read-write mode features in both features_for_read and - * features_for_write need to be supported. - * - * Some features may be required to read the ZAP objects containing feature - * information. To allow software to check for compatibility with these features - * before the pool is opened their names must be stored in the label in a - * new "features_for_read" entry (note that features that are only required - * to write to a pool never need to be stored in the label since the - * features_for_write ZAP object can be read before the pool is written to). - * To save space in the label features must be explicitly marked as needing to - * be written to the label. Also, reference counts are not stored in the label, - * instead any feature whose reference count drops to 0 is removed from the - * label. - * - * Adding New Features - * ------------------- - * - * Features must be registered in zpool_feature_init() function in - * zfeature_common.c using the zfeature_register() function. This function - * has arguments to specify if the feature should be stored in the - * features_for_read or features_for_write ZAP object and if it needs to be - * written to the label when active. - * - * Once a feature is registered it will appear as a "feature@" - * property which can be set by an administrator. Feature implementors should - * use the spa_feature_is_enabled() and spa_feature_is_active() functions to - * query the state of a feature and the spa_feature_incr() and - * spa_feature_decr() functions to change an enabled feature's reference count. - * Reference counts may only be updated in the syncing context. - * - * Features may not perform enable-time initialization. Instead, any such - * initialization should occur when the feature is first used. This design - * enforces that on-disk changes be made only when features are used. Code - * should only check if a feature is enabled using spa_feature_is_enabled(), - * not by relying on any feature specific metadata existing. If a feature is - * enabled, but the feature's metadata is not on disk yet then it should be - * created as needed. - * - * As an example, consider the com.delphix:async_destroy feature. This feature - * relies on the existence of a bptree in the MOS that store blocks for - * asynchronous freeing. This bptree is not created when async_destroy is - * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is - * called to check if async_destroy is enabled. If it is and the bptree object - * does not exist yet, the bptree object is created as part of the dataset - * destroy and async_destroy's reference count is incremented to indicate it - * has made an on-disk format change. Later, after the destroyed dataset's - * blocks have all been asynchronously freed there is no longer any use for the - * bptree object, so it is destroyed and async_destroy's reference count is - * decremented back to 0 to indicate that it has undone its on-disk format - * changes. - */ - -typedef enum { - FEATURE_ACTION_INCR, - FEATURE_ACTION_DECR, -} feature_action_t; - -/* - * Checks that the active features in the pool are supported by - * this software. Adds each unsupported feature (name -> description) to - * the supplied nvlist. - */ -boolean_t -spa_features_check(spa_t *spa, boolean_t for_write, - nvlist_t *unsup_feat, nvlist_t *enabled_feat) -{ - objset_t *os = spa->spa_meta_objset; - boolean_t supported; - zap_cursor_t zc; - zap_attribute_t za; - uint64_t obj = for_write ? - spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; - - supported = B_TRUE; - for (zap_cursor_init(&zc, os, obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - ASSERT(za.za_integer_length == sizeof (uint64_t) && - za.za_num_integers == 1); - - if (NULL != enabled_feat) { - fnvlist_add_uint64(enabled_feat, za.za_name, - za.za_first_integer); - } - - if (za.za_first_integer != 0 && - !zfeature_is_supported(za.za_name)) { - supported = B_FALSE; - - if (NULL != unsup_feat) { - char *desc = ""; - char buf[MAXPATHLEN]; - - if (zap_lookup(os, spa->spa_feat_desc_obj, - za.za_name, 1, sizeof (buf), buf) == 0) - desc = buf; - - VERIFY(nvlist_add_string(unsup_feat, za.za_name, - desc) == 0); - } - } - } - zap_cursor_fini(&zc); - - return (supported); -} - -/* - * Use an in-memory cache of feature refcounts for quick retrieval. - * - * Note: well-designed features will not need to use this; they should - * use spa_feature_is_enabled() and spa_feature_is_active() instead. - * However, this is non-static for zdb, zhack, and spa_add_feature_stats(). - */ -int -feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res) -{ - ASSERT(VALID_FEATURE_FID(feature->fi_feature)); - if (spa->spa_feat_refcount_cache[feature->fi_feature] == - SPA_FEATURE_DISABLED) { - return (SET_ERROR(ENOTSUP)); - } - *res = spa->spa_feat_refcount_cache[feature->fi_feature]; - return (0); -} - -/* - * Note: well-designed features will not need to use this; they should - * use spa_feature_is_enabled() and spa_feature_is_active() instead. - * However, this is non-static for zdb and zhack. - */ -int -feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature, - uint64_t *res) -{ - int err; - uint64_t refcount; - uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? - spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; - - /* - * If the pool is currently being created, the feature objects may not - * have been allocated yet. Act as though all features are disabled. - */ - if (zapobj == 0) - return (SET_ERROR(ENOTSUP)); - - err = zap_lookup(spa->spa_meta_objset, zapobj, - feature->fi_guid, sizeof (uint64_t), 1, &refcount); - if (err != 0) { - if (err == ENOENT) - return (SET_ERROR(ENOTSUP)); - else - return (err); - } - *res = refcount; - return (0); -} - - -static int -feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res) -{ - uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj; - - ASSERT(zfeature_depends_on(feature->fi_feature, - SPA_FEATURE_ENABLED_TXG)); - - if (!spa_feature_is_enabled(spa, feature->fi_feature)) { - return (SET_ERROR(ENOTSUP)); - } - - ASSERT(enabled_txg_obj != 0); - - VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj, - feature->fi_guid, sizeof (uint64_t), 1, res)); - - return (0); -} - -/* - * This function is non-static for zhack; it should otherwise not be used - * outside this file. - */ -void -feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount, - dmu_tx_t *tx) -{ - ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature)); - uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? - spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; - - VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid, - sizeof (uint64_t), 1, &refcount, tx)); - - /* - * feature_sync is called directly from zhack, allowing the - * creation of arbitrary features whose fi_feature field may - * be greater than SPA_FEATURES. When called from zhack, the - * zfeature_info_t object's fi_feature field will be set to - * SPA_FEATURE_NONE. - */ - if (feature->fi_feature != SPA_FEATURE_NONE) { - uint64_t *refcount_cache = - &spa->spa_feat_refcount_cache[feature->fi_feature]; - VERIFY3U(*refcount_cache, ==, - atomic_swap_64(refcount_cache, refcount)); - } - - if (refcount == 0) - spa_deactivate_mos_feature(spa, feature->fi_guid); - else if (feature->fi_flags & ZFEATURE_FLAG_MOS) - spa_activate_mos_feature(spa, feature->fi_guid, tx); -} - -/* - * This function is non-static for zhack; it should otherwise not be used - * outside this file. - */ -void -feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) -{ - uint64_t initial_refcount = - (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0; - uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? - spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; - - ASSERT(0 != zapobj); - ASSERT(zfeature_is_valid_guid(feature->fi_guid)); - ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); - - /* - * If the feature is already enabled, ignore the request. - */ - if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0) - return; - - for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) - spa_feature_enable(spa, feature->fi_depends[i], tx); - - VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj, - feature->fi_guid, 1, strlen(feature->fi_desc) + 1, - feature->fi_desc, tx)); - - feature_sync(spa, feature, initial_refcount, tx); - - if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) { - uint64_t enabling_txg = dmu_tx_get_txg(tx); - - if (spa->spa_feat_enabled_txg_obj == 0ULL) { - spa->spa_feat_enabled_txg_obj = - zap_create_link(spa->spa_meta_objset, - DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_FEATURE_ENABLED_TXG, tx); - } - spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx); - - VERIFY0(zap_add(spa->spa_meta_objset, - spa->spa_feat_enabled_txg_obj, feature->fi_guid, - sizeof (uint64_t), 1, &enabling_txg, tx)); - } -} - -static void -feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action, - dmu_tx_t *tx) -{ - uint64_t refcount; - zfeature_info_t *feature = &spa_feature_table[fid]; - uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? - spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; - - ASSERT(VALID_FEATURE_FID(fid)); - ASSERT(0 != zapobj); - ASSERT(zfeature_is_valid_guid(feature->fi_guid)); - - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); - - VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP); - - switch (action) { - case FEATURE_ACTION_INCR: - VERIFY3U(refcount, !=, UINT64_MAX); - refcount++; - break; - case FEATURE_ACTION_DECR: - VERIFY3U(refcount, !=, 0); - refcount--; - break; - default: - ASSERT(0); - break; - } - - feature_sync(spa, feature, refcount, tx); -} - -void -spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx) -{ - /* - * We create feature flags ZAP objects in two instances: during pool - * creation and during pool upgrade. - */ - ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on && - tx->tx_txg == TXG_INITIAL)); - - spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset, - DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_FEATURES_FOR_READ, tx); - spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset, - DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_FEATURES_FOR_WRITE, tx); - spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset, - DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_FEATURE_DESCRIPTIONS, tx); -} - -/* - * Enable any required dependencies, then enable the requested feature. - */ -void -spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) -{ - ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); - ASSERT(VALID_FEATURE_FID(fid)); - feature_enable_sync(spa, &spa_feature_table[fid], tx); -} - -void -spa_feature_incr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) -{ - feature_do_action(spa, fid, FEATURE_ACTION_INCR, tx); -} - -void -spa_feature_decr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) -{ - feature_do_action(spa, fid, FEATURE_ACTION_DECR, tx); -} - -boolean_t -spa_feature_is_enabled(spa_t *spa, spa_feature_t fid) -{ - int err; - uint64_t refcount; - - ASSERT(VALID_FEATURE_FID(fid)); - if (spa_version(spa) < SPA_VERSION_FEATURES) - return (B_FALSE); - - err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount); - ASSERT(err == 0 || err == ENOTSUP); - return (err == 0); -} - -boolean_t -spa_feature_is_active(spa_t *spa, spa_feature_t fid) -{ - int err; - uint64_t refcount; - - ASSERT(VALID_FEATURE_FID(fid)); - if (spa_version(spa) < SPA_VERSION_FEATURES) - return (B_FALSE); - - err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount); - ASSERT(err == 0 || err == ENOTSUP); - return (err == 0 && refcount > 0); -} - -/* - * For the feature specified by fid (which must depend on - * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the - * OUT txg argument. - * - * Returns B_TRUE if the feature is enabled, in which case txg will be filled - * with the transaction group in which the specified feature was enabled. - * Returns B_FALSE otherwise (i.e. if the feature is not enabled). - */ -boolean_t -spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg) -{ - int err; - - ASSERT(VALID_FEATURE_FID(fid)); - if (spa_version(spa) < SPA_VERSION_FEATURES) - return (B_FALSE); - - err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg); - ASSERT(err == 0 || err == ENOTSUP); - - return (err == 0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf deleted file mode 100644 index 09881909b804..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf +++ /dev/null @@ -1,28 +0,0 @@ -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END -# -# -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# -# ident "%Z%%M% %I% %E% SMI" -# -name="zfs" parent="pseudo"; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c deleted file mode 100644 index a588c59b491c..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c +++ /dev/null @@ -1,2778 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE -#define DENY ACE_ACCESS_DENIED_ACE_TYPE -#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE -#define MIN_ACE_TYPE ALLOW - -#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) -#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ - ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) -#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ - ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) -#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ - ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) - -#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ - ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ - ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ - ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) - -#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) -#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ - ACE_DELETE|ACE_DELETE_CHILD) -#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) - -#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ - ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) - -#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ - ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) - -#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ - ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) - -#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) - -#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ - ZFS_ACL_PROTECTED) - -#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ - ZFS_ACL_OBJ_ACE) - -#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) - -static uint16_t -zfs_ace_v0_get_type(void *acep) -{ - return (((zfs_oldace_t *)acep)->z_type); -} - -static uint16_t -zfs_ace_v0_get_flags(void *acep) -{ - return (((zfs_oldace_t *)acep)->z_flags); -} - -static uint32_t -zfs_ace_v0_get_mask(void *acep) -{ - return (((zfs_oldace_t *)acep)->z_access_mask); -} - -static uint64_t -zfs_ace_v0_get_who(void *acep) -{ - return (((zfs_oldace_t *)acep)->z_fuid); -} - -static void -zfs_ace_v0_set_type(void *acep, uint16_t type) -{ - ((zfs_oldace_t *)acep)->z_type = type; -} - -static void -zfs_ace_v0_set_flags(void *acep, uint16_t flags) -{ - ((zfs_oldace_t *)acep)->z_flags = flags; -} - -static void -zfs_ace_v0_set_mask(void *acep, uint32_t mask) -{ - ((zfs_oldace_t *)acep)->z_access_mask = mask; -} - -static void -zfs_ace_v0_set_who(void *acep, uint64_t who) -{ - ((zfs_oldace_t *)acep)->z_fuid = who; -} - -/*ARGSUSED*/ -static size_t -zfs_ace_v0_size(void *acep) -{ - return (sizeof (zfs_oldace_t)); -} - -static size_t -zfs_ace_v0_abstract_size(void) -{ - return (sizeof (zfs_oldace_t)); -} - -static int -zfs_ace_v0_mask_off(void) -{ - return (offsetof(zfs_oldace_t, z_access_mask)); -} - -/*ARGSUSED*/ -static int -zfs_ace_v0_data(void *acep, void **datap) -{ - *datap = NULL; - return (0); -} - -static acl_ops_t zfs_acl_v0_ops = { - zfs_ace_v0_get_mask, - zfs_ace_v0_set_mask, - zfs_ace_v0_get_flags, - zfs_ace_v0_set_flags, - zfs_ace_v0_get_type, - zfs_ace_v0_set_type, - zfs_ace_v0_get_who, - zfs_ace_v0_set_who, - zfs_ace_v0_size, - zfs_ace_v0_abstract_size, - zfs_ace_v0_mask_off, - zfs_ace_v0_data -}; - -static uint16_t -zfs_ace_fuid_get_type(void *acep) -{ - return (((zfs_ace_hdr_t *)acep)->z_type); -} - -static uint16_t -zfs_ace_fuid_get_flags(void *acep) -{ - return (((zfs_ace_hdr_t *)acep)->z_flags); -} - -static uint32_t -zfs_ace_fuid_get_mask(void *acep) -{ - return (((zfs_ace_hdr_t *)acep)->z_access_mask); -} - -static uint64_t -zfs_ace_fuid_get_who(void *args) -{ - uint16_t entry_type; - zfs_ace_t *acep = args; - - entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; - - if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || - entry_type == ACE_EVERYONE) - return (-1); - return (((zfs_ace_t *)acep)->z_fuid); -} - -static void -zfs_ace_fuid_set_type(void *acep, uint16_t type) -{ - ((zfs_ace_hdr_t *)acep)->z_type = type; -} - -static void -zfs_ace_fuid_set_flags(void *acep, uint16_t flags) -{ - ((zfs_ace_hdr_t *)acep)->z_flags = flags; -} - -static void -zfs_ace_fuid_set_mask(void *acep, uint32_t mask) -{ - ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; -} - -static void -zfs_ace_fuid_set_who(void *arg, uint64_t who) -{ - zfs_ace_t *acep = arg; - - uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; - - if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || - entry_type == ACE_EVERYONE) - return; - acep->z_fuid = who; -} - -static size_t -zfs_ace_fuid_size(void *acep) -{ - zfs_ace_hdr_t *zacep = acep; - uint16_t entry_type; - - switch (zacep->z_type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - return (sizeof (zfs_object_ace_t)); - case ALLOW: - case DENY: - entry_type = - (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); - if (entry_type == ACE_OWNER || - entry_type == OWNING_GROUP || - entry_type == ACE_EVERYONE) - return (sizeof (zfs_ace_hdr_t)); - /*FALLTHROUGH*/ - default: - return (sizeof (zfs_ace_t)); - } -} - -static size_t -zfs_ace_fuid_abstract_size(void) -{ - return (sizeof (zfs_ace_hdr_t)); -} - -static int -zfs_ace_fuid_mask_off(void) -{ - return (offsetof(zfs_ace_hdr_t, z_access_mask)); -} - -static int -zfs_ace_fuid_data(void *acep, void **datap) -{ - zfs_ace_t *zacep = acep; - zfs_object_ace_t *zobjp; - - switch (zacep->z_hdr.z_type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - zobjp = acep; - *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); - return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); - default: - *datap = NULL; - return (0); - } -} - -static acl_ops_t zfs_acl_fuid_ops = { - zfs_ace_fuid_get_mask, - zfs_ace_fuid_set_mask, - zfs_ace_fuid_get_flags, - zfs_ace_fuid_set_flags, - zfs_ace_fuid_get_type, - zfs_ace_fuid_set_type, - zfs_ace_fuid_get_who, - zfs_ace_fuid_set_who, - zfs_ace_fuid_size, - zfs_ace_fuid_abstract_size, - zfs_ace_fuid_mask_off, - zfs_ace_fuid_data -}; - -/* - * The following three functions are provided for compatibility with - * older ZPL version in order to determine if the file use to have - * an external ACL and what version of ACL previously existed on the - * file. Would really be nice to not need this, sigh. - */ -uint64_t -zfs_external_acl(znode_t *zp) -{ - zfs_acl_phys_t acl_phys; - int error; - - if (zp->z_is_sa) - return (0); - - /* - * Need to deal with a potential - * race where zfs_sa_upgrade could cause - * z_isa_sa to change. - * - * If the lookup fails then the state of z_is_sa should have - * changed. - */ - - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), - &acl_phys, sizeof (acl_phys))) == 0) - return (acl_phys.z_acl_extern_obj); - else { - /* - * after upgrade the SA_ZPL_ZNODE_ACL should have been - * removed - */ - VERIFY(zp->z_is_sa && error == ENOENT); - return (0); - } -} - -/* - * Determine size of ACL in bytes - * - * This is more complicated than it should be since we have to deal - * with old external ACLs. - */ -static int -zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, - zfs_acl_phys_t *aclphys) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint64_t acl_count; - int size; - int error; - - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - if (zp->z_is_sa) { - if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), - &size)) != 0) - return (error); - *aclsize = size; - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), - &acl_count, sizeof (acl_count))) != 0) - return (error); - *aclcount = acl_count; - } else { - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), - aclphys, sizeof (*aclphys))) != 0) - return (error); - - if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { - *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); - *aclcount = aclphys->z_acl_size; - } else { - *aclsize = aclphys->z_acl_size; - *aclcount = aclphys->z_acl_count; - } - } - return (0); -} - -int -zfs_znode_acl_version(znode_t *zp) -{ - zfs_acl_phys_t acl_phys; - - if (zp->z_is_sa) - return (ZFS_ACL_VERSION_FUID); - else { - int error; - - /* - * Need to deal with a potential - * race where zfs_sa_upgrade could cause - * z_isa_sa to change. - * - * If the lookup fails then the state of z_is_sa should have - * changed. - */ - if ((error = sa_lookup(zp->z_sa_hdl, - SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), - &acl_phys, sizeof (acl_phys))) == 0) - return (acl_phys.z_acl_version); - else { - /* - * After upgrade SA_ZPL_ZNODE_ACL should have - * been removed. - */ - VERIFY(zp->z_is_sa && error == ENOENT); - return (ZFS_ACL_VERSION_FUID); - } - } -} - -static int -zfs_acl_version(int version) -{ - if (version < ZPL_VERSION_FUID) - return (ZFS_ACL_VERSION_INITIAL); - else - return (ZFS_ACL_VERSION_FUID); -} - -static int -zfs_acl_version_zp(znode_t *zp) -{ - return (zfs_acl_version(zp->z_zfsvfs->z_version)); -} - -zfs_acl_t * -zfs_acl_alloc(int vers) -{ - zfs_acl_t *aclp; - - aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); - list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), - offsetof(zfs_acl_node_t, z_next)); - aclp->z_version = vers; - if (vers == ZFS_ACL_VERSION_FUID) - aclp->z_ops = zfs_acl_fuid_ops; - else - aclp->z_ops = zfs_acl_v0_ops; - return (aclp); -} - -zfs_acl_node_t * -zfs_acl_node_alloc(size_t bytes) -{ - zfs_acl_node_t *aclnode; - - aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); - if (bytes) { - aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); - aclnode->z_allocdata = aclnode->z_acldata; - aclnode->z_allocsize = bytes; - aclnode->z_size = bytes; - } - - return (aclnode); -} - -static void -zfs_acl_node_free(zfs_acl_node_t *aclnode) -{ - if (aclnode->z_allocsize) - kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); - kmem_free(aclnode, sizeof (zfs_acl_node_t)); -} - -static void -zfs_acl_release_nodes(zfs_acl_t *aclp) -{ - zfs_acl_node_t *aclnode; - - while (aclnode = list_head(&aclp->z_acl)) { - list_remove(&aclp->z_acl, aclnode); - zfs_acl_node_free(aclnode); - } - aclp->z_acl_count = 0; - aclp->z_acl_bytes = 0; -} - -void -zfs_acl_free(zfs_acl_t *aclp) -{ - zfs_acl_release_nodes(aclp); - list_destroy(&aclp->z_acl); - kmem_free(aclp, sizeof (zfs_acl_t)); -} - -static boolean_t -zfs_acl_valid_ace_type(uint_t type, uint_t flags) -{ - uint16_t entry_type; - - switch (type) { - case ALLOW: - case DENY: - case ACE_SYSTEM_AUDIT_ACE_TYPE: - case ACE_SYSTEM_ALARM_ACE_TYPE: - entry_type = flags & ACE_TYPE_FLAGS; - return (entry_type == ACE_OWNER || - entry_type == OWNING_GROUP || - entry_type == ACE_EVERYONE || entry_type == 0 || - entry_type == ACE_IDENTIFIER_GROUP); - default: - if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) - return (B_TRUE); - } - return (B_FALSE); -} - -static boolean_t -zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) -{ - /* - * first check type of entry - */ - - if (!zfs_acl_valid_ace_type(type, iflags)) - return (B_FALSE); - - switch (type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - if (aclp->z_version < ZFS_ACL_VERSION_FUID) - return (B_FALSE); - aclp->z_hints |= ZFS_ACL_OBJ_ACE; - } - - /* - * next check inheritance level flags - */ - - if (obj_type == VDIR && - (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) - aclp->z_hints |= ZFS_INHERIT_ACE; - - if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { - if ((iflags & (ACE_FILE_INHERIT_ACE| - ACE_DIRECTORY_INHERIT_ACE)) == 0) { - return (B_FALSE); - } - } - - return (B_TRUE); -} - -static void * -zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, - uint32_t *access_mask, uint16_t *iflags, uint16_t *type) -{ - zfs_acl_node_t *aclnode; - - ASSERT(aclp); - - if (start == NULL) { - aclnode = list_head(&aclp->z_acl); - if (aclnode == NULL) - return (NULL); - - aclp->z_next_ace = aclnode->z_acldata; - aclp->z_curr_node = aclnode; - aclnode->z_ace_idx = 0; - } - - aclnode = aclp->z_curr_node; - - if (aclnode == NULL) - return (NULL); - - if (aclnode->z_ace_idx >= aclnode->z_ace_count) { - aclnode = list_next(&aclp->z_acl, aclnode); - if (aclnode == NULL) - return (NULL); - else { - aclp->z_curr_node = aclnode; - aclnode->z_ace_idx = 0; - aclp->z_next_ace = aclnode->z_acldata; - } - } - - if (aclnode->z_ace_idx < aclnode->z_ace_count) { - void *acep = aclp->z_next_ace; - size_t ace_size; - - /* - * Make sure we don't overstep our bounds - */ - ace_size = aclp->z_ops.ace_size(acep); - - if (((caddr_t)acep + ace_size) > - ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { - return (NULL); - } - - *iflags = aclp->z_ops.ace_flags_get(acep); - *type = aclp->z_ops.ace_type_get(acep); - *access_mask = aclp->z_ops.ace_mask_get(acep); - *who = aclp->z_ops.ace_who_get(acep); - aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; - aclnode->z_ace_idx++; - - return ((void *)acep); - } - return (NULL); -} - -/*ARGSUSED*/ -static uint64_t -zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, - uint16_t *flags, uint16_t *type, uint32_t *mask) -{ - zfs_acl_t *aclp = datap; - zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; - uint64_t who; - - acep = zfs_acl_next_ace(aclp, acep, &who, mask, - flags, type); - return ((uint64_t)(uintptr_t)acep); -} - -static zfs_acl_node_t * -zfs_acl_curr_node(zfs_acl_t *aclp) -{ - ASSERT(aclp->z_curr_node); - return (aclp->z_curr_node); -} - -/* - * Copy ACE to internal ZFS format. - * While processing the ACL each ACE will be validated for correctness. - * ACE FUIDs will be created later. - */ -int -zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp, - void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, - zfs_fuid_info_t **fuidp, cred_t *cr) -{ - int i; - uint16_t entry_type; - zfs_ace_t *aceptr = z_acl; - ace_t *acep = datap; - zfs_object_ace_t *zobjacep; - ace_object_t *aceobjp; - - for (i = 0; i != aclcnt; i++) { - aceptr->z_hdr.z_access_mask = acep->a_access_mask; - aceptr->z_hdr.z_flags = acep->a_flags; - aceptr->z_hdr.z_type = acep->a_type; - entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; - if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && - entry_type != ACE_EVERYONE) { - aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, - cr, (entry_type == 0) ? - ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); - } - - /* - * Make sure ACE is valid - */ - if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type, - aceptr->z_hdr.z_flags) != B_TRUE) - return (SET_ERROR(EINVAL)); - - switch (acep->a_type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - zobjacep = (zfs_object_ace_t *)aceptr; - aceobjp = (ace_object_t *)acep; - - bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, - sizeof (aceobjp->a_obj_type)); - bcopy(aceobjp->a_inherit_obj_type, - zobjacep->z_inherit_type, - sizeof (aceobjp->a_inherit_obj_type)); - acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); - break; - default: - acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); - } - - aceptr = (zfs_ace_t *)((caddr_t)aceptr + - aclp->z_ops.ace_size(aceptr)); - } - - *size = (caddr_t)aceptr - (caddr_t)z_acl; - - return (0); -} - -/* - * Copy ZFS ACEs to fixed size ace_t layout - */ -static void -zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, - void *datap, int filter) -{ - uint64_t who; - uint32_t access_mask; - uint16_t iflags, type; - zfs_ace_hdr_t *zacep = NULL; - ace_t *acep = datap; - ace_object_t *objacep; - zfs_object_ace_t *zobjacep; - size_t ace_size; - uint16_t entry_type; - - while (zacep = zfs_acl_next_ace(aclp, zacep, - &who, &access_mask, &iflags, &type)) { - - switch (type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - if (filter) { - continue; - } - zobjacep = (zfs_object_ace_t *)zacep; - objacep = (ace_object_t *)acep; - bcopy(zobjacep->z_object_type, - objacep->a_obj_type, - sizeof (zobjacep->z_object_type)); - bcopy(zobjacep->z_inherit_type, - objacep->a_inherit_obj_type, - sizeof (zobjacep->z_inherit_type)); - ace_size = sizeof (ace_object_t); - break; - default: - ace_size = sizeof (ace_t); - break; - } - - entry_type = (iflags & ACE_TYPE_FLAGS); - if ((entry_type != ACE_OWNER && - entry_type != OWNING_GROUP && - entry_type != ACE_EVERYONE)) { - acep->a_who = zfs_fuid_map_id(zfsvfs, who, - cr, (entry_type & ACE_IDENTIFIER_GROUP) ? - ZFS_ACE_GROUP : ZFS_ACE_USER); - } else { - acep->a_who = (uid_t)(int64_t)who; - } - acep->a_access_mask = access_mask; - acep->a_flags = iflags; - acep->a_type = type; - acep = (ace_t *)((caddr_t)acep + ace_size); - } -} - -static int -zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep, - zfs_oldace_t *z_acl, int aclcnt, size_t *size) -{ - int i; - zfs_oldace_t *aceptr = z_acl; - - for (i = 0; i != aclcnt; i++, aceptr++) { - aceptr->z_access_mask = acep[i].a_access_mask; - aceptr->z_type = acep[i].a_type; - aceptr->z_flags = acep[i].a_flags; - aceptr->z_fuid = acep[i].a_who; - /* - * Make sure ACE is valid - */ - if (zfs_ace_valid(obj_type, aclp, aceptr->z_type, - aceptr->z_flags) != B_TRUE) - return (SET_ERROR(EINVAL)); - } - *size = (caddr_t)aceptr - (caddr_t)z_acl; - return (0); -} - -/* - * convert old ACL format to new - */ -void -zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) -{ - zfs_oldace_t *oldaclp; - int i; - uint16_t type, iflags; - uint32_t access_mask; - uint64_t who; - void *cookie = NULL; - zfs_acl_node_t *newaclnode; - - ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); - /* - * First create the ACE in a contiguous piece of memory - * for zfs_copy_ace_2_fuid(). - * - * We only convert an ACL once, so this won't happen - * everytime. - */ - oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, - KM_SLEEP); - i = 0; - while (cookie = zfs_acl_next_ace(aclp, cookie, &who, - &access_mask, &iflags, &type)) { - oldaclp[i].z_flags = iflags; - oldaclp[i].z_type = type; - oldaclp[i].z_fuid = who; - oldaclp[i++].z_access_mask = access_mask; - } - - newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * - sizeof (zfs_object_ace_t)); - aclp->z_ops = zfs_acl_fuid_ops; - VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp, - oldaclp, newaclnode->z_acldata, aclp->z_acl_count, - &newaclnode->z_size, NULL, cr) == 0); - newaclnode->z_ace_count = aclp->z_acl_count; - aclp->z_version = ZFS_ACL_VERSION; - kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); - - /* - * Release all previous ACL nodes - */ - - zfs_acl_release_nodes(aclp); - - list_insert_head(&aclp->z_acl, newaclnode); - - aclp->z_acl_bytes = newaclnode->z_size; - aclp->z_acl_count = newaclnode->z_ace_count; - -} - -/* - * Convert unix access mask to v4 access mask - */ -static uint32_t -zfs_unix_to_v4(uint32_t access_mask) -{ - uint32_t new_mask = 0; - - if (access_mask & S_IXOTH) - new_mask |= ACE_EXECUTE; - if (access_mask & S_IWOTH) - new_mask |= ACE_WRITE_DATA; - if (access_mask & S_IROTH) - new_mask |= ACE_READ_DATA; - return (new_mask); -} - -static void -zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, - uint16_t access_type, uint64_t fuid, uint16_t entry_type) -{ - uint16_t type = entry_type & ACE_TYPE_FLAGS; - - aclp->z_ops.ace_mask_set(acep, access_mask); - aclp->z_ops.ace_type_set(acep, access_type); - aclp->z_ops.ace_flags_set(acep, entry_type); - if ((type != ACE_OWNER && type != OWNING_GROUP && - type != ACE_EVERYONE)) - aclp->z_ops.ace_who_set(acep, fuid); -} - -/* - * Determine mode of file based on ACL. - */ -uint64_t -zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, - uint64_t *pflags, uint64_t fuid, uint64_t fgid) -{ - int entry_type; - mode_t mode; - mode_t seen = 0; - zfs_ace_hdr_t *acep = NULL; - uint64_t who; - uint16_t iflags, type; - uint32_t access_mask; - boolean_t an_exec_denied = B_FALSE; - - mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); - - while (acep = zfs_acl_next_ace(aclp, acep, &who, - &access_mask, &iflags, &type)) { - - if (!zfs_acl_valid_ace_type(type, iflags)) - continue; - - entry_type = (iflags & ACE_TYPE_FLAGS); - - /* - * Skip over any inherit_only ACEs - */ - if (iflags & ACE_INHERIT_ONLY_ACE) - continue; - - if (entry_type == ACE_OWNER || (entry_type == 0 && - who == fuid)) { - if ((access_mask & ACE_READ_DATA) && - (!(seen & S_IRUSR))) { - seen |= S_IRUSR; - if (type == ALLOW) { - mode |= S_IRUSR; - } - } - if ((access_mask & ACE_WRITE_DATA) && - (!(seen & S_IWUSR))) { - seen |= S_IWUSR; - if (type == ALLOW) { - mode |= S_IWUSR; - } - } - if ((access_mask & ACE_EXECUTE) && - (!(seen & S_IXUSR))) { - seen |= S_IXUSR; - if (type == ALLOW) { - mode |= S_IXUSR; - } - } - } else if (entry_type == OWNING_GROUP || - (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { - if ((access_mask & ACE_READ_DATA) && - (!(seen & S_IRGRP))) { - seen |= S_IRGRP; - if (type == ALLOW) { - mode |= S_IRGRP; - } - } - if ((access_mask & ACE_WRITE_DATA) && - (!(seen & S_IWGRP))) { - seen |= S_IWGRP; - if (type == ALLOW) { - mode |= S_IWGRP; - } - } - if ((access_mask & ACE_EXECUTE) && - (!(seen & S_IXGRP))) { - seen |= S_IXGRP; - if (type == ALLOW) { - mode |= S_IXGRP; - } - } - } else if (entry_type == ACE_EVERYONE) { - if ((access_mask & ACE_READ_DATA)) { - if (!(seen & S_IRUSR)) { - seen |= S_IRUSR; - if (type == ALLOW) { - mode |= S_IRUSR; - } - } - if (!(seen & S_IRGRP)) { - seen |= S_IRGRP; - if (type == ALLOW) { - mode |= S_IRGRP; - } - } - if (!(seen & S_IROTH)) { - seen |= S_IROTH; - if (type == ALLOW) { - mode |= S_IROTH; - } - } - } - if ((access_mask & ACE_WRITE_DATA)) { - if (!(seen & S_IWUSR)) { - seen |= S_IWUSR; - if (type == ALLOW) { - mode |= S_IWUSR; - } - } - if (!(seen & S_IWGRP)) { - seen |= S_IWGRP; - if (type == ALLOW) { - mode |= S_IWGRP; - } - } - if (!(seen & S_IWOTH)) { - seen |= S_IWOTH; - if (type == ALLOW) { - mode |= S_IWOTH; - } - } - } - if ((access_mask & ACE_EXECUTE)) { - if (!(seen & S_IXUSR)) { - seen |= S_IXUSR; - if (type == ALLOW) { - mode |= S_IXUSR; - } - } - if (!(seen & S_IXGRP)) { - seen |= S_IXGRP; - if (type == ALLOW) { - mode |= S_IXGRP; - } - } - if (!(seen & S_IXOTH)) { - seen |= S_IXOTH; - if (type == ALLOW) { - mode |= S_IXOTH; - } - } - } - } else { - /* - * Only care if this IDENTIFIER_GROUP or - * USER ACE denies execute access to someone, - * mode is not affected - */ - if ((access_mask & ACE_EXECUTE) && type == DENY) - an_exec_denied = B_TRUE; - } - } - - /* - * Failure to allow is effectively a deny, so execute permission - * is denied if it was never mentioned or if we explicitly - * weren't allowed it. - */ - if (!an_exec_denied && - ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || - (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) - an_exec_denied = B_TRUE; - - if (an_exec_denied) - *pflags &= ~ZFS_NO_EXECS_DENIED; - else - *pflags |= ZFS_NO_EXECS_DENIED; - - return (mode); -} - -/* - * Read an external acl object. If the intent is to modify, always - * create a new acl and leave any cached acl in place. - */ -static int -zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) -{ - zfs_acl_t *aclp; - int aclsize; - int acl_count; - zfs_acl_node_t *aclnode; - zfs_acl_phys_t znode_acl; - int version; - int error; - - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - ASSERT_VOP_LOCKED(ZTOV(zp), __func__); - - if (zp->z_acl_cached && !will_modify) { - *aclpp = zp->z_acl_cached; - return (0); - } - - version = zfs_znode_acl_version(zp); - - if ((error = zfs_acl_znode_info(zp, &aclsize, - &acl_count, &znode_acl)) != 0) { - goto done; - } - - aclp = zfs_acl_alloc(version); - - aclp->z_acl_count = acl_count; - aclp->z_acl_bytes = aclsize; - - aclnode = zfs_acl_node_alloc(aclsize); - aclnode->z_ace_count = aclp->z_acl_count; - aclnode->z_size = aclsize; - - if (!zp->z_is_sa) { - if (znode_acl.z_acl_extern_obj) { - error = dmu_read(zp->z_zfsvfs->z_os, - znode_acl.z_acl_extern_obj, 0, aclnode->z_size, - aclnode->z_acldata, DMU_READ_PREFETCH); - } else { - bcopy(znode_acl.z_ace_data, aclnode->z_acldata, - aclnode->z_size); - } - } else { - error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs), - aclnode->z_acldata, aclnode->z_size); - } - - if (error != 0) { - zfs_acl_free(aclp); - zfs_acl_node_free(aclnode); - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = SET_ERROR(EIO); - goto done; - } - - list_insert_head(&aclp->z_acl, aclnode); - - *aclpp = aclp; - if (!will_modify) - zp->z_acl_cached = aclp; -done: - return (error); -} - -/*ARGSUSED*/ -void -zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, - boolean_t start, void *userdata) -{ - zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; - - if (start) { - cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); - } else { - cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, - cb->cb_acl_node); - } - *dataptr = cb->cb_acl_node->z_acldata; - *length = cb->cb_acl_node->z_size; -} - -int -zfs_acl_chown_setattr(znode_t *zp) -{ - int error; - zfs_acl_t *aclp; - - ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - ASSERT_VOP_IN_SEQC(ZTOV(zp)); - - if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0) - zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, - &zp->z_pflags, zp->z_uid, zp->z_gid); - return (error); -} - -/* - * common code for setting ACLs. - * - * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. - * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's - * already checked the acl and knows whether to inherit. - */ -int -zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) -{ - int error; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - dmu_object_type_t otype; - zfs_acl_locator_cb_t locate = { 0 }; - uint64_t mode; - sa_bulk_attr_t bulk[5]; - uint64_t ctime[2]; - int count = 0; - zfs_acl_phys_t acl_phys; - - ASSERT_VOP_IN_SEQC(ZTOV(zp)); - - mode = zp->z_mode; - - mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, - zp->z_uid, zp->z_gid); - - zp->z_mode = mode; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, - &mode, sizeof (mode)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, sizeof (zp->z_pflags)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, - &ctime, sizeof (ctime)); - - if (zp->z_acl_cached) { - zfs_acl_free(zp->z_acl_cached); - zp->z_acl_cached = NULL; - } - - /* - * Upgrade needed? - */ - if (!zfsvfs->z_use_fuids) { - otype = DMU_OT_OLDACL; - } else { - if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && - (zfsvfs->z_version >= ZPL_VERSION_FUID)) - zfs_acl_xform(zp, aclp, cr); - ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); - otype = DMU_OT_ACL; - } - - /* - * Arrgh, we have to handle old on disk format - * as well as newer (preferred) SA format. - */ - - if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ - locate.cb_aclp = aclp; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), - zfs_acl_data_locator, &locate, aclp->z_acl_bytes); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), - NULL, &aclp->z_acl_count, sizeof (uint64_t)); - } else { /* Painful legacy way */ - zfs_acl_node_t *aclnode; - uint64_t off = 0; - uint64_t aoid; - - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), - &acl_phys, sizeof (acl_phys))) != 0) - return (error); - - aoid = acl_phys.z_acl_extern_obj; - - if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { - /* - * If ACL was previously external and we are now - * converting to new ACL format then release old - * ACL object and create a new one. - */ - if (aoid && - aclp->z_version != acl_phys.z_acl_version) { - error = dmu_object_free(zfsvfs->z_os, aoid, tx); - if (error) - return (error); - aoid = 0; - } - if (aoid == 0) { - aoid = dmu_object_alloc(zfsvfs->z_os, - otype, aclp->z_acl_bytes, - otype == DMU_OT_ACL ? - DMU_OT_SYSACL : DMU_OT_NONE, - otype == DMU_OT_ACL ? - DN_OLD_MAX_BONUSLEN : 0, tx); - } else { - (void) dmu_object_set_blocksize(zfsvfs->z_os, - aoid, aclp->z_acl_bytes, 0, tx); - } - acl_phys.z_acl_extern_obj = aoid; - for (aclnode = list_head(&aclp->z_acl); aclnode; - aclnode = list_next(&aclp->z_acl, aclnode)) { - if (aclnode->z_ace_count == 0) - continue; - dmu_write(zfsvfs->z_os, aoid, off, - aclnode->z_size, aclnode->z_acldata, tx); - off += aclnode->z_size; - } - } else { - void *start = acl_phys.z_ace_data; - /* - * Migrating back embedded? - */ - if (acl_phys.z_acl_extern_obj) { - error = dmu_object_free(zfsvfs->z_os, - acl_phys.z_acl_extern_obj, tx); - if (error) - return (error); - acl_phys.z_acl_extern_obj = 0; - } - - for (aclnode = list_head(&aclp->z_acl); aclnode; - aclnode = list_next(&aclp->z_acl, aclnode)) { - if (aclnode->z_ace_count == 0) - continue; - bcopy(aclnode->z_acldata, start, - aclnode->z_size); - start = (caddr_t)start + aclnode->z_size; - } - } - /* - * If Old version then swap count/bytes to match old - * layout of znode_acl_phys_t. - */ - if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { - acl_phys.z_acl_size = aclp->z_acl_count; - acl_phys.z_acl_count = aclp->z_acl_bytes; - } else { - acl_phys.z_acl_size = aclp->z_acl_bytes; - acl_phys.z_acl_count = aclp->z_acl_count; - } - acl_phys.z_acl_version = aclp->z_version; - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, - &acl_phys, sizeof (acl_phys)); - } - - /* - * Replace ACL wide bits, but first clear them. - */ - zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; - - zp->z_pflags |= aclp->z_hints; - - if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) - zp->z_pflags |= ZFS_ACL_TRIVIAL; - - zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE); - return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); -} - -static void -zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim, - zfs_acl_t *aclp) -{ - void *acep = NULL; - uint64_t who; - int new_count, new_bytes; - int ace_size; - int entry_type; - uint16_t iflags, type; - uint32_t access_mask; - zfs_acl_node_t *newnode; - size_t abstract_size = aclp->z_ops.ace_abstract_size(); - void *zacep; - boolean_t isdir; - trivial_acl_t masks; - - new_count = new_bytes = 0; - - isdir = (vtype == VDIR); - - acl_trivial_access_masks((mode_t)mode, isdir, &masks); - - newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); - - zacep = newnode->z_acldata; - if (masks.allow0) { - zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); - zacep = (void *)((uintptr_t)zacep + abstract_size); - new_count++; - new_bytes += abstract_size; - } - if (masks.deny1) { - zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); - zacep = (void *)((uintptr_t)zacep + abstract_size); - new_count++; - new_bytes += abstract_size; - } - if (masks.deny2) { - zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); - zacep = (void *)((uintptr_t)zacep + abstract_size); - new_count++; - new_bytes += abstract_size; - } - - while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, - &iflags, &type)) { - entry_type = (iflags & ACE_TYPE_FLAGS); - /* - * ACEs used to represent the file mode may be divided - * into an equivalent pair of inherit-only and regular - * ACEs, if they are inheritable. - * Skip regular ACEs, which are replaced by the new mode. - */ - if (split && (entry_type == ACE_OWNER || - entry_type == OWNING_GROUP || - entry_type == ACE_EVERYONE)) { - if (!isdir || !(iflags & - (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) - continue; - /* - * We preserve owner@, group@, or @everyone - * permissions, if they are inheritable, by - * copying them to inherit_only ACEs. This - * prevents inheritable permissions from being - * altered along with the file mode. - */ - iflags |= ACE_INHERIT_ONLY_ACE; - } - - /* - * If this ACL has any inheritable ACEs, mark that in - * the hints (which are later masked into the pflags) - * so create knows to do inheritance. - */ - if (isdir && (iflags & - (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) - aclp->z_hints |= ZFS_INHERIT_ACE; - - if ((type != ALLOW && type != DENY) || - (iflags & ACE_INHERIT_ONLY_ACE)) { - switch (type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - aclp->z_hints |= ZFS_ACL_OBJ_ACE; - break; - } - } else { - /* - * Limit permissions granted by ACEs to be no greater - * than permissions of the requested group mode. - * Applies when the "aclmode" property is set to - * "groupmask". - */ - if ((type == ALLOW) && trim) - access_mask &= masks.group; - } - zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); - ace_size = aclp->z_ops.ace_size(acep); - zacep = (void *)((uintptr_t)zacep + ace_size); - new_count++; - new_bytes += ace_size; - } - zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); - zacep = (void *)((uintptr_t)zacep + abstract_size); - zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); - zacep = (void *)((uintptr_t)zacep + abstract_size); - zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); - - new_count += 3; - new_bytes += abstract_size * 3; - zfs_acl_release_nodes(aclp); - aclp->z_acl_count = new_count; - aclp->z_acl_bytes = new_bytes; - newnode->z_ace_count = new_count; - newnode->z_size = new_bytes; - list_insert_tail(&aclp->z_acl, newnode); -} - -int -zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) -{ - int error = 0; - - mutex_enter(&zp->z_acl_lock); - ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); - if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) - *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); - else - error = zfs_acl_node_read(zp, aclp, B_TRUE); - - if (error == 0) { - (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; - zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE, - (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); - } - mutex_exit(&zp->z_acl_lock); - - return (error); -} - -/* - * Should ACE be inherited? - */ -static int -zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags) -{ - int iflags = (acep_flags & 0xf); - - if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) - return (1); - else if (iflags & ACE_FILE_INHERIT_ACE) - return (!((vtype == VDIR) && - (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); - return (0); -} - -/* - * inherit inheritable ACEs from parent - */ -static zfs_acl_t * -zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, - uint64_t mode, boolean_t *need_chmod) -{ - void *pacep = NULL; - void *acep; - zfs_acl_node_t *aclnode; - zfs_acl_t *aclp = NULL; - uint64_t who; - uint32_t access_mask; - uint16_t iflags, newflags, type; - size_t ace_size; - void *data1, *data2; - size_t data1sz, data2sz; - uint_t aclinherit; - boolean_t isdir = (vtype == VDIR); - boolean_t isreg = (vtype == VREG); - - *need_chmod = B_TRUE; - - aclp = zfs_acl_alloc(paclp->z_version); - aclinherit = zfsvfs->z_acl_inherit; - if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK) - return (aclp); - - while (pacep = zfs_acl_next_ace(paclp, pacep, &who, - &access_mask, &iflags, &type)) { - - /* - * don't inherit bogus ACEs - */ - if (!zfs_acl_valid_ace_type(type, iflags)) - continue; - - /* - * Check if ACE is inheritable by this vnode - */ - if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || - !zfs_ace_can_use(vtype, iflags)) - continue; - - /* - * If owner@, group@, or everyone@ inheritable - * then zfs_acl_chmod() isn't needed. - */ - if ((aclinherit == ZFS_ACL_PASSTHROUGH || - aclinherit == ZFS_ACL_PASSTHROUGH_X) && - ((iflags & (ACE_OWNER|ACE_EVERYONE)) || - ((iflags & OWNING_GROUP) == OWNING_GROUP)) && - (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) - *need_chmod = B_FALSE; - - /* - * Strip inherited execute permission from file if - * not in mode - */ - if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && - !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { - access_mask &= ~ACE_EXECUTE; - } - - /* - * Strip write_acl and write_owner from permissions - * when inheriting an ACE - */ - if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { - access_mask &= ~RESTRICTED_CLEAR; - } - - ace_size = aclp->z_ops.ace_size(pacep); - aclnode = zfs_acl_node_alloc(ace_size); - list_insert_tail(&aclp->z_acl, aclnode); - acep = aclnode->z_acldata; - - zfs_set_ace(aclp, acep, access_mask, type, - who, iflags|ACE_INHERITED_ACE); - - /* - * Copy special opaque data if any - */ - if ((data1sz = paclp->z_ops.ace_data(pacep, &data1)) != 0) { - VERIFY((data2sz = aclp->z_ops.ace_data(acep, - &data2)) == data1sz); - bcopy(data1, data2, data2sz); - } - - aclp->z_acl_count++; - aclnode->z_ace_count++; - aclp->z_acl_bytes += aclnode->z_size; - newflags = aclp->z_ops.ace_flags_get(acep); - - /* - * If ACE is not to be inherited further, or if the vnode is - * not a directory, remove all inheritance flags - */ - if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { - newflags &= ~ALL_INHERIT; - aclp->z_ops.ace_flags_set(acep, - newflags|ACE_INHERITED_ACE); - continue; - } - - /* - * This directory has an inheritable ACE - */ - aclp->z_hints |= ZFS_INHERIT_ACE; - - /* - * If only FILE_INHERIT is set then turn on - * inherit_only - */ - if ((iflags & (ACE_FILE_INHERIT_ACE | - ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { - newflags |= ACE_INHERIT_ONLY_ACE; - aclp->z_ops.ace_flags_set(acep, - newflags|ACE_INHERITED_ACE); - } else { - newflags &= ~ACE_INHERIT_ONLY_ACE; - aclp->z_ops.ace_flags_set(acep, - newflags|ACE_INHERITED_ACE); - } - } - - return (aclp); -} - -/* - * Create file system object initial permissions - * including inheritable ACEs. - * Also, create FUIDs for owner and group. - */ -int -zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, - vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) -{ - int error; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zfs_acl_t *paclp; - gid_t gid; - boolean_t need_chmod = B_TRUE; - boolean_t trim = B_FALSE; - boolean_t inherited = B_FALSE; - - if ((flag & IS_ROOT_NODE) == 0) - ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); - else - ASSERT(dzp->z_vnode == NULL); - bzero(acl_ids, sizeof (zfs_acl_ids_t)); - acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); - - if (vsecp) - if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, - &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) - return (error); - /* - * Determine uid and gid. - */ - if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || - ((flag & IS_XATTR) && (vap->va_type == VDIR))) { - acl_ids->z_fuid = zfs_fuid_create(zfsvfs, - (uint64_t)vap->va_uid, cr, - ZFS_OWNER, &acl_ids->z_fuidp); - acl_ids->z_fgid = zfs_fuid_create(zfsvfs, - (uint64_t)vap->va_gid, cr, - ZFS_GROUP, &acl_ids->z_fuidp); - gid = vap->va_gid; - } else { - acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, - cr, &acl_ids->z_fuidp); - acl_ids->z_fgid = 0; - if (vap->va_mask & AT_GID) { - acl_ids->z_fgid = zfs_fuid_create(zfsvfs, - (uint64_t)vap->va_gid, - cr, ZFS_GROUP, &acl_ids->z_fuidp); - gid = vap->va_gid; - if (acl_ids->z_fgid != dzp->z_gid && - !groupmember(vap->va_gid, cr) && - secpolicy_vnode_create_gid(cr) != 0) - acl_ids->z_fgid = 0; - } - if (acl_ids->z_fgid == 0) { -#ifndef __FreeBSD_kernel__ - if (dzp->z_mode & S_ISGID) { -#endif - char *domain; - uint32_t rid; - - acl_ids->z_fgid = dzp->z_gid; - gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, - cr, ZFS_GROUP); - - if (zfsvfs->z_use_fuids && - IS_EPHEMERAL(acl_ids->z_fgid)) { - domain = zfs_fuid_idx_domain( - &zfsvfs->z_fuid_idx, - FUID_INDEX(acl_ids->z_fgid)); - rid = FUID_RID(acl_ids->z_fgid); - zfs_fuid_node_add(&acl_ids->z_fuidp, - domain, rid, - FUID_INDEX(acl_ids->z_fgid), - acl_ids->z_fgid, ZFS_GROUP); - } -#ifndef __FreeBSD_kernel__ - } else { - acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, - ZFS_GROUP, cr, &acl_ids->z_fuidp); - gid = crgetgid(cr); - } -#endif - } - } - - /* - * If we're creating a directory, and the parent directory has the - * set-GID bit set, set in on the new directory. - * Otherwise, if the user is neither privileged nor a member of the - * file's new group, clear the file's set-GID bit. - */ - - if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && - (vap->va_type == VDIR)) { - acl_ids->z_mode |= S_ISGID; - } else { - if ((acl_ids->z_mode & S_ISGID) && - secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0) - acl_ids->z_mode &= ~S_ISGID; - } - - if (acl_ids->z_aclp == NULL) { - mutex_enter(&dzp->z_acl_lock); - if (!(flag & IS_ROOT_NODE) && - (dzp->z_pflags & ZFS_INHERIT_ACE) && - !(dzp->z_pflags & ZFS_XATTR)) { - VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE)); - acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, - vap->va_type, paclp, acl_ids->z_mode, &need_chmod); - inherited = B_TRUE; - } else { - acl_ids->z_aclp = - zfs_acl_alloc(zfs_acl_version_zp(dzp)); - acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; - } - mutex_exit(&dzp->z_acl_lock); - - if (need_chmod) { - if (vap->va_type == VDIR) - acl_ids->z_aclp->z_hints |= - ZFS_ACL_AUTO_INHERIT; - - if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && - zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && - zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) - trim = B_TRUE; - zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE, - trim, acl_ids->z_aclp); - } - } - - if (inherited || vsecp) { - acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, - acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, - acl_ids->z_fuid, acl_ids->z_fgid); - if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) - acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; - } - - return (0); -} - -/* - * Free ACL and fuid_infop, but not the acl_ids structure - */ -void -zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) -{ - if (acl_ids->z_aclp) - zfs_acl_free(acl_ids->z_aclp); - if (acl_ids->z_fuidp) - zfs_fuid_info_free(acl_ids->z_fuidp); - acl_ids->z_aclp = NULL; - acl_ids->z_fuidp = NULL; -} - -boolean_t -zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids) -{ - return (zfs_fuid_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) || - zfs_fuid_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid)); -} - -/* - * Retrieve a file's ACL - */ -int -zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) -{ - zfs_acl_t *aclp; - ulong_t mask; - int error; - int count = 0; - int largeace = 0; - - mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | - VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); - - if (mask == 0) - return (SET_ERROR(ENOSYS)); - - if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)) - return (error); - - mutex_enter(&zp->z_acl_lock); - - ASSERT_VOP_LOCKED(ZTOV(zp), __func__); - error = zfs_acl_node_read(zp, &aclp, B_FALSE); - if (error != 0) { - mutex_exit(&zp->z_acl_lock); - return (error); - } - - /* - * Scan ACL to determine number of ACEs - */ - if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { - void *zacep = NULL; - uint64_t who; - uint32_t access_mask; - uint16_t type, iflags; - - while (zacep = zfs_acl_next_ace(aclp, zacep, - &who, &access_mask, &iflags, &type)) { - switch (type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - largeace++; - continue; - default: - count++; - } - } - vsecp->vsa_aclcnt = count; - } else - count = (int)aclp->z_acl_count; - - if (mask & VSA_ACECNT) { - vsecp->vsa_aclcnt = count; - } - - if (mask & VSA_ACE) { - size_t aclsz; - - aclsz = count * sizeof (ace_t) + - sizeof (ace_object_t) * largeace; - - vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); - vsecp->vsa_aclentsz = aclsz; - - if (aclp->z_version == ZFS_ACL_VERSION_FUID) - zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr, - vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); - else { - zfs_acl_node_t *aclnode; - void *start = vsecp->vsa_aclentp; - - for (aclnode = list_head(&aclp->z_acl); aclnode; - aclnode = list_next(&aclp->z_acl, aclnode)) { - bcopy(aclnode->z_acldata, start, - aclnode->z_size); - start = (caddr_t)start + aclnode->z_size; - } - ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp == - aclp->z_acl_bytes); - } - } - if (mask & VSA_ACE_ACLFLAGS) { - vsecp->vsa_aclflags = 0; - if (zp->z_pflags & ZFS_ACL_DEFAULTED) - vsecp->vsa_aclflags |= ACL_DEFAULTED; - if (zp->z_pflags & ZFS_ACL_PROTECTED) - vsecp->vsa_aclflags |= ACL_PROTECTED; - if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) - vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; - } - - mutex_exit(&zp->z_acl_lock); - - return (0); -} - -int -zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type, - vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) -{ - zfs_acl_t *aclp; - zfs_acl_node_t *aclnode; - int aclcnt = vsecp->vsa_aclcnt; - int error; - - if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) - return (SET_ERROR(EINVAL)); - - aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); - - aclp->z_hints = 0; - aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); - if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { - if ((error = zfs_copy_ace_2_oldace(obj_type, aclp, - (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, - aclcnt, &aclnode->z_size)) != 0) { - zfs_acl_free(aclp); - zfs_acl_node_free(aclnode); - return (error); - } - } else { - if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp, - vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, - &aclnode->z_size, fuidp, cr)) != 0) { - zfs_acl_free(aclp); - zfs_acl_node_free(aclnode); - return (error); - } - } - aclp->z_acl_bytes = aclnode->z_size; - aclnode->z_ace_count = aclcnt; - aclp->z_acl_count = aclcnt; - list_insert_head(&aclp->z_acl, aclnode); - - /* - * If flags are being set then add them to z_hints - */ - if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { - if (vsecp->vsa_aclflags & ACL_PROTECTED) - aclp->z_hints |= ZFS_ACL_PROTECTED; - if (vsecp->vsa_aclflags & ACL_DEFAULTED) - aclp->z_hints |= ZFS_ACL_DEFAULTED; - if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) - aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; - } - - *zaclp = aclp; - - return (0); -} - -/* - * Set a file's ACL - */ -int -zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); - dmu_tx_t *tx; - int error; - zfs_acl_t *aclp; - zfs_fuid_info_t *fuidp = NULL; - boolean_t fuid_dirtied; - uint64_t acl_obj; - - ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); - if (mask == 0) - return (SET_ERROR(ENOSYS)); - - if (zp->z_pflags & ZFS_IMMUTABLE) - return (SET_ERROR(EPERM)); - - if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) - return (error); - - error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp, - &aclp); - if (error) - return (error); - - /* - * If ACL wide flags aren't being set then preserve any - * existing flags. - */ - if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { - aclp->z_hints |= - (zp->z_pflags & V4_ACL_WIDE_FLAGS); - } -top: - mutex_enter(&zp->z_acl_lock); - - tx = dmu_tx_create(zfsvfs->z_os); - - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - - /* - * If old version and ACL won't fit in bonus and we aren't - * upgrading then take out necessary DMU holds - */ - - if ((acl_obj = zfs_external_acl(zp)) != 0) { - if (zfsvfs->z_version >= ZPL_VERSION_FUID && - zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, acl_obj, 0, - DMU_OBJECT_END); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - aclp->z_acl_bytes); - } else { - dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); - } - } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); - } - - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_NOWAIT); - if (error) { - mutex_exit(&zp->z_acl_lock); - - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - zfs_acl_free(aclp); - return (error); - } - - error = zfs_aclset_common(zp, aclp, cr, tx); - ASSERT(error == 0); - ASSERT(zp->z_acl_cached == NULL); - zp->z_acl_cached = aclp; - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - zfs_log_acl(zilog, tx, zp, vsecp, fuidp); - - if (fuidp) - zfs_fuid_info_free(fuidp); - dmu_tx_commit(tx); - mutex_exit(&zp->z_acl_lock); - - return (error); -} - -/* - * Check accesses of interest (AoI) against attributes of the dataset - * such as read-only. Returns zero if no AoI conflict with dataset - * attributes, otherwise an appropriate errno is returned. - */ -static int -zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) -{ - if ((v4_mode & WRITE_MASK) && - (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && - (!IS_DEVVP(ZTOV(zp)) || - (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) { - return (SET_ERROR(EROFS)); - } - - /* - * Intentionally allow ZFS_READONLY through here. - * See zfs_zaccess_common(). - */ - if ((v4_mode & WRITE_MASK_DATA) && - (zp->z_pflags & ZFS_IMMUTABLE)) { - return (SET_ERROR(EPERM)); - } - -#ifdef illumos - if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && - (zp->z_pflags & ZFS_NOUNLINK)) { - return (SET_ERROR(EPERM)); - } -#else - /* - * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK - * (sunlnk) is set. We just don't allow directory removal, which is - * handled in zfs_zaccess_delete(). - */ - if ((v4_mode & ACE_DELETE) && - (zp->z_pflags & ZFS_NOUNLINK)) { - return (EPERM); - } -#endif - - if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && - (zp->z_pflags & ZFS_AV_QUARANTINED))) { - return (SET_ERROR(EACCES)); - } - - return (0); -} - -/* - * The primary usage of this function is to loop through all of the - * ACEs in the znode, determining what accesses of interest (AoI) to - * the caller are allowed or denied. The AoI are expressed as bits in - * the working_mode parameter. As each ACE is processed, bits covered - * by that ACE are removed from the working_mode. This removal - * facilitates two things. The first is that when the working mode is - * empty (= 0), we know we've looked at all the AoI. The second is - * that the ACE interpretation rules don't allow a later ACE to undo - * something granted or denied by an earlier ACE. Removing the - * discovered access or denial enforces this rule. At the end of - * processing the ACEs, all AoI that were found to be denied are - * placed into the working_mode, giving the caller a mask of denied - * accesses. Returns: - * 0 if all AoI granted - * EACCESS if the denied mask is non-zero - * other error if abnormal failure (e.g., IO error) - * - * A secondary usage of the function is to determine if any of the - * AoI are granted. If an ACE grants any access in - * the working_mode, we immediately short circuit out of the function. - * This mode is chosen by setting anyaccess to B_TRUE. The - * working_mode is not a denied access mask upon exit if the function - * is used in this manner. - */ -static int -zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, - boolean_t anyaccess, cred_t *cr) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zfs_acl_t *aclp; - int error; - uid_t uid = crgetuid(cr); - uint64_t who; - uint16_t type, iflags; - uint16_t entry_type; - uint32_t access_mask; - uint32_t deny_mask = 0; - zfs_ace_hdr_t *acep = NULL; - boolean_t checkit; - uid_t gowner; - uid_t fowner; - - zfs_fuid_map_ids(zp, cr, &fowner, &gowner); - - mutex_enter(&zp->z_acl_lock); - - ASSERT_VOP_LOCKED(ZTOV(zp), __func__); - error = zfs_acl_node_read(zp, &aclp, B_FALSE); - if (error != 0) { - mutex_exit(&zp->z_acl_lock); - return (error); - } - - ASSERT(zp->z_acl_cached); - - while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, - &iflags, &type)) { - uint32_t mask_matched; - - if (!zfs_acl_valid_ace_type(type, iflags)) - continue; - - if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE)) - continue; - - /* Skip ACE if it does not affect any AoI */ - mask_matched = (access_mask & *working_mode); - if (!mask_matched) - continue; - - entry_type = (iflags & ACE_TYPE_FLAGS); - - checkit = B_FALSE; - - switch (entry_type) { - case ACE_OWNER: - if (uid == fowner) - checkit = B_TRUE; - break; - case OWNING_GROUP: - who = gowner; - /*FALLTHROUGH*/ - case ACE_IDENTIFIER_GROUP: - checkit = zfs_groupmember(zfsvfs, who, cr); - break; - case ACE_EVERYONE: - checkit = B_TRUE; - break; - - /* USER Entry */ - default: - if (entry_type == 0) { - uid_t newid; - - newid = zfs_fuid_map_id(zfsvfs, who, cr, - ZFS_ACE_USER); - if (newid != IDMAP_WK_CREATOR_OWNER_UID && - uid == newid) - checkit = B_TRUE; - break; - } else { - mutex_exit(&zp->z_acl_lock); - return (SET_ERROR(EIO)); - } - } - - if (checkit) { - if (type == DENY) { - DTRACE_PROBE3(zfs__ace__denies, - znode_t *, zp, - zfs_ace_hdr_t *, acep, - uint32_t, mask_matched); - deny_mask |= mask_matched; - } else { - DTRACE_PROBE3(zfs__ace__allows, - znode_t *, zp, - zfs_ace_hdr_t *, acep, - uint32_t, mask_matched); - if (anyaccess) { - mutex_exit(&zp->z_acl_lock); - return (0); - } - } - *working_mode &= ~mask_matched; - } - - /* Are we done? */ - if (*working_mode == 0) - break; - } - - mutex_exit(&zp->z_acl_lock); - - /* Put the found 'denies' back on the working mode */ - if (deny_mask) { - *working_mode |= deny_mask; - return (SET_ERROR(EACCES)); - } else if (*working_mode) { - return (-1); - } - - return (0); -} - -/* - * Return true if any access whatsoever granted, we don't actually - * care what access is granted. - */ -boolean_t -zfs_has_access(znode_t *zp, cred_t *cr) -{ - uint32_t have = ACE_ALL_PERMS; - - if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { - uid_t owner; - - owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); - return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0); - } - return (B_TRUE); -} - -static int -zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, - boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int err; - - *working_mode = v4_mode; - *check_privs = B_TRUE; - - /* - * Short circuit empty requests - */ - if (v4_mode == 0 || zfsvfs->z_replay) { - *working_mode = 0; - return (0); - } - - if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { - *check_privs = B_FALSE; - return (err); - } - - /* - * The caller requested that the ACL check be skipped. This - * would only happen if the caller checked VOP_ACCESS() with a - * 32 bit ACE mask and already had the appropriate permissions. - */ - if (skipaclchk) { - *working_mode = 0; - return (0); - } - - /* - * Note: ZFS_READONLY represents the "DOS R/O" attribute. - * When that flag is set, we should behave as if write access - * were not granted by anything in the ACL. In particular: - * We _must_ allow writes after opening the file r/w, then - * setting the DOS R/O attribute, and writing some more. - * (Similar to how you can write after fchmod(fd, 0444).) - * - * Therefore ZFS_READONLY is ignored in the dataset check - * above, and checked here as if part of the ACL check. - * Also note: DOS R/O is ignored for directories. - */ - if ((v4_mode & WRITE_MASK_DATA) && - (ZTOV(zp)->v_type != VDIR) && - (zp->z_pflags & ZFS_READONLY)) { - return (SET_ERROR(EPERM)); - } - - return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); -} - -static int -zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, - cred_t *cr) -{ - if (*working_mode != ACE_WRITE_DATA) - return (SET_ERROR(EACCES)); - - return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, - check_privs, B_FALSE, cr)); -} - -/* - * Check if VEXEC is allowed. - * - * This routine is based on zfs_fastaccesschk_execute which has slowpath - * calling zfs_zaccess. This would be incorrect on FreeBSD (see - * zfs_freebsd_access for the difference). Thus this variant let's the - * caller handle the slowpath (if necessary). - * - * We only check for ZFS_NO_EXECS_DENIED and fail early. This routine can - * be extended to cover more cases, but the flag covers the majority. - */ -int -zfs_freebsd_fastaccesschk_execute(struct vnode *vp, cred_t *cr) -{ - boolean_t is_attr; - znode_t *zdp = VTOZ(vp); - - ASSERT_VOP_LOCKED(vp, __func__); - - if (zdp->z_pflags & ZFS_AV_QUARANTINED) - return (1); - - is_attr = ((zdp->z_pflags & ZFS_XATTR) && - (ZTOV(zdp)->v_type == VDIR)); - if (is_attr) - return (1); - - if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) - return (0); - - return (1); -} - -#ifdef illumos -int -zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) -{ - boolean_t owner = B_FALSE; - boolean_t groupmbr = B_FALSE; - boolean_t is_attr; - uid_t uid = crgetuid(cr); - int error; - - if (zdp->z_pflags & ZFS_AV_QUARANTINED) - return (SET_ERROR(EACCES)); - - is_attr = ((zdp->z_pflags & ZFS_XATTR) && - (ZTOV(zdp)->v_type == VDIR)); - if (is_attr) - goto slow; - - - mutex_enter(&zdp->z_acl_lock); - - if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) { - mutex_exit(&zdp->z_acl_lock); - return (0); - } - - if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) { - mutex_exit(&zdp->z_acl_lock); - goto slow; - } - - if (uid == zdp->z_uid) { - owner = B_TRUE; - if (zdp->z_mode & S_IXUSR) { - mutex_exit(&zdp->z_acl_lock); - return (0); - } else { - mutex_exit(&zdp->z_acl_lock); - goto slow; - } - } - if (groupmember(zdp->z_gid, cr)) { - groupmbr = B_TRUE; - if (zdp->z_mode & S_IXGRP) { - mutex_exit(&zdp->z_acl_lock); - return (0); - } else { - mutex_exit(&zdp->z_acl_lock); - goto slow; - } - } - if (!owner && !groupmbr) { - if (zdp->z_mode & S_IXOTH) { - mutex_exit(&zdp->z_acl_lock); - return (0); - } - } - - mutex_exit(&zdp->z_acl_lock); - -slow: - DTRACE_PROBE(zfs__fastpath__execute__access__miss); - ZFS_ENTER(zdp->z_zfsvfs); - error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); - ZFS_EXIT(zdp->z_zfsvfs); - return (error); -} -#endif - -/* - * Determine whether Access should be granted/denied. - * - * The least priv subsystem is always consulted as a basic privilege - * can define any form of access. - */ -int -zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) -{ - uint32_t working_mode; - int error; - int is_attr; - boolean_t check_privs; - znode_t *xzp; - znode_t *check_zp = zp; - mode_t needed_bits; - uid_t owner; - - is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR)); - -#ifdef __FreeBSD_kernel__ - /* - * In FreeBSD, we don't care about permissions of individual ADS. - * Note that not checking them is not just an optimization - without - * this shortcut, EA operations may bogusly fail with EACCES. - */ - if (zp->z_pflags & ZFS_XATTR) - return (0); -#else - /* - * If attribute then validate against base file - */ - if (is_attr) { - uint64_t parent; - - if ((error = sa_lookup(zp->z_sa_hdl, - SA_ZPL_PARENT(zp->z_zfsvfs), &parent, - sizeof (parent))) != 0) - return (error); - - if ((error = zfs_zget(zp->z_zfsvfs, - parent, &xzp)) != 0) { - return (error); - } - - check_zp = xzp; - - /* - * fixup mode to map to xattr perms - */ - - if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { - mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - mode |= ACE_WRITE_NAMED_ATTRS; - } - - if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { - mode &= ~(ACE_READ_DATA|ACE_EXECUTE); - mode |= ACE_READ_NAMED_ATTRS; - } - } -#endif - - owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); - /* - * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC - * in needed_bits. Map the bits mapped by working_mode (currently - * missing) in missing_bits. - * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), - * needed_bits. - */ - needed_bits = 0; - - working_mode = mode; - if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && - owner == crgetuid(cr)) - working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); - - if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| - ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) - needed_bits |= VREAD; - if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| - ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) - needed_bits |= VWRITE; - if (working_mode & ACE_EXECUTE) - needed_bits |= VEXEC; - - if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, - &check_privs, skipaclchk, cr)) == 0) { - if (is_attr) - VN_RELE(ZTOV(xzp)); - return (secpolicy_vnode_access2(cr, ZTOV(zp), owner, - needed_bits, needed_bits)); - } - - if (error && !check_privs) { - if (is_attr) - VN_RELE(ZTOV(xzp)); - return (error); - } - - if (error && (flags & V_APPEND)) { - error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); - } - - if (error && check_privs) { - mode_t checkmode = 0; - - /* - * First check for implicit owner permission on - * read_acl/read_attributes - */ - - error = 0; - ASSERT(working_mode != 0); - - if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && - owner == crgetuid(cr))) - working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); - - if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| - ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) - checkmode |= VREAD; - if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| - ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) - checkmode |= VWRITE; - if (working_mode & ACE_EXECUTE) - checkmode |= VEXEC; - - error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner, - needed_bits & ~checkmode, needed_bits); - - if (error == 0 && (working_mode & ACE_WRITE_OWNER)) - error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner); - if (error == 0 && (working_mode & ACE_WRITE_ACL)) - error = secpolicy_vnode_setdac(ZTOV(check_zp), cr, owner); - - if (error == 0 && (working_mode & - (ACE_DELETE|ACE_DELETE_CHILD))) - error = secpolicy_vnode_remove(ZTOV(check_zp), cr); - - if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { - error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner); - } - if (error == 0) { - /* - * See if any bits other than those already checked - * for are still present. If so then return EACCES - */ - if (working_mode & ~(ZFS_CHECKED_MASKS)) { - error = SET_ERROR(EACCES); - } - } - } else if (error == 0) { - error = secpolicy_vnode_access2(cr, ZTOV(zp), owner, - needed_bits, needed_bits); - } - - - if (is_attr) - VN_RELE(ZTOV(xzp)); - - return (error); -} - -/* - * Translate traditional unix VREAD/VWRITE/VEXEC mode into - * native ACL format and call zfs_zaccess() - */ -int -zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) -{ - return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); -} - -/* - * Access function for secpolicy_vnode_setattr - */ -int -zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) -{ - int v4_mode = zfs_unix_to_v4(mode >> 6); - - return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); -} - -static int -zfs_delete_final_check(znode_t *zp, znode_t *dzp, - mode_t available_perms, cred_t *cr) -{ - int error; - uid_t downer; - - downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER); - - error = secpolicy_vnode_access2(cr, ZTOV(dzp), - downer, available_perms, VWRITE|VEXEC); - - if (error == 0) - error = zfs_sticky_remove_access(dzp, zp, cr); - - return (error); -} - -/* - * Determine whether Access should be granted/deny, without - * consulting least priv subsystem. - * - * The following chart is the recommended NFSv4 enforcement for - * ability to delete an object. - * - * ------------------------------------------------------- - * | Parent Dir | Target Object Permissions | - * | permissions | | - * ------------------------------------------------------- - * | | ACL Allows | ACL Denies| Delete | - * | | Delete | Delete | unspecified| - * ------------------------------------------------------- - * | ACL Allows | Permit | Permit | Permit | - * | DELETE_CHILD | | - * ------------------------------------------------------- - * | ACL Denies | Permit | Deny | Deny | - * | DELETE_CHILD | | | | - * ------------------------------------------------------- - * | ACL specifies | | | | - * | only allow | Permit | Permit | Permit | - * | write and | | | | - * | execute | | | | - * ------------------------------------------------------- - * | ACL denies | | | | - * | write and | Permit | Deny | Deny | - * | execute | | | | - * ------------------------------------------------------- - * ^ - * | - * No search privilege, can't even look up file? - * - */ -int -zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) -{ - uint32_t dzp_working_mode = 0; - uint32_t zp_working_mode = 0; - int dzp_error, zp_error; - mode_t available_perms; - boolean_t dzpcheck_privs = B_TRUE; - boolean_t zpcheck_privs = B_TRUE; - - /* - * We want specific DELETE permissions to - * take precedence over WRITE/EXECUTE. We don't - * want an ACL such as this to mess us up. - * user:joe:write_data:deny,user:joe:delete:allow - * - * However, deny permissions may ultimately be overridden - * by secpolicy_vnode_access(). - * - * We will ask for all of the necessary permissions and then - * look at the working modes from the directory and target object - * to determine what was found. - */ - - if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) - return (SET_ERROR(EPERM)); - - /* - * First row - * If the directory permissions allow the delete, we are done. - */ - if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, - &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) - return (0); - - /* - * If target object has delete permission then we are done - */ - if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, - &zpcheck_privs, B_FALSE, cr)) == 0) - return (0); - - ASSERT(dzp_error && zp_error); - - if (!dzpcheck_privs) - return (dzp_error); - if (!zpcheck_privs) - return (zp_error); - - /* - * Second row - * - * If directory returns EACCES then delete_child was denied - * due to deny delete_child. In this case send the request through - * secpolicy_vnode_remove(). We don't use zfs_delete_final_check() - * since that *could* allow the delete based on write/execute permission - * and we want delete permissions to override write/execute. - */ - - if (dzp_error == EACCES) - return (secpolicy_vnode_remove(ZTOV(dzp), cr)); /* XXXPJD: s/dzp/zp/ ? */ - - /* - * Third Row - * only need to see if we have write/execute on directory. - */ - - dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, - &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); - - if (dzp_error != 0 && !dzpcheck_privs) - return (dzp_error); - - /* - * Fourth row - */ - - available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE; - available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC; - - return (zfs_delete_final_check(zp, dzp, available_perms, cr)); - -} - -int -zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, - znode_t *tzp, cred_t *cr) -{ - int add_perm; - int error; - - if (szp->z_pflags & ZFS_AV_QUARANTINED) - return (SET_ERROR(EACCES)); - - add_perm = (ZTOV(szp)->v_type == VDIR) ? - ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; - - /* - * Rename permissions are combination of delete permission + - * add file/subdir permission. - * - * BSD operating systems also require write permission - * on the directory being moved from one parent directory - * to another. - */ - if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) { - if (error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr)) - return (error); - } - - /* - * first make sure we do the delete portion. - * - * If that succeeds then check for add_file/add_subdir permissions - */ - - if (error = zfs_zaccess_delete(sdzp, szp, cr)) - return (error); - - /* - * If we have a tzp, see if we can delete it? - */ - if (tzp) { - if (error = zfs_zaccess_delete(tdzp, tzp, cr)) - return (error); - } - - /* - * Now check for add permissions - */ - error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); - - return (error); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c deleted file mode 100644 index 6048eb124525..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c +++ /dev/null @@ -1,199 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include -#include -#include -#include -#include -#include - -void -zfs_oldace_byteswap(ace_t *ace, int ace_cnt) -{ - int i; - - for (i = 0; i != ace_cnt; i++, ace++) { - ace->a_who = BSWAP_32(ace->a_who); - ace->a_access_mask = BSWAP_32(ace->a_access_mask); - ace->a_flags = BSWAP_16(ace->a_flags); - ace->a_type = BSWAP_16(ace->a_type); - } -} - -/* - * swap ace_t and ace_oject_t - */ -void -zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) -{ - caddr_t end; - caddr_t ptr; - zfs_ace_t *zacep = NULL; - ace_t *acep; - uint16_t entry_type; - size_t entry_size; - int ace_type; - - end = (caddr_t)buf + size; - ptr = buf; - - while (ptr < end) { - if (zfs_layout) { - /* - * Avoid overrun. Embedded aces can have one - * of several sizes. We don't know exactly - * how many our present, only the size of the - * buffer containing them. That size may be - * larger than needed to hold the aces - * present. As long as we do not do any - * swapping beyond the end of our block we are - * okay. It it safe to swap any non-ace data - * within the block since it is just zeros. - */ - if (ptr + sizeof (zfs_ace_hdr_t) > end) { - break; - } - zacep = (zfs_ace_t *)ptr; - zacep->z_hdr.z_access_mask = - BSWAP_32(zacep->z_hdr.z_access_mask); - zacep->z_hdr.z_flags = BSWAP_16(zacep->z_hdr.z_flags); - ace_type = zacep->z_hdr.z_type = - BSWAP_16(zacep->z_hdr.z_type); - entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS; - } else { - /* Overrun avoidance */ - if (ptr + sizeof (ace_t) > end) { - break; - } - acep = (ace_t *)ptr; - acep->a_access_mask = BSWAP_32(acep->a_access_mask); - acep->a_flags = BSWAP_16(acep->a_flags); - ace_type = acep->a_type = BSWAP_16(acep->a_type); - acep->a_who = BSWAP_32(acep->a_who); - entry_type = acep->a_flags & ACE_TYPE_FLAGS; - } - switch (entry_type) { - case ACE_OWNER: - case ACE_EVERYONE: - case (ACE_IDENTIFIER_GROUP | ACE_GROUP): - entry_size = zfs_layout ? - sizeof (zfs_ace_hdr_t) : sizeof (ace_t); - break; - case ACE_IDENTIFIER_GROUP: - default: - /* Overrun avoidance */ - if (zfs_layout) { - if (ptr + sizeof (zfs_ace_t) <= end) { - zacep->z_fuid = BSWAP_64(zacep->z_fuid); - } else { - entry_size = sizeof (zfs_ace_t); - break; - } - } - switch (ace_type) { - case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: - case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: - case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: - case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: - entry_size = zfs_layout ? - sizeof (zfs_object_ace_t) : - sizeof (ace_object_t); - break; - default: - entry_size = zfs_layout ? sizeof (zfs_ace_t) : - sizeof (ace_t); - break; - } - } - ptr = ptr + entry_size; - } -} - -/* ARGSUSED */ -void -zfs_oldacl_byteswap(void *buf, size_t size) -{ - int cnt; - - /* - * Arggh, since we don't know how many ACEs are in - * the array, we have to swap the entire block - */ - - cnt = size / sizeof (ace_t); - - zfs_oldace_byteswap((ace_t *)buf, cnt); -} - -/* ARGSUSED */ -void -zfs_acl_byteswap(void *buf, size_t size) -{ - zfs_ace_byteswap(buf, size, B_TRUE); -} - -void -zfs_znode_byteswap(void *buf, size_t size) -{ - znode_phys_t *zp = buf; - - ASSERT(size >= sizeof (znode_phys_t)); - - zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]); - zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]); - zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]); - zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]); - zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]); - zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]); - zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]); - zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]); - zp->zp_gen = BSWAP_64(zp->zp_gen); - zp->zp_mode = BSWAP_64(zp->zp_mode); - zp->zp_size = BSWAP_64(zp->zp_size); - zp->zp_parent = BSWAP_64(zp->zp_parent); - zp->zp_links = BSWAP_64(zp->zp_links); - zp->zp_xattr = BSWAP_64(zp->zp_xattr); - zp->zp_rdev = BSWAP_64(zp->zp_rdev); - zp->zp_flags = BSWAP_64(zp->zp_flags); - zp->zp_uid = BSWAP_64(zp->zp_uid); - zp->zp_gid = BSWAP_64(zp->zp_gid); - zp->zp_zap = BSWAP_64(zp->zp_zap); - zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]); - zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]); - zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]); - - zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj); - zp->zp_acl.z_acl_size = BSWAP_32(zp->zp_acl.z_acl_size); - zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version); - zp->zp_acl.z_acl_count = BSWAP_16(zp->zp_acl.z_acl_count); - if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) { - zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0], - ZFS_ACE_SPACE); - } else { - zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0], - ACE_SLOT_CNT); - } -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c deleted file mode 100644 index 9775d842f55e..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c +++ /dev/null @@ -1,1364 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. - * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. - */ - -/* - * ZFS control directory (a.k.a. ".zfs") - * - * This directory provides a common location for all ZFS meta-objects. - * Currently, this is only the 'snapshot' directory, but this may expand in the - * future. The elements are built using the GFS primitives, as the hierarchy - * does not actually exist on disk. - * - * For 'snapshot', we don't want to have all snapshots always mounted, because - * this would take up a huge amount of space in /etc/mnttab. We have three - * types of objects: - * - * ctldir ------> snapshotdir -------> snapshot - * | - * | - * V - * mounted fs - * - * The 'snapshot' node contains just enough information to lookup '..' and act - * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we - * perform an automount of the underlying filesystem and return the - * corresponding vnode. - * - * All mounts are handled automatically by the kernel, but unmounts are - * (currently) handled from user land. The main reason is that there is no - * reliable way to auto-unmount the filesystem when it's "no longer in use". - * When the user unmounts a filesystem, we call zfsctl_unmount(), which - * unmounts any snapshots within the snapshot directory. - * - * The '.zfs', '.zfs/snapshot', and all directories created under - * '.zfs/snapshot' (ie: '.zfs/snapshot/') are all GFS nodes and - * share the same vfs_t as the head filesystem (what '.zfs' lives under). - * - * File systems mounted ontop of the GFS nodes '.zfs/snapshot/' - * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. - * However, vnodes within these mounted on file systems have their v_vfsp - * fields set to the head filesystem to make NFS happy (see - * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t - * so that it cannot be freed until all snapshots have been unmounted. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zfs_namecheck.h" - -/* Common access mode for all virtual directories under the ctldir */ -const u_short zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | - S_IROTH | S_IXOTH; - -/* - * "Synthetic" filesystem implementation. - */ - -/* - * Assert that A implies B. - */ -#define KASSERT_IMPLY(A, B, msg) KASSERT(!(A) || (B), (msg)); - -static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes"); - -typedef struct sfs_node { - char sn_name[ZFS_MAX_DATASET_NAME_LEN]; - uint64_t sn_parent_id; - uint64_t sn_id; -} sfs_node_t; - -/* - * Check the parent's ID as well as the node's to account for a chance - * that IDs originating from different domains (snapshot IDs, artifical - * IDs, znode IDs) may clash. - */ -static int -sfs_compare_ids(struct vnode *vp, void *arg) -{ - sfs_node_t *n1 = vp->v_data; - sfs_node_t *n2 = arg; - bool equal; - - equal = n1->sn_id == n2->sn_id && - n1->sn_parent_id == n2->sn_parent_id; - - /* Zero means equality. */ - return (!equal); -} - -static int -sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id, - uint64_t id, struct vnode **vpp) -{ - sfs_node_t search; - int err; - - search.sn_id = id; - search.sn_parent_id = parent_id; - err = vfs_hash_get(mp, (u_int)id, flags, curthread, vpp, - sfs_compare_ids, &search); - return (err); -} - -static int -sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id, - uint64_t id, struct vnode **vpp) -{ - int err; - - KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data")); - err = vfs_hash_insert(vp, (u_int)id, flags, curthread, vpp, - sfs_compare_ids, vp->v_data); - return (err); -} - -static void -sfs_vnode_remove(struct vnode *vp) -{ - vfs_hash_remove(vp); -} - -typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg); - -static int -sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id, - const char *tag, struct vop_vector *vops, - sfs_vnode_setup_fn setup, void *arg, - struct vnode **vpp) -{ - struct vnode *vp; - int error; - - error = sfs_vnode_get(mp, flags, parent_id, id, vpp); - if (error != 0 || *vpp != NULL) { - KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, - "sfs vnode with no data"); - return (error); - } - - /* Allocate a new vnode/inode. */ - error = getnewvnode(tag, mp, vops, &vp); - if (error != 0) { - *vpp = NULL; - return (error); - } - - /* - * Exclusively lock the vnode vnode while it's being constructed. - */ - lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); - error = insmntque(vp, mp); - if (error != 0) { - *vpp = NULL; - return (error); - } - - setup(vp, arg); - - error = sfs_vnode_insert(vp, flags, parent_id, id, vpp); - if (error != 0 || *vpp != NULL) { - KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, - "sfs vnode with no data"); - return (error); - } - - *vpp = vp; - return (0); -} - -static void -sfs_print_node(sfs_node_t *node) -{ - printf("\tname = %s\n", node->sn_name); - printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id); - printf("\tid = %ju\n", (uintmax_t)node->sn_id); -} - -static sfs_node_t * -sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id) -{ - struct sfs_node *node; - - KASSERT(strlen(name) < sizeof(node->sn_name), - ("sfs node name is too long")); - KASSERT(size >= sizeof(*node), ("sfs node size is too small")); - node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO); - strlcpy(node->sn_name, name, sizeof(node->sn_name)); - node->sn_parent_id = parent_id; - node->sn_id = id; - - return (node); -} - -static void -sfs_destroy_node(sfs_node_t *node) -{ - free(node, M_SFSNODES); -} - -static void * -sfs_reclaim_vnode(vnode_t *vp) -{ - sfs_node_t *node; - void *data; - - sfs_vnode_remove(vp); - data = vp->v_data; - vp->v_data = NULL; - return (data); -} - -static int -sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap, - uio_t *uio, off_t *offp) -{ - struct dirent entry; - int error; - - /* Reset ncookies for subsequent use of vfs_read_dirent. */ - if (ap->a_ncookies != NULL) - *ap->a_ncookies = 0; - - if (uio->uio_resid < sizeof(entry)) - return (SET_ERROR(EINVAL)); - - if (uio->uio_offset < 0) - return (SET_ERROR(EINVAL)); - if (uio->uio_offset == 0) { - entry.d_fileno = id; - entry.d_type = DT_DIR; - entry.d_name[0] = '.'; - entry.d_namlen = 1; - entry.d_reclen = sizeof(entry); - dirent_terminate(&entry); - error = vfs_read_dirent(ap, &entry, uio->uio_offset); - if (error != 0) - return (SET_ERROR(error)); - } - - if (uio->uio_offset < sizeof(entry)) - return (SET_ERROR(EINVAL)); - if (uio->uio_offset == sizeof(entry)) { - entry.d_fileno = parent_id; - entry.d_type = DT_DIR; - entry.d_name[0] = '.'; - entry.d_name[1] = '.'; - entry.d_namlen = 2; - entry.d_reclen = sizeof(entry); - dirent_terminate(&entry); - error = vfs_read_dirent(ap, &entry, uio->uio_offset); - if (error != 0) - return (SET_ERROR(error)); - } - - if (offp != NULL) - *offp = 2 * sizeof(entry); - return (0); -} - - -/* - * .zfs inode namespace - * - * We need to generate unique inode numbers for all files and directories - * within the .zfs pseudo-filesystem. We use the following scheme: - * - * ENTRY ZFSCTL_INODE - * .zfs 1 - * .zfs/snapshot 2 - * .zfs/snapshot/ objectid(snap) - */ -#define ZFSCTL_INO_SNAP(id) (id) - -static struct vop_vector zfsctl_ops_root; -static struct vop_vector zfsctl_ops_snapdir; -static struct vop_vector zfsctl_ops_snapshot; -static struct vop_vector zfsctl_ops_shares_dir; - -void -zfsctl_init(void) -{ -} - -void -zfsctl_fini(void) -{ -} - -boolean_t -zfsctl_is_node(vnode_t *vp) -{ - return (vn_matchops(vp, zfsctl_ops_root) || - vn_matchops(vp, zfsctl_ops_snapdir) || - vn_matchops(vp, zfsctl_ops_snapshot) || - vn_matchops(vp, zfsctl_ops_shares_dir)); - -} - -typedef struct zfsctl_root { - sfs_node_t node; - sfs_node_t *snapdir; - timestruc_t cmtime; -} zfsctl_root_t; - - -/* - * Create the '.zfs' directory. - */ -void -zfsctl_create(zfsvfs_t *zfsvfs) -{ - zfsctl_root_t *dot_zfs; - sfs_node_t *snapdir; - vnode_t *rvp; - uint64_t crtime[2]; - - ASSERT(zfsvfs->z_ctldir == NULL); - - snapdir = sfs_alloc_node(sizeof(*snapdir), "snapshot", ZFSCTL_INO_ROOT, - ZFSCTL_INO_SNAPDIR); - dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof(*dot_zfs), ".zfs", 0, - ZFSCTL_INO_ROOT); - dot_zfs->snapdir = snapdir; - - VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0); - VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), - &crtime, sizeof(crtime))); - ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime); - vput(rvp); - - zfsvfs->z_ctldir = dot_zfs; -} - -/* - * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. - * The nodes must not have any associated vnodes by now as they should be - * vflush-ed. - */ -void -zfsctl_destroy(zfsvfs_t *zfsvfs) -{ - sfs_destroy_node(zfsvfs->z_ctldir->snapdir); - sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir); - zfsvfs->z_ctldir = NULL; -} - -static int -zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags, - struct vnode **vpp) -{ - return (VFS_ROOT(mp, flags, vpp)); -} - -static void -zfsctl_common_vnode_setup(vnode_t *vp, void *arg) -{ - ASSERT_VOP_ELOCKED(vp, __func__); - - /* We support shared locking. */ - VN_LOCK_ASHARE(vp); - vp->v_type = VDIR; - vp->v_data = arg; -} - -static int -zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags, - struct vnode **vpp) -{ - void *node; - int err; - - node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir; - err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root, - zfsctl_common_vnode_setup, node, vpp); - return (err); -} - -static int -zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags, - struct vnode **vpp) -{ - void *node; - int err; - - node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir->snapdir; - err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs", - &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp); - return (err); -} - -/* - * Given a root znode, retrieve the associated .zfs directory. - * Add a hold to the vnode and return it. - */ -int -zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp) -{ - vnode_t *vp; - int error; - - error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp); - return (error); -} - -/* - * Common open routine. Disallow any write access. - */ -static int -zfsctl_common_open(struct vop_open_args *ap) -{ - int flags = ap->a_mode; - - if (flags & FWRITE) - return (SET_ERROR(EACCES)); - - return (0); -} - -/* - * Common close routine. Nothing to do here. - */ -/* ARGSUSED */ -static int -zfsctl_common_close(struct vop_close_args *ap) -{ - return (0); -} - -/* - * Common access routine. Disallow writes. - */ -static int -zfsctl_common_access(ap) - struct vop_access_args /* { - struct vnode *a_vp; - accmode_t a_accmode; - struct ucred *a_cred; - struct thread *a_td; - } */ *ap; -{ - accmode_t accmode = ap->a_accmode; - - if (accmode & VWRITE) - return (SET_ERROR(EACCES)); - return (0); -} - -/* - * Common getattr function. Fill in basic information. - */ -static void -zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) -{ - timestruc_t now; - sfs_node_t *node; - - node = vp->v_data; - - vap->va_uid = 0; - vap->va_gid = 0; - vap->va_rdev = 0; - /* - * We are a purely virtual object, so we have no - * blocksize or allocated blocks. - */ - vap->va_blksize = 0; - vap->va_nblocks = 0; - vap->va_seq = 0; - vn_fsid(vp, vap); - vap->va_mode = zfsctl_ctldir_mode; - vap->va_type = VDIR; - /* - * We live in the now (for atime). - */ - gethrestime(&now); - vap->va_atime = now; - /* FreeBSD: Reset chflags(2) flags. */ - vap->va_flags = 0; - - vap->va_nodeid = node->sn_id; - - /* At least '.' and '..'. */ - vap->va_nlink = 2; -} - -static int -zfsctl_common_fid(ap) - struct vop_fid_args /* { - struct vnode *a_vp; - struct fid *a_fid; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - fid_t *fidp = (void *)ap->a_fid; - sfs_node_t *node = vp->v_data; - uint64_t object = node->sn_id; - zfid_short_t *zfid; - int i; - - zfid = (zfid_short_t *)fidp; - zfid->zf_len = SHORT_FID_LEN; - - for (i = 0; i < sizeof(zfid->zf_object); i++) - zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); - - /* .zfs nodes always have a generation number of 0 */ - for (i = 0; i < sizeof(zfid->zf_gen); i++) - zfid->zf_gen[i] = 0; - - return (0); -} - -static int -zfsctl_common_reclaim(ap) - struct vop_reclaim_args /* { - struct vnode *a_vp; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - - (void) sfs_reclaim_vnode(vp); - return (0); -} - -static int -zfsctl_common_print(ap) - struct vop_print_args /* { - struct vnode *a_vp; - } */ *ap; -{ - sfs_print_node(ap->a_vp->v_data); - return (0); -} - -/* - * Get root directory attributes. - */ -static int -zfsctl_root_getattr(ap) - struct vop_getattr_args /* { - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct vattr *vap = ap->a_vap; - zfsctl_root_t *node = vp->v_data; - - zfsctl_common_getattr(vp, vap); - vap->va_ctime = node->cmtime; - vap->va_mtime = vap->va_ctime; - vap->va_birthtime = vap->va_ctime; - vap->va_nlink += 1; /* snapdir */ - vap->va_size = vap->va_nlink; - return (0); -} - -/* - * When we lookup "." we still can be asked to lock it - * differently, can't we? - */ -int -zfsctl_relock_dot(vnode_t *dvp, int ltype) -{ - vref(dvp); - if (ltype != VOP_ISLOCKED(dvp)) { - if (ltype == LK_EXCLUSIVE) - vn_lock(dvp, LK_UPGRADE | LK_RETRY); - else /* if (ltype == LK_SHARED) */ - vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); - - /* Relock for the "." case may left us with reclaimed vnode. */ - if (VN_IS_DOOMED(dvp)) { - vrele(dvp); - return (SET_ERROR(ENOENT)); - } - } - return (0); -} - -/* - * Special case the handling of "..". - */ -int -zfsctl_root_lookup(ap) - struct vop_lookup_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - } */ *ap; -{ - struct componentname *cnp = ap->a_cnp; - vnode_t *dvp = ap->a_dvp; - vnode_t **vpp = ap->a_vpp; - cred_t *cr = ap->a_cnp->cn_cred; - int flags = ap->a_cnp->cn_flags; - int lkflags = ap->a_cnp->cn_lkflags; - int nameiop = ap->a_cnp->cn_nameiop; - int err; - int ltype; - - ASSERT(dvp->v_type == VDIR); - - if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) - return (SET_ERROR(ENOTSUP)); - - if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { - err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); - if (err == 0) - *vpp = dvp; - } else if ((flags & ISDOTDOT) != 0) { - err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL, - lkflags, vpp); - } else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) { - err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp); - } else { - err = SET_ERROR(ENOENT); - } - if (err != 0) - *vpp = NULL; - return (err); -} - -static int -zfsctl_root_readdir(ap) - struct vop_readdir_args /* { - struct vnode *a_vp; - struct uio *a_uio; - struct ucred *a_cred; - int *a_eofflag; - int *ncookies; - u_long **a_cookies; - } */ *ap; -{ - struct dirent entry; - vnode_t *vp = ap->a_vp; - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - zfsctl_root_t *node = vp->v_data; - uio_t *uio = ap->a_uio; - int *eofp = ap->a_eofflag; - off_t dots_offset; - int error; - - ASSERT(vp->v_type == VDIR); - - error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio, - &dots_offset); - if (error != 0) { - if (error == ENAMETOOLONG) /* ran out of destination space */ - error = 0; - return (error); - } - if (uio->uio_offset != dots_offset) - return (SET_ERROR(EINVAL)); - - CTASSERT(sizeof(node->snapdir->sn_name) <= sizeof(entry.d_name)); - entry.d_fileno = node->snapdir->sn_id; - entry.d_type = DT_DIR; - strcpy(entry.d_name, node->snapdir->sn_name); - entry.d_namlen = strlen(entry.d_name); - entry.d_reclen = sizeof(entry); - dirent_terminate(&entry); - error = vfs_read_dirent(ap, &entry, uio->uio_offset); - if (error != 0) { - if (error == ENAMETOOLONG) - error = 0; - return (SET_ERROR(error)); - } - if (eofp != NULL) - *eofp = 1; - return (0); -} - -static int -zfsctl_root_vptocnp(struct vop_vptocnp_args *ap) -{ - static const char dotzfs_name[4] = ".zfs"; - vnode_t *dvp; - int error; - - if (*ap->a_buflen < sizeof (dotzfs_name)) - return (SET_ERROR(ENOMEM)); - - error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL, - LK_SHARED, &dvp); - if (error != 0) - return (SET_ERROR(error)); - - VOP_UNLOCK(dvp); - *ap->a_vpp = dvp; - *ap->a_buflen -= sizeof (dotzfs_name); - bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name)); - return (0); -} - -static int -zfsctl_common_pathconf(ap) - struct vop_pathconf_args /* { - struct vnode *a_vp; - int a_name; - int *a_retval; - } */ *ap; -{ - /* - * We care about ACL variables so that user land utilities like ls - * can display them correctly. Since the ctldir's st_dev is set to be - * the same as the parent dataset, we must support all variables that - * it supports. - */ - switch (ap->a_name) { - case _PC_LINK_MAX: - *ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX); - return (0); - - case _PC_FILESIZEBITS: - *ap->a_retval = 64; - return (0); - - case _PC_MIN_HOLE_SIZE: - *ap->a_retval = (int)SPA_MINBLOCKSIZE; - return (0); - - case _PC_ACL_NFS4: - *ap->a_retval = 1; - return (0); - - case _PC_ACL_PATH_MAX: - *ap->a_retval = ACL_MAX_ENTRIES; - return (0); - - case _PC_NAME_MAX: - *ap->a_retval = NAME_MAX; - return (0); - - default: - return (vop_stdpathconf(ap)); - } -} - -/** - * Returns a trivial ACL - */ -int -zfsctl_common_getacl(ap) - struct vop_getacl_args /* { - struct vnode *vp; - acl_type_t a_type; - struct acl *a_aclp; - struct ucred *cred; - struct thread *td; - } */ *ap; -{ - int i; - - if (ap->a_type != ACL_TYPE_NFS4) - return (EINVAL); - - acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0); - /* - * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify - * attributes. That is not the case for the ctldir, so we must clear - * those bits. We also must clear ACL_READ_NAMED_ATTRS, because xattrs - * aren't supported by the ctldir. - */ - for (i = 0; i < ap->a_aclp->acl_cnt; i++) { - struct acl_entry *entry; - entry = &(ap->a_aclp->acl_entry[i]); - uint32_t old_perm = entry->ae_perm; - entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER | - ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS | - ACL_READ_NAMED_ATTRS ); - } - - return (0); -} - -static struct vop_vector zfsctl_ops_root = { - .vop_default = &default_vnodeops, - .vop_open = zfsctl_common_open, - .vop_close = zfsctl_common_close, - .vop_ioctl = VOP_EINVAL, - .vop_getattr = zfsctl_root_getattr, - .vop_access = zfsctl_common_access, - .vop_readdir = zfsctl_root_readdir, - .vop_lookup = zfsctl_root_lookup, - .vop_inactive = VOP_NULL, - .vop_reclaim = zfsctl_common_reclaim, - .vop_fid = zfsctl_common_fid, - .vop_print = zfsctl_common_print, - .vop_vptocnp = zfsctl_root_vptocnp, - .vop_pathconf = zfsctl_common_pathconf, - .vop_getacl = zfsctl_common_getacl, -}; -VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root); - -static int -zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) -{ - objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; - - dmu_objset_name(os, zname); - if (strlen(zname) + 1 + strlen(name) >= len) - return (SET_ERROR(ENAMETOOLONG)); - (void) strcat(zname, "@"); - (void) strcat(zname, name); - return (0); -} - -static int -zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id) -{ - objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; - int err; - - err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id); - return (err); -} - -/* - * Given a vnode get a root vnode of a filesystem mounted on top of - * the vnode, if any. The root vnode is referenced and locked. - * If no filesystem is mounted then the orinal vnode remains referenced - * and locked. If any error happens the orinal vnode is unlocked and - * released. - */ -static int -zfsctl_mounted_here(vnode_t **vpp, int flags) -{ - struct mount *mp; - int err; - - ASSERT_VOP_LOCKED(*vpp, __func__); - ASSERT3S((*vpp)->v_type, ==, VDIR); - - if ((mp = (*vpp)->v_mountedhere) != NULL) { - err = vfs_busy(mp, 0); - KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err)); - KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint")); - vput(*vpp); - err = VFS_ROOT(mp, flags, vpp); - vfs_unbusy(mp); - return (err); - } - return (EJUSTRETURN); -} - -typedef struct { - const char *snap_name; - uint64_t snap_id; -} snapshot_setup_arg_t; - -static void -zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg) -{ - snapshot_setup_arg_t *ssa = arg; - sfs_node_t *node; - - ASSERT_VOP_ELOCKED(vp, __func__); - - node = sfs_alloc_node(sizeof(sfs_node_t), - ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id); - zfsctl_common_vnode_setup(vp, node); - - /* We have to support recursive locking. */ - VN_LOCK_AREC(vp); -} - -/* - * Lookup entry point for the 'snapshot' directory. Try to open the - * snapshot if it exist, creating the pseudo filesystem vnode as necessary. - * Perform a mount of the associated dataset on top of the vnode. - * There are four possibilities: - * - the snapshot node and vnode do not exist - * - the snapshot vnode is covered by the mounted snapshot - * - the snapshot vnode is not covered yet, the mount operation is in progress - * - the snapshot vnode is not covered, because the snapshot has been unmounted - * The last two states are transient and should be relatively short-lived. - */ -int -zfsctl_snapdir_lookup(ap) - struct vop_lookup_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - } */ *ap; -{ - vnode_t *dvp = ap->a_dvp; - vnode_t **vpp = ap->a_vpp; - struct componentname *cnp = ap->a_cnp; - char name[NAME_MAX + 1]; - char fullname[ZFS_MAX_DATASET_NAME_LEN]; - char *mountpoint; - size_t mountpoint_len; - zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; - uint64_t snap_id; - int nameiop = cnp->cn_nameiop; - int lkflags = cnp->cn_lkflags; - int flags = cnp->cn_flags; - int err; - - ASSERT(dvp->v_type == VDIR); - - if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) - return (SET_ERROR(ENOTSUP)); - - if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { - err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); - if (err == 0) - *vpp = dvp; - return (err); - } - if (flags & ISDOTDOT) { - err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags, - vpp); - return (err); - } - - if (cnp->cn_namelen >= sizeof(name)) - return (SET_ERROR(ENAMETOOLONG)); - - strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); - err = zfsctl_snapshot_lookup(dvp, name, &snap_id); - if (err != 0) - return (SET_ERROR(ENOENT)); - - for (;;) { - snapshot_setup_arg_t ssa; - - ssa.snap_name = name; - ssa.snap_id = snap_id; - err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR, - snap_id, "zfs", &zfsctl_ops_snapshot, - zfsctl_snapshot_vnode_setup, &ssa, vpp); - if (err != 0) - return (err); - - /* Check if a new vnode has just been created. */ - if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE) - break; - - /* - * Check if a snapshot is already mounted on top of the vnode. - */ - err = zfsctl_mounted_here(vpp, lkflags); - if (err != EJUSTRETURN) - return (err); - - /* - * If the vnode is not covered, then either the mount operation - * is in progress or the snapshot has already been unmounted - * but the vnode hasn't been inactivated and reclaimed yet. - * We can try to re-use the vnode in the latter case. - */ - VI_LOCK(*vpp); - if (((*vpp)->v_iflag & VI_MOUNT) == 0) { - /* Upgrade to exclusive lock in order to: - * - avoid race conditions - * - satisfy the contract of mount_snapshot() - */ - err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK); - if (err == 0) - break; - } else { - VI_UNLOCK(*vpp); - } - - /* - * In this state we can loop on uncontested locks and starve - * the thread doing the lengthy, non-trivial mount operation. - * So, yield to prevent that from happening. - */ - vput(*vpp); - kern_yield(PRI_USER); - } - - VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof(fullname), fullname)); - - mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) + - strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1; - mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); - (void) snprintf(mountpoint, mountpoint_len, - "%s/" ZFS_CTLDIR_NAME "/snapshot/%s", - dvp->v_vfsp->mnt_stat.f_mntonname, name); - - err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0); - kmem_free(mountpoint, mountpoint_len); - if (err == 0) { - /* - * Fix up the root vnode mounted on .zfs/snapshot/. - * - * This is where we lie about our v_vfsp in order to - * make .zfs/snapshot/ accessible over NFS - * without requiring manual mounts of . - */ - ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); - VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; - - /* Clear the root flag (set via VFS_ROOT) as well. */ - (*vpp)->v_vflag &= ~VV_ROOT; - } - - if (err != 0) - *vpp = NULL; - return (err); -} - -static int -zfsctl_snapdir_readdir(ap) - struct vop_readdir_args /* { - struct vnode *a_vp; - struct uio *a_uio; - struct ucred *a_cred; - int *a_eofflag; - int *ncookies; - u_long **a_cookies; - } */ *ap; -{ - char snapname[ZFS_MAX_DATASET_NAME_LEN]; - struct dirent entry; - vnode_t *vp = ap->a_vp; - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - uio_t *uio = ap->a_uio; - int *eofp = ap->a_eofflag; - off_t dots_offset; - int error; - - ASSERT(vp->v_type == VDIR); - - error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio, - &dots_offset); - if (error != 0) { - if (error == ENAMETOOLONG) /* ran out of destination space */ - error = 0; - return (error); - } - - ZFS_ENTER(zfsvfs); - for (;;) { - uint64_t cookie; - uint64_t id; - - cookie = uio->uio_offset - dots_offset; - - dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); - error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname), - snapname, &id, &cookie, NULL); - dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); - if (error != 0) { - if (error == ENOENT) { - if (eofp != NULL) - *eofp = 1; - error = 0; - } - ZFS_EXIT(zfsvfs); - return (error); - } - - entry.d_fileno = id; - entry.d_type = DT_DIR; - strcpy(entry.d_name, snapname); - entry.d_namlen = strlen(entry.d_name); - entry.d_reclen = sizeof(entry); - /* NOTE: d_off is the offset for the *next* entry. */ - entry.d_off = cookie + dots_offset; - dirent_terminate(&entry); - error = vfs_read_dirent(ap, &entry, uio->uio_offset); - if (error != 0) { - if (error == ENAMETOOLONG) - error = 0; - ZFS_EXIT(zfsvfs); - return (SET_ERROR(error)); - } - uio->uio_offset = cookie + dots_offset; - } - /* NOTREACHED */ -} - -static int -zfsctl_snapdir_getattr(ap) - struct vop_getattr_args /* { - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - vattr_t *vap = ap->a_vap; - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - dsl_dataset_t *ds; - sfs_node_t *node = vp->v_data; - uint64_t snap_count; - int err; - - ZFS_ENTER(zfsvfs); - ds = dmu_objset_ds(zfsvfs->z_os); - zfsctl_common_getattr(vp, vap); - vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os); - vap->va_mtime = vap->va_ctime; - vap->va_birthtime = vap->va_ctime; - if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { - err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset, - dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); - if (err != 0) { - ZFS_EXIT(zfsvfs); - return (err); - } - vap->va_nlink += snap_count; - } - vap->va_size = vap->va_nlink; - - ZFS_EXIT(zfsvfs); - return (0); -} - -static struct vop_vector zfsctl_ops_snapdir = { - .vop_default = &default_vnodeops, - .vop_open = zfsctl_common_open, - .vop_close = zfsctl_common_close, - .vop_getattr = zfsctl_snapdir_getattr, - .vop_access = zfsctl_common_access, - .vop_readdir = zfsctl_snapdir_readdir, - .vop_lookup = zfsctl_snapdir_lookup, - .vop_reclaim = zfsctl_common_reclaim, - .vop_fid = zfsctl_common_fid, - .vop_print = zfsctl_common_print, - .vop_pathconf = zfsctl_common_pathconf, - .vop_getacl = zfsctl_common_getacl, -}; -VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir); - -static int -zfsctl_snapshot_inactive(ap) - struct vop_inactive_args /* { - struct vnode *a_vp; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - - VERIFY(vrecycle(vp) == 1); - return (0); -} - -static int -zfsctl_snapshot_reclaim(ap) - struct vop_reclaim_args /* { - struct vnode *a_vp; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - void *data = vp->v_data; - - sfs_reclaim_vnode(vp); - sfs_destroy_node(data); - return (0); -} - -static int -zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap) -{ - struct mount *mp; - vnode_t *dvp; - vnode_t *vp; - sfs_node_t *node; - size_t len; - enum vgetstate vs; - int locked; - int error; - - vp = ap->a_vp; - node = vp->v_data; - len = strlen(node->sn_name); - if (*ap->a_buflen < len) - return (SET_ERROR(ENOMEM)); - - /* - * Prevent unmounting of the snapshot while the vnode lock - * is not held. That is not strictly required, but allows - * us to assert that an uncovered snapshot vnode is never - * "leaked". - */ - mp = vp->v_mountedhere; - if (mp == NULL) - return (SET_ERROR(ENOENT)); - error = vfs_busy(mp, 0); - KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error)); - - /* - * We can vput the vnode as we can now depend on the reference owned - * by the busied mp. But we also need to hold the vnode, because - * the reference may go after vfs_unbusy() which has to be called - * before we can lock the vnode again. - */ - locked = VOP_ISLOCKED(vp); - vs = vget_prep(vp); - vput(vp); - - /* Look up .zfs/snapshot, our parent. */ - error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp); - if (error == 0) { - VOP_UNLOCK(dvp); - *ap->a_vpp = dvp; - *ap->a_buflen -= len; - bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len); - } - vfs_unbusy(mp); - vget_finish(vp, locked | LK_RETRY, vs); - return (error); -} - -/* - * These VP's should never see the light of day. They should always - * be covered. - */ -static struct vop_vector zfsctl_ops_snapshot = { - .vop_default = NULL, /* ensure very restricted access */ - .vop_inactive = zfsctl_snapshot_inactive, - .vop_need_inactive = vop_stdneed_inactive, - .vop_reclaim = zfsctl_snapshot_reclaim, - .vop_vptocnp = zfsctl_snapshot_vptocnp, - .vop_lock1 = vop_stdlock, - .vop_unlock = vop_stdunlock, - .vop_islocked = vop_stdislocked, - .vop_advlockpurge = vop_stdadvlockpurge, /* called by vgone */ - .vop_print = zfsctl_common_print, -}; -VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot); - -int -zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) -{ - struct mount *mp; - zfsvfs_t *zfsvfs = vfsp->vfs_data; - vnode_t *vp; - int error; - - ASSERT(zfsvfs->z_ctldir != NULL); - *zfsvfsp = NULL; - error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, - ZFSCTL_INO_SNAPDIR, objsetid, &vp); - if (error == 0 && vp != NULL) { - /* - * XXX Probably need to at least reference, if not busy, the mp. - */ - if (vp->v_mountedhere != NULL) - *zfsvfsp = vp->v_mountedhere->mnt_data; - vput(vp); - } - if (*zfsvfsp == NULL) - return (SET_ERROR(EINVAL)); - return (0); -} - -/* - * Unmount any snapshots for the given filesystem. This is called from - * zfs_umount() - if we have a ctldir, then go through and unmount all the - * snapshots. - */ -int -zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) -{ - char snapname[ZFS_MAX_DATASET_NAME_LEN]; - zfsvfs_t *zfsvfs = vfsp->vfs_data; - struct mount *mp; - vnode_t *dvp; - vnode_t *vp; - sfs_node_t *node; - sfs_node_t *snap; - uint64_t cookie; - int error; - - ASSERT(zfsvfs->z_ctldir != NULL); - - cookie = 0; - for (;;) { - uint64_t id; - - dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); - error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname), - snapname, &id, &cookie, NULL); - dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); - if (error != 0) { - if (error == ENOENT) - error = 0; - break; - } - - for (;;) { - error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, - ZFSCTL_INO_SNAPDIR, id, &vp); - if (error != 0 || vp == NULL) - break; - - mp = vp->v_mountedhere; - - /* - * v_mountedhere being NULL means that the - * (uncovered) vnode is in a transient state - * (mounting or unmounting), so loop until it - * settles down. - */ - if (mp != NULL) - break; - vput(vp); - } - if (error != 0) - break; - if (vp == NULL) - continue; /* no mountpoint, nothing to do */ - - /* - * The mount-point vnode is kept locked to avoid spurious EBUSY - * from a concurrent umount. - * The vnode lock must have recursive locking enabled. - */ - vfs_ref(mp); - error = dounmount(mp, fflags, curthread); - KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1, - ("extra references after unmount")); - vput(vp); - if (error != 0) - break; - } - KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0, - ("force unmounting failed")); - return (error); -} - diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c deleted file mode 100644 index a9cbe4dfe392..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c +++ /dev/null @@ -1,112 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. - */ - -#include - -list_t zfs_dbgmsgs; -int zfs_dbgmsg_size; -kmutex_t zfs_dbgmsgs_lock; -int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ - -void -zfs_dbgmsg_init(void) -{ - list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), - offsetof(zfs_dbgmsg_t, zdm_node)); - mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); -} - -void -zfs_dbgmsg_fini(void) -{ - zfs_dbgmsg_t *zdm; - - while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) { - int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); - kmem_free(zdm, size); - zfs_dbgmsg_size -= size; - } - mutex_destroy(&zfs_dbgmsgs_lock); - ASSERT0(zfs_dbgmsg_size); -} - -/* - * Print these messages by running: - * echo ::zfs_dbgmsg | mdb -k - * - * Monitor these messages by running: - * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' - * - * When used with libzpool, monitor with: - * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}' - */ -void -zfs_dbgmsg(const char *fmt, ...) -{ - int size; - va_list adx; - zfs_dbgmsg_t *zdm; - - va_start(adx, fmt); - size = vsnprintf(NULL, 0, fmt, adx); - va_end(adx); - - /* - * There is one byte of string in sizeof (zfs_dbgmsg_t), used - * for the terminating null. - */ - zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP); - zdm->zdm_timestamp = gethrestime_sec(); - - va_start(adx, fmt); - (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx); - va_end(adx); - - DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg); - - mutex_enter(&zfs_dbgmsgs_lock); - list_insert_tail(&zfs_dbgmsgs, zdm); - zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size; - while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) { - zdm = list_remove_head(&zfs_dbgmsgs); - size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); - kmem_free(zdm, size); - zfs_dbgmsg_size -= size; - } - mutex_exit(&zfs_dbgmsgs_lock); -} - -void -zfs_dbgmsg_print(const char *tag) -{ - zfs_dbgmsg_t *zdm; - - (void) printf("ZFS_DBGMSG(%s):\n", tag); - mutex_enter(&zfs_dbgmsgs_lock); - for (zdm = list_head(&zfs_dbgmsgs); zdm; - zdm = list_next(&zfs_dbgmsgs, zdm)) - (void) printf("%s\n", zdm->zdm_msg); - mutex_exit(&zfs_dbgmsgs_lock); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c deleted file mode 100644 index c3621a24d137..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c +++ /dev/null @@ -1,968 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups - * of names after deciding which is the appropriate lookup interface. - */ -static int -zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, - matchtype_t mt, uint64_t *zoid) -{ - int error; - - if (zfsvfs->z_norm) { - - /* - * In the non-mixed case we only expect there would ever - * be one match, but we need to use the normalizing lookup. - */ - error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, - zoid, mt, NULL, 0, NULL); - } else { - error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); - } - *zoid = ZFS_DIRENT_OBJ(*zoid); - - return (error); -} - -/* - * Look up a directory entry under a locked vnode. - * dvp being locked gives us a guarantee that there are no concurrent - * modification of the directory and, thus, if a node can be found in - * the directory, then it must not be unlinked. - * - * Input arguments: - * dzp - znode for directory - * name - name of entry to lock - * flag - ZNEW: if the entry already exists, fail with EEXIST. - * ZEXISTS: if the entry does not exist, fail with ENOENT. - * ZXATTR: we want dzp's xattr directory - * - * Output arguments: - * zpp - pointer to the znode for the entry (NULL if there isn't one) - * - * Return value: 0 on success or errno on failure. - * - * NOTE: Always checks for, and rejects, '.' and '..'. - */ -int -zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag) -{ - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - matchtype_t mt = 0; - uint64_t zoid; - vnode_t *vp = NULL; - int error = 0; - - ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); - - *zpp = NULL; - - /* - * Verify that we are not trying to lock '.', '..', or '.zfs' - */ - if (name[0] == '.' && - (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) || - zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) - return (SET_ERROR(EEXIST)); - - /* - * Case sensitivity and normalization preferences are set when - * the file system is created. These are stored in the - * zfsvfs->z_case and zfsvfs->z_norm fields. These choices - * affect how we perform zap lookups. - * - * When matching we may need to normalize & change case according to - * FS settings. - * - * Note that a normalized match is necessary for a case insensitive - * filesystem when the lookup request is not exact because normalization - * can fold case independent of normalizing code point sequences. - * - * See the table above zfs_dropname(). - */ - if (zfsvfs->z_norm != 0) { - mt = MT_NORMALIZE; - - /* - * Determine if the match needs to honor the case specified in - * lookup, and if so keep track of that so that during - * normalization we don't fold case. - */ - if (zfsvfs->z_case == ZFS_CASE_MIXED) { - mt |= MT_MATCH_CASE; - } - } - - /* - * Only look in or update the DNLC if we are looking for the - * name on a file system that does not require normalization - * or case folding. We can also look there if we happen to be - * on a non-normalizing, mixed sensitivity file system IF we - * are looking for the exact name. - * - * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE - * because in that case MT_EXACT and MT_FIRST should produce exactly - * the same result. - */ - - if (dzp->z_unlinked && !(flag & ZXATTR)) - return (ENOENT); - if (flag & ZXATTR) { - error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, - sizeof (zoid)); - if (error == 0) - error = (zoid == 0 ? ENOENT : 0); - } else { - error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid); - } - if (error) { - if (error != ENOENT || (flag & ZEXISTS)) { - return (error); - } - } else { - if (flag & ZNEW) { - return (SET_ERROR(EEXIST)); - } - error = zfs_zget(zfsvfs, zoid, zpp); - if (error) - return (error); - ASSERT(!(*zpp)->z_unlinked); - } - - return (0); -} - -static int -zfs_dd_lookup(znode_t *dzp, znode_t **zpp) -{ - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - znode_t *zp; - uint64_t parent; - int error; - - ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); - ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); - - if (dzp->z_unlinked) - return (ENOENT); - - if ((error = sa_lookup(dzp->z_sa_hdl, - SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) - return (error); - - error = zfs_zget(zfsvfs, parent, &zp); - if (error == 0) - *zpp = zp; - return (error); -} - -int -zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp) -{ - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - znode_t *zp; - int error = 0; - - ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); - ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); - - if (dzp->z_unlinked) - return (SET_ERROR(ENOENT)); - - if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { - *zpp = dzp; - } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { - error = zfs_dd_lookup(dzp, zpp); - } else { - error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS); - if (error == 0) { - dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ - *zpp = zp; - } - } - return (error); -} - -/* - * unlinked Set (formerly known as the "delete queue") Error Handling - * - * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we - * don't specify the name of the entry that we will be manipulating. We - * also fib and say that we won't be adding any new entries to the - * unlinked set, even though we might (this is to lower the minimum file - * size that can be deleted in a full filesystem). So on the small - * chance that the nlink list is using a fat zap (ie. has more than - * 2000 entries), we *may* not pre-read a block that's needed. - * Therefore it is remotely possible for some of the assertions - * regarding the unlinked set below to fail due to i/o error. On a - * nondebug system, this will result in the space being leaked. - */ -void -zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - ASSERT(zp->z_unlinked); - ASSERT(zp->z_links == 0); - - VERIFY3U(0, ==, - zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); -} - -/* - * Clean up any znodes that had no links when we either crashed or - * (force) umounted the file system. - */ -void -zfs_unlinked_drain(zfsvfs_t *zfsvfs) -{ - zap_cursor_t zc; - zap_attribute_t zap; - dmu_object_info_t doi; - znode_t *zp; - dmu_tx_t *tx; - int error; - - /* - * Interate over the contents of the unlinked set. - */ - for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); - zap_cursor_retrieve(&zc, &zap) == 0; - zap_cursor_advance(&zc)) { - - /* - * See what kind of object we have in list - */ - - error = dmu_object_info(zfsvfs->z_os, - zap.za_first_integer, &doi); - if (error != 0) - continue; - - ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || - (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); - /* - * We need to re-mark these list entries for deletion, - * so we pull them back into core and set zp->z_unlinked. - */ - error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); - - /* - * We may pick up znodes that are already marked for deletion. - * This could happen during the purge of an extended attribute - * directory. All we need to do is skip over them, since they - * are already in the system marked z_unlinked. - */ - if (error != 0) - continue; - - vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); -#if defined(__FreeBSD__) - /* - * Due to changes in zfs_rmnode we need to make sure the - * link count is set to zero here. - */ - if (zp->z_links != 0) { - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error != 0) { - dmu_tx_abort(tx); - vput(ZTOV(zp)); - continue; - } - zp->z_links = 0; - VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), - &zp->z_links, sizeof (zp->z_links), tx)); - dmu_tx_commit(tx); - } -#endif - zp->z_unlinked = B_TRUE; - vput(ZTOV(zp)); - } - zap_cursor_fini(&zc); -} - -/* - * Delete the entire contents of a directory. Return a count - * of the number of entries that could not be deleted. If we encounter - * an error, return a count of at least one so that the directory stays - * in the unlinked set. - * - * NOTE: this function assumes that the directory is inactive, - * so there is no need to lock its entries before deletion. - * Also, it assumes the directory contents is *only* regular - * files. - */ -static int -zfs_purgedir(znode_t *dzp) -{ - zap_cursor_t zc; - zap_attribute_t zap; - znode_t *xzp; - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - int skipped = 0; - int error; - - for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); - (error = zap_cursor_retrieve(&zc, &zap)) == 0; - zap_cursor_advance(&zc)) { - error = zfs_zget(zfsvfs, - ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); - if (error) { - skipped += 1; - continue; - } - - vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY); - ASSERT((ZTOV(xzp)->v_type == VREG) || - (ZTOV(xzp)->v_type == VLNK)); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); - dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - /* Is this really needed ? */ - zfs_sa_upgrade_txholds(tx, xzp); - dmu_tx_mark_netfree(tx); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - vput(ZTOV(xzp)); - skipped += 1; - continue; - } - - error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL); - if (error) - skipped += 1; - dmu_tx_commit(tx); - - vput(ZTOV(xzp)); - } - zap_cursor_fini(&zc); - if (error != ENOENT) - skipped += 1; - return (skipped); -} - -#if defined(__FreeBSD__) -extern taskq_t *zfsvfs_taskq; -#endif - -void -zfs_rmnode(znode_t *zp) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - objset_t *os = zfsvfs->z_os; - dmu_tx_t *tx; - uint64_t acl_obj; - uint64_t xattr_obj; - int error; - - ASSERT(zp->z_links == 0); - ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); - - /* - * If this is an attribute directory, purge its contents. - */ - if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && - (zp->z_pflags & ZFS_XATTR)) { - if (zfs_purgedir(zp) != 0) { - /* - * Not enough space to delete some xattrs. - * Leave it in the unlinked set. - */ - zfs_znode_dmu_fini(zp); - zfs_znode_free(zp); - return; - } - } else { - /* - * Free up all the data in the file. We don't do this for - * XATTR directories because we need truncate and remove to be - * in the same tx, like in zfs_znode_delete(). Otherwise, if - * we crash here we'll end up with an inconsistent truncated - * zap object in the delete queue. Note a truncated file is - * harmless since it only contains user data. - */ - error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); - if (error) { - /* - * Not enough space or we were interrupted by unmount. - * Leave the file in the unlinked set. - */ - zfs_znode_dmu_fini(zp); - zfs_znode_free(zp); - return; - } - } - - /* - * If the file has extended attributes, we're going to unlink - * the xattr dir. - */ - error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), - &xattr_obj, sizeof (xattr_obj)); - if (error) - xattr_obj = 0; - - acl_obj = zfs_external_acl(zp); - - /* - * Set up the final transaction. - */ - tx = dmu_tx_create(os); - dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - if (xattr_obj) - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); - if (acl_obj) - dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); - - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - /* - * Not enough space to delete the file. Leave it in the - * unlinked set, leaking it until the fs is remounted (at - * which point we'll call zfs_unlinked_drain() to process it). - */ - dmu_tx_abort(tx); - zfs_znode_dmu_fini(zp); - zfs_znode_free(zp); - return; - } - -#if defined(__FreeBSD__) - /* - * FreeBSD's implemention of zfs_zget requires a vnode to back it. - * This means that we could end up calling into getnewvnode while - * calling zfs_rmnode as a result of a prior call to getnewvnode - * trying to clear vnodes out of the cache. If this repeats we can - * recurse enough that we overflow our stack. To avoid this, we - * avoid calling zfs_zget on the xattr znode and instead simply add - * it to the unlinked set and schedule a call to zfs_unlinked_drain. - */ - if (xattr_obj) { - /* Add extended attribute directory to the unlinked set. */ - VERIFY3U(0, ==, - zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx)); - } -#else - if (xzp) { - ASSERT(error == 0); - xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ - xzp->z_links = 0; /* no more links to it */ - VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), - &xzp->z_links, sizeof (xzp->z_links), tx)); - zfs_unlinked_add(xzp, tx); - } -#endif - - /* Remove this znode from the unlinked set */ - VERIFY3U(0, ==, - zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); - - zfs_znode_delete(zp, tx); - - dmu_tx_commit(tx); - -#if defined(__FreeBSD__) - if (xattr_obj) { - /* - * We're using the FreeBSD taskqueue API here instead of - * the Solaris taskq API since the FreeBSD API allows for a - * task to be enqueued multiple times but executed once. - */ - taskqueue_enqueue(zfsvfs_taskq->tq_queue, - &zfsvfs->z_unlinked_drain_task); - } -#endif -} - -static uint64_t -zfs_dirent(znode_t *zp, uint64_t mode) -{ - uint64_t de = zp->z_id; - - if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) - de |= IFTODT(mode) << 60; - return (de); -} - -/* - * Link zp into dzp. Can only fail if zp has been unlinked. - */ -int -zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, - int flag) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - vnode_t *vp = ZTOV(zp); - uint64_t value; - int zp_is_dir = (vp->v_type == VDIR); - sa_bulk_attr_t bulk[5]; - uint64_t mtime[2], ctime[2]; - int count = 0; - int error; - - ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); - ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); -#ifdef __FreeBSD__ - if (zp_is_dir) { - if (dzp->z_links >= ZFS_LINK_MAX) - return (SET_ERROR(EMLINK)); - } -#endif - if (!(flag & ZRENAMING)) { - if (zp->z_unlinked) { /* no new links to unlinked zp */ - ASSERT(!(flag & (ZNEW | ZEXISTS))); - return (SET_ERROR(ENOENT)); - } -#ifdef __FreeBSD__ - if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) { - return (SET_ERROR(EMLINK)); - } -#endif - zp->z_links++; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, - &zp->z_links, sizeof (zp->z_links)); - - } else { - ASSERT(zp->z_unlinked == 0); - } - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, - &dzp->z_id, sizeof (dzp->z_id)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, sizeof (zp->z_pflags)); - - if (!(flag & ZNEW)) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, - ctime, sizeof (ctime)); - zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, - ctime, B_TRUE); - } - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - ASSERT0(error); - - dzp->z_size++; - dzp->z_links += zp_is_dir; - count = 0; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, - &dzp->z_size, sizeof (dzp->z_size)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, - &dzp->z_links, sizeof (dzp->z_links)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, - mtime, sizeof (mtime)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, - ctime, sizeof (ctime)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &dzp->z_pflags, sizeof (dzp->z_pflags)); - zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); - error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); - ASSERT0(error); - - value = zfs_dirent(zp, zp->z_mode); - error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name, - 8, 1, &value, tx); - VERIFY0(error); - - return (0); -} - -/* - * The match type in the code for this function should conform to: - * - * ------------------------------------------------------------------------ - * fs type | z_norm | lookup type | match type - * ---------|-------------|-------------|---------------------------------- - * CS !norm | 0 | 0 | 0 (exact) - * CS norm | formX | 0 | MT_NORMALIZE - * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE - * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE - * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE - * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE - * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE - * CM !norm | upper | ZCILOOK | MT_NORMALIZE - * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE - * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE - * - * Abbreviations: - * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed - * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) - * formX = unicode normalization form set on fs creation - */ -static int -zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, - int flag) -{ - int error; - - if (zp->z_zfsvfs->z_norm) { - matchtype_t mt = MT_NORMALIZE; - - if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) { - mt |= MT_MATCH_CASE; - } - - error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id, - name, mt, tx); - } else { - error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx); - } - - return (error); -} - -/* - * Unlink zp from dzp, and mark zp for deletion if this was the last link. - * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). - * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. - * If it's non-NULL, we use it to indicate whether the znode needs deletion, - * and it's the caller's job to do it. - */ -int -zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, - int flag, boolean_t *unlinkedp) -{ - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - vnode_t *vp = ZTOV(zp); - int zp_is_dir = (vp->v_type == VDIR); - boolean_t unlinked = B_FALSE; - sa_bulk_attr_t bulk[5]; - uint64_t mtime[2], ctime[2]; - int count = 0; - int error; - - ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); - ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); - - if (!(flag & ZRENAMING)) { - - if (zp_is_dir && !zfs_dirempty(zp)) { -#ifdef illumos - return (SET_ERROR(EEXIST)); -#else - return (SET_ERROR(ENOTEMPTY)); -#endif - } - - /* - * If we get here, we are going to try to remove the object. - * First try removing the name from the directory; if that - * fails, return the error. - */ - error = zfs_dropname(dzp, name, zp, tx, flag); - if (error != 0) { - return (error); - } - - if (zp->z_links <= zp_is_dir) { - zfs_panic_recover("zfs: link count on vnode %p is %u, " - "should be at least %u", zp->z_vnode, - (int)zp->z_links, - zp_is_dir + 1); - zp->z_links = zp_is_dir + 1; - } - if (--zp->z_links == zp_is_dir) { - zp->z_unlinked = B_TRUE; - zp->z_links = 0; - unlinked = B_TRUE; - } else { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), - NULL, &ctime, sizeof (ctime)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), - NULL, &zp->z_pflags, sizeof (zp->z_pflags)); - zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, - B_TRUE); - } - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), - NULL, &zp->z_links, sizeof (zp->z_links)); - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - count = 0; - ASSERT0(error); - } else { - ASSERT(zp->z_unlinked == 0); - error = zfs_dropname(dzp, name, zp, tx, flag); - if (error != 0) - return (error); - } - - dzp->z_size--; /* one dirent removed */ - dzp->z_links -= zp_is_dir; /* ".." link from zp */ - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), - NULL, &dzp->z_links, sizeof (dzp->z_links)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), - NULL, &dzp->z_size, sizeof (dzp->z_size)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), - NULL, ctime, sizeof (ctime)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), - NULL, mtime, sizeof (mtime)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), - NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); - zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); - error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); - ASSERT0(error); - - if (unlinkedp != NULL) - *unlinkedp = unlinked; - else if (unlinked) - zfs_unlinked_add(zp, tx); - - return (0); -} - -/* - * Indicate whether the directory is empty. - */ -boolean_t -zfs_dirempty(znode_t *dzp) -{ - return (dzp->z_size == 2); -} - -int -zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - znode_t *xzp; - dmu_tx_t *tx; - int error; - zfs_acl_ids_t acl_ids; - boolean_t fuid_dirtied; - uint64_t parent; - - *xvpp = NULL; - - /* - * In FreeBSD, access checking for creating an EA is being done - * in zfs_setextattr(), - */ -#ifndef __FreeBSD_kernel__ - if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)) - return (error); -#endif - - if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, - &acl_ids)) != 0) - return (error); - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { - zfs_acl_ids_free(&acl_ids); - return (SET_ERROR(EDQUOT)); - } - - getnewvnode_reserve(); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - getnewvnode_drop_reserve(); - return (error); - } - zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - -#ifdef DEBUG - error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), - &parent, sizeof (parent)); - ASSERT(error == 0 && parent == zp->z_id); -#endif - - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, - sizeof (xzp->z_id), tx)); - - (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, - xzp, "", NULL, acl_ids.z_fuidp, vap); - - zfs_acl_ids_free(&acl_ids); - dmu_tx_commit(tx); - - getnewvnode_drop_reserve(); - - *xvpp = ZTOV(xzp); - - return (0); -} - -/* - * Return a znode for the extended attribute directory for zp. - * ** If the directory does not already exist, it is created ** - * - * IN: zp - znode to obtain attribute directory from - * cr - credentials of caller - * flags - flags from the VOP_LOOKUP call - * - * OUT: xzpp - pointer to extended attribute znode - * - * RETURN: 0 on success - * error number on failure - */ -int -zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - znode_t *xzp; - vattr_t va; - int error; -top: - error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR); - if (error) - return (error); - - if (xzp != NULL) { - *xvpp = ZTOV(xzp); - return (0); - } - - - if (!(flags & CREATE_XATTR_DIR)) { -#ifdef illumos - return (SET_ERROR(ENOENT)); -#else - return (SET_ERROR(ENOATTR)); -#endif - } - - if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { - return (SET_ERROR(EROFS)); - } - - /* - * The ability to 'create' files in an attribute - * directory comes from the write_xattr permission on the base file. - * - * The ability to 'search' an attribute directory requires - * read_xattr permission on the base file. - * - * Once in a directory the ability to read/write attributes - * is controlled by the permissions on the attribute file. - */ - va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID; - va.va_type = VDIR; - va.va_mode = S_IFDIR | S_ISVTX | 0777; - zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); - - error = zfs_make_xattrdir(zp, &va, xvpp, cr); - - if (error == ERESTART) { - /* NB: we already did dmu_tx_wait() if necessary */ - goto top; - } - if (error == 0) - VOP_UNLOCK(*xvpp); - - return (error); -} - -/* - * Decide whether it is okay to remove within a sticky directory. - * - * In sticky directories, write access is not sufficient; - * you can remove entries from a directory only if: - * - * you own the directory, - * you own the entry, - * the entry is a plain file and you have write access, - * or you are privileged (checked in secpolicy...). - * - * The function returns 0 if remove access is granted. - */ -int -zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) -{ - uid_t uid; - uid_t downer; - uid_t fowner; - zfsvfs_t *zfsvfs = zdp->z_zfsvfs; - - if (zdp->z_zfsvfs->z_replay) - return (0); - - if ((zdp->z_mode & S_ISVTX) == 0) - return (0); - - downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); - fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); - - if ((uid = crgetuid(cr)) == downer || uid == fowner || - (ZTOV(zp)->v_type == VREG && - zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) - return (0); - else - return (secpolicy_vnode_remove(ZTOV(zp), cr)); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c deleted file mode 100644 index 398a3d04aa6e..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c +++ /dev/null @@ -1,871 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2012 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* - * This general routine is responsible for generating all the different ZFS - * ereports. The payload is dependent on the class, and which arguments are - * supplied to the function: - * - * EREPORT POOL VDEV IO - * block X X X - * data X X - * device X X - * pool X - * - * If we are in a loading state, all errors are chained together by the same - * SPA-wide ENA (Error Numeric Association). - * - * For isolated I/O requests, we get the ENA from the zio_t. The propagation - * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want - * to chain together all ereports associated with a logical piece of data. For - * read I/Os, there are basically three 'types' of I/O, which form a roughly - * layered diagram: - * - * +---------------+ - * | Aggregate I/O | No associated logical data or device - * +---------------+ - * | - * V - * +---------------+ Reads associated with a piece of logical data. - * | Read I/O | This includes reads on behalf of RAID-Z, - * +---------------+ mirrors, gang blocks, retries, etc. - * | - * V - * +---------------+ Reads associated with a particular device, but - * | Physical I/O | no logical data. Issued as part of vdev caching - * +---------------+ and I/O aggregation. - * - * Note that 'physical I/O' here is not the same terminology as used in the rest - * of ZIO. Typically, 'physical I/O' simply means that there is no attached - * blockpointer. But I/O with no associated block pointer can still be related - * to a logical piece of data (i.e. RAID-Z requests). - * - * Purely physical I/O always have unique ENAs. They are not related to a - * particular piece of logical data, and therefore cannot be chained together. - * We still generate an ereport, but the DE doesn't correlate it with any - * logical piece of data. When such an I/O fails, the delegated I/O requests - * will issue a retry, which will trigger the 'real' ereport with the correct - * ENA. - * - * We keep track of the ENA for a ZIO chain through the 'io_logical' member. - * When a new logical I/O is issued, we set this to point to itself. Child I/Os - * then inherit this pointer, so that when it is first set subsequent failures - * will use the same ENA. For vdev cache fill and queue aggregation I/O, - * this pointer is set to NULL, and no ereport will be generated (since it - * doesn't actually correspond to any particular device or piece of data, - * and the caller will always retry without caching or queueing anyway). - * - * For checksum errors, we want to include more information about the actual - * error which occurs. Accordingly, we build an ereport when the error is - * noticed, but instead of sending it in immediately, we hang it off of the - * io_cksum_report field of the logical IO. When the logical IO completes - * (successfully or not), zfs_ereport_finish_checksum() is called with the - * good and bad versions of the buffer (if available), and we annotate the - * ereport with information about the differences. - */ -#ifdef _KERNEL -static void -zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, - const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, - uint64_t stateoroffset, uint64_t size) -{ - nvlist_t *ereport, *detector; - - uint64_t ena; - char class[64]; - - /* - * If we are doing a spa_tryimport() or in recovery mode, - * ignore errors. - */ - if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || - spa_load_state(spa) == SPA_LOAD_RECOVER) - return; - - /* - * If we are in the middle of opening a pool, and the previous attempt - * failed, don't bother logging any new ereports - we're just going to - * get the same diagnosis anyway. - */ - if (spa_load_state(spa) != SPA_LOAD_NONE && - spa->spa_last_open_failed) - return; - - if (zio != NULL) { - /* - * If this is not a read or write zio, ignore the error. This - * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. - */ - if (zio->io_type != ZIO_TYPE_READ && - zio->io_type != ZIO_TYPE_WRITE) - return; - - /* - * Ignore any errors from speculative I/Os, as failure is an - * expected result. - */ - if (zio->io_flags & ZIO_FLAG_SPECULATIVE) - return; - - /* - * If this I/O is not a retry I/O, don't post an ereport. - * Otherwise, we risk making bad diagnoses based on B_FAILFAST - * I/Os. - */ - if (zio->io_error == EIO && - !(zio->io_flags & ZIO_FLAG_IO_RETRY)) - return; - - if (vd != NULL) { - /* - * If the vdev has already been marked as failing due - * to a failed probe, then ignore any subsequent I/O - * errors, as the DE will automatically fault the vdev - * on the first such failure. This also catches cases - * where vdev_remove_wanted is set and the device has - * not yet been asynchronously placed into the REMOVED - * state. - */ - if (zio->io_vd == vd && !vdev_accessible(vd, zio)) - return; - - /* - * Ignore checksum errors for reads from DTL regions of - * leaf vdevs. - */ - if (zio->io_type == ZIO_TYPE_READ && - zio->io_error == ECKSUM && - vd->vdev_ops->vdev_op_leaf && - vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) - return; - } - } - - /* - * For probe failure, we want to avoid posting ereports if we've - * already removed the device in the meantime. - */ - if (vd != NULL && - strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && - (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) - return; - - if ((ereport = fm_nvlist_create(NULL)) == NULL) - return; - - if ((detector = fm_nvlist_create(NULL)) == NULL) { - fm_nvlist_destroy(ereport, FM_NVA_FREE); - return; - } - - /* - * Serialize ereport generation - */ - mutex_enter(&spa->spa_errlist_lock); - - /* - * Determine the ENA to use for this event. If we are in a loading - * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use - * a root zio-wide ENA. Otherwise, simply use a unique ENA. - */ - if (spa_load_state(spa) != SPA_LOAD_NONE) { - if (spa->spa_ena == 0) - spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); - ena = spa->spa_ena; - } else if (zio != NULL && zio->io_logical != NULL) { - if (zio->io_logical->io_ena == 0) - zio->io_logical->io_ena = - fm_ena_generate(0, FM_ENA_FMT1); - ena = zio->io_logical->io_ena; - } else { - ena = fm_ena_generate(0, FM_ENA_FMT1); - } - - /* - * Construct the full class, detector, and other standard FMA fields. - */ - (void) snprintf(class, sizeof (class), "%s.%s", - ZFS_ERROR_CLASS, subclass); - - fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), - vd != NULL ? vd->vdev_guid : 0); - - fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); - - /* - * Construct the per-ereport payload, depending on which parameters are - * passed in. - */ - - /* - * Generic payload members common to all ereports. - */ - fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL, - DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, - DATA_TYPE_UINT64, spa_guid(spa), - FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, - spa_load_state(spa), NULL); - - if (spa != NULL) { - fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, - DATA_TYPE_STRING, - spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? - FM_EREPORT_FAILMODE_WAIT : - spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? - FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, - NULL); - } - - if (vd != NULL) { - vdev_t *pvd = vd->vdev_parent; - - fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, - DATA_TYPE_UINT64, vd->vdev_guid, - FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, - DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); - if (vd->vdev_path != NULL) - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, - DATA_TYPE_STRING, vd->vdev_path, NULL); - if (vd->vdev_devid != NULL) - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, - DATA_TYPE_STRING, vd->vdev_devid, NULL); - if (vd->vdev_fru != NULL) - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, - DATA_TYPE_STRING, vd->vdev_fru, NULL); - - if (pvd != NULL) { - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, - DATA_TYPE_UINT64, pvd->vdev_guid, - FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, - DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, - NULL); - if (pvd->vdev_path) - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, - DATA_TYPE_STRING, pvd->vdev_path, NULL); - if (pvd->vdev_devid) - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, - DATA_TYPE_STRING, pvd->vdev_devid, NULL); - } - } - - if (zio != NULL) { - /* - * Payload common to all I/Os. - */ - fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, - DATA_TYPE_INT32, zio->io_error, NULL); - - /* - * If the 'size' parameter is non-zero, it indicates this is a - * RAID-Z or other I/O where the physical offset and length are - * provided for us, instead of within the zio_t. - */ - if (vd != NULL) { - if (size) - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, - DATA_TYPE_UINT64, stateoroffset, - FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, - DATA_TYPE_UINT64, size, NULL); - else - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, - DATA_TYPE_UINT64, zio->io_offset, - FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, - DATA_TYPE_UINT64, zio->io_size, NULL); - } - - /* - * Payload for I/Os with corresponding logical information. - */ - if (zio->io_logical != NULL) - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, - DATA_TYPE_UINT64, - zio->io_logical->io_bookmark.zb_objset, - FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, - DATA_TYPE_UINT64, - zio->io_logical->io_bookmark.zb_object, - FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, - DATA_TYPE_INT64, - zio->io_logical->io_bookmark.zb_level, - FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, - DATA_TYPE_UINT64, - zio->io_logical->io_bookmark.zb_blkid, NULL); - } else if (vd != NULL) { - /* - * If we have a vdev but no zio, this is a device fault, and the - * 'stateoroffset' parameter indicates the previous state of the - * vdev. - */ - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, - DATA_TYPE_UINT64, stateoroffset, NULL); - } - - mutex_exit(&spa->spa_errlist_lock); - - *ereport_out = ereport; - *detector_out = detector; -} - -/* if it's <= 128 bytes, save the corruption directly */ -#define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) - -#define MAX_RANGES 16 - -typedef struct zfs_ecksum_info { - /* histograms of set and cleared bits by bit number in a 64-bit word */ - uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY]; - uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; - - /* inline arrays of bits set and cleared. */ - uint64_t zei_bits_set[ZFM_MAX_INLINE]; - uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; - - /* - * for each range, the number of bits set and cleared. The Hamming - * distance between the good and bad buffers is the sum of them all. - */ - uint32_t zei_range_sets[MAX_RANGES]; - uint32_t zei_range_clears[MAX_RANGES]; - - struct zei_ranges { - uint32_t zr_start; - uint32_t zr_end; - } zei_ranges[MAX_RANGES]; - - size_t zei_range_count; - uint32_t zei_mingap; - uint32_t zei_allowed_mingap; - -} zfs_ecksum_info_t; - -static void -update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count) -{ - size_t i; - size_t bits = 0; - uint64_t value = BE_64(value_arg); - - /* We store the bits in big-endian (largest-first) order */ - for (i = 0; i < 64; i++) { - if (value & (1ull << i)) { - hist[63 - i]++; - ++bits; - } - } - /* update the count of bits changed */ - *count += bits; -} - -/* - * We've now filled up the range array, and need to increase "mingap" and - * shrink the range list accordingly. zei_mingap is always the smallest - * distance between array entries, so we set the new_allowed_gap to be - * one greater than that. We then go through the list, joining together - * any ranges which are closer than the new_allowed_gap. - * - * By construction, there will be at least one. We also update zei_mingap - * to the new smallest gap, to prepare for our next invocation. - */ -static void -shrink_ranges(zfs_ecksum_info_t *eip) -{ - uint32_t mingap = UINT32_MAX; - uint32_t new_allowed_gap = eip->zei_mingap + 1; - - size_t idx, output; - size_t max = eip->zei_range_count; - - struct zei_ranges *r = eip->zei_ranges; - - ASSERT3U(eip->zei_range_count, >, 0); - ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); - - output = idx = 0; - while (idx < max - 1) { - uint32_t start = r[idx].zr_start; - uint32_t end = r[idx].zr_end; - - while (idx < max - 1) { - idx++; - - uint32_t nstart = r[idx].zr_start; - uint32_t nend = r[idx].zr_end; - - uint32_t gap = nstart - end; - if (gap < new_allowed_gap) { - end = nend; - continue; - } - if (gap < mingap) - mingap = gap; - break; - } - r[output].zr_start = start; - r[output].zr_end = end; - output++; - } - ASSERT3U(output, <, eip->zei_range_count); - eip->zei_range_count = output; - eip->zei_mingap = mingap; - eip->zei_allowed_mingap = new_allowed_gap; -} - -static void -add_range(zfs_ecksum_info_t *eip, int start, int end) -{ - struct zei_ranges *r = eip->zei_ranges; - size_t count = eip->zei_range_count; - - if (count >= MAX_RANGES) { - shrink_ranges(eip); - count = eip->zei_range_count; - } - if (count == 0) { - eip->zei_mingap = UINT32_MAX; - eip->zei_allowed_mingap = 1; - } else { - int gap = start - r[count - 1].zr_end; - - if (gap < eip->zei_allowed_mingap) { - r[count - 1].zr_end = end; - return; - } - if (gap < eip->zei_mingap) - eip->zei_mingap = gap; - } - r[count].zr_start = start; - r[count].zr_end = end; - eip->zei_range_count++; -} - -static size_t -range_total_size(zfs_ecksum_info_t *eip) -{ - struct zei_ranges *r = eip->zei_ranges; - size_t count = eip->zei_range_count; - size_t result = 0; - size_t idx; - - for (idx = 0; idx < count; idx++) - result += (r[idx].zr_end - r[idx].zr_start); - - return (result); -} - -static zfs_ecksum_info_t * -annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, - const uint8_t *goodbuf, const uint8_t *badbuf, size_t size, - boolean_t drop_if_identical) -{ - const uint64_t *good = (const uint64_t *)goodbuf; - const uint64_t *bad = (const uint64_t *)badbuf; - - uint64_t allset = 0; - uint64_t allcleared = 0; - - size_t nui64s = size / sizeof (uint64_t); - - size_t inline_size; - int no_inline = 0; - size_t idx; - size_t range; - - size_t offset = 0; - ssize_t start = -1; - - zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); - - /* don't do any annotation for injected checksum errors */ - if (info != NULL && info->zbc_injected) - return (eip); - - if (info != NULL && info->zbc_has_cksum) { - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, - DATA_TYPE_UINT64_ARRAY, - sizeof (info->zbc_expected) / sizeof (uint64_t), - (uint64_t *)&info->zbc_expected, - FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, - DATA_TYPE_UINT64_ARRAY, - sizeof (info->zbc_actual) / sizeof (uint64_t), - (uint64_t *)&info->zbc_actual, - FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, - DATA_TYPE_STRING, - info->zbc_checksum_name, - NULL); - - if (info->zbc_byteswapped) { - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, - DATA_TYPE_BOOLEAN, 1, - NULL); - } - } - - if (badbuf == NULL || goodbuf == NULL) - return (eip); - - ASSERT3U(nui64s, <=, UINT32_MAX); - ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); - ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); - ASSERT3U(size, <=, UINT32_MAX); - - /* build up the range list by comparing the two buffers. */ - for (idx = 0; idx < nui64s; idx++) { - if (good[idx] == bad[idx]) { - if (start == -1) - continue; - - add_range(eip, start, idx); - start = -1; - } else { - if (start != -1) - continue; - - start = idx; - } - } - if (start != -1) - add_range(eip, start, idx); - - /* See if it will fit in our inline buffers */ - inline_size = range_total_size(eip); - if (inline_size > ZFM_MAX_INLINE) - no_inline = 1; - - /* - * If there is no change and we want to drop if the buffers are - * identical, do so. - */ - if (inline_size == 0 && drop_if_identical) { - kmem_free(eip, sizeof (*eip)); - return (NULL); - } - - /* - * Now walk through the ranges, filling in the details of the - * differences. Also convert our uint64_t-array offsets to byte - * offsets. - */ - for (range = 0; range < eip->zei_range_count; range++) { - size_t start = eip->zei_ranges[range].zr_start; - size_t end = eip->zei_ranges[range].zr_end; - - for (idx = start; idx < end; idx++) { - uint64_t set, cleared; - - // bits set in bad, but not in good - set = ((~good[idx]) & bad[idx]); - // bits set in good, but not in bad - cleared = (good[idx] & (~bad[idx])); - - allset |= set; - allcleared |= cleared; - - if (!no_inline) { - ASSERT3U(offset, <, inline_size); - eip->zei_bits_set[offset] = set; - eip->zei_bits_cleared[offset] = cleared; - offset++; - } - - update_histogram(set, eip->zei_histogram_set, - &eip->zei_range_sets[range]); - update_histogram(cleared, eip->zei_histogram_cleared, - &eip->zei_range_clears[range]); - } - - /* convert to byte offsets */ - eip->zei_ranges[range].zr_start *= sizeof (uint64_t); - eip->zei_ranges[range].zr_end *= sizeof (uint64_t); - } - eip->zei_allowed_mingap *= sizeof (uint64_t); - inline_size *= sizeof (uint64_t); - - /* fill in ereport */ - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, - DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, - (uint32_t *)eip->zei_ranges, - FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, - DATA_TYPE_UINT32, eip->zei_allowed_mingap, - FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, - DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, - FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, - DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, - NULL); - - if (!no_inline) { - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, - DATA_TYPE_UINT8_ARRAY, - inline_size, (uint8_t *)eip->zei_bits_set, - FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, - DATA_TYPE_UINT8_ARRAY, - inline_size, (uint8_t *)eip->zei_bits_cleared, - NULL); - } else { - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, - DATA_TYPE_UINT32_ARRAY, - NBBY * sizeof (uint64_t), eip->zei_histogram_set, - FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, - DATA_TYPE_UINT32_ARRAY, - NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, - NULL); - } - return (eip); -} -#endif - -void -zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, - uint64_t stateoroffset, uint64_t size) -{ -#ifdef _KERNEL - nvlist_t *ereport = NULL; - nvlist_t *detector = NULL; - - zfs_ereport_start(&ereport, &detector, - subclass, spa, vd, zio, stateoroffset, size); - - if (ereport == NULL) - return; - - fm_ereport_post(ereport, EVCH_SLEEP); - - fm_nvlist_destroy(ereport, FM_NVA_FREE); - fm_nvlist_destroy(detector, FM_NVA_FREE); -#endif -} - -void -zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, - struct zio *zio, uint64_t offset, uint64_t length, void *arg, - zio_bad_cksum_t *info) -{ - zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP); - - if (zio->io_vsd != NULL) - zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); - else - zio_vsd_default_cksum_report(zio, report, arg); - - /* copy the checksum failure information if it was provided */ - if (info != NULL) { - report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); - bcopy(info, report->zcr_ckinfo, sizeof (*info)); - } - - report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; - report->zcr_length = length; - -#ifdef _KERNEL - zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, - FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); - - if (report->zcr_ereport == NULL) { - report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo); - if (report->zcr_ckinfo != NULL) { - kmem_free(report->zcr_ckinfo, - sizeof (*report->zcr_ckinfo)); - } - kmem_free(report, sizeof (*report)); - return; - } -#endif - - mutex_enter(&spa->spa_errlist_lock); - report->zcr_next = zio->io_logical->io_cksum_report; - zio->io_logical->io_cksum_report = report; - mutex_exit(&spa->spa_errlist_lock); -} - -void -zfs_ereport_finish_checksum(zio_cksum_report_t *report, - const void *good_data, const void *bad_data, boolean_t drop_if_identical) -{ -#ifdef _KERNEL - zfs_ecksum_info_t *info = NULL; - info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, - good_data, bad_data, report->zcr_length, drop_if_identical); - - if (info != NULL) - fm_ereport_post(report->zcr_ereport, EVCH_SLEEP); - - fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE); - fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE); - report->zcr_ereport = report->zcr_detector = NULL; - - if (info != NULL) - kmem_free(info, sizeof (*info)); -#endif -} - -void -zfs_ereport_free_checksum(zio_cksum_report_t *rpt) -{ -#ifdef _KERNEL - if (rpt->zcr_ereport != NULL) { - fm_nvlist_destroy(rpt->zcr_ereport, - FM_NVA_FREE); - fm_nvlist_destroy(rpt->zcr_detector, - FM_NVA_FREE); - } -#endif - rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); - - if (rpt->zcr_ckinfo != NULL) - kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); - - kmem_free(rpt, sizeof (*rpt)); -} - -void -zfs_ereport_send_interim_checksum(zio_cksum_report_t *report) -{ -#ifdef _KERNEL - fm_ereport_post(report->zcr_ereport, EVCH_SLEEP); -#endif -} - -void -zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, - struct zio *zio, uint64_t offset, uint64_t length, - const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc) -{ -#ifdef _KERNEL - nvlist_t *ereport = NULL; - nvlist_t *detector = NULL; - zfs_ecksum_info_t *info; - - zfs_ereport_start(&ereport, &detector, - FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); - - if (ereport == NULL) - return; - - info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, - B_FALSE); - - if (info != NULL) - fm_ereport_post(ereport, EVCH_SLEEP); - - fm_nvlist_destroy(ereport, FM_NVA_FREE); - fm_nvlist_destroy(detector, FM_NVA_FREE); - - if (info != NULL) - kmem_free(info, sizeof (*info)); -#endif -} - -static void -zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) -{ -#ifdef _KERNEL - nvlist_t *resource; - char class[64]; - - if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) - return; - - if ((resource = fm_nvlist_create(NULL)) == NULL) - return; - - (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE, - ZFS_ERROR_CLASS, name); - VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); - VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); - VERIFY(nvlist_add_uint64(resource, - FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0); - if (vd) - VERIFY(nvlist_add_uint64(resource, - FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0); - - fm_ereport_post(resource, EVCH_SLEEP); - - fm_nvlist_destroy(resource, FM_NVA_FREE); -#endif -} - -/* - * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev - * has been removed from the system. This will cause the DE to ignore any - * recent I/O errors, inferring that they are due to the asynchronous device - * removal. - */ -void -zfs_post_remove(spa_t *spa, vdev_t *vd) -{ - zfs_post_common(spa, vd, FM_RESOURCE_REMOVED); -} - -/* - * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool - * has the 'autoreplace' property set, and therefore any broken vdevs will be - * handled by higher level logic, and no vdev fault should be generated. - */ -void -zfs_post_autoreplace(spa_t *spa, vdev_t *vd) -{ - zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE); -} - -/* - * The 'resource.fs.zfs.statechange' event is an internal signal that the - * given vdev has transitioned its state to DEGRADED or HEALTHY. This will - * cause the retire agent to repair any outstanding fault management cases - * open because the device was not found (fault.fs.zfs.device). - */ -void -zfs_post_state_change(spa_t *spa, vdev_t *vd) -{ - zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c deleted file mode 100644 index 581b6b1bfb64..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c +++ /dev/null @@ -1,762 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#ifdef _KERNEL -#include -#include -#include -#include -#endif -#include - -/* - * FUID Domain table(s). - * - * The FUID table is stored as a packed nvlist of an array - * of nvlists which contain an index, domain string and offset - * - * During file system initialization the nvlist(s) are read and - * two AVL trees are created. One tree is keyed by the index number - * and the other by the domain string. Nodes are never removed from - * trees, but new entries may be added. If a new entry is added then - * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then - * be responsible for calling zfs_fuid_sync() to sync the changes to disk. - * - */ - -#define FUID_IDX "fuid_idx" -#define FUID_DOMAIN "fuid_domain" -#define FUID_OFFSET "fuid_offset" -#define FUID_NVP_ARRAY "fuid_nvlist" - -typedef struct fuid_domain { - avl_node_t f_domnode; - avl_node_t f_idxnode; - ksiddomain_t *f_ksid; - uint64_t f_idx; -} fuid_domain_t; - -static char *nulldomain = ""; - -/* - * Compare two indexes. - */ -static int -idx_compare(const void *arg1, const void *arg2) -{ - const fuid_domain_t *node1 = (const fuid_domain_t *)arg1; - const fuid_domain_t *node2 = (const fuid_domain_t *)arg2; - - return (AVL_CMP(node1->f_idx, node2->f_idx)); -} - -/* - * Compare two domain strings. - */ -static int -domain_compare(const void *arg1, const void *arg2) -{ - const fuid_domain_t *node1 = (const fuid_domain_t *)arg1; - const fuid_domain_t *node2 = (const fuid_domain_t *)arg2; - int val; - - val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name); - - return (AVL_ISIGN(val)); -} - -void -zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree) -{ - avl_create(idx_tree, idx_compare, - sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); - avl_create(domain_tree, domain_compare, - sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode)); -} - -/* - * load initial fuid domain and idx trees. This function is used by - * both the kernel and zdb. - */ -uint64_t -zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, - avl_tree_t *domain_tree) -{ - dmu_buf_t *db; - uint64_t fuid_size; - - ASSERT(fuid_obj != 0); - VERIFY(0 == dmu_bonus_hold(os, fuid_obj, - FTAG, &db)); - fuid_size = *(uint64_t *)db->db_data; - dmu_buf_rele(db, FTAG); - - if (fuid_size) { - nvlist_t **fuidnvp; - nvlist_t *nvp = NULL; - uint_t count; - char *packed; - int i; - - packed = kmem_alloc(fuid_size, KM_SLEEP); - VERIFY(dmu_read(os, fuid_obj, 0, - fuid_size, packed, DMU_READ_PREFETCH) == 0); - VERIFY(nvlist_unpack(packed, fuid_size, - &nvp, 0) == 0); - VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, - &fuidnvp, &count) == 0); - - for (i = 0; i != count; i++) { - fuid_domain_t *domnode; - char *domain; - uint64_t idx; - - VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN, - &domain) == 0); - VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX, - &idx) == 0); - - domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); - - domnode->f_idx = idx; - domnode->f_ksid = ksid_lookupdomain(domain); - avl_add(idx_tree, domnode); - avl_add(domain_tree, domnode); - } - nvlist_free(nvp); - kmem_free(packed, fuid_size); - } - return (fuid_size); -} - -void -zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree) -{ - fuid_domain_t *domnode; - void *cookie; - - cookie = NULL; - while (domnode = avl_destroy_nodes(domain_tree, &cookie)) - ksiddomain_rele(domnode->f_ksid); - - avl_destroy(domain_tree); - cookie = NULL; - while (domnode = avl_destroy_nodes(idx_tree, &cookie)) - kmem_free(domnode, sizeof (fuid_domain_t)); - avl_destroy(idx_tree); -} - -char * -zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx) -{ - fuid_domain_t searchnode, *findnode; - avl_index_t loc; - - searchnode.f_idx = idx; - - findnode = avl_find(idx_tree, &searchnode, &loc); - - return (findnode ? findnode->f_ksid->kd_name : nulldomain); -} - -#ifdef _KERNEL -/* - * Load the fuid table(s) into memory. - */ -static void -zfs_fuid_init(zfsvfs_t *zfsvfs) -{ - rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); - - if (zfsvfs->z_fuid_loaded) { - rw_exit(&zfsvfs->z_fuid_lock); - return; - } - - zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); - - (void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, - ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); - if (zfsvfs->z_fuid_obj != 0) { - zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os, - zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, - &zfsvfs->z_fuid_domain); - } - - zfsvfs->z_fuid_loaded = B_TRUE; - rw_exit(&zfsvfs->z_fuid_lock); -} - -/* - * sync out AVL trees to persistent storage. - */ -void -zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx) -{ - nvlist_t *nvp; - nvlist_t **fuids; - size_t nvsize = 0; - char *packed; - dmu_buf_t *db; - fuid_domain_t *domnode; - int numnodes; - int i; - - if (!zfsvfs->z_fuid_dirty) { - return; - } - - rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); - - /* - * First see if table needs to be created? - */ - if (zfsvfs->z_fuid_obj == 0) { - zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os, - DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, - sizeof (uint64_t), tx); - VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, - ZFS_FUID_TABLES, sizeof (uint64_t), 1, - &zfsvfs->z_fuid_obj, tx) == 0); - } - - VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - numnodes = avl_numnodes(&zfsvfs->z_fuid_idx); - fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP); - for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++, - domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) { - VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, - domnode->f_idx) == 0); - VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0); - VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN, - domnode->f_ksid->kd_name) == 0); - } - VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, - fuids, numnodes) == 0); - for (i = 0; i != numnodes; i++) - nvlist_free(fuids[i]); - kmem_free(fuids, numnodes * sizeof (void *)); - VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); - packed = kmem_alloc(nvsize, KM_SLEEP); - VERIFY(nvlist_pack(nvp, &packed, &nvsize, - NV_ENCODE_XDR, KM_SLEEP) == 0); - nvlist_free(nvp); - zfsvfs->z_fuid_size = nvsize; - dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, - zfsvfs->z_fuid_size, packed, tx); - kmem_free(packed, zfsvfs->z_fuid_size); - VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, - FTAG, &db)); - dmu_buf_will_dirty(db, tx); - *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; - dmu_buf_rele(db, FTAG); - - zfsvfs->z_fuid_dirty = B_FALSE; - rw_exit(&zfsvfs->z_fuid_lock); -} - -/* - * Query domain table for a given domain. - * - * If domain isn't found and addok is set, it is added to AVL trees and - * the zfsvfs->z_fuid_dirty flag will be set to TRUE. It will then be - * necessary for the caller or another thread to detect the dirty table - * and sync out the changes. - */ -int -zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, - char **retdomain, boolean_t addok) -{ - fuid_domain_t searchnode, *findnode; - avl_index_t loc; - krw_t rw = RW_READER; - - /* - * If the dummy "nobody" domain then return an index of 0 - * to cause the created FUID to be a standard POSIX id - * for the user nobody. - */ - if (domain[0] == '\0') { - if (retdomain) - *retdomain = nulldomain; - return (0); - } - - searchnode.f_ksid = ksid_lookupdomain(domain); - if (retdomain) - *retdomain = searchnode.f_ksid->kd_name; - if (!zfsvfs->z_fuid_loaded) - zfs_fuid_init(zfsvfs); - -retry: - rw_enter(&zfsvfs->z_fuid_lock, rw); - findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc); - - if (findnode) { - rw_exit(&zfsvfs->z_fuid_lock); - ksiddomain_rele(searchnode.f_ksid); - return (findnode->f_idx); - } else if (addok) { - fuid_domain_t *domnode; - uint64_t retidx; - - if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) { - rw_exit(&zfsvfs->z_fuid_lock); - rw = RW_WRITER; - goto retry; - } - - domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); - domnode->f_ksid = searchnode.f_ksid; - - retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1; - - avl_add(&zfsvfs->z_fuid_domain, domnode); - avl_add(&zfsvfs->z_fuid_idx, domnode); - zfsvfs->z_fuid_dirty = B_TRUE; - rw_exit(&zfsvfs->z_fuid_lock); - return (retidx); - } else { - rw_exit(&zfsvfs->z_fuid_lock); - return (-1); - } -} - -/* - * Query domain table by index, returning domain string - * - * Returns a pointer from an avl node of the domain string. - * - */ -const char * -zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) -{ - char *domain; - - if (idx == 0 || !zfsvfs->z_use_fuids) - return (NULL); - - if (!zfsvfs->z_fuid_loaded) - zfs_fuid_init(zfsvfs); - - rw_enter(&zfsvfs->z_fuid_lock, RW_READER); - - if (zfsvfs->z_fuid_obj || zfsvfs->z_fuid_dirty) - domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx); - else - domain = nulldomain; - rw_exit(&zfsvfs->z_fuid_lock); - - ASSERT(domain); - return (domain); -} - -void -zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp) -{ - *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); - *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_gid, cr, ZFS_GROUP); -} - -uid_t -zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, - cred_t *cr, zfs_fuid_type_t type) -{ - uint32_t index = FUID_INDEX(fuid); - const char *domain; - uid_t id; - - if (index == 0) - return (fuid); - - domain = zfs_fuid_find_by_idx(zfsvfs, index); - ASSERT(domain != NULL); - -#ifdef illumos - if (type == ZFS_OWNER || type == ZFS_ACE_USER) { - (void) kidmap_getuidbysid(crgetzone(cr), domain, - FUID_RID(fuid), &id); - } else { - (void) kidmap_getgidbysid(crgetzone(cr), domain, - FUID_RID(fuid), &id); - } -#else - id = UID_NOBODY; -#endif - return (id); -} - -/* - * Add a FUID node to the list of fuid's being created for this - * ACL - * - * If ACL has multiple domains, then keep only one copy of each unique - * domain. - */ -void -zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, - uint64_t idx, uint64_t id, zfs_fuid_type_t type) -{ - zfs_fuid_t *fuid; - zfs_fuid_domain_t *fuid_domain; - zfs_fuid_info_t *fuidp; - uint64_t fuididx; - boolean_t found = B_FALSE; - - if (*fuidpp == NULL) - *fuidpp = zfs_fuid_info_alloc(); - - fuidp = *fuidpp; - /* - * First find fuid domain index in linked list - * - * If one isn't found then create an entry. - */ - - for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains); - fuid_domain; fuid_domain = list_next(&fuidp->z_domains, - fuid_domain), fuididx++) { - if (idx == fuid_domain->z_domidx) { - found = B_TRUE; - break; - } - } - - if (!found) { - fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP); - fuid_domain->z_domain = domain; - fuid_domain->z_domidx = idx; - list_insert_tail(&fuidp->z_domains, fuid_domain); - fuidp->z_domain_str_sz += strlen(domain) + 1; - fuidp->z_domain_cnt++; - } - - if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) { - - /* - * Now allocate fuid entry and add it on the end of the list - */ - - fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); - fuid->z_id = id; - fuid->z_domidx = idx; - fuid->z_logfuid = FUID_ENCODE(fuididx, rid); - - list_insert_tail(&fuidp->z_fuids, fuid); - fuidp->z_fuid_cnt++; - } else { - if (type == ZFS_OWNER) - fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid); - else - fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid); - } -} - -/* - * Create a file system FUID, based on information in the users cred - * - * If cred contains KSID_OWNER then it should be used to determine - * the uid otherwise cred's uid will be used. By default cred's gid - * is used unless it's an ephemeral ID in which case KSID_GROUP will - * be used if it exists. - */ -uint64_t -zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, - cred_t *cr, zfs_fuid_info_t **fuidp) -{ - uint64_t idx; - ksid_t *ksid; - uint32_t rid; - char *kdomain; - const char *domain; - uid_t id; - - VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); - - ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP); - - if (!zfsvfs->z_use_fuids || (ksid == NULL)) { - id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr); - - if (IS_EPHEMERAL(id)) - return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY); - - return ((uint64_t)id); - } - - /* - * ksid is present and FUID is supported - */ - id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr); - - if (!IS_EPHEMERAL(id)) - return ((uint64_t)id); - - if (type == ZFS_GROUP) - id = ksid_getid(ksid); - - rid = ksid_getrid(ksid); - domain = ksid_getdomain(ksid); - - idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); - - zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type); - - return (FUID_ENCODE(idx, rid)); -} - -/* - * Create a file system FUID for an ACL ace - * or a chown/chgrp of the file. - * This is similar to zfs_fuid_create_cred, except that - * we can't find the domain + rid information in the - * cred. Instead we have to query Winchester for the - * domain and rid. - * - * During replay operations the domain+rid information is - * found in the zfs_fuid_info_t that the replay code has - * attached to the zfsvfs of the file system. - */ -uint64_t -zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, - zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp) -{ - const char *domain; - char *kdomain; - uint32_t fuid_idx = FUID_INDEX(id); - uint32_t rid; - idmap_stat status; - uint64_t idx = 0; - zfs_fuid_t *zfuid = NULL; - zfs_fuid_info_t *fuidp = NULL; - - /* - * If POSIX ID, or entry is already a FUID then - * just return the id - * - * We may also be handed an already FUID'ized id via - * chmod. - */ - - if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0) - return (id); - - if (zfsvfs->z_replay) { - fuidp = zfsvfs->z_fuid_replay; - - /* - * If we are passed an ephemeral id, but no - * fuid_info was logged then return NOBODY. - * This is most likely a result of idmap service - * not being available. - */ - if (fuidp == NULL) - return (UID_NOBODY); - - VERIFY3U(type, >=, ZFS_OWNER); - VERIFY3U(type, <=, ZFS_ACE_GROUP); - - switch (type) { - case ZFS_ACE_USER: - case ZFS_ACE_GROUP: - zfuid = list_head(&fuidp->z_fuids); - rid = FUID_RID(zfuid->z_logfuid); - idx = FUID_INDEX(zfuid->z_logfuid); - break; - case ZFS_OWNER: - rid = FUID_RID(fuidp->z_fuid_owner); - idx = FUID_INDEX(fuidp->z_fuid_owner); - break; - case ZFS_GROUP: - rid = FUID_RID(fuidp->z_fuid_group); - idx = FUID_INDEX(fuidp->z_fuid_group); - break; - }; - domain = fuidp->z_domain_table[idx - 1]; - } else { - if (type == ZFS_OWNER || type == ZFS_ACE_USER) - status = kidmap_getsidbyuid(crgetzone(cr), id, - &domain, &rid); - else - status = kidmap_getsidbygid(crgetzone(cr), id, - &domain, &rid); - - if (status != 0) { - /* - * When returning nobody we will need to - * make a dummy fuid table entry for logging - * purposes. - */ - rid = UID_NOBODY; - domain = nulldomain; - } - } - - idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); - - if (!zfsvfs->z_replay) - zfs_fuid_node_add(fuidpp, kdomain, - rid, idx, id, type); - else if (zfuid != NULL) { - list_remove(&fuidp->z_fuids, zfuid); - kmem_free(zfuid, sizeof (zfs_fuid_t)); - } - return (FUID_ENCODE(idx, rid)); -} - -void -zfs_fuid_destroy(zfsvfs_t *zfsvfs) -{ - rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); - if (!zfsvfs->z_fuid_loaded) { - rw_exit(&zfsvfs->z_fuid_lock); - return; - } - zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); - rw_exit(&zfsvfs->z_fuid_lock); -} - -/* - * Allocate zfs_fuid_info for tracking FUIDs created during - * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR() - */ -zfs_fuid_info_t * -zfs_fuid_info_alloc(void) -{ - zfs_fuid_info_t *fuidp; - - fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP); - list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t), - offsetof(zfs_fuid_domain_t, z_next)); - list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t), - offsetof(zfs_fuid_t, z_next)); - return (fuidp); -} - -/* - * Release all memory associated with zfs_fuid_info_t - */ -void -zfs_fuid_info_free(zfs_fuid_info_t *fuidp) -{ - zfs_fuid_t *zfuid; - zfs_fuid_domain_t *zdomain; - - while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) { - list_remove(&fuidp->z_fuids, zfuid); - kmem_free(zfuid, sizeof (zfs_fuid_t)); - } - - if (fuidp->z_domain_table != NULL) - kmem_free(fuidp->z_domain_table, - (sizeof (char **)) * fuidp->z_domain_cnt); - - while ((zdomain = list_head(&fuidp->z_domains)) != NULL) { - list_remove(&fuidp->z_domains, zdomain); - kmem_free(zdomain, sizeof (zfs_fuid_domain_t)); - } - - kmem_free(fuidp, sizeof (zfs_fuid_info_t)); -} - -/* - * Check to see if id is a groupmember. If cred - * has ksid info then sidlist is checked first - * and if still not found then POSIX groups are checked - * - * Will use a straight FUID compare when possible. - */ -boolean_t -zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) -{ -#ifdef illumos - ksid_t *ksid = crgetsid(cr, KSID_GROUP); - ksidlist_t *ksidlist = crgetsidlist(cr); -#endif - uid_t gid; - -#ifdef illumos - if (ksid && ksidlist) { - int i; - ksid_t *ksid_groups; - uint32_t idx = FUID_INDEX(id); - uint32_t rid = FUID_RID(id); - - ksid_groups = ksidlist->ksl_sids; - - for (i = 0; i != ksidlist->ksl_nsid; i++) { - if (idx == 0) { - if (id != IDMAP_WK_CREATOR_GROUP_GID && - id == ksid_groups[i].ks_id) { - return (B_TRUE); - } - } else { - const char *domain; - - domain = zfs_fuid_find_by_idx(zfsvfs, idx); - ASSERT(domain != NULL); - - if (strcmp(domain, - IDMAP_WK_CREATOR_SID_AUTHORITY) == 0) - return (B_FALSE); - - if ((strcmp(domain, - ksid_groups[i].ks_domain->kd_name) == 0) && - rid == ksid_groups[i].ks_rid) - return (B_TRUE); - } - } - } -#endif /* illumos */ - - /* - * Not found in ksidlist, check posix groups - */ - gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP); - return (groupmember(gid, cr)); -} - -void -zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx) -{ - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } -} -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c deleted file mode 100644 index a7e2aff6e683..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ /dev/null @@ -1,7692 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. - * Copyright 2013 Martin Matuska . All rights reserved. - * Copyright 2014 Xin Li . All rights reserved. - * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2015 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2013 Steven Hartland. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2016 Toomas Soome - * Copyright 2017 RackTop Systems. - * Copyright (c) 2018, loli10K . All rights reserved. - * Copyright (c) 2019 Datto Inc. - */ - -/* - * ZFS ioctls. - * - * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage - * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. - * - * There are two ways that we handle ioctls: the legacy way where almost - * all of the logic is in the ioctl callback, and the new way where most - * of the marshalling is handled in the common entry point, zfsdev_ioctl(). - * - * Non-legacy ioctls should be registered by calling - * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked - * from userland by lzc_ioctl(). - * - * The registration arguments are as follows: - * - * const char *name - * The name of the ioctl. This is used for history logging. If the - * ioctl returns successfully (the callback returns 0), and allow_log - * is true, then a history log entry will be recorded with the input & - * output nvlists. The log entry can be printed with "zpool history -i". - * - * zfs_ioc_t ioc - * The ioctl request number, which userland will pass to ioctl(2). - * We want newer versions of libzfs and libzfs_core to run against - * existing zfs kernel modules (i.e. a deferred reboot after an update). - * Therefore the ioctl numbers cannot change from release to release. - * - * zfs_secpolicy_func_t *secpolicy - * This function will be called before the zfs_ioc_func_t, to - * determine if this operation is permitted. It should return EPERM - * on failure, and 0 on success. Checks include determining if the - * dataset is visible in this zone, and if the user has either all - * zfs privileges in the zone (SYS_MOUNT), or has been granted permission - * to do this operation on this dataset with "zfs allow". - * - * zfs_ioc_namecheck_t namecheck - * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool - * name, a dataset name, or nothing. If the name is not well-formed, - * the ioctl will fail and the callback will not be called. - * Therefore, the callback can assume that the name is well-formed - * (e.g. is null-terminated, doesn't have more than one '@' character, - * doesn't have invalid characters). - * - * zfs_ioc_poolcheck_t pool_check - * This specifies requirements on the pool state. If the pool does - * not meet them (is suspended or is readonly), the ioctl will fail - * and the callback will not be called. If any checks are specified - * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. - * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | - * POOL_CHECK_READONLY). - * - * zfs_ioc_key_t *nvl_keys - * The list of expected/allowable innvl input keys. This list is used - * to validate the nvlist input to the ioctl. - * - * boolean_t smush_outnvlist - * If smush_outnvlist is true, then the output is presumed to be a - * list of errors, and it will be "smushed" down to fit into the - * caller's buffer, by removing some entries and replacing them with a - * single "N_MORE_ERRORS" entry indicating how many were removed. See - * nvlist_smush() for details. If smush_outnvlist is false, and the - * outnvlist does not fit into the userland-provided buffer, then the - * ioctl will fail with ENOMEM. - * - * zfs_ioc_func_t *func - * The callback function that will perform the operation. - * - * The callback should return 0 on success, or an error number on - * failure. If the function fails, the userland ioctl will return -1, - * and errno will be set to the callback's return value. The callback - * will be called with the following arguments: - * - * const char *name - * The name of the pool or dataset to operate on, from - * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the - * expected type (pool, dataset, or none). - * - * nvlist_t *innvl - * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or - * NULL if no input nvlist was provided. Changes to this nvlist are - * ignored. If the input nvlist could not be deserialized, the - * ioctl will fail and the callback will not be called. - * - * nvlist_t *outnvl - * The output nvlist, initially empty. The callback can fill it in, - * and it will be returned to userland by serializing it into - * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization - * fails (e.g. because the caller didn't supply a large enough - * buffer), then the overall ioctl will fail. See the - * 'smush_nvlist' argument above for additional behaviors. - * - * There are two typical uses of the output nvlist: - * - To return state, e.g. property values. In this case, - * smush_outnvlist should be false. If the buffer was not large - * enough, the caller will reallocate a larger buffer and try - * the ioctl again. - * - * - To return multiple errors from an ioctl which makes on-disk - * changes. In this case, smush_outnvlist should be true. - * Ioctls which make on-disk modifications should generally not - * use the outnvl if they succeed, because the caller can not - * distinguish between the operation failing, and - * deserialization failing. - * - * - * IOCTL Interface Errors - * - * The following ioctl input errors can be returned: - * ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel - * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel - * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing - * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type - */ - -#ifdef __FreeBSD__ -#include "opt_kstack_pages.h" -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zfs_namecheck.h" -#include "zfs_prop.h" -#include "zfs_deleg.h" -#include "zfs_comutil.h" -#include "zfs_ioctl_compat.h" - -#include "lua.h" -#include "lauxlib.h" - -#ifndef ARRAY_SIZE -#define ARRAY_SIZE(x) nitems(x) -#endif - -static struct cdev *zfsdev; - -extern void zfs_init(void); -extern void zfs_fini(void); - -uint_t zfs_fsyncer_key; -extern uint_t rrw_tsd_key; -static uint_t zfs_allow_log_key; -extern uint_t zfs_geom_probe_vdev_key; - -typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *); -typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *); -typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *); - -/* - * IOC Keys are used to document and validate user->kernel interface inputs. - * See zfs_keys_recv_new for an example declaration. Any key name that is not - * listed will be rejected as input. - * - * The keyname 'optional' is always allowed, and must be an nvlist if present. - * Arguments which older kernels can safely ignore can be placed under the - * "optional" key. - * - * When adding new keys to an existing ioc for new functionality, consider: - * - adding an entry into zfs_sysfs.c zfs_features[] list - * - updating the libzfs_input_check.c test utility - * - * Note: in the ZK_WILDCARDLIST case, the name serves as documentation - * for the expected name (bookmark, snapshot, property, etc) but there - * is no validation in the preflight zfs_check_input_nvpairs() check. - */ -typedef enum { - ZK_OPTIONAL = 1 << 0, /* pair is optional */ - ZK_WILDCARDLIST = 1 << 1, /* one or more unspecified key names */ -} ioc_key_flag_t; - -/* DATA_TYPE_ANY is used when zkey_type can vary. */ -#define DATA_TYPE_ANY DATA_TYPE_UNKNOWN - -typedef struct zfs_ioc_key { - const char *zkey_name; - data_type_t zkey_type; - ioc_key_flag_t zkey_flags; -} zfs_ioc_key_t; - -typedef enum { - NO_NAME, - POOL_NAME, - DATASET_NAME, - ENTITY_NAME -} zfs_ioc_namecheck_t; - -typedef enum { - POOL_CHECK_NONE = 1 << 0, - POOL_CHECK_SUSPENDED = 1 << 1, - POOL_CHECK_READONLY = 1 << 2, -} zfs_ioc_poolcheck_t; - -typedef struct zfs_ioc_vec { - zfs_ioc_legacy_func_t *zvec_legacy_func; - zfs_ioc_func_t *zvec_func; - zfs_secpolicy_func_t *zvec_secpolicy; - zfs_ioc_namecheck_t zvec_namecheck; - boolean_t zvec_allow_log; - zfs_ioc_poolcheck_t zvec_pool_check; - boolean_t zvec_smush_outnvlist; - const char *zvec_name; - const zfs_ioc_key_t *zvec_nvl_keys; - size_t zvec_nvl_key_count; -} zfs_ioc_vec_t; - -/* This array is indexed by zfs_userquota_prop_t */ -static const char *userquota_perms[] = { - ZFS_DELEG_PERM_USERUSED, - ZFS_DELEG_PERM_USERQUOTA, - ZFS_DELEG_PERM_GROUPUSED, - ZFS_DELEG_PERM_GROUPQUOTA, -}; - -static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); -static int zfs_check_settable(const char *name, nvpair_t *property, - cred_t *cr); -static int zfs_check_clearable(char *dataset, nvlist_t *props, - nvlist_t **errors); -static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, - boolean_t *); -int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); -static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); - -static void zfsdev_close(void *data); - -static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature); - -/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ -void -__dprintf(const char *file, const char *func, int line, const char *fmt, ...) -{ - const char *newfile; - char buf[512]; - va_list adx; - - /* - * Get rid of annoying "../common/" prefix to filename. - */ - newfile = strrchr(file, '/'); - if (newfile != NULL) { - newfile = newfile + 1; /* Get rid of leading / */ - } else { - newfile = file; - } - - va_start(adx, fmt); - (void) vsnprintf(buf, sizeof (buf), fmt, adx); - va_end(adx); - - /* - * To get this data, use the zfs-dprintf probe as so: - * dtrace -q -n 'zfs-dprintf \ - * /stringof(arg0) == "dbuf.c"/ \ - * {printf("%s: %s", stringof(arg1), stringof(arg3))}' - * arg0 = file name - * arg1 = function name - * arg2 = line number - * arg3 = message - */ - DTRACE_PROBE4(zfs__dprintf, - char *, newfile, char *, func, int, line, char *, buf); -} - -static void -history_str_free(char *buf) -{ - kmem_free(buf, HIS_MAX_RECORD_LEN); -} - -static char * -history_str_get(zfs_cmd_t *zc) -{ - char *buf; - - if (zc->zc_history == 0) - return (NULL); - - buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); - if (copyinstr((void *)(uintptr_t)zc->zc_history, - buf, HIS_MAX_RECORD_LEN, NULL) != 0) { - history_str_free(buf); - return (NULL); - } - - buf[HIS_MAX_RECORD_LEN -1] = '\0'; - - return (buf); -} - -/* - * Check to see if the named dataset is currently defined as bootable - */ -static boolean_t -zfs_is_bootfs(const char *name) -{ - objset_t *os; - - if (dmu_objset_hold(name, FTAG, &os) == 0) { - boolean_t ret; - ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os))); - dmu_objset_rele(os, FTAG); - return (ret); - } - return (B_FALSE); -} - -/* - * Return non-zero if the spa version is less than requested version. - */ -static int -zfs_earlier_version(const char *name, int version) -{ - spa_t *spa; - - if (spa_open(name, &spa, FTAG) == 0) { - if (spa_version(spa) < version) { - spa_close(spa, FTAG); - return (1); - } - spa_close(spa, FTAG); - } - return (0); -} - -/* - * Return TRUE if the ZPL version is less than requested version. - */ -static boolean_t -zpl_earlier_version(const char *name, int version) -{ - objset_t *os; - boolean_t rc = B_TRUE; - - if (dmu_objset_hold(name, FTAG, &os) == 0) { - uint64_t zplversion; - - if (dmu_objset_type(os) != DMU_OST_ZFS) { - dmu_objset_rele(os, FTAG); - return (B_TRUE); - } - /* XXX reading from non-owned objset */ - if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) - rc = zplversion < version; - dmu_objset_rele(os, FTAG); - } - return (rc); -} - -static void -zfs_log_history(zfs_cmd_t *zc) -{ - spa_t *spa; - char *buf; - - if ((buf = history_str_get(zc)) == NULL) - return; - - if (spa_open(zc->zc_name, &spa, FTAG) == 0) { - if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) - (void) spa_history_log(spa, buf); - spa_close(spa, FTAG); - } - history_str_free(buf); -} - -/* - * Policy for top-level read operations (list pools). Requires no privileges, - * and can be used in the local zone, as there is no associated dataset. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - return (0); -} - -/* - * Policy for dataset read operations (list children, get statistics). Requires - * no privileges, but must be visible in the local zone. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - if (INGLOBALZONE(curthread) || - zone_dataset_visible(zc->zc_name, NULL)) - return (0); - - return (SET_ERROR(ENOENT)); -} - -static int -zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) -{ - int writable = 1; - - /* - * The dataset must be visible by this zone -- check this first - * so they don't see EPERM on something they shouldn't know about. - */ - if (!INGLOBALZONE(curthread) && - !zone_dataset_visible(dataset, &writable)) - return (SET_ERROR(ENOENT)); - - if (INGLOBALZONE(curthread)) { - /* - * If the fs is zoned, only root can access it from the - * global zone. - */ - if (secpolicy_zfs(cr) && zoned) - return (SET_ERROR(EPERM)); - } else { - /* - * If we are in a local zone, the 'zoned' property must be set. - */ - if (!zoned) - return (SET_ERROR(EPERM)); - - /* must be writable by this zone */ - if (!writable) - return (SET_ERROR(EPERM)); - } - return (0); -} - -static int -zfs_dozonecheck(const char *dataset, cred_t *cr) -{ - uint64_t zoned; - - if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL)) - return (SET_ERROR(ENOENT)); - - return (zfs_dozonecheck_impl(dataset, zoned, cr)); -} - -static int -zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) -{ - uint64_t zoned; - - if (dsl_prop_get_int_ds(ds, "jailed", &zoned)) - return (SET_ERROR(ENOENT)); - - return (zfs_dozonecheck_impl(dataset, zoned, cr)); -} - -static int -zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, - const char *perm, cred_t *cr) -{ - int error; - - error = zfs_dozonecheck_ds(name, ds, cr); - if (error == 0) { - error = secpolicy_zfs(cr); - if (error != 0) - error = dsl_deleg_access_impl(ds, perm, cr); - } - return (error); -} - -static int -zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) -{ - int error; - dsl_dataset_t *ds; - dsl_pool_t *dp; - - /* - * First do a quick check for root in the global zone, which - * is allowed to do all write_perms. This ensures that zfs_ioc_* - * will get to handle nonexistent datasets. - */ - if (INGLOBALZONE(curthread) && secpolicy_zfs(cr) == 0) - return (0); - - error = dsl_pool_hold(name, FTAG, &dp); - if (error != 0) - return (error); - - error = dsl_dataset_hold(dp, name, FTAG, &ds); - if (error != 0) { - dsl_pool_rele(dp, FTAG); - return (error); - } - - error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); - - dsl_dataset_rele(ds, FTAG); - dsl_pool_rele(dp, FTAG); - return (error); -} - -#ifdef SECLABEL -/* - * Policy for setting the security label property. - * - * Returns 0 for success, non-zero for access and other errors. - */ -static int -zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) -{ - char ds_hexsl[MAXNAMELEN]; - bslabel_t ds_sl, new_sl; - boolean_t new_default = FALSE; - uint64_t zoned; - int needed_priv = -1; - int error; - - /* First get the existing dataset label. */ - error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), - 1, sizeof (ds_hexsl), &ds_hexsl, NULL); - if (error != 0) - return (SET_ERROR(EPERM)); - - if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) - new_default = TRUE; - - /* The label must be translatable */ - if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) - return (SET_ERROR(EINVAL)); - - /* - * In a non-global zone, disallow attempts to set a label that - * doesn't match that of the zone; otherwise no other checks - * are needed. - */ - if (!INGLOBALZONE(curproc)) { - if (new_default || !blequal(&new_sl, CR_SL(CRED()))) - return (SET_ERROR(EPERM)); - return (0); - } - - /* - * For global-zone datasets (i.e., those whose zoned property is - * "off", verify that the specified new label is valid for the - * global zone. - */ - if (dsl_prop_get_integer(name, - zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) - return (SET_ERROR(EPERM)); - if (!zoned) { - if (zfs_check_global_label(name, strval) != 0) - return (SET_ERROR(EPERM)); - } - - /* - * If the existing dataset label is nondefault, check if the - * dataset is mounted (label cannot be changed while mounted). - * Get the zfsvfs; if there isn't one, then the dataset isn't - * mounted (or isn't a dataset, doesn't exist, ...). - */ - if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { - objset_t *os; - static char *setsl_tag = "setsl_tag"; - - /* - * Try to own the dataset; abort if there is any error, - * (e.g., already mounted, in use, or other error). - */ - error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, - setsl_tag, &os); - if (error != 0) - return (SET_ERROR(EPERM)); - - dmu_objset_disown(os, setsl_tag); - - if (new_default) { - needed_priv = PRIV_FILE_DOWNGRADE_SL; - goto out_check; - } - - if (hexstr_to_label(strval, &new_sl) != 0) - return (SET_ERROR(EPERM)); - - if (blstrictdom(&ds_sl, &new_sl)) - needed_priv = PRIV_FILE_DOWNGRADE_SL; - else if (blstrictdom(&new_sl, &ds_sl)) - needed_priv = PRIV_FILE_UPGRADE_SL; - } else { - /* dataset currently has a default label */ - if (!new_default) - needed_priv = PRIV_FILE_UPGRADE_SL; - } - -out_check: - if (needed_priv != -1) - return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); - return (0); -} -#endif /* SECLABEL */ - -static int -zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, - cred_t *cr) -{ - char *strval; - - /* - * Check permissions for special properties. - */ - switch (prop) { - case ZFS_PROP_ZONED: - /* - * Disallow setting of 'zoned' from within a local zone. - */ - if (!INGLOBALZONE(curthread)) - return (SET_ERROR(EPERM)); - break; - - case ZFS_PROP_QUOTA: - case ZFS_PROP_FILESYSTEM_LIMIT: - case ZFS_PROP_SNAPSHOT_LIMIT: - if (!INGLOBALZONE(curthread)) { - uint64_t zoned; - char setpoint[ZFS_MAX_DATASET_NAME_LEN]; - /* - * Unprivileged users are allowed to modify the - * limit on things *under* (ie. contained by) - * the thing they own. - */ - if (dsl_prop_get_integer(dsname, "jailed", &zoned, - setpoint)) - return (SET_ERROR(EPERM)); - if (!zoned || strlen(dsname) <= strlen(setpoint)) - return (SET_ERROR(EPERM)); - } - break; - - case ZFS_PROP_MLSLABEL: -#ifdef SECLABEL - if (!is_system_labeled()) - return (SET_ERROR(EPERM)); - - if (nvpair_value_string(propval, &strval) == 0) { - int err; - - err = zfs_set_slabel_policy(dsname, strval, CRED()); - if (err != 0) - return (err); - } -#else - return (EOPNOTSUPP); -#endif - break; - } - - return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - int error; - - error = zfs_dozonecheck(zc->zc_name, cr); - if (error != 0) - return (error); - - /* - * permission to set permissions will be evaluated later in - * dsl_deleg_can_allow() - */ - return (0); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_ROLLBACK, cr)); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - dsl_pool_t *dp; - dsl_dataset_t *ds; - char *cp; - int error; - - /* - * Generate the current snapshot name from the given objsetid, then - * use that name for the secpolicy/zone checks. - */ - cp = strchr(zc->zc_name, '@'); - if (cp == NULL) - return (SET_ERROR(EINVAL)); - error = dsl_pool_hold(zc->zc_name, FTAG, &dp); - if (error != 0) - return (error); - - error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); - if (error != 0) { - dsl_pool_rele(dp, FTAG); - return (error); - } - - dsl_dataset_name(ds, zc->zc_name); - - error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, - ZFS_DELEG_PERM_SEND, cr); - dsl_dataset_rele(ds, FTAG); - dsl_pool_rele(dp, FTAG); - - return (error); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_SEND, cr)); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - vnode_t *vp; - int error; - - if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, - NO_FOLLOW, NULL, &vp)) != 0) - return (error); - - /* Now make sure mntpnt and dataset are ZFS */ - - if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || - (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), - zc->zc_name) != 0)) { - VN_RELE(vp); - return (SET_ERROR(EPERM)); - } - - VN_RELE(vp); - return (dsl_deleg_access(zc->zc_name, - ZFS_DELEG_PERM_SHARE, cr)); -} - -int -zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - if (!INGLOBALZONE(curthread)) - return (SET_ERROR(EPERM)); - - if (secpolicy_nfs(cr) == 0) { - return (0); - } else { - return (zfs_secpolicy_deleg_share(zc, innvl, cr)); - } -} - -int -zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - if (!INGLOBALZONE(curthread)) - return (SET_ERROR(EPERM)); - - if (secpolicy_smb(cr) == 0) { - return (0); - } else { - return (zfs_secpolicy_deleg_share(zc, innvl, cr)); - } -} - -static int -zfs_get_parent(const char *datasetname, char *parent, int parentsize) -{ - char *cp; - - /* - * Remove the @bla or /bla from the end of the name to get the parent. - */ - (void) strncpy(parent, datasetname, parentsize); - cp = strrchr(parent, '@'); - if (cp != NULL) { - cp[0] = '\0'; - } else { - cp = strrchr(parent, '/'); - if (cp == NULL) - return (SET_ERROR(ENOENT)); - cp[0] = '\0'; - } - - return (0); -} - -int -zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) -{ - int error; - - if ((error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_MOUNT, cr)) != 0) - return (error); - - return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); -} - -/* - * Destroying snapshots with delegated permissions requires - * descendant mount and destroy permissions. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - nvlist_t *snaps; - nvpair_t *pair, *nextpair; - int error = 0; - - snaps = fnvlist_lookup_nvlist(innvl, "snaps"); - - for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; - pair = nextpair) { - nextpair = nvlist_next_nvpair(snaps, pair); - error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); - if (error == ENOENT) { - /* - * Ignore any snapshots that don't exist (we consider - * them "already destroyed"). Remove the name from the - * nvl here in case the snapshot is created between - * now and when we try to destroy it (in which case - * we don't want to destroy it since we haven't - * checked for permission). - */ - fnvlist_remove_nvpair(snaps, pair); - error = 0; - } - if (error != 0) - break; - } - - return (error); -} - -int -zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) -{ - char parentname[ZFS_MAX_DATASET_NAME_LEN]; - int error; - - if ((error = zfs_secpolicy_write_perms(from, - ZFS_DELEG_PERM_RENAME, cr)) != 0) - return (error); - - if ((error = zfs_secpolicy_write_perms(from, - ZFS_DELEG_PERM_MOUNT, cr)) != 0) - return (error); - - if ((error = zfs_get_parent(to, parentname, - sizeof (parentname))) != 0) - return (error); - - if ((error = zfs_secpolicy_write_perms(parentname, - ZFS_DELEG_PERM_CREATE, cr)) != 0) - return (error); - - if ((error = zfs_secpolicy_write_perms(parentname, - ZFS_DELEG_PERM_MOUNT, cr)) != 0) - return (error); - - return (error); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - char *at = NULL; - char *pound; - int error; - - if ((pound = strchr(zc->zc_name, '#')) != NULL) { - *pound = '\0'; - error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_RENAME, cr); - if (error == 0) { - error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_BOOKMARK, cr); - } - *pound = '#'; - return (error); - } - - if ((zc->zc_cookie & 1) != 0) { - /* - * This is recursive rename, so the starting snapshot might - * not exist. Check file system or volume permission instead. - */ - at = strchr(zc->zc_name, '@'); - if (at == NULL) - return (EINVAL); - *at = '\0'; - } - - error = zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr); - - if (at != NULL) - *at = '@'; - - return (error); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - dsl_pool_t *dp; - dsl_dataset_t *clone; - int error; - - error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_PROMOTE, cr); - if (error != 0) - return (error); - - error = dsl_pool_hold(zc->zc_name, FTAG, &dp); - if (error != 0) - return (error); - - error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); - - if (error == 0) { - char parentname[ZFS_MAX_DATASET_NAME_LEN]; - dsl_dataset_t *origin = NULL; - dsl_dir_t *dd; - dd = clone->ds_dir; - - error = dsl_dataset_hold_obj(dd->dd_pool, - dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); - if (error != 0) { - dsl_dataset_rele(clone, FTAG); - dsl_pool_rele(dp, FTAG); - return (error); - } - - error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, - ZFS_DELEG_PERM_MOUNT, cr); - - dsl_dataset_name(origin, parentname); - if (error == 0) { - error = zfs_secpolicy_write_perms_ds(parentname, origin, - ZFS_DELEG_PERM_PROMOTE, cr); - } - dsl_dataset_rele(clone, FTAG); - dsl_dataset_rele(origin, FTAG); - } - dsl_pool_rele(dp, FTAG); - return (error); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - int error; - - if ((error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_RECEIVE, cr)) != 0) - return (error); - - if ((error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_MOUNT, cr)) != 0) - return (error); - - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_CREATE, cr)); -} - -int -zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) -{ - return (zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_SNAPSHOT, cr)); -} - -/* - * Check for permission to create each snapshot in the nvlist. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - nvlist_t *snaps; - int error; - nvpair_t *pair; - - snaps = fnvlist_lookup_nvlist(innvl, "snaps"); - - for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; - pair = nvlist_next_nvpair(snaps, pair)) { - char *name = nvpair_name(pair); - char *atp = strchr(name, '@'); - - if (atp == NULL) { - error = SET_ERROR(EINVAL); - break; - } - *atp = '\0'; - error = zfs_secpolicy_snapshot_perms(name, cr); - *atp = '@'; - if (error != 0) - break; - } - return (error); -} - -/* - * Check for permission to create each bookmark in the nvlist. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - int error = 0; - - for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); - pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { - char *name = nvpair_name(pair); - char *hashp = strchr(name, '#'); - - if (hashp == NULL) { - error = SET_ERROR(EINVAL); - break; - } - *hashp = '\0'; - error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_BOOKMARK, cr); - *hashp = '#'; - if (error != 0) - break; - } - return (error); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_remap(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_REMAP, cr)); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - nvpair_t *pair, *nextpair; - int error = 0; - - for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; - pair = nextpair) { - char *name = nvpair_name(pair); - char *hashp = strchr(name, '#'); - nextpair = nvlist_next_nvpair(innvl, pair); - - if (hashp == NULL) { - error = SET_ERROR(EINVAL); - break; - } - - *hashp = '\0'; - error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_DESTROY, cr); - *hashp = '#'; - if (error == ENOENT) { - /* - * Ignore any filesystems that don't exist (we consider - * their bookmarks "already destroyed"). Remove - * the name from the nvl here in case the filesystem - * is created between now and when we try to destroy - * the bookmark (in which case we don't want to - * destroy it since we haven't checked for permission). - */ - fnvlist_remove_nvpair(innvl, pair); - error = 0; - } - if (error != 0) - break; - } - - return (error); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - /* - * Even root must have a proper TSD so that we know what pool - * to log to. - */ - if (tsd_get(zfs_allow_log_key) == NULL) - return (SET_ERROR(EPERM)); - return (0); -} - -static int -zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - char parentname[ZFS_MAX_DATASET_NAME_LEN]; - int error; - char *origin; - - if ((error = zfs_get_parent(zc->zc_name, parentname, - sizeof (parentname))) != 0) - return (error); - - if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && - (error = zfs_secpolicy_write_perms(origin, - ZFS_DELEG_PERM_CLONE, cr)) != 0) - return (error); - - if ((error = zfs_secpolicy_write_perms(parentname, - ZFS_DELEG_PERM_CREATE, cr)) != 0) - return (error); - - return (zfs_secpolicy_write_perms(parentname, - ZFS_DELEG_PERM_MOUNT, cr)); -} - -/* - * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires - * SYS_CONFIG privilege, which is not available in a local zone. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - if (secpolicy_sys_config(cr, B_FALSE) != 0) - return (SET_ERROR(EPERM)); - - return (0); -} - -/* - * Policy for object to name lookups. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - int error; - - if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) - return (0); - - error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); - return (error); -} - -/* - * Policy for fault injection. Requires all privileges. - */ -/* ARGSUSED */ -static int -zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - return (secpolicy_zinject(cr)); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); - - if (prop == ZPROP_INVAL) { - if (!zfs_prop_user(zc->zc_value)) - return (SET_ERROR(EINVAL)); - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_USERPROP, cr)); - } else { - return (zfs_secpolicy_setprop(zc->zc_name, prop, - NULL, cr)); - } -} - -static int -zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - int err = zfs_secpolicy_read(zc, innvl, cr); - if (err) - return (err); - - if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) - return (SET_ERROR(EINVAL)); - - if (zc->zc_value[0] == 0) { - /* - * They are asking about a posix uid/gid. If it's - * themself, allow it. - */ - if (zc->zc_objset_type == ZFS_PROP_USERUSED || - zc->zc_objset_type == ZFS_PROP_USERQUOTA) { - if (zc->zc_guid == crgetuid(cr)) - return (0); - } else { - if (groupmember(zc->zc_guid, cr)) - return (0); - } - } - - return (zfs_secpolicy_write_perms(zc->zc_name, - userquota_perms[zc->zc_objset_type], cr)); -} - -static int -zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - int err = zfs_secpolicy_read(zc, innvl, cr); - if (err) - return (err); - - if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) - return (SET_ERROR(EINVAL)); - - return (zfs_secpolicy_write_perms(zc->zc_name, - userquota_perms[zc->zc_objset_type], cr)); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, - NULL, cr)); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - nvpair_t *pair; - nvlist_t *holds; - int error; - - holds = fnvlist_lookup_nvlist(innvl, "holds"); - - for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; - pair = nvlist_next_nvpair(holds, pair)) { - char fsname[ZFS_MAX_DATASET_NAME_LEN]; - error = dmu_fsname(nvpair_name(pair), fsname); - if (error != 0) - return (error); - error = zfs_secpolicy_write_perms(fsname, - ZFS_DELEG_PERM_HOLD, cr); - if (error != 0) - return (error); - } - return (0); -} - -/* ARGSUSED */ -static int -zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - nvpair_t *pair; - int error; - - for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; - pair = nvlist_next_nvpair(innvl, pair)) { - char fsname[ZFS_MAX_DATASET_NAME_LEN]; - error = dmu_fsname(nvpair_name(pair), fsname); - if (error != 0) - return (error); - error = zfs_secpolicy_write_perms(fsname, - ZFS_DELEG_PERM_RELEASE, cr); - if (error != 0) - return (error); - } - return (0); -} - -/* - * Policy for allowing temporary snapshots to be taken or released - */ -static int -zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - /* - * A temporary snapshot is the same as a snapshot, - * hold, destroy and release all rolled into one. - * Delegated diff alone is sufficient that we allow this. - */ - int error; - - if ((error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_DIFF, cr)) == 0) - return (0); - - error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); - if (innvl != NULL) { - if (error == 0) - error = zfs_secpolicy_hold(zc, innvl, cr); - if (error == 0) - error = zfs_secpolicy_release(zc, innvl, cr); - if (error == 0) - error = zfs_secpolicy_destroy(zc, innvl, cr); - } - return (error); -} - -/* - * Returns the nvlist as specified by the user in the zfs_cmd_t. - */ -static int -get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) -{ - char *packed; - int error; - nvlist_t *list = NULL; - - /* - * Read in and unpack the user-supplied nvlist. - */ - if (size == 0) - return (SET_ERROR(EINVAL)); - - packed = kmem_alloc(size, KM_SLEEP); - - if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, - iflag)) != 0) { - kmem_free(packed, size); - return (SET_ERROR(EFAULT)); - } - - if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { - kmem_free(packed, size); - return (error); - } - - kmem_free(packed, size); - - *nvp = list; - return (0); -} - -/* - * Reduce the size of this nvlist until it can be serialized in 'max' bytes. - * Entries will be removed from the end of the nvlist, and one int32 entry - * named "N_MORE_ERRORS" will be added indicating how many entries were - * removed. - */ -static int -nvlist_smush(nvlist_t *errors, size_t max) -{ - size_t size; - - size = fnvlist_size(errors); - - if (size > max) { - nvpair_t *more_errors; - int n = 0; - - if (max < 1024) - return (SET_ERROR(ENOMEM)); - - fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); - more_errors = nvlist_prev_nvpair(errors, NULL); - - do { - nvpair_t *pair = nvlist_prev_nvpair(errors, - more_errors); - fnvlist_remove_nvpair(errors, pair); - n++; - size = fnvlist_size(errors); - } while (size > max); - - fnvlist_remove_nvpair(errors, more_errors); - fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); - ASSERT3U(fnvlist_size(errors), <=, max); - } - - return (0); -} - -static int -put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) -{ - char *packed = NULL; - int error = 0; - size_t size; - - size = fnvlist_size(nvl); - - if (size > zc->zc_nvlist_dst_size) { - /* - * Solaris returns ENOMEM here, because even if an error is - * returned from an ioctl(2), new zc_nvlist_dst_size will be - * passed to the userland. This is not the case for FreeBSD. - * We need to return 0, so the kernel will copy the - * zc_nvlist_dst_size back and the userland can discover that a - * bigger buffer is needed. - */ - error = 0; - } else { - packed = fnvlist_pack(nvl, &size); - if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, - size, zc->zc_iflags) != 0) - error = SET_ERROR(EFAULT); - fnvlist_pack_free(packed, size); - } - - zc->zc_nvlist_dst_size = size; - zc->zc_nvlist_dst_filled = B_TRUE; - return (error); -} - -int -getzfsvfs_impl(objset_t *os, vfs_t **vfsp) -{ - zfsvfs_t *zfvp; - int error = 0; - - if (dmu_objset_type(os) != DMU_OST_ZFS) { - return (SET_ERROR(EINVAL)); - } - - mutex_enter(&os->os_user_ptr_lock); - zfvp = dmu_objset_get_user(os); - if (zfvp) { - *vfsp = zfvp->z_vfs; - vfs_ref(zfvp->z_vfs); - } else { - error = SET_ERROR(ESRCH); - } - mutex_exit(&os->os_user_ptr_lock); - return (error); -} - -int -getzfsvfs(const char *dsname, zfsvfs_t **zfvp) -{ - objset_t *os; - vfs_t *vfsp; - int error; - - error = dmu_objset_hold(dsname, FTAG, &os); - if (error != 0) - return (error); - error = getzfsvfs_impl(os, &vfsp); - dmu_objset_rele(os, FTAG); - if (error != 0) - return (error); - - error = vfs_busy(vfsp, 0); - vfs_rel(vfsp); - if (error != 0) { - *zfvp = NULL; - error = SET_ERROR(ESRCH); - } else { - *zfvp = vfsp->vfs_data; - } - return (error); -} - -/* - * Find a zfsvfs_t for a mounted filesystem, or create our own, in which - * case its z_vfs will be NULL, and it will be opened as the owner. - * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, - * which prevents all vnode ops from running. - */ -static int -zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) -{ - int error = 0; - - if (getzfsvfs(name, zfvp) != 0) - error = zfsvfs_create(name, zfvp); - if (error == 0) { - rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : - RW_READER, tag); -#ifdef illumos - if ((*zfvp)->z_unmounted) { - /* - * XXX we could probably try again, since the unmounting - * thread should be just about to disassociate the - * objset from the zfsvfs. - */ - rrm_exit(&(*zfvp)->z_teardown_lock, tag); - return (SET_ERROR(EBUSY)); - } -#else - /* - * vfs_busy() ensures that the filesystem is not and - * can not be unmounted. - */ - ASSERT(!(*zfvp)->z_unmounted); -#endif - } - return (error); -} - -static void -zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) -{ - rrm_exit(&zfsvfs->z_teardown_lock, tag); - - if (zfsvfs->z_vfs) { -#ifdef illumos - VFS_RELE(zfsvfs->z_vfs); -#else - vfs_unbusy(zfsvfs->z_vfs); -#endif - } else { - dmu_objset_disown(zfsvfs->z_os, zfsvfs); - zfsvfs_free(zfsvfs); - } -} - -static int -zfs_ioc_pool_create(zfs_cmd_t *zc) -{ - int error; - nvlist_t *config, *props = NULL; - nvlist_t *rootprops = NULL; - nvlist_t *zplprops = NULL; - char *spa_name = zc->zc_name; - - if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - zc->zc_iflags, &config)) - return (error); - - if (zc->zc_nvlist_src_size != 0 && (error = - get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &props))) { - nvlist_free(config); - return (error); - } - - if (props) { - nvlist_t *nvl = NULL; - uint64_t version = SPA_VERSION; - char *tname; - - (void) nvlist_lookup_uint64(props, - zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); - if (!SPA_VERSION_IS_SUPPORTED(version)) { - error = SET_ERROR(EINVAL); - goto pool_props_bad; - } - (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); - if (nvl) { - error = nvlist_dup(nvl, &rootprops, KM_SLEEP); - if (error != 0) { - nvlist_free(config); - nvlist_free(props); - return (error); - } - (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); - } - VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); - error = zfs_fill_zplprops_root(version, rootprops, - zplprops, NULL); - if (error != 0) - goto pool_props_bad; - - if (nvlist_lookup_string(props, - zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0) - spa_name = tname; - } - - error = spa_create(zc->zc_name, config, props, zplprops); - - /* - * Set the remaining root properties - */ - if (!error && (error = zfs_set_prop_nvlist(spa_name, - ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) - (void) spa_destroy(spa_name); - -pool_props_bad: - nvlist_free(rootprops); - nvlist_free(zplprops); - nvlist_free(config); - nvlist_free(props); - - return (error); -} - -static int -zfs_ioc_pool_destroy(zfs_cmd_t *zc) -{ - int error; - zfs_log_history(zc); - error = spa_destroy(zc->zc_name); -#ifndef __FreeBSD__ - if (error == 0) - zvol_remove_minors(zc->zc_name); -#endif - return (error); -} - -static int -zfs_ioc_pool_import(zfs_cmd_t *zc) -{ - nvlist_t *config, *props = NULL; - uint64_t guid; - int error; - - if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - zc->zc_iflags, &config)) != 0) - return (error); - - if (zc->zc_nvlist_src_size != 0 && (error = - get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &props))) { - nvlist_free(config); - return (error); - } - - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || - guid != zc->zc_guid) - error = SET_ERROR(EINVAL); - else - error = spa_import(zc->zc_name, config, props, zc->zc_cookie); - - if (zc->zc_nvlist_dst != 0) { - int err; - - if ((err = put_nvlist(zc, config)) != 0) - error = err; - } - - nvlist_free(config); - - nvlist_free(props); - - return (error); -} - -static int -zfs_ioc_pool_export(zfs_cmd_t *zc) -{ - int error; - boolean_t force = (boolean_t)zc->zc_cookie; - boolean_t hardforce = (boolean_t)zc->zc_guid; - - zfs_log_history(zc); - error = spa_export(zc->zc_name, NULL, force, hardforce); -#ifndef __FreeBSD__ - if (error == 0) - zvol_remove_minors(zc->zc_name); -#endif - return (error); -} - -static int -zfs_ioc_pool_configs(zfs_cmd_t *zc) -{ - nvlist_t *configs; - int error; - - if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) - return (SET_ERROR(EEXIST)); - - error = put_nvlist(zc, configs); - - nvlist_free(configs); - - return (error); -} - -/* - * inputs: - * zc_name name of the pool - * - * outputs: - * zc_cookie real errno - * zc_nvlist_dst config nvlist - * zc_nvlist_dst_size size of config nvlist - */ -static int -zfs_ioc_pool_stats(zfs_cmd_t *zc) -{ - nvlist_t *config; - int error; - int ret = 0; - - error = spa_get_stats(zc->zc_name, &config, zc->zc_value, - sizeof (zc->zc_value)); - - if (config != NULL) { - ret = put_nvlist(zc, config); - nvlist_free(config); - - /* - * The config may be present even if 'error' is non-zero. - * In this case we return success, and preserve the real errno - * in 'zc_cookie'. - */ - zc->zc_cookie = error; - } else { - ret = error; - } - - return (ret); -} - -/* - * Try to import the given pool, returning pool stats as appropriate so that - * user land knows which devices are available and overall pool health. - */ -static int -zfs_ioc_pool_tryimport(zfs_cmd_t *zc) -{ - nvlist_t *tryconfig, *config; - int error; - - if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - zc->zc_iflags, &tryconfig)) != 0) - return (error); - - config = spa_tryimport(tryconfig); - - nvlist_free(tryconfig); - - if (config == NULL) - return (SET_ERROR(EINVAL)); - - error = put_nvlist(zc, config); - nvlist_free(config); - - return (error); -} - -/* - * inputs: - * zc_name name of the pool - * zc_cookie scan func (pool_scan_func_t) - * zc_flags scrub pause/resume flag (pool_scrub_cmd_t) - */ -static int -zfs_ioc_pool_scan(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - if (zc->zc_flags >= POOL_SCRUB_FLAGS_END) - return (SET_ERROR(EINVAL)); - - if (zc->zc_flags == POOL_SCRUB_PAUSE) - error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); - else if (zc->zc_cookie == POOL_SCAN_NONE) - error = spa_scan_stop(spa); - else - error = spa_scan(spa, zc->zc_cookie); - - spa_close(spa, FTAG); - - return (error); -} - -static int -zfs_ioc_pool_freeze(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - error = spa_open(zc->zc_name, &spa, FTAG); - if (error == 0) { - spa_freeze(spa); - spa_close(spa, FTAG); - } - return (error); -} - -static int -zfs_ioc_pool_upgrade(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - if (zc->zc_cookie < spa_version(spa) || - !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { - spa_close(spa, FTAG); - return (SET_ERROR(EINVAL)); - } - - spa_upgrade(spa, zc->zc_cookie); - spa_close(spa, FTAG); - - return (error); -} - -static int -zfs_ioc_pool_get_history(zfs_cmd_t *zc) -{ - spa_t *spa; - char *hist_buf; - uint64_t size; - int error; - - if ((size = zc->zc_history_len) == 0) - return (SET_ERROR(EINVAL)); - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { - spa_close(spa, FTAG); - return (SET_ERROR(ENOTSUP)); - } - - hist_buf = kmem_alloc(size, KM_SLEEP); - if ((error = spa_history_get(spa, &zc->zc_history_offset, - &zc->zc_history_len, hist_buf)) == 0) { - error = ddi_copyout(hist_buf, - (void *)(uintptr_t)zc->zc_history, - zc->zc_history_len, zc->zc_iflags); - } - - spa_close(spa, FTAG); - kmem_free(hist_buf, size); - return (error); -} - -static int -zfs_ioc_pool_reguid(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - error = spa_open(zc->zc_name, &spa, FTAG); - if (error == 0) { - error = spa_change_guid(spa); - spa_close(spa, FTAG); - } - return (error); -} - -static int -zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) -{ - return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_obj object to find - * - * outputs: - * zc_value name of object - */ -static int -zfs_ioc_obj_to_path(zfs_cmd_t *zc) -{ - objset_t *os; - int error; - - /* XXX reading from objset not owned */ - if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) - return (error); - if (dmu_objset_type(os) != DMU_OST_ZFS) { - dmu_objset_rele(os, FTAG); - return (SET_ERROR(EINVAL)); - } - error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, - sizeof (zc->zc_value)); - dmu_objset_rele(os, FTAG); - - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_obj object to find - * - * outputs: - * zc_stat stats on object - * zc_value path to object - */ -static int -zfs_ioc_obj_to_stats(zfs_cmd_t *zc) -{ - objset_t *os; - int error; - - /* XXX reading from objset not owned */ - if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) - return (error); - if (dmu_objset_type(os) != DMU_OST_ZFS) { - dmu_objset_rele(os, FTAG); - return (SET_ERROR(EINVAL)); - } - error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, - sizeof (zc->zc_value)); - dmu_objset_rele(os, FTAG); - - return (error); -} - -static int -zfs_ioc_vdev_add(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - nvlist_t *config, **l2cache, **spares; - uint_t nl2cache = 0, nspares = 0; - - error = spa_open(zc->zc_name, &spa, FTAG); - if (error != 0) - return (error); - - error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - zc->zc_iflags, &config); - (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, - &l2cache, &nl2cache); - - (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, - &spares, &nspares); - -#ifdef illumos - /* - * A root pool with concatenated devices is not supported. - * Thus, can not add a device to a root pool. - * - * Intent log device can not be added to a rootpool because - * during mountroot, zil is replayed, a seperated log device - * can not be accessed during the mountroot time. - * - * l2cache and spare devices are ok to be added to a rootpool. - */ - if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) { - nvlist_free(config); - spa_close(spa, FTAG); - return (SET_ERROR(EDOM)); - } -#endif /* illumos */ - - if (error == 0) { - error = spa_vdev_add(spa, config); - nvlist_free(config); - } - spa_close(spa, FTAG); - return (error); -} - -/* - * inputs: - * zc_name name of the pool - * zc_guid guid of vdev to remove - * zc_cookie cancel removal - */ -static int -zfs_ioc_vdev_remove(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - error = spa_open(zc->zc_name, &spa, FTAG); - if (error != 0) - return (error); - if (zc->zc_cookie != 0) { - error = spa_vdev_remove_cancel(spa); - } else { - error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); - } - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_set_state(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - vdev_state_t newstate = VDEV_STATE_UNKNOWN; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - switch (zc->zc_cookie) { - case VDEV_STATE_ONLINE: - error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); - break; - - case VDEV_STATE_OFFLINE: - error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); - break; - - case VDEV_STATE_FAULTED: - if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && - zc->zc_obj != VDEV_AUX_EXTERNAL) - zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; - - error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); - break; - - case VDEV_STATE_DEGRADED: - if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && - zc->zc_obj != VDEV_AUX_EXTERNAL) - zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; - - error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); - break; - - default: - error = SET_ERROR(EINVAL); - } - zc->zc_cookie = newstate; - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_attach(zfs_cmd_t *zc) -{ - spa_t *spa; - int replacing = zc->zc_cookie; - nvlist_t *config; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - zc->zc_iflags, &config)) == 0) { - error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); - nvlist_free(config); - } - - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_detach(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); - - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_split(zfs_cmd_t *zc) -{ - spa_t *spa; - nvlist_t *config, *props = NULL; - int error; - boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - zc->zc_iflags, &config)) { - spa_close(spa, FTAG); - return (error); - } - - if (zc->zc_nvlist_src_size != 0 && (error = - get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &props))) { - spa_close(spa, FTAG); - nvlist_free(config); - return (error); - } - - error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); - - spa_close(spa, FTAG); - - nvlist_free(config); - nvlist_free(props); - - return (error); -} - -static int -zfs_ioc_vdev_setpath(zfs_cmd_t *zc) -{ - spa_t *spa; - char *path = zc->zc_value; - uint64_t guid = zc->zc_guid; - int error; - - error = spa_open(zc->zc_name, &spa, FTAG); - if (error != 0) - return (error); - - error = spa_vdev_setpath(spa, guid, path); - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_vdev_setfru(zfs_cmd_t *zc) -{ - spa_t *spa; - char *fru = zc->zc_value; - uint64_t guid = zc->zc_guid; - int error; - - error = spa_open(zc->zc_name, &spa, FTAG); - if (error != 0) - return (error); - - error = spa_vdev_setfru(spa, guid, fru); - spa_close(spa, FTAG); - return (error); -} - -static int -zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) -{ - int error = 0; - nvlist_t *nv; - - dmu_objset_fast_stat(os, &zc->zc_objset_stats); - - if (zc->zc_nvlist_dst != 0 && - (error = dsl_prop_get_all(os, &nv)) == 0) { - dmu_objset_stats(os, nv); - /* - * NB: zvol_get_stats() will read the objset contents, - * which we aren't supposed to do with a - * DS_MODE_USER hold, because it could be - * inconsistent. So this is a bit of a workaround... - * XXX reading with out owning - */ - if (!zc->zc_objset_stats.dds_inconsistent && - dmu_objset_type(os) == DMU_OST_ZVOL) { - error = zvol_get_stats(os, nv); - if (error == EIO) - return (error); - VERIFY0(error); - } - error = put_nvlist(zc, nv); - nvlist_free(nv); - } - - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_nvlist_dst_size size of buffer for property nvlist - * - * outputs: - * zc_objset_stats stats - * zc_nvlist_dst property nvlist - * zc_nvlist_dst_size size of property nvlist - */ -static int -zfs_ioc_objset_stats(zfs_cmd_t *zc) -{ - objset_t *os; - int error; - - error = dmu_objset_hold(zc->zc_name, FTAG, &os); - if (error == 0) { - error = zfs_ioc_objset_stats_impl(zc, os); - dmu_objset_rele(os, FTAG); - } - - if (error == ENOMEM) - error = 0; - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_nvlist_dst_size size of buffer for property nvlist - * - * outputs: - * zc_nvlist_dst received property nvlist - * zc_nvlist_dst_size size of received property nvlist - * - * Gets received properties (distinct from local properties on or after - * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from - * local property values. - */ -static int -zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) -{ - int error = 0; - nvlist_t *nv; - - /* - * Without this check, we would return local property values if the - * caller has not already received properties on or after - * SPA_VERSION_RECVD_PROPS. - */ - if (!dsl_prop_get_hasrecvd(zc->zc_name)) - return (SET_ERROR(ENOTSUP)); - - if (zc->zc_nvlist_dst != 0 && - (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { - error = put_nvlist(zc, nv); - nvlist_free(nv); - } - - return (error); -} - -static int -nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) -{ - uint64_t value; - int error; - - /* - * zfs_get_zplprop() will either find a value or give us - * the default value (if there is one). - */ - if ((error = zfs_get_zplprop(os, prop, &value)) != 0) - return (error); - VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); - return (0); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_nvlist_dst_size size of buffer for zpl property nvlist - * - * outputs: - * zc_nvlist_dst zpl property nvlist - * zc_nvlist_dst_size size of zpl property nvlist - */ -static int -zfs_ioc_objset_zplprops(zfs_cmd_t *zc) -{ - objset_t *os; - int err; - - /* XXX reading without owning */ - if (err = dmu_objset_hold(zc->zc_name, FTAG, &os)) - return (err); - - dmu_objset_fast_stat(os, &zc->zc_objset_stats); - - /* - * NB: nvl_add_zplprop() will read the objset contents, - * which we aren't supposed to do with a DS_MODE_USER - * hold, because it could be inconsistent. - */ - if (zc->zc_nvlist_dst != 0 && - !zc->zc_objset_stats.dds_inconsistent && - dmu_objset_type(os) == DMU_OST_ZFS) { - nvlist_t *nv; - - VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && - (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && - (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && - (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) - err = put_nvlist(zc, nv); - nvlist_free(nv); - } else { - err = SET_ERROR(ENOENT); - } - dmu_objset_rele(os, FTAG); - return (err); -} - -boolean_t -dataset_name_hidden(const char *name) -{ - /* - * Skip over datasets that are not visible in this zone, - * internal datasets (which have a $ in their name), and - * temporary datasets (which have a % in their name). - */ - if (strchr(name, '$') != NULL) - return (B_TRUE); - if (strchr(name, '%') != NULL) - return (B_TRUE); - if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL)) - return (B_TRUE); - return (B_FALSE); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_cookie zap cursor - * zc_nvlist_src iteration range nvlist - * zc_nvlist_src_size size of iteration range nvlist - * - * outputs: - * zc_name name of next filesystem - * zc_cookie zap cursor - * zc_objset_stats stats - * zc_nvlist_dst property nvlist - * zc_nvlist_dst_size size of property nvlist - */ -static int -zfs_ioc_dataset_list_next(zfs_cmd_t *zc) -{ - objset_t *os; - int error; - char *p; - size_t orig_len = strlen(zc->zc_name); - -top: - if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) { - if (error == ENOENT) - error = SET_ERROR(ESRCH); - return (error); - } - - p = strrchr(zc->zc_name, '/'); - if (p == NULL || p[1] != '\0') - (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); - p = zc->zc_name + strlen(zc->zc_name); - - do { - error = dmu_dir_list_next(os, - sizeof (zc->zc_name) - (p - zc->zc_name), p, - NULL, &zc->zc_cookie); - if (error == ENOENT) - error = SET_ERROR(ESRCH); - } while (error == 0 && dataset_name_hidden(zc->zc_name)); - dmu_objset_rele(os, FTAG); - - /* - * If it's an internal dataset (ie. with a '$' in its name), - * don't try to get stats for it, otherwise we'll return ENOENT. - */ - if (error == 0 && strchr(zc->zc_name, '$') == NULL) { - error = zfs_ioc_objset_stats(zc); /* fill in the stats */ - if (error == ENOENT) { - /* We lost a race with destroy, get the next one. */ - zc->zc_name[orig_len] = '\0'; - goto top; - } - } - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_cookie zap cursor - * zc_nvlist_dst_size size of buffer for property nvlist - * zc_simple when set, only name is requested - * - * outputs: - * zc_name name of next snapshot - * zc_objset_stats stats - * zc_nvlist_dst property nvlist - * zc_nvlist_dst_size size of property nvlist - */ -static int -zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) -{ - int error; - objset_t *os, *ossnap; - dsl_dataset_t *ds; - uint64_t min_txg = 0, max_txg = 0; - - if (zc->zc_nvlist_src_size != 0) { - nvlist_t *props = NULL; - error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &props); - if (error != 0) - return (error); - (void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG, - &min_txg); - (void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG, - &max_txg); - nvlist_free(props); - } - - error = dmu_objset_hold(zc->zc_name, FTAG, &os); - if (error != 0) { - return (error == ENOENT ? ESRCH : error); - } - - /* - * A dataset name of maximum length cannot have any snapshots, - * so exit immediately. - */ - if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= - ZFS_MAX_DATASET_NAME_LEN) { - dmu_objset_rele(os, FTAG); - return (SET_ERROR(ESRCH)); - } - - while (error == 0) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { - error = SET_ERROR(EINTR); - break; - } - - error = dmu_snapshot_list_next(os, - sizeof (zc->zc_name) - strlen(zc->zc_name), - zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, - &zc->zc_cookie, NULL); - if (error == ENOENT) { - error = SET_ERROR(ESRCH); - break; - } else if (error != 0) { - break; - } - - error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj, - FTAG, &ds); - if (error != 0) - break; - - if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) || - (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) { - dsl_dataset_rele(ds, FTAG); - /* undo snapshot name append */ - *(strchr(zc->zc_name, '@') + 1) = '\0'; - /* skip snapshot */ - continue; - } - - if (zc->zc_simple) { - dsl_dataset_rele(ds, FTAG); - break; - } - - if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) { - dsl_dataset_rele(ds, FTAG); - break; - } - if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) { - dsl_dataset_rele(ds, FTAG); - break; - } - dsl_dataset_rele(ds, FTAG); - break; - } - - dmu_objset_rele(os, FTAG); - /* if we failed, undo the @ that we tacked on to zc_name */ - if (error != 0) - *strchr(zc->zc_name, '@') = '\0'; - return (error); -} - -static int -zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) -{ - const char *propname = nvpair_name(pair); - uint64_t *valary; - unsigned int vallen; - const char *domain; - char *dash; - zfs_userquota_prop_t type; - uint64_t rid; - uint64_t quota; - zfsvfs_t *zfsvfs; - int err; - - if (nvpair_type(pair) == DATA_TYPE_NVLIST) { - nvlist_t *attrs; - VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); - if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &pair) != 0) - return (SET_ERROR(EINVAL)); - } - - /* - * A correctly constructed propname is encoded as - * userquota@-. - */ - if ((dash = strchr(propname, '-')) == NULL || - nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || - vallen != 3) - return (SET_ERROR(EINVAL)); - - domain = dash + 1; - type = valary[0]; - rid = valary[1]; - quota = valary[2]; - - err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); - if (err == 0) { - err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); - zfsvfs_rele(zfsvfs, FTAG); - } - - return (err); -} - -/* - * If the named property is one that has a special function to set its value, - * return 0 on success and a positive error code on failure; otherwise if it is - * not one of the special properties handled by this function, return -1. - * - * XXX: It would be better for callers of the property interface if we handled - * these special cases in dsl_prop.c (in the dsl layer). - */ -static int -zfs_prop_set_special(const char *dsname, zprop_source_t source, - nvpair_t *pair) -{ - const char *propname = nvpair_name(pair); - zfs_prop_t prop = zfs_name_to_prop(propname); - uint64_t intval; - int err = -1; - - if (prop == ZPROP_INVAL) { - if (zfs_prop_userquota(propname)) - return (zfs_prop_set_userquota(dsname, pair)); - return (-1); - } - - if (nvpair_type(pair) == DATA_TYPE_NVLIST) { - nvlist_t *attrs; - VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); - VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &pair) == 0); - } - - if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) - return (-1); - - VERIFY(0 == nvpair_value_uint64(pair, &intval)); - - switch (prop) { - case ZFS_PROP_QUOTA: - err = dsl_dir_set_quota(dsname, source, intval); - break; - case ZFS_PROP_REFQUOTA: - err = dsl_dataset_set_refquota(dsname, source, intval); - break; - case ZFS_PROP_FILESYSTEM_LIMIT: - case ZFS_PROP_SNAPSHOT_LIMIT: - if (intval == UINT64_MAX) { - /* clearing the limit, just do it */ - err = 0; - } else { - err = dsl_dir_activate_fs_ss_limit(dsname); - } - /* - * Set err to -1 to force the zfs_set_prop_nvlist code down the - * default path to set the value in the nvlist. - */ - if (err == 0) - err = -1; - break; - case ZFS_PROP_RESERVATION: - err = dsl_dir_set_reservation(dsname, source, intval); - break; - case ZFS_PROP_REFRESERVATION: - err = dsl_dataset_set_refreservation(dsname, source, intval); - break; - case ZFS_PROP_VOLSIZE: - err = zvol_set_volsize(dsname, intval); - break; - case ZFS_PROP_VERSION: - { - zfsvfs_t *zfsvfs; - - if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) - break; - - err = zfs_set_version(zfsvfs, intval); - zfsvfs_rele(zfsvfs, FTAG); - - if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { - zfs_cmd_t *zc; - - zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); - (void) strcpy(zc->zc_name, dsname); - (void) zfs_ioc_userspace_upgrade(zc); - kmem_free(zc, sizeof (zfs_cmd_t)); - } - break; - } - default: - err = -1; - } - - return (err); -} - -/* - * This function is best effort. If it fails to set any of the given properties, - * it continues to set as many as it can and returns the last error - * encountered. If the caller provides a non-NULL errlist, it will be filled in - * with the list of names of all the properties that failed along with the - * corresponding error numbers. - * - * If every property is set successfully, zero is returned and errlist is not - * modified. - */ -int -zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, - nvlist_t *errlist) -{ - nvpair_t *pair; - nvpair_t *propval; - int rv = 0; - uint64_t intval; - char *strval; - nvlist_t *genericnvl = fnvlist_alloc(); - nvlist_t *retrynvl = fnvlist_alloc(); - -retry: - pair = NULL; - while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { - const char *propname = nvpair_name(pair); - zfs_prop_t prop = zfs_name_to_prop(propname); - int err = 0; - - /* decode the property value */ - propval = pair; - if (nvpair_type(pair) == DATA_TYPE_NVLIST) { - nvlist_t *attrs; - attrs = fnvpair_value_nvlist(pair); - if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &propval) != 0) - err = SET_ERROR(EINVAL); - } - - /* Validate value type */ - if (err == 0 && prop == ZPROP_INVAL) { - if (zfs_prop_user(propname)) { - if (nvpair_type(propval) != DATA_TYPE_STRING) - err = SET_ERROR(EINVAL); - } else if (zfs_prop_userquota(propname)) { - if (nvpair_type(propval) != - DATA_TYPE_UINT64_ARRAY) - err = SET_ERROR(EINVAL); - } else { - err = SET_ERROR(EINVAL); - } - } else if (err == 0) { - if (nvpair_type(propval) == DATA_TYPE_STRING) { - if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) - err = SET_ERROR(EINVAL); - } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { - const char *unused; - - intval = fnvpair_value_uint64(propval); - - switch (zfs_prop_get_type(prop)) { - case PROP_TYPE_NUMBER: - break; - case PROP_TYPE_STRING: - err = SET_ERROR(EINVAL); - break; - case PROP_TYPE_INDEX: - if (zfs_prop_index_to_string(prop, - intval, &unused) != 0) - err = SET_ERROR(EINVAL); - break; - default: - cmn_err(CE_PANIC, - "unknown property type"); - } - } else { - err = SET_ERROR(EINVAL); - } - } - - /* Validate permissions */ - if (err == 0) - err = zfs_check_settable(dsname, pair, CRED()); - - if (err == 0) { - err = zfs_prop_set_special(dsname, source, pair); - if (err == -1) { - /* - * For better performance we build up a list of - * properties to set in a single transaction. - */ - err = nvlist_add_nvpair(genericnvl, pair); - } else if (err != 0 && nvl != retrynvl) { - /* - * This may be a spurious error caused by - * receiving quota and reservation out of order. - * Try again in a second pass. - */ - err = nvlist_add_nvpair(retrynvl, pair); - } - } - - if (err != 0) { - if (errlist != NULL) - fnvlist_add_int32(errlist, propname, err); - rv = err; - } - } - - if (nvl != retrynvl && !nvlist_empty(retrynvl)) { - nvl = retrynvl; - goto retry; - } - - if (!nvlist_empty(genericnvl) && - dsl_props_set(dsname, source, genericnvl) != 0) { - /* - * If this fails, we still want to set as many properties as we - * can, so try setting them individually. - */ - pair = NULL; - while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { - const char *propname = nvpair_name(pair); - int err = 0; - - propval = pair; - if (nvpair_type(pair) == DATA_TYPE_NVLIST) { - nvlist_t *attrs; - attrs = fnvpair_value_nvlist(pair); - propval = fnvlist_lookup_nvpair(attrs, - ZPROP_VALUE); - } - - if (nvpair_type(propval) == DATA_TYPE_STRING) { - strval = fnvpair_value_string(propval); - err = dsl_prop_set_string(dsname, propname, - source, strval); - } else { - intval = fnvpair_value_uint64(propval); - err = dsl_prop_set_int(dsname, propname, source, - intval); - } - - if (err != 0) { - if (errlist != NULL) { - fnvlist_add_int32(errlist, propname, - err); - } - rv = err; - } - } - } - nvlist_free(genericnvl); - nvlist_free(retrynvl); - - return (rv); -} - -/* - * Check that all the properties are valid user properties. - */ -static int -zfs_check_userprops(nvlist_t *nvl) -{ - nvpair_t *pair = NULL; - - while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { - const char *propname = nvpair_name(pair); - - if (!zfs_prop_user(propname) || - nvpair_type(pair) != DATA_TYPE_STRING) - return (SET_ERROR(EINVAL)); - - if (strlen(propname) >= ZAP_MAXNAMELEN) - return (SET_ERROR(ENAMETOOLONG)); - - if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) - return (E2BIG); - } - return (0); -} - -static void -props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) -{ - nvpair_t *pair; - - VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - pair = NULL; - while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { - if (nvlist_exists(skipped, nvpair_name(pair))) - continue; - - VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); - } -} - -static int -clear_received_props(const char *dsname, nvlist_t *props, - nvlist_t *skipped) -{ - int err = 0; - nvlist_t *cleared_props = NULL; - props_skip(props, skipped, &cleared_props); - if (!nvlist_empty(cleared_props)) { - /* - * Acts on local properties until the dataset has received - * properties at least once on or after SPA_VERSION_RECVD_PROPS. - */ - zprop_source_t flags = (ZPROP_SRC_NONE | - (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); - err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); - } - nvlist_free(cleared_props); - return (err); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_value name of property to set - * zc_nvlist_src{_size} nvlist of properties to apply - * zc_cookie received properties flag - * - * outputs: - * zc_nvlist_dst{_size} error for each unapplied received property - */ -static int -zfs_ioc_set_prop(zfs_cmd_t *zc) -{ - nvlist_t *nvl; - boolean_t received = zc->zc_cookie; - zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : - ZPROP_SRC_LOCAL); - nvlist_t *errors; - int error; - - if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &nvl)) != 0) - return (error); - - if (received) { - nvlist_t *origprops; - - if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { - (void) clear_received_props(zc->zc_name, - origprops, nvl); - nvlist_free(origprops); - } - - error = dsl_prop_set_hasrecvd(zc->zc_name); - } - - errors = fnvlist_alloc(); - if (error == 0) - error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); - - if (zc->zc_nvlist_dst != 0 && errors != NULL) { - (void) put_nvlist(zc, errors); - } - - nvlist_free(errors); - nvlist_free(nvl); - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_value name of property to inherit - * zc_cookie revert to received value if TRUE - * - * outputs: none - */ -static int -zfs_ioc_inherit_prop(zfs_cmd_t *zc) -{ - const char *propname = zc->zc_value; - zfs_prop_t prop = zfs_name_to_prop(propname); - boolean_t received = zc->zc_cookie; - zprop_source_t source = (received - ? ZPROP_SRC_NONE /* revert to received value, if any */ - : ZPROP_SRC_INHERITED); /* explicitly inherit */ - - if (received) { - nvlist_t *dummy; - nvpair_t *pair; - zprop_type_t type; - int err; - - /* - * zfs_prop_set_special() expects properties in the form of an - * nvpair with type info. - */ - if (prop == ZPROP_INVAL) { - if (!zfs_prop_user(propname)) - return (SET_ERROR(EINVAL)); - - type = PROP_TYPE_STRING; - } else if (prop == ZFS_PROP_VOLSIZE || - prop == ZFS_PROP_VERSION) { - return (SET_ERROR(EINVAL)); - } else { - type = zfs_prop_get_type(prop); - } - - VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - switch (type) { - case PROP_TYPE_STRING: - VERIFY(0 == nvlist_add_string(dummy, propname, "")); - break; - case PROP_TYPE_NUMBER: - case PROP_TYPE_INDEX: - VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); - break; - default: - nvlist_free(dummy); - return (SET_ERROR(EINVAL)); - } - - pair = nvlist_next_nvpair(dummy, NULL); - err = zfs_prop_set_special(zc->zc_name, source, pair); - nvlist_free(dummy); - if (err != -1) - return (err); /* special property already handled */ - } else { - /* - * Only check this in the non-received case. We want to allow - * 'inherit -S' to revert non-inheritable properties like quota - * and reservation to the received or default values even though - * they are not considered inheritable. - */ - if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) - return (SET_ERROR(EINVAL)); - } - - /* property name has been validated by zfs_secpolicy_inherit_prop() */ - return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source)); -} - -static int -zfs_ioc_pool_set_props(zfs_cmd_t *zc) -{ - nvlist_t *props; - spa_t *spa; - int error; - nvpair_t *pair; - - if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &props)) - return (error); - - /* - * If the only property is the configfile, then just do a spa_lookup() - * to handle the faulted case. - */ - pair = nvlist_next_nvpair(props, NULL); - if (pair != NULL && strcmp(nvpair_name(pair), - zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && - nvlist_next_nvpair(props, pair) == NULL) { - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(zc->zc_name)) != NULL) { - spa_configfile_set(spa, props, B_FALSE); - spa_write_cachefile(spa, B_FALSE, B_TRUE); - } - mutex_exit(&spa_namespace_lock); - if (spa != NULL) { - nvlist_free(props); - return (0); - } - } - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { - nvlist_free(props); - return (error); - } - - error = spa_prop_set(spa, props); - - nvlist_free(props); - spa_close(spa, FTAG); - - return (error); -} - -static int -zfs_ioc_pool_get_props(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - nvlist_t *nvp = NULL; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { - /* - * If the pool is faulted, there may be properties we can still - * get (such as altroot and cachefile), so attempt to get them - * anyway. - */ - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(zc->zc_name)) != NULL) - error = spa_prop_get(spa, &nvp); - mutex_exit(&spa_namespace_lock); - } else { - error = spa_prop_get(spa, &nvp); - spa_close(spa, FTAG); - } - - if (error == 0 && zc->zc_nvlist_dst != 0) - error = put_nvlist(zc, nvp); - else - error = SET_ERROR(EFAULT); - - nvlist_free(nvp); - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_nvlist_src{_size} nvlist of delegated permissions - * zc_perm_action allow/unallow flag - * - * outputs: none - */ -static int -zfs_ioc_set_fsacl(zfs_cmd_t *zc) -{ - int error; - nvlist_t *fsaclnv = NULL; - - if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &fsaclnv)) != 0) - return (error); - - /* - * Verify nvlist is constructed correctly - */ - if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { - nvlist_free(fsaclnv); - return (SET_ERROR(EINVAL)); - } - - /* - * If we don't have PRIV_SYS_MOUNT, then validate - * that user is allowed to hand out each permission in - * the nvlist(s) - */ - - error = secpolicy_zfs(CRED()); - if (error != 0) { - if (zc->zc_perm_action == B_FALSE) { - error = dsl_deleg_can_allow(zc->zc_name, - fsaclnv, CRED()); - } else { - error = dsl_deleg_can_unallow(zc->zc_name, - fsaclnv, CRED()); - } - } - - if (error == 0) - error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); - - nvlist_free(fsaclnv); - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * - * outputs: - * zc_nvlist_src{_size} nvlist of delegated permissions - */ -static int -zfs_ioc_get_fsacl(zfs_cmd_t *zc) -{ - nvlist_t *nvp; - int error; - - if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { - error = put_nvlist(zc, nvp); - nvlist_free(nvp); - } - - return (error); -} - -/* ARGSUSED */ -static void -zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) -{ - zfs_creat_t *zct = arg; - - zfs_create_fs(os, cr, zct->zct_zplprops, tx); -} - -#define ZFS_PROP_UNDEFINED ((uint64_t)-1) - -/* - * inputs: - * os parent objset pointer (NULL if root fs) - * fuids_ok fuids allowed in this version of the spa? - * sa_ok SAs allowed in this version of the spa? - * createprops list of properties requested by creator - * - * outputs: - * zplprops values for the zplprops we attach to the master node object - * is_ci true if requested file system will be purely case-insensitive - * - * Determine the settings for utf8only, normalization and - * casesensitivity. Specific values may have been requested by the - * creator and/or we can inherit values from the parent dataset. If - * the file system is of too early a vintage, a creator can not - * request settings for these properties, even if the requested - * setting is the default value. We don't actually want to create dsl - * properties for these, so remove them from the source nvlist after - * processing. - */ -static int -zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, - boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, - nvlist_t *zplprops, boolean_t *is_ci) -{ - uint64_t sense = ZFS_PROP_UNDEFINED; - uint64_t norm = ZFS_PROP_UNDEFINED; - uint64_t u8 = ZFS_PROP_UNDEFINED; - - ASSERT(zplprops != NULL); - - /* parent dataset must be a filesystem */ - if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS) - return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); - - /* - * Pull out creator prop choices, if any. - */ - if (createprops) { - (void) nvlist_lookup_uint64(createprops, - zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); - (void) nvlist_lookup_uint64(createprops, - zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); - (void) nvlist_remove_all(createprops, - zfs_prop_to_name(ZFS_PROP_NORMALIZE)); - (void) nvlist_lookup_uint64(createprops, - zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); - (void) nvlist_remove_all(createprops, - zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); - (void) nvlist_lookup_uint64(createprops, - zfs_prop_to_name(ZFS_PROP_CASE), &sense); - (void) nvlist_remove_all(createprops, - zfs_prop_to_name(ZFS_PROP_CASE)); - } - - /* - * If the zpl version requested is whacky or the file system - * or pool is version is too "young" to support normalization - * and the creator tried to set a value for one of the props, - * error out. - */ - if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || - (zplver >= ZPL_VERSION_FUID && !fuids_ok) || - (zplver >= ZPL_VERSION_SA && !sa_ok) || - (zplver < ZPL_VERSION_NORMALIZATION && - (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || - sense != ZFS_PROP_UNDEFINED))) - return (SET_ERROR(ENOTSUP)); - - /* - * Put the version in the zplprops - */ - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); - - if (norm == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); - - /* - * If we're normalizing, names must always be valid UTF-8 strings. - */ - if (norm) - u8 = 1; - if (u8 == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); - - if (sense == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); - - if (is_ci) - *is_ci = (sense == ZFS_CASE_INSENSITIVE); - - return (0); -} - -static int -zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, - nvlist_t *zplprops, boolean_t *is_ci) -{ - boolean_t fuids_ok, sa_ok; - uint64_t zplver = ZPL_VERSION; - objset_t *os = NULL; - char parentname[ZFS_MAX_DATASET_NAME_LEN]; - spa_t *spa; - uint64_t spa_vers; - int error; - - zfs_get_parent(dataset, parentname, sizeof (parentname)); - - if ((error = spa_open(dataset, &spa, FTAG)) != 0) - return (error); - - spa_vers = spa_version(spa); - spa_close(spa, FTAG); - - zplver = zfs_zpl_version_map(spa_vers); - fuids_ok = (zplver >= ZPL_VERSION_FUID); - sa_ok = (zplver >= ZPL_VERSION_SA); - - /* - * Open parent object set so we can inherit zplprop values. - */ - if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) - return (error); - - error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, - zplprops, is_ci); - dmu_objset_rele(os, FTAG); - return (error); -} - -static int -zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, - nvlist_t *zplprops, boolean_t *is_ci) -{ - boolean_t fuids_ok; - boolean_t sa_ok; - uint64_t zplver = ZPL_VERSION; - int error; - - zplver = zfs_zpl_version_map(spa_vers); - fuids_ok = (zplver >= ZPL_VERSION_FUID); - sa_ok = (zplver >= ZPL_VERSION_SA); - - error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, - createprops, zplprops, is_ci); - return (error); -} - -/* - * innvl: { - * "type" -> dmu_objset_type_t (int32) - * (optional) "props" -> { prop -> value } - * } - * - * outnvl: propname -> error code (int32) - */ - -static const zfs_ioc_key_t zfs_keys_create[] = { - {"type", DATA_TYPE_INT32, 0}, - {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, - {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, -}; - -static int -zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) -{ - int error = 0; - zfs_creat_t zct = { 0 }; - nvlist_t *nvprops = NULL; - void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); - dmu_objset_type_t type; - boolean_t is_insensitive = B_FALSE; - - type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type"); - (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); - - switch (type) { - case DMU_OST_ZFS: - cbfunc = zfs_create_cb; - break; - - case DMU_OST_ZVOL: - cbfunc = zvol_create_cb; - break; - - default: - cbfunc = NULL; - break; - } - if (strchr(fsname, '@') || - strchr(fsname, '%')) - return (SET_ERROR(EINVAL)); - - zct.zct_props = nvprops; - - if (cbfunc == NULL) - return (SET_ERROR(EINVAL)); - - if (type == DMU_OST_ZVOL) { - uint64_t volsize, volblocksize; - - if (nvprops == NULL) - return (SET_ERROR(EINVAL)); - if (nvlist_lookup_uint64(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) - return (SET_ERROR(EINVAL)); - - if ((error = nvlist_lookup_uint64(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), - &volblocksize)) != 0 && error != ENOENT) - return (SET_ERROR(EINVAL)); - - if (error != 0) - volblocksize = zfs_prop_default_numeric( - ZFS_PROP_VOLBLOCKSIZE); - - if ((error = zvol_check_volblocksize( - volblocksize)) != 0 || - (error = zvol_check_volsize(volsize, - volblocksize)) != 0) - return (error); - } else if (type == DMU_OST_ZFS) { - int error; - - /* - * We have to have normalization and - * case-folding flags correct when we do the - * file system creation, so go figure them out - * now. - */ - VERIFY(nvlist_alloc(&zct.zct_zplprops, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - error = zfs_fill_zplprops(fsname, nvprops, - zct.zct_zplprops, &is_insensitive); - if (error != 0) { - nvlist_free(zct.zct_zplprops); - return (error); - } - } - - error = dmu_objset_create(fsname, type, - is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); - nvlist_free(zct.zct_zplprops); - - /* - * It would be nice to do this atomically. - */ - if (error == 0) { - error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, - nvprops, outnvl); -#if defined(__FreeBSD__) && defined(_KERNEL) - /* - * Wait for ZVOL operations to settle down before destroying. - */ - if (error != 0) { - spa_t *spa; - - if (spa_open(fsname, &spa, FTAG) == 0) { - taskqueue_drain_all( - spa->spa_zvol_taskq->tq_queue); - spa_close(spa, FTAG); - } - } -#endif - if (error != 0) - (void) dsl_destroy_head(fsname); - } - return (error); -} - -/* - * innvl: { - * "origin" -> name of origin snapshot - * (optional) "props" -> { prop -> value } - * } - * - * outnvl: propname -> error code (int32) - */ -static const zfs_ioc_key_t zfs_keys_clone[] = { - {"origin", DATA_TYPE_STRING, 0}, - {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, - {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, -}; - -static int -zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) -{ - int error = 0; - nvlist_t *nvprops = NULL; - char *origin_name; - - origin_name = fnvlist_lookup_string(innvl, "origin"); - (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); - - if (strchr(fsname, '@') || - strchr(fsname, '%')) - return (SET_ERROR(EINVAL)); - - if (dataset_namecheck(origin_name, NULL, NULL) != 0) - return (SET_ERROR(EINVAL)); - error = dmu_objset_clone(fsname, origin_name); - if (error != 0) - return (error); - - /* - * It would be nice to do this atomically. - */ - if (error == 0) { - error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, - nvprops, outnvl); - if (error != 0) - (void) dsl_destroy_head(fsname); - } - return (error); -} - -static const zfs_ioc_key_t zfs_keys_remap[] = { - /* no nvl keys */ -}; - -/* ARGSUSED */ -static int -zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) -{ - if (strchr(fsname, '@') || - strchr(fsname, '%')) - return (SET_ERROR(EINVAL)); - - return (dmu_objset_remap_indirects(fsname)); -} - -/* - * innvl: { - * "snaps" -> { snapshot1, snapshot2 } - * (optional) "props" -> { prop -> value (string) } - * } - * - * outnvl: snapshot -> error code (int32) - */ -static const zfs_ioc_key_t zfs_keys_snapshot[] = { - {"snaps", DATA_TYPE_NVLIST, 0}, - {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, -}; - -static int -zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) -{ - nvlist_t *snaps; - nvlist_t *props = NULL; - int error, poollen; - nvpair_t *pair; - - (void) nvlist_lookup_nvlist(innvl, "props", &props); - if (!nvlist_empty(props) && - zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) - return (SET_ERROR(ENOTSUP)); - if ((error = zfs_check_userprops(props)) != 0) - return (error); - - snaps = fnvlist_lookup_nvlist(innvl, "snaps"); - poollen = strlen(poolname); - for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; - pair = nvlist_next_nvpair(snaps, pair)) { - const char *name = nvpair_name(pair); - char *cp = strchr(name, '@'); - - /* - * The snap name must contain an @, and the part after it must - * contain only valid characters. - */ - if (cp == NULL || - zfs_component_namecheck(cp + 1, NULL, NULL) != 0) - return (SET_ERROR(EINVAL)); - - /* - * The snap must be in the specified pool. - */ - if (strncmp(name, poolname, poollen) != 0 || - (name[poollen] != '/' && name[poollen] != '@')) - return (SET_ERROR(EXDEV)); - - /* - * Check for permission to set the properties on the fs. - */ - if (!nvlist_empty(props)) { - *cp = '\0'; - error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_USERPROP, CRED()); - *cp = '@'; - if (error != 0) - return (error); - } - - /* This must be the only snap of this fs. */ - for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); - pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { - if (strncmp(name, nvpair_name(pair2), cp - name + 1) - == 0) { - return (SET_ERROR(EXDEV)); - } - } - } - - error = dsl_dataset_snapshot(snaps, props, outnvl); - return (error); -} - -/* - * innvl: "message" -> string - */ -static const zfs_ioc_key_t zfs_keys_log_history[] = { - {"message", DATA_TYPE_STRING, 0}, -}; - -/* ARGSUSED */ -static int -zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) -{ - char *message; - spa_t *spa; - int error; - char *poolname; - - /* - * The poolname in the ioctl is not set, we get it from the TSD, - * which was set at the end of the last successful ioctl that allows - * logging. The secpolicy func already checked that it is set. - * Only one log ioctl is allowed after each successful ioctl, so - * we clear the TSD here. - */ - poolname = tsd_get(zfs_allow_log_key); - (void) tsd_set(zfs_allow_log_key, NULL); - error = spa_open(poolname, &spa, FTAG); - strfree(poolname); - if (error != 0) - return (error); - - message = fnvlist_lookup_string(innvl, "message"); - - if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { - spa_close(spa, FTAG); - return (SET_ERROR(ENOTSUP)); - } - - error = spa_history_log(spa, message); - spa_close(spa, FTAG); - return (error); -} - -/* - * This ioctl is used to set the bootenv configuration on the current - * pool. This configuration is stored in the second padding area of the label, - * and it is used by the GRUB bootloader used on Linux to store the contents - * of the grubenv file. The file is stored as raw ASCII, and is protected by - * an embedded checksum. By default, GRUB will check if the boot filesystem - * supports storing the environment data in a special location, and if so, - * will invoke filesystem specific logic to retrieve it. This can be overriden - * by a variable, should the user so desire. - */ -/* ARGSUSED */ -static const zfs_ioc_key_t zfs_keys_set_bootenv[] = { - {"envmap", DATA_TYPE_STRING, 0}, -}; - -static int -zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) -{ - char *envmap; - int error; - spa_t *spa; - - envmap = fnvlist_lookup_string(innvl, "envmap"); - if ((error = spa_open(name, &spa, FTAG)) != 0) - return (error); - spa_vdev_state_enter(spa, SCL_ALL); - error = vdev_label_write_bootenv(spa->spa_root_vdev, envmap); - (void) spa_vdev_state_exit(spa, NULL, 0); - spa_close(spa, FTAG); - return (error); -} - -static const zfs_ioc_key_t zfs_keys_get_bootenv[] = { - /* no nvl keys */ -}; - - /* ARGSUSED */ -static int -zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) -{ - spa_t *spa; - int error; - - if ((error = spa_open(name, &spa, FTAG)) != 0) - return (error); - spa_vdev_state_enter(spa, SCL_ALL); - error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl); - (void) spa_vdev_state_exit(spa, NULL, 0); - spa_close(spa, FTAG); - return (error); -} - -#ifdef __FreeBSD__ -static const zfs_ioc_key_t zfs_keys_nextboot[] = { - {"command", DATA_TYPE_STRING, 0}, - {ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 0}, - {ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 0} -}; - -static int -zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) -{ - char name[MAXNAMELEN]; - spa_t *spa; - vdev_t *vd; - char *command; - uint64_t pool_guid; - uint64_t vdev_guid; - int error; - - if (nvlist_lookup_uint64(innvl, - ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) - return (EINVAL); - if (nvlist_lookup_uint64(innvl, - ZPOOL_CONFIG_GUID, &vdev_guid) != 0) - return (EINVAL); - command = fnvlist_lookup_string(innvl, "command"); - - mutex_enter(&spa_namespace_lock); - spa = spa_by_guid(pool_guid, vdev_guid); - if (spa != NULL) - strcpy(name, spa_name(spa)); - mutex_exit(&spa_namespace_lock); - if (spa == NULL) - return (ENOENT); - - if ((error = spa_open(name, &spa, FTAG)) != 0) - return (error); - spa_vdev_state_enter(spa, SCL_ALL); - vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); - if (vd == NULL) { - (void) spa_vdev_state_exit(spa, NULL, ENXIO); - spa_close(spa, FTAG); - return (ENODEV); - } - error = vdev_label_write_pad2(vd, command, strlen(command)); - (void) spa_vdev_state_exit(spa, NULL, 0); - txg_wait_synced(spa->spa_dsl_pool, 0); - spa_close(spa, FTAG); - return (error); -} -#endif - -/* - * The dp_config_rwlock must not be held when calling this, because the - * unmount may need to write out data. - * - * This function is best-effort. Callers must deal gracefully if it - * remains mounted (or is remounted after this call). - * - * Returns 0 if the argument is not a snapshot, or it is not currently a - * filesystem, or we were able to unmount it. Returns error code otherwise. - */ -void -zfs_unmount_snap(const char *snapname) -{ - vfs_t *vfsp = NULL; - zfsvfs_t *zfsvfs = NULL; - - if (strchr(snapname, '@') == NULL) - return; - - int err = getzfsvfs(snapname, &zfsvfs); - if (err != 0) { - ASSERT3P(zfsvfs, ==, NULL); - return; - } - vfsp = zfsvfs->z_vfs; - - ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); - -#ifdef illumos - err = vn_vfswlock(vfsp->vfs_vnodecovered); - VFS_RELE(vfsp); - if (err != 0) - return; -#endif - - /* - * Always force the unmount for snapshots. - */ -#ifdef illumos - (void) dounmount(vfsp, MS_FORCE, kcred); -#else - vfs_ref(vfsp); - vfs_unbusy(vfsp); - (void) dounmount(vfsp, MS_FORCE, curthread); -#endif -} - -/* ARGSUSED */ -static int -zfs_unmount_snap_cb(const char *snapname, void *arg) -{ - zfs_unmount_snap(snapname); - return (0); -} - -/* - * When a clone is destroyed, its origin may also need to be destroyed, - * in which case it must be unmounted. This routine will do that unmount - * if necessary. - */ -void -zfs_destroy_unmount_origin(const char *fsname) -{ - int error; - objset_t *os; - dsl_dataset_t *ds; - - error = dmu_objset_hold(fsname, FTAG, &os); - if (error != 0) - return; - ds = dmu_objset_ds(os); - if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { - char originname[ZFS_MAX_DATASET_NAME_LEN]; - dsl_dataset_name(ds->ds_prev, originname); - dmu_objset_rele(os, FTAG); - zfs_unmount_snap(originname); - } else { - dmu_objset_rele(os, FTAG); - } -} - -/* - * innvl: { - * "snaps" -> { snapshot1, snapshot2 } - * (optional boolean) "defer" - * } - * - * outnvl: snapshot -> error code (int32) - * - */ -static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = { - {"snaps", DATA_TYPE_NVLIST, 0}, - {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, -}; - -/* ARGSUSED */ -static int -zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) -{ - int error, poollen; - nvlist_t *snaps; - nvpair_t *pair; - boolean_t defer; - - snaps = fnvlist_lookup_nvlist(innvl, "snaps"); - defer = nvlist_exists(innvl, "defer"); - - poollen = strlen(poolname); - for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; - pair = nvlist_next_nvpair(snaps, pair)) { - const char *name = nvpair_name(pair); - - /* - * The snap must be in the specified pool to prevent the - * invalid removal of zvol minors below. - */ - if (strncmp(name, poolname, poollen) != 0 || - (name[poollen] != '/' && name[poollen] != '@')) - return (SET_ERROR(EXDEV)); - - zfs_unmount_snap(nvpair_name(pair)); - } - - return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); -} - -/* - * Create bookmarks. Bookmark names are of the form #. - * All bookmarks must be in the same pool. - * - * innvl: { - * bookmark1 -> snapshot1, bookmark2 -> snapshot2 - * } - * - * outnvl: bookmark -> error code (int32) - * - */ -static const zfs_ioc_key_t zfs_keys_bookmark[] = { - {"...", DATA_TYPE_STRING, ZK_WILDCARDLIST}, -}; - -/* ARGSUSED */ -static int -zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) -{ - for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); - pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { - char *snap_name; - - /* - * Verify the snapshot argument. - */ - if (nvpair_value_string(pair, &snap_name) != 0) - return (SET_ERROR(EINVAL)); - - - /* Verify that the keys (bookmarks) are unique */ - for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair); - pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) { - if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0) - return (SET_ERROR(EINVAL)); - } - } - - return (dsl_bookmark_create(innvl, outnvl)); -} - -/* - * innvl: { - * property 1, property 2, ... - * } - * - * outnvl: { - * bookmark name 1 -> { property 1, property 2, ... }, - * bookmark name 2 -> { property 1, property 2, ... } - * } - * - */ -static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = { - {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL}, -}; - -static int -zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) -{ - return (dsl_get_bookmarks(fsname, innvl, outnvl)); -} - -/* - * innvl: { - * bookmark name 1, bookmark name 2 - * } - * - * outnvl: bookmark -> error code (int32) - * - */ -static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = { - {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST}, -}; - -static int -zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, - nvlist_t *outnvl) -{ - int error, poollen; - - poollen = strlen(poolname); - for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); - pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { - const char *name = nvpair_name(pair); - const char *cp = strchr(name, '#'); - - /* - * The bookmark name must contain an #, and the part after it - * must contain only valid characters. - */ - if (cp == NULL || - zfs_component_namecheck(cp + 1, NULL, NULL) != 0) - return (SET_ERROR(EINVAL)); - - /* - * The bookmark must be in the specified pool. - */ - if (strncmp(name, poolname, poollen) != 0 || - (name[poollen] != '/' && name[poollen] != '#')) - return (SET_ERROR(EXDEV)); - } - - error = dsl_bookmark_destroy(innvl, outnvl); - return (error); -} - -static const zfs_ioc_key_t zfs_keys_channel_program[] = { - {"program", DATA_TYPE_STRING, 0}, - {"arg", DATA_TYPE_ANY, 0}, - {"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, - {"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, - {"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, -}; - -static int -zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, - nvlist_t *outnvl) -{ - char *program; - uint64_t instrlimit, memlimit; - boolean_t sync_flag; - nvpair_t *nvarg = NULL; - - program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM); - if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) { - sync_flag = B_TRUE; - } - if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) { - instrlimit = ZCP_DEFAULT_INSTRLIMIT; - } - if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) { - memlimit = ZCP_DEFAULT_MEMLIMIT; - } - nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST); - - if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit) - return (EINVAL); - if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) - return (EINVAL); - - return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit, - nvarg, outnvl)); -} - -/* - * innvl: unused - * outnvl: empty - */ -static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = { - /* no nvl keys */ -}; - -/* ARGSUSED */ -static int -zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) -{ - return (spa_checkpoint(poolname)); -} - -/* - * innvl: unused - * outnvl: empty - */ -static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = { - /* no nvl keys */ -}; - -/* ARGSUSED */ -static int -zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, - nvlist_t *outnvl) -{ - return (spa_checkpoint_discard(poolname)); -} - -/* - * inputs: - * zc_name name of dataset to destroy - * zc_defer_destroy mark for deferred destroy - * - * outputs: none - */ -static int -zfs_ioc_destroy(zfs_cmd_t *zc) -{ - objset_t *os; - dmu_objset_type_t ost; - int err; - - err = dmu_objset_hold(zc->zc_name, FTAG, &os); - if (err != 0) - return (err); - ost = dmu_objset_type(os); - dmu_objset_rele(os, FTAG); - - if (ost == DMU_OST_ZFS) - zfs_unmount_snap(zc->zc_name); - - if (strchr(zc->zc_name, '@')) - err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); - else - err = dsl_destroy_head(zc->zc_name); -#ifndef __FreeBSD__ - if (ost == DMU_OST_ZVOL && err == 0) - (void) zvol_remove_minor(zc->zc_name); -#endif - return (err); -} - -/* - * innvl: { - * vdevs: { - * guid 1, guid 2, ... - * }, - * func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND} - * } - * - * outnvl: { - * [func: EINVAL (if provided command type didn't make sense)], - * [vdevs: { - * guid1: errno, (see function body for possible errnos) - * ... - * }] - * } - * - */ -static const zfs_ioc_key_t zfs_keys_pool_initialize[] = { - {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0}, - {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0} -}; - -static int -zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) -{ - spa_t *spa; - int error; - - error = spa_open(poolname, &spa, FTAG); - if (error != 0) - return (error); - - uint64_t cmd_type; - if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, - &cmd_type) != 0) { - spa_close(spa, FTAG); - return (SET_ERROR(EINVAL)); - } - if (!(cmd_type == POOL_INITIALIZE_CANCEL || - cmd_type == POOL_INITIALIZE_DO || - cmd_type == POOL_INITIALIZE_SUSPEND)) { - spa_close(spa, FTAG); - return (SET_ERROR(EINVAL)); - } - - nvlist_t *vdev_guids; - if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, - &vdev_guids) != 0) { - spa_close(spa, FTAG); - return (SET_ERROR(EINVAL)); - } - - nvlist_t *vdev_errlist = fnvlist_alloc(); - int total_errors = 0; - - for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); - pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { - uint64_t vdev_guid = fnvpair_value_uint64(pair); - - error = spa_vdev_initialize(spa, vdev_guid, cmd_type); - if (error != 0) { - char guid_as_str[MAXNAMELEN]; - - (void) snprintf(guid_as_str, sizeof (guid_as_str), - "%llu", (unsigned long long)vdev_guid); - fnvlist_add_int64(vdev_errlist, guid_as_str, error); - total_errors++; - } - } - if (fnvlist_size(vdev_errlist) > 0) { - fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, - vdev_errlist); - } - fnvlist_free(vdev_errlist); - - spa_close(spa, FTAG); - return (total_errors > 0 ? EINVAL : 0); -} - -/* - * fsname is name of dataset to rollback (to most recent snapshot) - * - * innvl may contain name of expected target snapshot - * - * outnvl: "target" -> name of most recent snapshot - * } - */ -static const zfs_ioc_key_t zfs_keys_rollback[] = { - {"target", DATA_TYPE_STRING, ZK_OPTIONAL}, -}; - -/* ARGSUSED */ -static int -zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) -{ - zfsvfs_t *zfsvfs; - char *target = NULL; - int error; - - (void) nvlist_lookup_string(innvl, "target", &target); - if (target != NULL) { - const char *cp = strchr(target, '@'); - - /* - * The snap name must contain an @, and the part after it must - * contain only valid characters. - */ - if (cp == NULL || - zfs_component_namecheck(cp + 1, NULL, NULL) != 0) - return (SET_ERROR(EINVAL)); - } - - if (getzfsvfs(fsname, &zfsvfs) == 0) { - dsl_dataset_t *ds; - - ds = dmu_objset_ds(zfsvfs->z_os); - error = zfs_suspend_fs(zfsvfs); - if (error == 0) { - int resume_err; - - error = dsl_dataset_rollback(fsname, target, zfsvfs, - outnvl); - resume_err = zfs_resume_fs(zfsvfs, ds); - error = error ? error : resume_err; - } -#ifdef illumos - VFS_RELE(zfsvfs->z_vfs); -#else - vfs_unbusy(zfsvfs->z_vfs); -#endif - } else { - error = dsl_dataset_rollback(fsname, target, NULL, outnvl); - } - return (error); -} - -static int -recursive_unmount(const char *fsname, void *arg) -{ - const char *snapname = arg; - char fullname[ZFS_MAX_DATASET_NAME_LEN]; - - (void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname); - zfs_unmount_snap(fullname); - - return (0); -} - -/* - * inputs: - * zc_name old name of dataset or bookmark - * zc_value new name of dataset or bookmark - * zc_cookie recursive flag (only valid for snapshots) - * - * outputs: none - */ -static int -zfs_ioc_rename(zfs_cmd_t *zc) -{ - objset_t *os; - dmu_objset_type_t ost; - boolean_t recursive = zc->zc_cookie & 1; - char *pos, *pos2; - boolean_t allow_mounted = B_TRUE; - int err; - -#ifdef __FreeBSD__ - allow_mounted = (zc->zc_cookie & 2) != 0; -#endif - - zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; - zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; - - pos = strchr(zc->zc_name, '#'); - if (pos != NULL) { - /* Bookmarks must be in same fs. */ - pos2 = strchr(zc->zc_value, '#'); - if (pos2 == NULL) - return (SET_ERROR(EINVAL)); - - /* Recursive flag is not supported yet. */ - if (recursive) - return (SET_ERROR(ENOTSUP)); - - *pos = '\0'; - *pos2 = '\0'; - if (strcmp(zc->zc_name, zc->zc_value) == 0) { - err = dsl_bookmark_rename(zc->zc_name, - pos + 1, pos2 + 1); - } else { - err = SET_ERROR(EXDEV); - } - *pos = '#'; - *pos2 = '#'; - return (err); - } - - /* "zfs rename" from and to ...%recv datasets should both fail */ - if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || - dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || - strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%')) - return (SET_ERROR(EINVAL)); - - err = dmu_objset_hold(zc->zc_name, FTAG, &os); - if (err != 0) - return (err); - ost = dmu_objset_type(os); - dmu_objset_rele(os, FTAG); - - pos = strchr(zc->zc_name, '@'); - if (pos != NULL) { - /* Snapshots must be in same fs. */ - pos2 = strchr(zc->zc_value, '@'); - if (pos2 == NULL) - return (SET_ERROR(EINVAL)); - *pos = '\0'; - *pos2 = '\0'; - if (strcmp(zc->zc_name, zc->zc_value) != 0) { - err = SET_ERROR(EXDEV); - } else { - if (ost == DMU_OST_ZFS && !allow_mounted) { - err = dmu_objset_find(zc->zc_name, - recursive_unmount, pos + 1, - recursive ? DS_FIND_CHILDREN : 0); - } - if (err == 0) { - err = dsl_dataset_rename_snapshot(zc->zc_name, - pos + 1, pos2 + 1, recursive); - } - } - *pos = '@'; - *pos2 = '@'; - return (err); - } else { -#ifdef illumos - if (ost == DMU_OST_ZVOL) - (void) zvol_remove_minor(zc->zc_name); -#endif - return (dsl_dir_rename(zc->zc_name, zc->zc_value)); - } -} - -static int -zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) -{ - const char *propname = nvpair_name(pair); - boolean_t issnap = (strchr(dsname, '@') != NULL); - zfs_prop_t prop = zfs_name_to_prop(propname); - uint64_t intval; - int err; - - if (prop == ZPROP_INVAL) { - if (zfs_prop_user(propname)) { - if (err = zfs_secpolicy_write_perms(dsname, - ZFS_DELEG_PERM_USERPROP, cr)) - return (err); - return (0); - } - - if (!issnap && zfs_prop_userquota(propname)) { - const char *perm = NULL; - const char *uq_prefix = - zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; - const char *gq_prefix = - zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; - - if (strncmp(propname, uq_prefix, - strlen(uq_prefix)) == 0) { - perm = ZFS_DELEG_PERM_USERQUOTA; - } else if (strncmp(propname, gq_prefix, - strlen(gq_prefix)) == 0) { - perm = ZFS_DELEG_PERM_GROUPQUOTA; - } else { - /* USERUSED and GROUPUSED are read-only */ - return (SET_ERROR(EINVAL)); - } - - if (err = zfs_secpolicy_write_perms(dsname, perm, cr)) - return (err); - return (0); - } - - return (SET_ERROR(EINVAL)); - } - - if (issnap) - return (SET_ERROR(EINVAL)); - - if (nvpair_type(pair) == DATA_TYPE_NVLIST) { - /* - * dsl_prop_get_all_impl() returns properties in this - * format. - */ - nvlist_t *attrs; - VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); - VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &pair) == 0); - } - - /* - * Check that this value is valid for this pool version - */ - switch (prop) { - case ZFS_PROP_COMPRESSION: - /* - * If the user specified gzip compression, make sure - * the SPA supports it. We ignore any errors here since - * we'll catch them later. - */ - if (nvpair_value_uint64(pair, &intval) == 0) { - if (intval >= ZIO_COMPRESS_GZIP_1 && - intval <= ZIO_COMPRESS_GZIP_9 && - zfs_earlier_version(dsname, - SPA_VERSION_GZIP_COMPRESSION)) { - return (SET_ERROR(ENOTSUP)); - } - - if (intval == ZIO_COMPRESS_ZLE && - zfs_earlier_version(dsname, - SPA_VERSION_ZLE_COMPRESSION)) - return (SET_ERROR(ENOTSUP)); - - if (intval == ZIO_COMPRESS_LZ4) { - spa_t *spa; - - if ((err = spa_open(dsname, &spa, FTAG)) != 0) - return (err); - - if (!spa_feature_is_enabled(spa, - SPA_FEATURE_LZ4_COMPRESS)) { - spa_close(spa, FTAG); - return (SET_ERROR(ENOTSUP)); - } - spa_close(spa, FTAG); - } - - /* - * If this is a bootable dataset then - * verify that the compression algorithm - * is supported for booting. We must return - * something other than ENOTSUP since it - * implies a downrev pool version. - */ - if (zfs_is_bootfs(dsname) && - !BOOTFS_COMPRESS_VALID(intval)) { - return (SET_ERROR(ERANGE)); - } - } - break; - - case ZFS_PROP_COPIES: - if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) - return (SET_ERROR(ENOTSUP)); - break; - - case ZFS_PROP_RECORDSIZE: - /* Record sizes above 128k need the feature to be enabled */ - if (nvpair_value_uint64(pair, &intval) == 0 && - intval > SPA_OLD_MAXBLOCKSIZE) { - spa_t *spa; - - /* - * We don't allow setting the property above 1MB, - * unless the tunable has been changed. - */ - if (intval > zfs_max_recordsize || - intval > SPA_MAXBLOCKSIZE) - return (SET_ERROR(ERANGE)); - - if ((err = spa_open(dsname, &spa, FTAG)) != 0) - return (err); - - if (!spa_feature_is_enabled(spa, - SPA_FEATURE_LARGE_BLOCKS)) { - spa_close(spa, FTAG); - return (SET_ERROR(ENOTSUP)); - } - spa_close(spa, FTAG); - } - break; - - case ZFS_PROP_DNODESIZE: - /* Dnode sizes above 512 need the feature to be enabled */ - if (nvpair_value_uint64(pair, &intval) == 0 && - intval != ZFS_DNSIZE_LEGACY) { - spa_t *spa; - - if ((err = spa_open(dsname, &spa, FTAG)) != 0) - return (err); - - if (!spa_feature_is_enabled(spa, - SPA_FEATURE_LARGE_DNODE)) { - spa_close(spa, FTAG); - return (SET_ERROR(ENOTSUP)); - } - spa_close(spa, FTAG); - } - break; - - case ZFS_PROP_SPECIAL_SMALL_BLOCKS: - /* - * This property could require the allocation classes - * feature to be active for setting, however we allow - * it so that tests of settable properties succeed. - * The CLI will issue a warning in this case. - */ - break; - - case ZFS_PROP_SHARESMB: - if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) - return (SET_ERROR(ENOTSUP)); - break; - - case ZFS_PROP_ACLINHERIT: - if (nvpair_type(pair) == DATA_TYPE_UINT64 && - nvpair_value_uint64(pair, &intval) == 0) { - if (intval == ZFS_ACL_PASSTHROUGH_X && - zfs_earlier_version(dsname, - SPA_VERSION_PASSTHROUGH_X)) - return (SET_ERROR(ENOTSUP)); - } - break; - - case ZFS_PROP_CHECKSUM: - case ZFS_PROP_DEDUP: - { - spa_feature_t feature; - spa_t *spa; - - /* dedup feature version checks */ - if (prop == ZFS_PROP_DEDUP && - zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) - return (SET_ERROR(ENOTSUP)); - - if (nvpair_value_uint64(pair, &intval) != 0) - return (SET_ERROR(EINVAL)); - - /* check prop value is enabled in features */ - feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK); - if (feature == SPA_FEATURE_NONE) - break; - - if ((err = spa_open(dsname, &spa, FTAG)) != 0) - return (err); - - if (!spa_feature_is_enabled(spa, feature)) { - spa_close(spa, FTAG); - return (SET_ERROR(ENOTSUP)); - } - spa_close(spa, FTAG); - break; - } - } - - return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); -} - -/* - * Checks for a race condition to make sure we don't increment a feature flag - * multiple times. - */ -static int -zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - spa_feature_t *featurep = arg; - - if (!spa_feature_is_active(spa, *featurep)) - return (0); - else - return (SET_ERROR(EBUSY)); -} - -/* - * The callback invoked on feature activation in the sync task caused by - * zfs_prop_activate_feature. - */ -static void -zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - spa_feature_t *featurep = arg; - - spa_feature_incr(spa, *featurep, tx); -} - -/* - * Activates a feature on a pool in response to a property setting. This - * creates a new sync task which modifies the pool to reflect the feature - * as being active. - */ -static int -zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature) -{ - int err; - - /* EBUSY here indicates that the feature is already active */ - err = dsl_sync_task(spa_name(spa), - zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync, - &feature, 2, ZFS_SPACE_CHECK_RESERVED); - - if (err != 0 && err != EBUSY) - return (err); - else - return (0); -} - -/* - * Removes properties from the given props list that fail permission checks - * needed to clear them and to restore them in case of a receive error. For each - * property, make sure we have both set and inherit permissions. - * - * Returns the first error encountered if any permission checks fail. If the - * caller provides a non-NULL errlist, it also gives the complete list of names - * of all the properties that failed a permission check along with the - * corresponding error numbers. The caller is responsible for freeing the - * returned errlist. - * - * If every property checks out successfully, zero is returned and the list - * pointed at by errlist is NULL. - */ -static int -zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist) -{ - zfs_cmd_t *zc; - nvpair_t *pair, *next_pair; - nvlist_t *errors; - int err, rv = 0; - - if (props == NULL) - return (0); - - VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); - (void) strcpy(zc->zc_name, dataset); - pair = nvlist_next_nvpair(props, NULL); - while (pair != NULL) { - next_pair = nvlist_next_nvpair(props, pair); - - (void) strcpy(zc->zc_value, nvpair_name(pair)); - if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || - (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { - VERIFY(nvlist_remove_nvpair(props, pair) == 0); - VERIFY(nvlist_add_int32(errors, - zc->zc_value, err) == 0); - } - pair = next_pair; - } - kmem_free(zc, sizeof (zfs_cmd_t)); - - if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { - nvlist_free(errors); - errors = NULL; - } else { - VERIFY(nvpair_value_int32(pair, &rv) == 0); - } - - if (errlist == NULL) - nvlist_free(errors); - else - *errlist = errors; - - return (rv); -} - -static boolean_t -propval_equals(nvpair_t *p1, nvpair_t *p2) -{ - if (nvpair_type(p1) == DATA_TYPE_NVLIST) { - /* dsl_prop_get_all_impl() format */ - nvlist_t *attrs; - VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); - VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &p1) == 0); - } - - if (nvpair_type(p2) == DATA_TYPE_NVLIST) { - nvlist_t *attrs; - VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); - VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &p2) == 0); - } - - if (nvpair_type(p1) != nvpair_type(p2)) - return (B_FALSE); - - if (nvpair_type(p1) == DATA_TYPE_STRING) { - char *valstr1, *valstr2; - - VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); - VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); - return (strcmp(valstr1, valstr2) == 0); - } else { - uint64_t intval1, intval2; - - VERIFY(nvpair_value_uint64(p1, &intval1) == 0); - VERIFY(nvpair_value_uint64(p2, &intval2) == 0); - return (intval1 == intval2); - } -} - -/* - * Remove properties from props if they are not going to change (as determined - * by comparison with origprops). Remove them from origprops as well, since we - * do not need to clear or restore properties that won't change. - */ -static void -props_reduce(nvlist_t *props, nvlist_t *origprops) -{ - nvpair_t *pair, *next_pair; - - if (origprops == NULL) - return; /* all props need to be received */ - - pair = nvlist_next_nvpair(props, NULL); - while (pair != NULL) { - const char *propname = nvpair_name(pair); - nvpair_t *match; - - next_pair = nvlist_next_nvpair(props, pair); - - if ((nvlist_lookup_nvpair(origprops, propname, - &match) != 0) || !propval_equals(pair, match)) - goto next; /* need to set received value */ - - /* don't clear the existing received value */ - (void) nvlist_remove_nvpair(origprops, match); - /* don't bother receiving the property */ - (void) nvlist_remove_nvpair(props, pair); -next: - pair = next_pair; - } -} - -/* - * Extract properties that cannot be set PRIOR to the receipt of a dataset. - * For example, refquota cannot be set until after the receipt of a dataset, - * because in replication streams, an older/earlier snapshot may exceed the - * refquota. We want to receive the older/earlier snapshot, but setting - * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent - * the older/earlier snapshot from being received (with EDQUOT). - * - * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. - * - * libzfs will need to be judicious handling errors encountered by props - * extracted by this function. - */ -static nvlist_t * -extract_delay_props(nvlist_t *props) -{ - nvlist_t *delayprops; - nvpair_t *nvp, *tmp; - static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 }; - int i; - - VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; - nvp = nvlist_next_nvpair(props, nvp)) { - /* - * strcmp() is safe because zfs_prop_to_name() always returns - * a bounded string. - */ - for (i = 0; delayable[i] != 0; i++) { - if (strcmp(zfs_prop_to_name(delayable[i]), - nvpair_name(nvp)) == 0) { - break; - } - } - if (delayable[i] != 0) { - tmp = nvlist_prev_nvpair(props, nvp); - VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); - VERIFY(nvlist_remove_nvpair(props, nvp) == 0); - nvp = tmp; - } - } - - if (nvlist_empty(delayprops)) { - nvlist_free(delayprops); - delayprops = NULL; - } - return (delayprops); -} - -#ifdef DEBUG -static boolean_t zfs_ioc_recv_inject_err; -#endif - -/* - * inputs: - * zc_name name of containing filesystem - * zc_nvlist_src{_size} nvlist of properties to apply - * zc_value name of snapshot to create - * zc_string name of clone origin (if DRR_FLAG_CLONE) - * zc_cookie file descriptor to recv from - * zc_begin_record the BEGIN record of the stream (not byteswapped) - * zc_guid force flag - * zc_cleanup_fd cleanup-on-exit file descriptor - * zc_action_handle handle for this guid/ds mapping (or zero on first call) - * zc_resumable if data is incomplete assume sender will resume - * - * outputs: - * zc_cookie number of bytes read - * zc_nvlist_dst{_size} error for each unapplied received property - * zc_obj zprop_errflags_t - * zc_action_handle handle for this guid/ds mapping - */ -static int -zfs_ioc_recv(zfs_cmd_t *zc) -{ - file_t *fp; - dmu_recv_cookie_t drc; - boolean_t force = (boolean_t)zc->zc_guid; - int fd; - int error = 0; - int props_error = 0; - nvlist_t *errors; - offset_t off; - nvlist_t *props = NULL; /* sent properties */ - nvlist_t *origprops = NULL; /* existing properties */ - nvlist_t *delayprops = NULL; /* sent properties applied post-receive */ - char *origin = NULL; - char *tosnap; - char tofs[ZFS_MAX_DATASET_NAME_LEN]; - boolean_t first_recvd_props = B_FALSE; - - if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || - strchr(zc->zc_value, '@') == NULL || - strchr(zc->zc_value, '%')) - return (SET_ERROR(EINVAL)); - - (void) strcpy(tofs, zc->zc_value); - tosnap = strchr(tofs, '@'); - *tosnap++ = '\0'; - - if (zc->zc_nvlist_src != 0 && - (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &props)) != 0) - return (error); - - fd = zc->zc_cookie; -#ifdef illumos - fp = getf(fd); -#else - fget_read(curthread, fd, &cap_pread_rights, &fp); -#endif - if (fp == NULL) { - nvlist_free(props); - return (SET_ERROR(EBADF)); - } - - errors = fnvlist_alloc(); - - if (zc->zc_string[0]) - origin = zc->zc_string; - - error = dmu_recv_begin(tofs, tosnap, - &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc); - if (error != 0) - goto out; - - /* - * Set properties before we receive the stream so that they are applied - * to the new data. Note that we must call dmu_recv_stream() if - * dmu_recv_begin() succeeds. - */ - if (props != NULL && !drc.drc_newfs) { - if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= - SPA_VERSION_RECVD_PROPS && - !dsl_prop_get_hasrecvd(tofs)) - first_recvd_props = B_TRUE; - - /* - * If new received properties are supplied, they are to - * completely replace the existing received properties, so stash - * away the existing ones. - */ - if (dsl_prop_get_received(tofs, &origprops) == 0) { - nvlist_t *errlist = NULL; - /* - * Don't bother writing a property if its value won't - * change (and avoid the unnecessary security checks). - * - * The first receive after SPA_VERSION_RECVD_PROPS is a - * special case where we blow away all local properties - * regardless. - */ - if (!first_recvd_props) - props_reduce(props, origprops); - if (zfs_check_clearable(tofs, origprops, &errlist) != 0) - (void) nvlist_merge(errors, errlist, 0); - nvlist_free(errlist); - - if (clear_received_props(tofs, origprops, - first_recvd_props ? NULL : props) != 0) - zc->zc_obj |= ZPROP_ERR_NOCLEAR; - } else { - zc->zc_obj |= ZPROP_ERR_NOCLEAR; - } - } - - if (props != NULL) { - props_error = dsl_prop_set_hasrecvd(tofs); - - if (props_error == 0) { - delayprops = extract_delay_props(props); - (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, - props, errors); - } - } - - off = fp->f_offset; - error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd, - &zc->zc_action_handle); - - if (error == 0) { - zfsvfs_t *zfsvfs = NULL; - - if (getzfsvfs(tofs, &zfsvfs) == 0) { - /* online recv */ - dsl_dataset_t *ds; - int end_err; - - ds = dmu_objset_ds(zfsvfs->z_os); - error = zfs_suspend_fs(zfsvfs); - /* - * If the suspend fails, then the recv_end will - * likely also fail, and clean up after itself. - */ - end_err = dmu_recv_end(&drc, zfsvfs); - if (error == 0) - error = zfs_resume_fs(zfsvfs, ds); - error = error ? error : end_err; -#ifdef illumos - VFS_RELE(zfsvfs->z_vfs); -#else - vfs_unbusy(zfsvfs->z_vfs); -#endif - } else { - error = dmu_recv_end(&drc, NULL); - } - - /* Set delayed properties now, after we're done receiving. */ - if (delayprops != NULL && error == 0) { - (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, - delayprops, errors); - } - } - - if (delayprops != NULL) { - /* - * Merge delayed props back in with initial props, in case - * we're DEBUG and zfs_ioc_recv_inject_err is set (which means - * we have to make sure clear_received_props() includes - * the delayed properties). - * - * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, - * using ASSERT() will be just like a VERIFY. - */ - ASSERT(nvlist_merge(props, delayprops, 0) == 0); - nvlist_free(delayprops); - } - - /* - * Now that all props, initial and delayed, are set, report the prop - * errors to the caller. - */ - if (zc->zc_nvlist_dst_size != 0 && - (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || - put_nvlist(zc, errors) != 0)) { - /* - * Caller made zc->zc_nvlist_dst less than the minimum expected - * size or supplied an invalid address. - */ - props_error = SET_ERROR(EINVAL); - } - - zc->zc_cookie = off - fp->f_offset; - if (off >= 0 && off <= MAXOFFSET_T) - fp->f_offset = off; - -#ifdef DEBUG - if (zfs_ioc_recv_inject_err) { - zfs_ioc_recv_inject_err = B_FALSE; - error = 1; - } -#endif - - /* - * On error, restore the original props. - */ - if (error != 0 && props != NULL && !drc.drc_newfs) { - if (clear_received_props(tofs, props, NULL) != 0) { - /* - * We failed to clear the received properties. - * Since we may have left a $recvd value on the - * system, we can't clear the $hasrecvd flag. - */ - zc->zc_obj |= ZPROP_ERR_NORESTORE; - } else if (first_recvd_props) { - dsl_prop_unset_hasrecvd(tofs); - } - - if (origprops == NULL && !drc.drc_newfs) { - /* We failed to stash the original properties. */ - zc->zc_obj |= ZPROP_ERR_NORESTORE; - } - - /* - * dsl_props_set() will not convert RECEIVED to LOCAL on or - * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL - * explictly if we're restoring local properties cleared in the - * first new-style receive. - */ - if (origprops != NULL && - zfs_set_prop_nvlist(tofs, (first_recvd_props ? - ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), - origprops, NULL) != 0) { - /* - * We stashed the original properties but failed to - * restore them. - */ - zc->zc_obj |= ZPROP_ERR_NORESTORE; - } - } -out: - nvlist_free(props); - nvlist_free(origprops); - nvlist_free(errors); - releasef(fd); - - if (error == 0) - error = props_error; - - return (error); -} - -/* - * inputs: - * zc_name name of snapshot to send - * zc_cookie file descriptor to send stream to - * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) - * zc_sendobj objsetid of snapshot to send - * zc_fromobj objsetid of incremental fromsnap (may be zero) - * zc_guid if set, estimate size of stream only. zc_cookie is ignored. - * output size in zc_objset_type. - * zc_flags lzc_send_flags - * - * outputs: - * zc_objset_type estimated size, if zc_guid is set - * - * NOTE: This is no longer the preferred interface, any new functionality - * should be added to zfs_ioc_send_new() instead. - */ -static int -zfs_ioc_send(zfs_cmd_t *zc) -{ - int error; - offset_t off; - boolean_t estimate = (zc->zc_guid != 0); - boolean_t embedok = (zc->zc_flags & 0x1); - boolean_t large_block_ok = (zc->zc_flags & 0x2); - boolean_t compressok = (zc->zc_flags & 0x4); - - if (zc->zc_obj != 0) { - dsl_pool_t *dp; - dsl_dataset_t *tosnap; - - error = dsl_pool_hold(zc->zc_name, FTAG, &dp); - if (error != 0) - return (error); - - error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); - if (error != 0) { - dsl_pool_rele(dp, FTAG); - return (error); - } - - if (dsl_dir_is_clone(tosnap->ds_dir)) - zc->zc_fromobj = - dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; - dsl_dataset_rele(tosnap, FTAG); - dsl_pool_rele(dp, FTAG); - } - - if (estimate) { - dsl_pool_t *dp; - dsl_dataset_t *tosnap; - dsl_dataset_t *fromsnap = NULL; - - error = dsl_pool_hold(zc->zc_name, FTAG, &dp); - if (error != 0) - return (error); - - error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); - if (error != 0) { - dsl_pool_rele(dp, FTAG); - return (error); - } - - if (zc->zc_fromobj != 0) { - error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, - FTAG, &fromsnap); - if (error != 0) { - dsl_dataset_rele(tosnap, FTAG); - dsl_pool_rele(dp, FTAG); - return (error); - } - } - - error = dmu_send_estimate(tosnap, fromsnap, compressok, - &zc->zc_objset_type); - - if (fromsnap != NULL) - dsl_dataset_rele(fromsnap, FTAG); - dsl_dataset_rele(tosnap, FTAG); - dsl_pool_rele(dp, FTAG); - } else { - file_t *fp; - -#ifdef illumos - fp = getf(zc->zc_cookie); -#else - fget_write(curthread, zc->zc_cookie, &cap_write_rights, &fp); -#endif - if (fp == NULL) - return (SET_ERROR(EBADF)); - - off = fp->f_offset; - error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, - zc->zc_fromobj, embedok, large_block_ok, compressok, -#ifdef illumos - zc->zc_cookie, fp->f_vnode, &off); -#else - zc->zc_cookie, fp, &off); -#endif - - if (off >= 0 && off <= MAXOFFSET_T) - fp->f_offset = off; - releasef(zc->zc_cookie); - } - return (error); -} - -/* - * inputs: - * zc_name name of snapshot on which to report progress - * zc_cookie file descriptor of send stream - * - * outputs: - * zc_cookie number of bytes written in send stream thus far - */ -static int -zfs_ioc_send_progress(zfs_cmd_t *zc) -{ - dsl_pool_t *dp; - dsl_dataset_t *ds; - dmu_sendarg_t *dsp = NULL; - int error; - - error = dsl_pool_hold(zc->zc_name, FTAG, &dp); - if (error != 0) - return (error); - - error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); - if (error != 0) { - dsl_pool_rele(dp, FTAG); - return (error); - } - - mutex_enter(&ds->ds_sendstream_lock); - - /* - * Iterate over all the send streams currently active on this dataset. - * If there's one which matches the specified file descriptor _and_ the - * stream was started by the current process, return the progress of - * that stream. - */ - for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; - dsp = list_next(&ds->ds_sendstreams, dsp)) { - if (dsp->dsa_outfd == zc->zc_cookie && - dsp->dsa_proc == curproc) - break; - } - - if (dsp != NULL) - zc->zc_cookie = *(dsp->dsa_off); - else - error = SET_ERROR(ENOENT); - - mutex_exit(&ds->ds_sendstream_lock); - dsl_dataset_rele(ds, FTAG); - dsl_pool_rele(dp, FTAG); - return (error); -} - -static int -zfs_ioc_inject_fault(zfs_cmd_t *zc) -{ - int id, error; - - error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, - &zc->zc_inject_record); - - if (error == 0) - zc->zc_guid = (uint64_t)id; - - return (error); -} - -static int -zfs_ioc_clear_fault(zfs_cmd_t *zc) -{ - return (zio_clear_fault((int)zc->zc_guid)); -} - -static int -zfs_ioc_inject_list_next(zfs_cmd_t *zc) -{ - int id = (int)zc->zc_guid; - int error; - - error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), - &zc->zc_inject_record); - - zc->zc_guid = id; - - return (error); -} - -static int -zfs_ioc_error_log(zfs_cmd_t *zc) -{ - spa_t *spa; - int error; - size_t count = (size_t)zc->zc_nvlist_dst_size; - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, - &count); - if (error == 0) - zc->zc_nvlist_dst_size = count; - else - zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); - - spa_close(spa, FTAG); - - return (error); -} - -static int -zfs_ioc_clear(zfs_cmd_t *zc) -{ - spa_t *spa; - vdev_t *vd; - int error; - - /* - * On zpool clear we also fix up missing slogs - */ - mutex_enter(&spa_namespace_lock); - spa = spa_lookup(zc->zc_name); - if (spa == NULL) { - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(EIO)); - } - if (spa_get_log_state(spa) == SPA_LOG_MISSING) { - /* we need to let spa_open/spa_load clear the chains */ - spa_set_log_state(spa, SPA_LOG_CLEAR); - } - spa->spa_last_open_failed = 0; - mutex_exit(&spa_namespace_lock); - - if (zc->zc_cookie & ZPOOL_NO_REWIND) { - error = spa_open(zc->zc_name, &spa, FTAG); - } else { - nvlist_t *policy; - nvlist_t *config = NULL; - - if (zc->zc_nvlist_src == 0) - return (SET_ERROR(EINVAL)); - - if ((error = get_nvlist(zc->zc_nvlist_src, - zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { - error = spa_open_rewind(zc->zc_name, &spa, FTAG, - policy, &config); - if (config != NULL) { - int err; - - if ((err = put_nvlist(zc, config)) != 0) - error = err; - nvlist_free(config); - } - nvlist_free(policy); - } - } - - if (error != 0) - return (error); - - /* - * If multihost is enabled, resuming I/O is unsafe as another - * host may have imported the pool. - */ - if (spa_multihost(spa) && spa_suspended(spa)) - return (SET_ERROR(EINVAL)); - - spa_vdev_state_enter(spa, SCL_NONE); - - if (zc->zc_guid == 0) { - vd = NULL; - } else { - vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); - if (vd == NULL) { - (void) spa_vdev_state_exit(spa, NULL, ENODEV); - spa_close(spa, FTAG); - return (SET_ERROR(ENODEV)); - } - } - - vdev_clear(spa, vd); - - (void) spa_vdev_state_exit(spa, NULL, 0); - - /* - * Resume any suspended I/Os. - */ - if (zio_resume(spa) != 0) - error = SET_ERROR(EIO); - - spa_close(spa, FTAG); - - return (error); -} - -/* - * Reopen all the vdevs associated with the pool. - * - * innvl: { - * "scrub_restart" -> when true and scrub is running, allow to restart - * scrub as the side effect of the reopen (boolean). - * } - * - * outnvl is unused - */ -static const zfs_ioc_key_t zfs_keys_pool_reopen[] = { - {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, -}; - -static int -zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) -{ - spa_t *spa; - int error; - boolean_t scrub_restart = B_TRUE; - - if (innvl) { - scrub_restart = fnvlist_lookup_boolean_value(innvl, - "scrub_restart"); - } - - error = spa_open(pool, &spa, FTAG); - if (error != 0) - return (error); - - spa_vdev_state_enter(spa, SCL_NONE); - - /* - * If a resilver is already in progress then set the - * spa_scrub_reopen flag to B_TRUE so that we don't restart - * the scan as a side effect of the reopen. Otherwise, let - * vdev_open() decided if a resilver is required. - */ - spa->spa_scrub_reopen = (!scrub_restart && - dsl_scan_resilvering(spa->spa_dsl_pool)); - vdev_reopen(spa->spa_root_vdev); - spa->spa_scrub_reopen = B_FALSE; - - (void) spa_vdev_state_exit(spa, NULL, 0); - spa_close(spa, FTAG); - return (0); -} - -/* - * inputs: - * zc_name name of filesystem - * - * outputs: - * zc_string name of conflicting snapshot, if there is one - */ -static int -zfs_ioc_promote(zfs_cmd_t *zc) -{ - dsl_pool_t *dp; - dsl_dataset_t *ds, *ods; - char origin[ZFS_MAX_DATASET_NAME_LEN]; - char *cp; - int error; - - zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; - if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || - strchr(zc->zc_name, '%')) - return (SET_ERROR(EINVAL)); - - error = dsl_pool_hold(zc->zc_name, FTAG, &dp); - if (error != 0) - return (error); - - error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); - if (error != 0) { - dsl_pool_rele(dp, FTAG); - return (error); - } - - if (!dsl_dir_is_clone(ds->ds_dir)) { - dsl_dataset_rele(ds, FTAG); - dsl_pool_rele(dp, FTAG); - return (SET_ERROR(EINVAL)); - } - - error = dsl_dataset_hold_obj(dp, - dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - dsl_pool_rele(dp, FTAG); - return (error); - } - - dsl_dataset_name(ods, origin); - dsl_dataset_rele(ods, FTAG); - dsl_dataset_rele(ds, FTAG); - dsl_pool_rele(dp, FTAG); - - /* - * We don't need to unmount *all* the origin fs's snapshots, but - * it's easier. - */ - cp = strchr(origin, '@'); - if (cp) - *cp = '\0'; - (void) dmu_objset_find(origin, - zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); - return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); -} - -/* - * Retrieve a single {user|group}{used|quota}@... property. - * - * inputs: - * zc_name name of filesystem - * zc_objset_type zfs_userquota_prop_t - * zc_value domain name (eg. "S-1-234-567-89") - * zc_guid RID/UID/GID - * - * outputs: - * zc_cookie property value - */ -static int -zfs_ioc_userspace_one(zfs_cmd_t *zc) -{ - zfsvfs_t *zfsvfs; - int error; - - if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) - return (SET_ERROR(EINVAL)); - - error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); - if (error != 0) - return (error); - - error = zfs_userspace_one(zfsvfs, - zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); - zfsvfs_rele(zfsvfs, FTAG); - - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_cookie zap cursor - * zc_objset_type zfs_userquota_prop_t - * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) - * - * outputs: - * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) - * zc_cookie zap cursor - */ -static int -zfs_ioc_userspace_many(zfs_cmd_t *zc) -{ - zfsvfs_t *zfsvfs; - int bufsize = zc->zc_nvlist_dst_size; - - if (bufsize <= 0) - return (SET_ERROR(ENOMEM)); - - int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); - if (error != 0) - return (error); - - void *buf = kmem_alloc(bufsize, KM_SLEEP); - - error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, - buf, &zc->zc_nvlist_dst_size); - - if (error == 0) { - error = ddi_copyout(buf, - (void *)(uintptr_t)zc->zc_nvlist_dst, - zc->zc_nvlist_dst_size, zc->zc_iflags); - } - kmem_free(buf, bufsize); - zfsvfs_rele(zfsvfs, FTAG); - - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * - * outputs: - * none - */ -static int -zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) -{ - objset_t *os; - int error = 0; - zfsvfs_t *zfsvfs; - - if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { - if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { - /* - * If userused is not enabled, it may be because the - * objset needs to be closed & reopened (to grow the - * objset_phys_t). Suspend/resume the fs will do that. - */ - dsl_dataset_t *ds, *newds; - - ds = dmu_objset_ds(zfsvfs->z_os); - error = zfs_suspend_fs(zfsvfs); - if (error == 0) { - dmu_objset_refresh_ownership(ds, &newds, - zfsvfs); - error = zfs_resume_fs(zfsvfs, newds); - } - } - if (error == 0) - error = dmu_objset_userspace_upgrade(zfsvfs->z_os); -#ifdef illumos - VFS_RELE(zfsvfs->z_vfs); -#else - vfs_unbusy(zfsvfs->z_vfs); -#endif - } else { - /* XXX kind of reading contents without owning */ - error = dmu_objset_hold(zc->zc_name, FTAG, &os); - if (error != 0) - return (error); - - error = dmu_objset_userspace_upgrade(os); - dmu_objset_rele(os, FTAG); - } - - return (error); -} - -#ifdef illumos -/* - * We don't want to have a hard dependency - * against some special symbols in sharefs - * nfs, and smbsrv. Determine them if needed when - * the first file system is shared. - * Neither sharefs, nfs or smbsrv are unloadable modules. - */ -int (*znfsexport_fs)(void *arg); -int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t); -int (*zsmbexport_fs)(void *arg, boolean_t add_share); - -int zfs_nfsshare_inited; -int zfs_smbshare_inited; - -ddi_modhandle_t nfs_mod; -ddi_modhandle_t sharefs_mod; -ddi_modhandle_t smbsrv_mod; -#endif /* illumos */ -kmutex_t zfs_share_lock; - -#ifdef illumos -static int -zfs_init_sharefs() -{ - int error; - - ASSERT(MUTEX_HELD(&zfs_share_lock)); - /* Both NFS and SMB shares also require sharetab support. */ - if (sharefs_mod == NULL && ((sharefs_mod = - ddi_modopen("fs/sharefs", - KRTLD_MODE_FIRST, &error)) == NULL)) { - return (SET_ERROR(ENOSYS)); - } - if (zshare_fs == NULL && ((zshare_fs = - (int (*)(enum sharefs_sys_op, share_t *, uint32_t)) - ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) { - return (SET_ERROR(ENOSYS)); - } - return (0); -} -#endif /* illumos */ - -static int -zfs_ioc_share(zfs_cmd_t *zc) -{ -#ifdef illumos - int error; - int opcode; - - switch (zc->zc_share.z_sharetype) { - case ZFS_SHARE_NFS: - case ZFS_UNSHARE_NFS: - if (zfs_nfsshare_inited == 0) { - mutex_enter(&zfs_share_lock); - if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs", - KRTLD_MODE_FIRST, &error)) == NULL)) { - mutex_exit(&zfs_share_lock); - return (SET_ERROR(ENOSYS)); - } - if (znfsexport_fs == NULL && - ((znfsexport_fs = (int (*)(void *)) - ddi_modsym(nfs_mod, - "nfs_export", &error)) == NULL)) { - mutex_exit(&zfs_share_lock); - return (SET_ERROR(ENOSYS)); - } - error = zfs_init_sharefs(); - if (error != 0) { - mutex_exit(&zfs_share_lock); - return (SET_ERROR(ENOSYS)); - } - zfs_nfsshare_inited = 1; - mutex_exit(&zfs_share_lock); - } - break; - case ZFS_SHARE_SMB: - case ZFS_UNSHARE_SMB: - if (zfs_smbshare_inited == 0) { - mutex_enter(&zfs_share_lock); - if (smbsrv_mod == NULL && ((smbsrv_mod = - ddi_modopen("drv/smbsrv", - KRTLD_MODE_FIRST, &error)) == NULL)) { - mutex_exit(&zfs_share_lock); - return (SET_ERROR(ENOSYS)); - } - if (zsmbexport_fs == NULL && ((zsmbexport_fs = - (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod, - "smb_server_share", &error)) == NULL)) { - mutex_exit(&zfs_share_lock); - return (SET_ERROR(ENOSYS)); - } - error = zfs_init_sharefs(); - if (error != 0) { - mutex_exit(&zfs_share_lock); - return (SET_ERROR(ENOSYS)); - } - zfs_smbshare_inited = 1; - mutex_exit(&zfs_share_lock); - } - break; - default: - return (SET_ERROR(EINVAL)); - } - - switch (zc->zc_share.z_sharetype) { - case ZFS_SHARE_NFS: - case ZFS_UNSHARE_NFS: - if (error = - znfsexport_fs((void *) - (uintptr_t)zc->zc_share.z_exportdata)) - return (error); - break; - case ZFS_SHARE_SMB: - case ZFS_UNSHARE_SMB: - if (error = zsmbexport_fs((void *) - (uintptr_t)zc->zc_share.z_exportdata, - zc->zc_share.z_sharetype == ZFS_SHARE_SMB ? - B_TRUE: B_FALSE)) { - return (error); - } - break; - } - - opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS || - zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ? - SHAREFS_ADD : SHAREFS_REMOVE; - - /* - * Add or remove share from sharetab - */ - error = zshare_fs(opcode, - (void *)(uintptr_t)zc->zc_share.z_sharedata, - zc->zc_share.z_sharemax); - - return (error); - -#else /* !illumos */ - return (ENOSYS); -#endif /* illumos */ -} - -ace_t full_access[] = { - {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} -}; - -/* - * inputs: - * zc_name name of containing filesystem - * zc_obj object # beyond which we want next in-use object # - * - * outputs: - * zc_obj next in-use object # - */ -static int -zfs_ioc_next_obj(zfs_cmd_t *zc) -{ - objset_t *os = NULL; - int error; - - error = dmu_objset_hold(zc->zc_name, FTAG, &os); - if (error != 0) - return (error); - - error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0); - - dmu_objset_rele(os, FTAG); - return (error); -} - -/* - * inputs: - * zc_name name of filesystem - * zc_value prefix name for snapshot - * zc_cleanup_fd cleanup-on-exit file descriptor for calling process - * - * outputs: - * zc_value short name of new snapshot - */ -static int -zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) -{ - char *snap_name; - char *hold_name; - int error; - minor_t minor; - - error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); - if (error != 0) - return (error); - - snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, - (u_longlong_t)ddi_get_lbolt64()); - hold_name = kmem_asprintf("%%%s", zc->zc_value); - - error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, - hold_name); - if (error == 0) - (void) strcpy(zc->zc_value, snap_name); - strfree(snap_name); - strfree(hold_name); - zfs_onexit_fd_rele(zc->zc_cleanup_fd); - return (error); -} - -/* - * inputs: - * zc_name name of "to" snapshot - * zc_value name of "from" snapshot - * zc_cookie file descriptor to write diff data on - * - * outputs: - * dmu_diff_record_t's to the file descriptor - */ -static int -zfs_ioc_diff(zfs_cmd_t *zc) -{ - file_t *fp; - offset_t off; - int error; - -#ifdef illumos - fp = getf(zc->zc_cookie); -#else - fget_write(curthread, zc->zc_cookie, &cap_write_rights, &fp); -#endif - if (fp == NULL) - return (SET_ERROR(EBADF)); - - off = fp->f_offset; - -#ifdef illumos - error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off); -#else - error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); -#endif - - if (off >= 0 && off <= MAXOFFSET_T) - fp->f_offset = off; - releasef(zc->zc_cookie); - - return (error); -} - -#ifdef illumos -/* - * Remove all ACL files in shares dir - */ -static int -zfs_smb_acl_purge(znode_t *dzp) -{ - zap_cursor_t zc; - zap_attribute_t zap; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - int error; - - for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); - (error = zap_cursor_retrieve(&zc, &zap)) == 0; - zap_cursor_advance(&zc)) { - if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred, - NULL, 0)) != 0) - break; - } - zap_cursor_fini(&zc); - return (error); -} -#endif /* illumos */ - -static int -zfs_ioc_smb_acl(zfs_cmd_t *zc) -{ -#ifdef illumos - vnode_t *vp; - znode_t *dzp; - vnode_t *resourcevp = NULL; - znode_t *sharedir; - zfsvfs_t *zfsvfs; - nvlist_t *nvlist; - char *src, *target; - vattr_t vattr; - vsecattr_t vsec; - int error = 0; - - if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, - NO_FOLLOW, NULL, &vp)) != 0) - return (error); - - /* Now make sure mntpnt and dataset are ZFS */ - - if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || - (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), - zc->zc_name) != 0)) { - VN_RELE(vp); - return (SET_ERROR(EINVAL)); - } - - dzp = VTOZ(vp); - zfsvfs = dzp->z_zfsvfs; - ZFS_ENTER(zfsvfs); - - /* - * Create share dir if its missing. - */ - mutex_enter(&zfsvfs->z_lock); - if (zfsvfs->z_shares_dir == 0) { - dmu_tx_t *tx; - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE, - ZFS_SHARES_DIR); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error != 0) { - dmu_tx_abort(tx); - } else { - error = zfs_create_share_dir(zfsvfs, tx); - dmu_tx_commit(tx); - } - if (error != 0) { - mutex_exit(&zfsvfs->z_lock); - VN_RELE(vp); - ZFS_EXIT(zfsvfs); - return (error); - } - } - mutex_exit(&zfsvfs->z_lock); - - ASSERT(zfsvfs->z_shares_dir); - if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) { - VN_RELE(vp); - ZFS_EXIT(zfsvfs); - return (error); - } - - switch (zc->zc_cookie) { - case ZFS_SMB_ACL_ADD: - vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; - vattr.va_type = VREG; - vattr.va_mode = S_IFREG|0777; - vattr.va_uid = 0; - vattr.va_gid = 0; - - vsec.vsa_mask = VSA_ACE; - vsec.vsa_aclentp = &full_access; - vsec.vsa_aclentsz = sizeof (full_access); - vsec.vsa_aclcnt = 1; - - error = VOP_CREATE(ZTOV(sharedir), zc->zc_string, - &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec); - if (resourcevp) - VN_RELE(resourcevp); - break; - - case ZFS_SMB_ACL_REMOVE: - error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred, - NULL, 0); - break; - - case ZFS_SMB_ACL_RENAME: - if ((error = get_nvlist(zc->zc_nvlist_src, - zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) { - VN_RELE(vp); - VN_RELE(ZTOV(sharedir)); - ZFS_EXIT(zfsvfs); - return (error); - } - if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) || - nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET, - &target)) { - VN_RELE(vp); - VN_RELE(ZTOV(sharedir)); - ZFS_EXIT(zfsvfs); - nvlist_free(nvlist); - return (error); - } - error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target, - kcred, NULL, 0); - nvlist_free(nvlist); - break; - - case ZFS_SMB_ACL_PURGE: - error = zfs_smb_acl_purge(sharedir); - break; - - default: - error = SET_ERROR(EINVAL); - break; - } - - VN_RELE(vp); - VN_RELE(ZTOV(sharedir)); - - ZFS_EXIT(zfsvfs); - - return (error); -#else /* !illumos */ - return (EOPNOTSUPP); -#endif /* illumos */ -} - -/* - * innvl: { - * "holds" -> { snapname -> holdname (string), ... } - * (optional) "cleanup_fd" -> fd (int32) - * } - * - * outnvl: { - * snapname -> error value (int32) - * ... - * } - */ -static const zfs_ioc_key_t zfs_keys_hold[] = { - {"holds", DATA_TYPE_NVLIST, 0}, - {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, -}; - -/* ARGSUSED */ -static int -zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) -{ - nvpair_t *pair; - nvlist_t *holds; - int cleanup_fd = -1; - int error; - minor_t minor = 0; - - holds = fnvlist_lookup_nvlist(args, "holds"); - - /* make sure the user didn't pass us any invalid (empty) tags */ - for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; - pair = nvlist_next_nvpair(holds, pair)) { - char *htag; - - error = nvpair_value_string(pair, &htag); - if (error != 0) - return (SET_ERROR(error)); - - if (strlen(htag) == 0) - return (SET_ERROR(EINVAL)); - } - - if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { - error = zfs_onexit_fd_hold(cleanup_fd, &minor); - if (error != 0) - return (error); - } - - error = dsl_dataset_user_hold(holds, minor, errlist); - if (minor != 0) - zfs_onexit_fd_rele(cleanup_fd); - return (error); -} - -/* - * innvl is not used. - * - * outnvl: { - * holdname -> time added (uint64 seconds since epoch) - * ... - * } - */ -static const zfs_ioc_key_t zfs_keys_get_holds[] = { - /* no nvl keys */ -}; - -/* ARGSUSED */ -static int -zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) -{ - return (dsl_dataset_get_holds(snapname, outnvl)); -} - -/* - * innvl: { - * snapname -> { holdname, ... } - * ... - * } - * - * outnvl: { - * snapname -> error value (int32) - * ... - * } - */ -static const zfs_ioc_key_t zfs_keys_release[] = { - {"...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST}, -}; - -/* ARGSUSED */ -static int -zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) -{ - return (dsl_dataset_user_release(holds, errlist)); -} - -/* - * inputs: - * zc_name name of new filesystem or snapshot - * zc_value full name of old snapshot - * - * outputs: - * zc_cookie space in bytes - * zc_objset_type compressed space in bytes - * zc_perm_action uncompressed space in bytes - */ -static int -zfs_ioc_space_written(zfs_cmd_t *zc) -{ - int error; - dsl_pool_t *dp; - dsl_dataset_t *new, *old; - - error = dsl_pool_hold(zc->zc_name, FTAG, &dp); - if (error != 0) - return (error); - error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); - if (error != 0) { - dsl_pool_rele(dp, FTAG); - return (error); - } - error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); - if (error != 0) { - dsl_dataset_rele(new, FTAG); - dsl_pool_rele(dp, FTAG); - return (error); - } - - error = dsl_dataset_space_written(old, new, &zc->zc_cookie, - &zc->zc_objset_type, &zc->zc_perm_action); - dsl_dataset_rele(old, FTAG); - dsl_dataset_rele(new, FTAG); - dsl_pool_rele(dp, FTAG); - return (error); -} - -/* - * innvl: { - * "firstsnap" -> snapshot name - * } - * - * outnvl: { - * "used" -> space in bytes - * "compressed" -> compressed space in bytes - * "uncompressed" -> uncompressed space in bytes - * } - */ -static const zfs_ioc_key_t zfs_keys_space_snaps[] = { - {"firstsnap", DATA_TYPE_STRING, 0}, -}; - -static int -zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) -{ - int error; - dsl_pool_t *dp; - dsl_dataset_t *new, *old; - char *firstsnap; - uint64_t used, comp, uncomp; - - firstsnap = fnvlist_lookup_string(innvl, "firstsnap"); - - error = dsl_pool_hold(lastsnap, FTAG, &dp); - if (error != 0) - return (error); - - error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); - if (error == 0 && !new->ds_is_snapshot) { - dsl_dataset_rele(new, FTAG); - error = SET_ERROR(EINVAL); - } - if (error != 0) { - dsl_pool_rele(dp, FTAG); - return (error); - } - error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); - if (error == 0 && !old->ds_is_snapshot) { - dsl_dataset_rele(old, FTAG); - error = SET_ERROR(EINVAL); - } - if (error != 0) { - dsl_dataset_rele(new, FTAG); - dsl_pool_rele(dp, FTAG); - return (error); - } - - error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); - dsl_dataset_rele(old, FTAG); - dsl_dataset_rele(new, FTAG); - dsl_pool_rele(dp, FTAG); - fnvlist_add_uint64(outnvl, "used", used); - fnvlist_add_uint64(outnvl, "compressed", comp); - fnvlist_add_uint64(outnvl, "uncompressed", uncomp); - return (error); -} - -static int -zfs_ioc_jail(zfs_cmd_t *zc) -{ - - return (zone_dataset_attach(curthread->td_ucred, zc->zc_name, - (int)zc->zc_jailid)); -} - -static int -zfs_ioc_unjail(zfs_cmd_t *zc) -{ - - return (zone_dataset_detach(curthread->td_ucred, zc->zc_name, - (int)zc->zc_jailid)); -} - -/* - * innvl: { - * "fd" -> file descriptor to write stream to (int32) - * (optional) "fromsnap" -> full snap name to send an incremental from - * (optional) "largeblockok" -> (value ignored) - * indicates that blocks > 128KB are permitted - * (optional) "embedok" -> (value ignored) - * presence indicates DRR_WRITE_EMBEDDED records are permitted - * (optional) "compressok" -> (value ignored) - * presence indicates compressed DRR_WRITE records are permitted - * (optional) "resume_object" and "resume_offset" -> (uint64) - * if present, resume send stream from specified object and offset. - * } - * - * outnvl is unused - */ -static const zfs_ioc_key_t zfs_keys_send_new[] = { - {"fd", DATA_TYPE_INT32, 0}, - {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, - {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, - {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, - {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, - {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, - {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, - {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, -}; - -/* ARGSUSED */ -static int -zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) -{ - file_t *fp; - int error; - offset_t off; - char *fromname = NULL; - int fd; - boolean_t largeblockok; - boolean_t embedok; - boolean_t compressok; - uint64_t resumeobj = 0; - uint64_t resumeoff = 0; - - fd = fnvlist_lookup_int32(innvl, "fd"); - - (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); - - largeblockok = nvlist_exists(innvl, "largeblockok"); - embedok = nvlist_exists(innvl, "embedok"); - compressok = nvlist_exists(innvl, "compressok"); - - (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); - (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); - -#ifdef illumos - file_t *fp = getf(fd); -#else - fget_write(curthread, fd, &cap_write_rights, &fp); -#endif - if (fp == NULL) - return (SET_ERROR(EBADF)); - - off = fp->f_offset; - error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, -#ifdef illumos - fd, resumeobj, resumeoff, fp->f_vnode, &off); -#else - fd, resumeobj, resumeoff, fp, &off); -#endif - -#ifdef illumos - if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) - fp->f_offset = off; -#else - fp->f_offset = off; -#endif - - releasef(fd); - return (error); -} - -/* - * Determine approximately how large a zfs send stream will be -- the number - * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). - * - * innvl: { - * (optional) "from" -> full snap or bookmark name to send an incremental - * from - * (optional) "largeblockok" -> (value ignored) - * indicates that blocks > 128KB are permitted - * (optional) "embedok" -> (value ignored) - * presence indicates DRR_WRITE_EMBEDDED records are permitted - * (optional) "compressok" -> (value ignored) - * presence indicates compressed DRR_WRITE records are permitted - * } - * - * outnvl: { - * "space" -> bytes of space (uint64) - * } - */ -static const zfs_ioc_key_t zfs_keys_send_space[] = { - {"from", DATA_TYPE_STRING, ZK_OPTIONAL}, - {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, - {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, - {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, - {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, - {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, -}; - -static int -zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) -{ - dsl_pool_t *dp; - dsl_dataset_t *tosnap; - int error; - char *fromname; - boolean_t compressok; - uint64_t space; - - error = dsl_pool_hold(snapname, FTAG, &dp); - if (error != 0) - return (error); - - error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); - if (error != 0) { - dsl_pool_rele(dp, FTAG); - return (error); - } - - compressok = nvlist_exists(innvl, "compressok"); - - error = nvlist_lookup_string(innvl, "from", &fromname); - if (error == 0) { - if (strchr(fromname, '@') != NULL) { - /* - * If from is a snapshot, hold it and use the more - * efficient dmu_send_estimate to estimate send space - * size using deadlists. - */ - dsl_dataset_t *fromsnap; - error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); - if (error != 0) - goto out; - error = dmu_send_estimate(tosnap, fromsnap, compressok, - &space); - dsl_dataset_rele(fromsnap, FTAG); - } else if (strchr(fromname, '#') != NULL) { - /* - * If from is a bookmark, fetch the creation TXG of the - * snapshot it was created from and use that to find - * blocks that were born after it. - */ - zfs_bookmark_phys_t frombm; - - error = dsl_bookmark_lookup(dp, fromname, tosnap, - &frombm); - if (error != 0) - goto out; - error = dmu_send_estimate_from_txg(tosnap, - frombm.zbm_creation_txg, compressok, &space); - } else { - /* - * from is not properly formatted as a snapshot or - * bookmark - */ - error = SET_ERROR(EINVAL); - goto out; - } - } else { - /* - * If estimating the size of a full send, use dmu_send_estimate. - */ - error = dmu_send_estimate(tosnap, NULL, compressok, &space); - } - - fnvlist_add_uint64(outnvl, "space", space); - -out: - dsl_dataset_rele(tosnap, FTAG); - dsl_pool_rele(dp, FTAG); - return (error); -} - -/* - * Sync the currently open TXG to disk for the specified pool. - * This is somewhat similar to 'zfs_sync()'. - * For cases that do not result in error this ioctl will wait for - * the currently open TXG to commit before returning back to the caller. - * - * innvl: { - * "force" -> when true, force uberblock update even if there is no dirty data. - * In addition this will cause the vdev configuration to be written - * out including updating the zpool cache file. (boolean_t) - * } - * - * onvl is unused - */ -static const zfs_ioc_key_t zfs_keys_pool_sync[] = { - {"force", DATA_TYPE_BOOLEAN_VALUE, 0}, -}; - -/* ARGSUSED */ -static int -zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) -{ - int err; - boolean_t force; - spa_t *spa; - - if ((err = spa_open(pool, &spa, FTAG)) != 0) - return (err); - - force = fnvlist_lookup_boolean_value(innvl, "force"); - if (force) { - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER); - vdev_config_dirty(spa->spa_root_vdev); - spa_config_exit(spa, SCL_CONFIG, FTAG); - } - txg_wait_synced(spa_get_dsl(spa), 0); - - spa_close(spa, FTAG); - - return (err); -} - -static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; - -static void -zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, - zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, - boolean_t log_history, zfs_ioc_poolcheck_t pool_check) -{ - zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; - - ASSERT3U(ioc, >=, ZFS_IOC_FIRST); - ASSERT3U(ioc, <, ZFS_IOC_LAST); - ASSERT3P(vec->zvec_legacy_func, ==, NULL); - ASSERT3P(vec->zvec_func, ==, NULL); - - vec->zvec_legacy_func = func; - vec->zvec_secpolicy = secpolicy; - vec->zvec_namecheck = namecheck; - vec->zvec_allow_log = log_history; - vec->zvec_pool_check = pool_check; -} - -/* - * See the block comment at the beginning of this file for details on - * each argument to this function. - */ -static void -zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, - zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, - zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, - boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys) -{ - zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; - - ASSERT3U(ioc, >=, ZFS_IOC_FIRST); - ASSERT3U(ioc, <, ZFS_IOC_LAST); - ASSERT3P(vec->zvec_legacy_func, ==, NULL); - ASSERT3P(vec->zvec_func, ==, NULL); - - /* if we are logging, the name must be valid */ - ASSERT(!allow_log || namecheck != NO_NAME); - - vec->zvec_name = name; - vec->zvec_func = func; - vec->zvec_secpolicy = secpolicy; - vec->zvec_namecheck = namecheck; - vec->zvec_pool_check = pool_check; - vec->zvec_smush_outnvlist = smush_outnvlist; - vec->zvec_allow_log = allow_log; - vec->zvec_nvl_keys = nvl_keys; - vec->zvec_nvl_key_count = num_keys; -} - -static void -zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, - zfs_secpolicy_func_t *secpolicy, boolean_t log_history, - zfs_ioc_poolcheck_t pool_check) -{ - zfs_ioctl_register_legacy(ioc, func, secpolicy, - POOL_NAME, log_history, pool_check); -} - -static void -zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, - zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) -{ - zfs_ioctl_register_legacy(ioc, func, secpolicy, - DATASET_NAME, B_FALSE, pool_check); -} - -static void -zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) -{ - zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, - POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); -} - -static void -zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, - zfs_secpolicy_func_t *secpolicy) -{ - zfs_ioctl_register_legacy(ioc, func, secpolicy, - NO_NAME, B_FALSE, POOL_CHECK_NONE); -} - -static void -zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, - zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) -{ - zfs_ioctl_register_legacy(ioc, func, secpolicy, - DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); -} - -static void -zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) -{ - zfs_ioctl_register_dataset_read_secpolicy(ioc, func, - zfs_secpolicy_read); -} - -static void -zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, - zfs_secpolicy_func_t *secpolicy) -{ - zfs_ioctl_register_legacy(ioc, func, secpolicy, - DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); -} - -static void -zfs_ioctl_init(void) -{ - zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, - zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, - zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot)); - - zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, - zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, - zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history)); - - zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, - zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, - POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, - zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps)); - - zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, - zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, - POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, - zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new)); - - zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, - zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, - POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, - zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space)); - - zfs_ioctl_register("create", ZFS_IOC_CREATE, - zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, - zfs_keys_create, ARRAY_SIZE(zfs_keys_create)); - - zfs_ioctl_register("clone", ZFS_IOC_CLONE, - zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, - zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone)); - - zfs_ioctl_register("remap", ZFS_IOC_REMAP, - zfs_ioc_remap, zfs_secpolicy_remap, DATASET_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, - zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap)); - - zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, - zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, - zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps)); - - zfs_ioctl_register("hold", ZFS_IOC_HOLD, - zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, - zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold)); - zfs_ioctl_register("release", ZFS_IOC_RELEASE, - zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, - zfs_keys_release, ARRAY_SIZE(zfs_keys_release)); - - zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, - zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, - POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, - zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds)); - - zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, - zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, - zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback)); - - zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, - zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, - zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark)); - - zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, - zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, - POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, - zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks)); - - zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, - zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, - POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, - zfs_keys_destroy_bookmarks, - ARRAY_SIZE(zfs_keys_destroy_bookmarks)); - - zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM, - zfs_ioc_channel_program, zfs_secpolicy_config, - POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, - B_TRUE, zfs_keys_channel_program, - ARRAY_SIZE(zfs_keys_channel_program)); - - zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, - zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, - zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint)); - - zfs_ioctl_register("zpool_discard_checkpoint", - ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint, - zfs_secpolicy_config, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, - zfs_keys_pool_discard_checkpoint, - ARRAY_SIZE(zfs_keys_pool_discard_checkpoint)); - - zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, - zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, - zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize)); - - zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC, - zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, - zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync)); - zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, - zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE, - B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen)); - - zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV, - zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, - zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv)); - - zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV, - zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME, - POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE, - zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv)); - - /* IOCTLS that use the legacy function signature */ - - zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, - zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); - - zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, - zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); - zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, - zfs_ioc_pool_scan); - zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, - zfs_ioc_pool_upgrade); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, - zfs_ioc_vdev_add); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, - zfs_ioc_vdev_remove); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, - zfs_ioc_vdev_set_state); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, - zfs_ioc_vdev_attach); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, - zfs_ioc_vdev_detach); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, - zfs_ioc_vdev_setpath); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, - zfs_ioc_vdev_setfru); - zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, - zfs_ioc_pool_set_props); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, - zfs_ioc_vdev_split); - zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, - zfs_ioc_pool_reguid); - - zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, - zfs_ioc_pool_configs, zfs_secpolicy_none); - zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, - zfs_ioc_pool_tryimport, zfs_secpolicy_config); - zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, - zfs_ioc_inject_fault, zfs_secpolicy_inject); - zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, - zfs_ioc_clear_fault, zfs_secpolicy_inject); - zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, - zfs_ioc_inject_list_next, zfs_secpolicy_inject); - - /* - * pool destroy, and export don't log the history as part of - * zfsdev_ioctl, but rather zfs_ioc_pool_export - * does the logging of those commands. - */ - zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, - zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); - zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, - zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); - - zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, - zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); - zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, - zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); - - zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, - zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE); - zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, - zfs_ioc_dsobj_to_dsname, - zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE); - zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, - zfs_ioc_pool_get_history, - zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); - - zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, - zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); - - zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, - zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY); - - zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, - zfs_ioc_space_written); - zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, - zfs_ioc_objset_recvd_props); - zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, - zfs_ioc_next_obj); - zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, - zfs_ioc_get_fsacl); - zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, - zfs_ioc_objset_stats); - zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, - zfs_ioc_objset_zplprops); - zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, - zfs_ioc_dataset_list_next); - zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, - zfs_ioc_snapshot_list_next); - zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, - zfs_ioc_send_progress); - - zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, - zfs_ioc_diff, zfs_secpolicy_diff); - zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, - zfs_ioc_obj_to_stats, zfs_secpolicy_diff); - zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, - zfs_ioc_obj_to_path, zfs_secpolicy_diff); - zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, - zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); - zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, - zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); - zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, - zfs_ioc_send, zfs_secpolicy_send); - - zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, - zfs_secpolicy_none); - zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, - zfs_secpolicy_destroy); - zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, - zfs_secpolicy_recv); - zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, - zfs_secpolicy_promote); - zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, - zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); - zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, - zfs_secpolicy_set_fsacl); - - /* - * Not using zfs_ioctl_register_dataset_modify as DATASET_NAME check - * won't allow a bookmark name. - */ - zfs_ioctl_register_legacy(ZFS_IOC_RENAME, zfs_ioc_rename, - zfs_secpolicy_rename, ENTITY_NAME, B_TRUE, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); - - zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, - zfs_secpolicy_share, POOL_CHECK_NONE); - zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, - zfs_secpolicy_smb_acl, POOL_CHECK_NONE); - zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, - zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); - zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, - zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); - -#ifdef __FreeBSD__ - zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail, - zfs_secpolicy_config, POOL_CHECK_NONE); - zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail, - zfs_secpolicy_config, POOL_CHECK_NONE); - zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT, - zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME, - POOL_CHECK_NONE, B_FALSE, B_FALSE, - zfs_keys_nextboot, ARRAY_SIZE(zfs_keys_nextboot)); -#endif -} - -/* - * Verify that for non-legacy ioctls the input nvlist - * pairs match against the expected input. - * - * Possible errors are: - * ZFS_ERR_IOC_ARG_UNAVAIL An unrecognized nvpair was encountered - * ZFS_ERR_IOC_ARG_REQUIRED A required nvpair is missing - * ZFS_ERR_IOC_ARG_BADTYPE Invalid type for nvpair - */ -static int -zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec) -{ - const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys; - boolean_t required_keys_found = B_FALSE; - - /* - * examine each input pair - */ - for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); - pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { - char *name = nvpair_name(pair); - data_type_t type = nvpair_type(pair); - boolean_t identified = B_FALSE; - - /* - * check pair against the documented names and type - */ - for (int k = 0; k < vec->zvec_nvl_key_count; k++) { - /* if not a wild card name, check for an exact match */ - if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 && - strcmp(nvl_keys[k].zkey_name, name) != 0) - continue; - - identified = B_TRUE; - - if (nvl_keys[k].zkey_type != DATA_TYPE_ANY && - nvl_keys[k].zkey_type != type) { - return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE)); - } - - if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) - continue; - - required_keys_found = B_TRUE; - break; - } - - /* allow an 'optional' key, everything else is invalid */ - if (!identified && - (strcmp(name, "optional") != 0 || - type != DATA_TYPE_NVLIST)) { - return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL)); - } - } - - /* verify that all required keys were found */ - for (int k = 0; k < vec->zvec_nvl_key_count; k++) { - if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) - continue; - - if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) { - /* at least one non-optionial key is expected here */ - if (!required_keys_found) - return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); - continue; - } - - if (!nvlist_exists(innvl, nvl_keys[k].zkey_name)) - return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); - } - - return (0); -} - -int -pool_status_check(const char *name, zfs_ioc_namecheck_t type, - zfs_ioc_poolcheck_t check) -{ - spa_t *spa; - int error; - - ASSERT(type == POOL_NAME || type == DATASET_NAME || - type == ENTITY_NAME); - - if (check & POOL_CHECK_NONE) - return (0); - - error = spa_open(name, &spa, FTAG); - if (error == 0) { - if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) - error = SET_ERROR(EAGAIN); - else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) - error = SET_ERROR(EROFS); - spa_close(spa, FTAG); - } - return (error); -} - -/* - * Find a free minor number. - */ -minor_t -zfsdev_minor_alloc(void) -{ - static minor_t last_minor; - minor_t m; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - for (m = last_minor + 1; m != last_minor; m++) { - if (m > ZFSDEV_MAX_MINOR) - m = 1; - if (ddi_get_soft_state(zfsdev_state, m) == NULL) { - last_minor = m; - return (m); - } - } - - return (0); -} - -static int -zfs_ctldev_init(struct cdev *devp) -{ - minor_t minor; - zfs_soft_state_t *zs; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - minor = zfsdev_minor_alloc(); - if (minor == 0) - return (SET_ERROR(ENXIO)); - - if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) - return (SET_ERROR(EAGAIN)); - - devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close); - - zs = ddi_get_soft_state(zfsdev_state, minor); - zs->zss_type = ZSST_CTLDEV; - zfs_onexit_init((zfs_onexit_t **)&zs->zss_data); - - return (0); -} - -static void -zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor) -{ - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - zfs_onexit_destroy(zo); - ddi_soft_state_free(zfsdev_state, minor); -} - -void * -zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which) -{ - zfs_soft_state_t *zp; - - zp = ddi_get_soft_state(zfsdev_state, minor); - if (zp == NULL || zp->zss_type != which) - return (NULL); - - return (zp->zss_data); -} - -static int -zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td) -{ - int error = 0; - -#ifdef illumos - if (getminor(*devp) != 0) - return (zvol_open(devp, flag, otyp, cr)); -#endif - - /* This is the control device. Allocate a new minor if requested. */ - if (flag & FEXCL) { - mutex_enter(&spa_namespace_lock); - error = zfs_ctldev_init(devp); - mutex_exit(&spa_namespace_lock); - } - - return (error); -} - -static void -zfsdev_close(void *data) -{ - zfs_onexit_t *zo; - minor_t minor = (minor_t)(uintptr_t)data; - - if (minor == 0) - return; - - mutex_enter(&spa_namespace_lock); - zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV); - if (zo == NULL) { - mutex_exit(&spa_namespace_lock); - return; - } - zfs_ctldev_destroy(zo, minor); - mutex_exit(&spa_namespace_lock); -} - -static int -zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, - struct thread *td) -{ - zfs_cmd_t *zc; - uint_t vecnum; - int error, rc, len; -#ifdef illumos - minor_t minor = getminor(dev); -#else - zfs_iocparm_t *zc_iocparm; - int cflag, cmd, oldvecnum; - boolean_t newioc, compat; - void *compat_zc = NULL; - cred_t *cr = td->td_ucred; -#endif - const zfs_ioc_vec_t *vec; - char *saved_poolname = NULL; - nvlist_t *innvl = NULL; - - cflag = ZFS_CMD_COMPAT_NONE; - compat = B_FALSE; - newioc = B_TRUE; /* "new" style (zfs_iocparm_t) ioctl */ - - len = IOCPARM_LEN(zcmd); - vecnum = cmd = zcmd & 0xff; - - /* - * Check if we are talking to supported older binaries - * and translate zfs_cmd if necessary - */ - if (len != sizeof(zfs_iocparm_t)) { - newioc = B_FALSE; - compat = B_TRUE; - - vecnum = cmd; - - switch (len) { - case sizeof(zfs_cmd_zcmd_t): - cflag = ZFS_CMD_COMPAT_LZC; - break; - case sizeof(zfs_cmd_deadman_t): - cflag = ZFS_CMD_COMPAT_DEADMAN; - break; - case sizeof(zfs_cmd_v28_t): - cflag = ZFS_CMD_COMPAT_V28; - break; - case sizeof(zfs_cmd_v15_t): - if (cmd >= sizeof(zfs_ioctl_v15_to_v28) / - sizeof(zfs_ioctl_v15_to_v28[0])) - return (EINVAL); - - cflag = ZFS_CMD_COMPAT_V15; - vecnum = zfs_ioctl_v15_to_v28[cmd]; - - /* - * Return without further handling - * if the command is blacklisted. - */ - if (vecnum == ZFS_IOC_COMPAT_PASS) - return (0); - else if (vecnum == ZFS_IOC_COMPAT_FAIL) - return (ENOTSUP); - break; - default: - return (EINVAL); - } - } - -#ifdef illumos - vecnum = cmd - ZFS_IOC_FIRST; - ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip)); -#endif - - if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) - return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); - vec = &zfs_ioc_vec[vecnum]; - - zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); - -#ifdef illumos - error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); - if (error != 0) { - error = SET_ERROR(EFAULT); - goto out; - } -#else /* !illumos */ - bzero(zc, sizeof(zfs_cmd_t)); - - if (newioc) { - zc_iocparm = (void *)arg; - - switch (zc_iocparm->zfs_ioctl_version) { - case ZFS_IOCVER_CURRENT: - if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) { - error = SET_ERROR(EINVAL); - goto out; - } - break; - case ZFS_IOCVER_INLANES: - if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_inlanes_t)) { - error = SET_ERROR(EFAULT); - goto out; - } - compat = B_TRUE; - cflag = ZFS_CMD_COMPAT_INLANES; - break; - case ZFS_IOCVER_RESUME: - if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_resume_t)) { - error = SET_ERROR(EFAULT); - goto out; - } - compat = B_TRUE; - cflag = ZFS_CMD_COMPAT_RESUME; - break; - case ZFS_IOCVER_EDBP: - if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) { - error = SET_ERROR(EFAULT); - goto out; - } - compat = B_TRUE; - cflag = ZFS_CMD_COMPAT_EDBP; - break; - case ZFS_IOCVER_ZCMD: - if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) || - zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) { - error = SET_ERROR(EFAULT); - goto out; - } - compat = B_TRUE; - cflag = ZFS_CMD_COMPAT_ZCMD; - break; - default: - error = SET_ERROR(EINVAL); - goto out; - /* NOTREACHED */ - } - - if (compat) { - ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size); - compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); - bzero(compat_zc, sizeof(zfs_cmd_t)); - - error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd, - compat_zc, zc_iocparm->zfs_cmd_size, flag); - if (error != 0) { - error = SET_ERROR(EFAULT); - goto out; - } - } else { - error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd, - zc, zc_iocparm->zfs_cmd_size, flag); - if (error != 0) { - error = SET_ERROR(EFAULT); - goto out; - } - } - } - - if (compat) { - if (newioc) { - ASSERT(compat_zc != NULL); - zfs_cmd_compat_get(zc, compat_zc, cflag); - } else { - ASSERT(compat_zc == NULL); - zfs_cmd_compat_get(zc, arg, cflag); - } - oldvecnum = vecnum; - error = zfs_ioctl_compat_pre(zc, &vecnum, cflag); - if (error != 0) - goto out; - if (oldvecnum != vecnum) - vec = &zfs_ioc_vec[vecnum]; - } -#endif /* !illumos */ - - zc->zc_iflags = flag & FKIOCTL; - if (zc->zc_nvlist_src_size != 0) { - error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &innvl); - if (error != 0) - goto out; - } - - /* rewrite innvl for backwards compatibility */ - if (compat) - innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag); - - /* - * Ensure that all pool/dataset names are valid before we pass down to - * the lower layers. - */ - zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; - switch (vec->zvec_namecheck) { - case POOL_NAME: - if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) - error = SET_ERROR(EINVAL); - else - error = pool_status_check(zc->zc_name, - vec->zvec_namecheck, vec->zvec_pool_check); - break; - - case DATASET_NAME: - if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) - error = SET_ERROR(EINVAL); - else - error = pool_status_check(zc->zc_name, - vec->zvec_namecheck, vec->zvec_pool_check); - break; - - case ENTITY_NAME: - if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) { - error = SET_ERROR(EINVAL); - } else { - error = pool_status_check(zc->zc_name, - vec->zvec_namecheck, vec->zvec_pool_check); - } - break; - - case NO_NAME: - break; - } - - /* - * Ensure that all input pairs are valid before we pass them down - * to the lower layers. - * - * The vectored functions can use fnvlist_lookup_{type} for any - * required pairs since zfs_check_input_nvpairs() confirmed that - * they exist and are of the correct type. - */ - if (error == 0 && vec->zvec_func != NULL) { - error = zfs_check_input_nvpairs(innvl, vec); - if (error != 0) - goto out; - } - - if (error == 0) - error = vec->zvec_secpolicy(zc, innvl, cr); - - if (error != 0) - goto out; - - /* legacy ioctls can modify zc_name */ - len = strcspn(zc->zc_name, "/@#") + 1; - saved_poolname = kmem_alloc(len, KM_SLEEP); - (void) strlcpy(saved_poolname, zc->zc_name, len); - - if (vec->zvec_func != NULL) { - nvlist_t *outnvl; - int puterror = 0; - spa_t *spa; - nvlist_t *lognv = NULL; - - ASSERT(vec->zvec_legacy_func == NULL); - - /* - * Add the innvl to the lognv before calling the func, - * in case the func changes the innvl. - */ - if (vec->zvec_allow_log) { - lognv = fnvlist_alloc(); - fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, - vec->zvec_name); - if (!nvlist_empty(innvl)) { - fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, - innvl); - } - } - - outnvl = fnvlist_alloc(); - error = vec->zvec_func(zc->zc_name, innvl, outnvl); - - /* - * Some commands can partially execute, modify state, and still - * return an error. In these cases, attempt to record what - * was modified. - */ - if ((error == 0 || - (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) && - vec->zvec_allow_log && - spa_open(zc->zc_name, &spa, FTAG) == 0) { - if (!nvlist_empty(outnvl)) { - fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, - outnvl); - } - if (error != 0) { - fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, - error); - } - (void) spa_history_log_nvl(spa, lognv); - spa_close(spa, FTAG); - } - fnvlist_free(lognv); - - /* rewrite outnvl for backwards compatibility */ - if (compat) - outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum, - cflag); - - if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { - int smusherror = 0; - if (vec->zvec_smush_outnvlist) { - smusherror = nvlist_smush(outnvl, - zc->zc_nvlist_dst_size); - } - if (smusherror == 0) - puterror = put_nvlist(zc, outnvl); - } - - if (puterror != 0) - error = puterror; - - nvlist_free(outnvl); - } else { - error = vec->zvec_legacy_func(zc); - } - -out: - nvlist_free(innvl); - -#if defined(__FreeBSD__) && defined(_KERNEL) - /* - * Wait for ZVOL changes to get applied. - * NB: taskqueue_drain_all() does less than taskq_wait(), - * but enough for what we want. - * And there is no equivalent illumos API. - */ - if (error == 0) { - spa_t *spa; - - if (spa_open(saved_poolname, &spa, FTAG) == 0) { - taskqueue_drain_all( - spa->spa_zvol_taskq->tq_queue); - spa_close(spa, FTAG); - } - } -#endif - -#ifdef illumos - rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); - if (error == 0 && rc != 0) - error = SET_ERROR(EFAULT); -#else - if (compat) { - zfs_ioctl_compat_post(zc, cmd, cflag); - if (newioc) { - ASSERT(compat_zc != NULL); - ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size); - - zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag); - rc = ddi_copyout(compat_zc, - (void *)(uintptr_t)zc_iocparm->zfs_cmd, - zc_iocparm->zfs_cmd_size, flag); - if (error == 0 && rc != 0) - error = SET_ERROR(EFAULT); - kmem_free(compat_zc, sizeof (zfs_cmd_t)); - } else { - zfs_cmd_compat_put(zc, arg, vecnum, cflag); - } - } else { - ASSERT(newioc); - - rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd, - sizeof (zfs_cmd_t), flag); - if (error == 0 && rc != 0) - error = SET_ERROR(EFAULT); - } -#endif - if (error == 0 && vec->zvec_allow_log) { - char *s = tsd_get(zfs_allow_log_key); - if (s != NULL) - strfree(s); - (void) tsd_set(zfs_allow_log_key, saved_poolname); - } else { - if (saved_poolname != NULL) - strfree(saved_poolname); - } - - kmem_free(zc, sizeof (zfs_cmd_t)); - return (error); -} - -#ifdef illumos -static int -zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) -{ - if (cmd != DDI_ATTACH) - return (DDI_FAILURE); - - if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0, - DDI_PSEUDO, 0) == DDI_FAILURE) - return (DDI_FAILURE); - - zfs_dip = dip; - - ddi_report_dev(dip); - - return (DDI_SUCCESS); -} - -static int -zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) -{ - if (spa_busy() || zfs_busy() || zvol_busy()) - return (DDI_FAILURE); - - if (cmd != DDI_DETACH) - return (DDI_FAILURE); - - zfs_dip = NULL; - - ddi_prop_remove_all(dip); - ddi_remove_minor_node(dip, NULL); - - return (DDI_SUCCESS); -} - -/*ARGSUSED*/ -static int -zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) -{ - switch (infocmd) { - case DDI_INFO_DEVT2DEVINFO: - *result = zfs_dip; - return (DDI_SUCCESS); - - case DDI_INFO_DEVT2INSTANCE: - *result = (void *)0; - return (DDI_SUCCESS); - } - - return (DDI_FAILURE); -} -#endif /* illumos */ - -/* - * OK, so this is a little weird. - * - * /dev/zfs is the control node, i.e. minor 0. - * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0. - * - * /dev/zfs has basically nothing to do except serve up ioctls, - * so most of the standard driver entry points are in zvol.c. - */ -#ifdef illumos -static struct cb_ops zfs_cb_ops = { - zfsdev_open, /* open */ - zfsdev_close, /* close */ - zvol_strategy, /* strategy */ - nodev, /* print */ - zvol_dump, /* dump */ - zvol_read, /* read */ - zvol_write, /* write */ - zfsdev_ioctl, /* ioctl */ - nodev, /* devmap */ - nodev, /* mmap */ - nodev, /* segmap */ - nochpoll, /* poll */ - ddi_prop_op, /* prop_op */ - NULL, /* streamtab */ - D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */ - CB_REV, /* version */ - nodev, /* async read */ - nodev, /* async write */ -}; - -static struct dev_ops zfs_dev_ops = { - DEVO_REV, /* version */ - 0, /* refcnt */ - zfs_info, /* info */ - nulldev, /* identify */ - nulldev, /* probe */ - zfs_attach, /* attach */ - zfs_detach, /* detach */ - nodev, /* reset */ - &zfs_cb_ops, /* driver operations */ - NULL, /* no bus operations */ - NULL, /* power */ - ddi_quiesce_not_needed, /* quiesce */ -}; - -static struct modldrv zfs_modldrv = { - &mod_driverops, - "ZFS storage pool", - &zfs_dev_ops -}; - -static struct modlinkage modlinkage = { - MODREV_1, - (void *)&zfs_modlfs, - (void *)&zfs_modldrv, - NULL -}; -#endif /* illumos */ - -static struct cdevsw zfs_cdevsw = { - .d_version = D_VERSION, - .d_open = zfsdev_open, - .d_ioctl = zfsdev_ioctl, - .d_name = ZFS_DEV_NAME -}; - -static void -zfs_allow_log_destroy(void *arg) -{ - char *poolname = arg; - strfree(poolname); -} - -static void -zfsdev_init(void) -{ - zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666, - ZFS_DEV_NAME); -} - -static void -zfsdev_fini(void) -{ - if (zfsdev != NULL) - destroy_dev(zfsdev); -} - -static struct root_hold_token *zfs_root_token; - -#ifdef illumos -int -_init(void) -{ - int error; - - spa_init(FREAD | FWRITE); - zfs_init(); - zvol_init(); - zfs_ioctl_init(); - - if ((error = mod_install(&modlinkage)) != 0) { - zvol_fini(); - zfs_fini(); - spa_fini(); - return (error); - } - - tsd_create(&zfs_fsyncer_key, NULL); - tsd_create(&rrw_tsd_key, rrw_tsd_destroy); - tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); - - error = ldi_ident_from_mod(&modlinkage, &zfs_li); - ASSERT(error == 0); - mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); - - return (0); -} - -int -_fini(void) -{ - int error; - - if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) - return (SET_ERROR(EBUSY)); - - if ((error = mod_remove(&modlinkage)) != 0) - return (error); - - zvol_fini(); - zfs_fini(); - spa_fini(); - if (zfs_nfsshare_inited) - (void) ddi_modclose(nfs_mod); - if (zfs_smbshare_inited) - (void) ddi_modclose(smbsrv_mod); - if (zfs_nfsshare_inited || zfs_smbshare_inited) - (void) ddi_modclose(sharefs_mod); - - tsd_destroy(&zfs_fsyncer_key); - ldi_ident_release(zfs_li); - zfs_li = NULL; - mutex_destroy(&zfs_share_lock); - - return (error); -} - -int -_info(struct modinfo *modinfop) -{ - return (mod_info(&modlinkage, modinfop)); -} -#endif /* illumos */ - -static int zfs__init(void); -static int zfs__fini(void); -static void zfs_shutdown(void *, int); - -static eventhandler_tag zfs_shutdown_event_tag; - -#ifdef __FreeBSD__ -#define ZFS_MIN_KSTACK_PAGES 4 -#endif - -int -zfs__init(void) -{ - -#ifdef __FreeBSD__ -#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES - printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack " - "overflow panic!\nPlease consider adding " - "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES, - ZFS_MIN_KSTACK_PAGES); -#endif -#endif - zfs_root_token = root_mount_hold("ZFS"); - - mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); - - spa_init(FREAD | FWRITE); - zfs_init(); - zvol_init(); - zfs_ioctl_init(); - - tsd_create(&zfs_fsyncer_key, NULL); - tsd_create(&rrw_tsd_key, rrw_tsd_destroy); - tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); - tsd_create(&zfs_geom_probe_vdev_key, NULL); - - printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n"); - root_mount_rel(zfs_root_token); - - zfsdev_init(); - - return (0); -} - -int -zfs__fini(void) -{ - if (spa_busy() || zfs_busy() || zvol_busy() || - zio_injection_enabled) { - return (EBUSY); - } - - zfsdev_fini(); - zvol_fini(); - zfs_fini(); - spa_fini(); - - tsd_destroy(&zfs_fsyncer_key); - tsd_destroy(&rrw_tsd_key); - tsd_destroy(&zfs_allow_log_key); - - mutex_destroy(&zfs_share_lock); - - return (0); -} - -static void -zfs_shutdown(void *arg __unused, int howto __unused) -{ - - /* - * ZFS fini routines can not properly work in a panic-ed system. - */ - if (!KERNEL_PANICKED()) - (void)zfs__fini(); -} - - -static int -zfs_modevent(module_t mod, int type, void *unused __unused) -{ - int err; - - switch (type) { - case MOD_LOAD: - err = zfs__init(); - if (err == 0) - zfs_shutdown_event_tag = EVENTHANDLER_REGISTER( - shutdown_post_sync, zfs_shutdown, NULL, - SHUTDOWN_PRI_FIRST); - return (err); - case MOD_UNLOAD: - err = zfs__fini(); - if (err == 0 && zfs_shutdown_event_tag != NULL) - EVENTHANDLER_DEREGISTER(shutdown_post_sync, - zfs_shutdown_event_tag); - return (err); - case MOD_SHUTDOWN: - return (0); - default: - break; - } - return (EOPNOTSUPP); -} - -static moduledata_t zfs_mod = { - "zfsctrl", - zfs_modevent, - 0 -}; -DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY); -MODULE_VERSION(zfsctrl, 1); -MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1); -MODULE_DEPEND(zfsctrl, xdr, 1, 1, 1); -MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1); -MODULE_DEPEND(zfsctrl, zlib, 1, 1, 1); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c deleted file mode 100644 index c00c60a25ebb..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c +++ /dev/null @@ -1,688 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2015, 2018 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * These zfs_log_* functions must be called within a dmu tx, in one - * of 2 contexts depending on zilog->z_replay: - * - * Non replay mode - * --------------- - * We need to record the transaction so that if it is committed to - * the Intent Log then it can be replayed. An intent log transaction - * structure (itx_t) is allocated and all the information necessary to - * possibly replay the transaction is saved in it. The itx is then assigned - * a sequence number and inserted in the in-memory list anchored in the zilog. - * - * Replay mode - * ----------- - * We need to mark the intent log record as replayed in the log header. - * This is done in the same transaction as the replay so that they - * commit atomically. - */ - -int -zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap) -{ - int isxvattr = (vap->va_mask & AT_XVATTR); - switch (type) { - case Z_FILE: - if (vsecp == NULL && !isxvattr) - return (TX_CREATE); - if (vsecp && isxvattr) -#ifdef TODO - return (TX_CREATE_ACL_ATTR); -#else - panic("%s:%u: unsupported condition", __func__, __LINE__); -#endif - if (vsecp) - return (TX_CREATE_ACL); - else - return (TX_CREATE_ATTR); - /*NOTREACHED*/ - case Z_DIR: - if (vsecp == NULL && !isxvattr) - return (TX_MKDIR); - if (vsecp && isxvattr) -#ifdef TODO - return (TX_MKDIR_ACL_ATTR); -#else - panic("%s:%u: unsupported condition", __func__, __LINE__); -#endif - if (vsecp) - return (TX_MKDIR_ACL); - else - return (TX_MKDIR_ATTR); - case Z_XATTRDIR: - return (TX_MKXATTR); - } - ASSERT(0); - return (TX_MAX_TYPE); -} - -/* - * build up the log data necessary for logging xvattr_t - * First lr_attr_t is initialized. following the lr_attr_t - * is the mapsize and attribute bitmap copied from the xvattr_t. - * Following the bitmap and bitmapsize two 64 bit words are reserved - * for the create time which may be set. Following the create time - * records a single 64 bit integer which has the bits to set on - * replay for the xvattr. - */ -static void -zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) -{ - uint32_t *bitmap; - uint64_t *attrs; - uint64_t *crtime; - xoptattr_t *xoap; - void *scanstamp; - int i; - - xoap = xva_getxoptattr(xvap); - ASSERT(xoap); - - lrattr->lr_attr_masksize = xvap->xva_mapsize; - bitmap = &lrattr->lr_attr_bitmap; - for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) { - *bitmap = xvap->xva_reqattrmap[i]; - } - - /* Now pack the attributes up in a single uint64_t */ - attrs = (uint64_t *)bitmap; - crtime = attrs + 1; - scanstamp = (caddr_t)(crtime + 2); - *attrs = 0; - if (XVA_ISSET_REQ(xvap, XAT_READONLY)) - *attrs |= (xoap->xoa_readonly == 0) ? 0 : - XAT0_READONLY; - if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) - *attrs |= (xoap->xoa_hidden == 0) ? 0 : - XAT0_HIDDEN; - if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) - *attrs |= (xoap->xoa_system == 0) ? 0 : - XAT0_SYSTEM; - if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) - *attrs |= (xoap->xoa_archive == 0) ? 0 : - XAT0_ARCHIVE; - if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) - *attrs |= (xoap->xoa_immutable == 0) ? 0 : - XAT0_IMMUTABLE; - if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) - *attrs |= (xoap->xoa_nounlink == 0) ? 0 : - XAT0_NOUNLINK; - if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) - *attrs |= (xoap->xoa_appendonly == 0) ? 0 : - XAT0_APPENDONLY; - if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) - *attrs |= (xoap->xoa_opaque == 0) ? 0 : - XAT0_APPENDONLY; - if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) - *attrs |= (xoap->xoa_nodump == 0) ? 0 : - XAT0_NODUMP; - if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) - *attrs |= (xoap->xoa_av_quarantined == 0) ? 0 : - XAT0_AV_QUARANTINED; - if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) - *attrs |= (xoap->xoa_av_modified == 0) ? 0 : - XAT0_AV_MODIFIED; - if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) - ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime); - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) - bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ); - if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) - *attrs |= (xoap->xoa_reparse == 0) ? 0 : - XAT0_REPARSE; - if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) - *attrs |= (xoap->xoa_offline == 0) ? 0 : - XAT0_OFFLINE; - if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) - *attrs |= (xoap->xoa_sparse == 0) ? 0 : - XAT0_SPARSE; -} - -static void * -zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start) -{ - zfs_fuid_t *zfuid; - uint64_t *fuidloc = start; - - /* First copy in the ACE FUIDs */ - for (zfuid = list_head(&fuidp->z_fuids); zfuid; - zfuid = list_next(&fuidp->z_fuids, zfuid)) { - *fuidloc++ = zfuid->z_logfuid; - } - return (fuidloc); -} - - -static void * -zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start) -{ - zfs_fuid_domain_t *zdomain; - - /* now copy in the domain info, if any */ - if (fuidp->z_domain_str_sz != 0) { - for (zdomain = list_head(&fuidp->z_domains); zdomain; - zdomain = list_next(&fuidp->z_domains, zdomain)) { - bcopy((void *)zdomain->z_domain, start, - strlen(zdomain->z_domain) + 1); - start = (caddr_t)start + - strlen(zdomain->z_domain) + 1; - } - } - return (start); -} - -/* - * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and - * TK_MKXATTR transactions. - * - * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID - * domain information appended prior to the name. In this case the - * uid/gid in the log record will be a log centric FUID. - * - * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that - * may contain attributes, ACL and optional fuid information. - * - * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify - * and ACL and normal users/groups in the ACEs. - * - * There may be an optional xvattr attribute information similar - * to zfs_log_setattr. - * - * Also, after the file name "domain" strings may be appended. - */ -void -zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp, - zfs_fuid_info_t *fuidp, vattr_t *vap) -{ - itx_t *itx; - lr_create_t *lr; - lr_acl_create_t *lracl; - size_t aclsize = (vsecp != NULL) ? vsecp->vsa_aclentsz : 0; - size_t xvatsize = 0; - size_t txsize; - xvattr_t *xvap = (xvattr_t *)vap; - void *end; - size_t lrsize; - size_t namesize = strlen(name) + 1; - size_t fuidsz = 0; - - if (zil_replaying(zilog, tx)) - return; - - /* - * If we have FUIDs present then add in space for - * domains and ACE fuid's if any. - */ - if (fuidp) { - fuidsz += fuidp->z_domain_str_sz; - fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t); - } - - if (vap->va_mask & AT_XVATTR) - xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize); - - if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR || - (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR || - (int)txtype == TX_MKXATTR) { - txsize = sizeof (*lr) + namesize + fuidsz + xvatsize; - lrsize = sizeof (*lr); - } else { - txsize = - sizeof (lr_acl_create_t) + namesize + fuidsz + - ZIL_ACE_LENGTH(aclsize) + xvatsize; - lrsize = sizeof (lr_acl_create_t); - } - - itx = zil_itx_create(txtype, txsize); - - lr = (lr_create_t *)&itx->itx_lr; - lr->lr_doid = dzp->z_id; - lr->lr_foid = zp->z_id; - /* Store dnode slot count in 8 bits above object id. */ - LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT); - lr->lr_mode = zp->z_mode; - if (!IS_EPHEMERAL(zp->z_uid)) { - lr->lr_uid = (uint64_t)zp->z_uid; - } else { - lr->lr_uid = fuidp->z_fuid_owner; - } - if (!IS_EPHEMERAL(zp->z_gid)) { - lr->lr_gid = (uint64_t)zp->z_gid; - } else { - lr->lr_gid = fuidp->z_fuid_group; - } - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen, - sizeof (uint64_t)); - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), - lr->lr_crtime, sizeof (uint64_t) * 2); - - if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev, - sizeof (lr->lr_rdev)) != 0) - lr->lr_rdev = 0; - - /* - * Fill in xvattr info if any - */ - if (vap->va_mask & AT_XVATTR) { - zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap); - end = (caddr_t)lr + lrsize + xvatsize; - } else { - end = (caddr_t)lr + lrsize; - } - - /* Now fill in any ACL info */ - - if (vsecp) { - lracl = (lr_acl_create_t *)&itx->itx_lr; - lracl->lr_aclcnt = vsecp->vsa_aclcnt; - lracl->lr_acl_bytes = aclsize; - lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; - lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; - if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS) - lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; - else - lracl->lr_acl_flags = 0; - - bcopy(vsecp->vsa_aclentp, end, aclsize); - end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize); - } - - /* drop in FUID info */ - if (fuidp) { - end = zfs_log_fuid_ids(fuidp, end); - end = zfs_log_fuid_domains(fuidp, end); - } - /* - * Now place file name in log record - */ - bcopy(name, end, namesize); - - zil_itx_assign(zilog, itx, tx); -} - -/* - * Handles both TX_REMOVE and TX_RMDIR transactions. - */ -void -zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, char *name, uint64_t foid) -{ - itx_t *itx; - lr_remove_t *lr; - size_t namesize = strlen(name) + 1; - - if (zil_replaying(zilog, tx)) - return; - - itx = zil_itx_create(txtype, sizeof (*lr) + namesize); - lr = (lr_remove_t *)&itx->itx_lr; - lr->lr_doid = dzp->z_id; - bcopy(name, (char *)(lr + 1), namesize); - - itx->itx_oid = foid; - - zil_itx_assign(zilog, itx, tx); -} - -/* - * Handles TX_LINK transactions. - */ -void -zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name) -{ - itx_t *itx; - lr_link_t *lr; - size_t namesize = strlen(name) + 1; - - if (zil_replaying(zilog, tx)) - return; - - itx = zil_itx_create(txtype, sizeof (*lr) + namesize); - lr = (lr_link_t *)&itx->itx_lr; - lr->lr_doid = dzp->z_id; - lr->lr_link_obj = zp->z_id; - bcopy(name, (char *)(lr + 1), namesize); - - zil_itx_assign(zilog, itx, tx); -} - -/* - * Handles TX_SYMLINK transactions. - */ -void -zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name, char *link) -{ - itx_t *itx; - lr_create_t *lr; - size_t namesize = strlen(name) + 1; - size_t linksize = strlen(link) + 1; - - if (zil_replaying(zilog, tx)) - return; - - itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); - lr = (lr_create_t *)&itx->itx_lr; - lr->lr_doid = dzp->z_id; - lr->lr_foid = zp->z_id; - lr->lr_uid = zp->z_uid; - lr->lr_gid = zp->z_gid; - lr->lr_mode = zp->z_mode; - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen, - sizeof (uint64_t)); - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), - lr->lr_crtime, sizeof (uint64_t) * 2); - bcopy(name, (char *)(lr + 1), namesize); - bcopy(link, (char *)(lr + 1) + namesize, linksize); - - zil_itx_assign(zilog, itx, tx); -} - -/* - * Handles TX_RENAME transactions. - */ -void -zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) -{ - itx_t *itx; - lr_rename_t *lr; - size_t snamesize = strlen(sname) + 1; - size_t dnamesize = strlen(dname) + 1; - - if (zil_replaying(zilog, tx)) - return; - - itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); - lr = (lr_rename_t *)&itx->itx_lr; - lr->lr_sdoid = sdzp->z_id; - lr->lr_tdoid = tdzp->z_id; - bcopy(sname, (char *)(lr + 1), snamesize); - bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize); - itx->itx_oid = szp->z_id; - - zil_itx_assign(zilog, itx, tx); -} - -/* - * Handles TX_WRITE transactions. - */ -ssize_t zfs_immediate_write_sz = 32768; -#ifdef _KERNEL -SYSCTL_DECL(_vfs_zfs); -SYSCTL_LONG(_vfs_zfs, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN, - &zfs_immediate_write_sz, 0, "Minimal size for indirect log write"); -#endif - -void -zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t resid, int ioflag) -{ - uint32_t blocksize = zp->z_blksz; - itx_wr_state_t write_state; - uintptr_t fsync_cnt; - - if (zil_replaying(zilog, tx) || zp->z_unlinked) - return; - - if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) - write_state = WR_INDIRECT; - else if (!spa_has_slogs(zilog->zl_spa) && - resid >= zfs_immediate_write_sz) - write_state = WR_INDIRECT; - else if (ioflag & (FSYNC | FDSYNC)) - write_state = WR_COPIED; - else - write_state = WR_NEED_COPY; - - if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) { - (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); - } - - while (resid) { - itx_t *itx; - lr_write_t *lr; - itx_wr_state_t wr_state = write_state; - ssize_t len = resid; - - /* - * A WR_COPIED record must fit entirely in one log block. - * Large writes can use WR_NEED_COPY, which the ZIL will - * split into multiple records across several log blocks - * if necessary. - */ - if (wr_state == WR_COPIED && - resid > zil_max_copied_data(zilog)) - wr_state = WR_NEED_COPY; - else if (wr_state == WR_INDIRECT) - len = MIN(blocksize - P2PHASE(off, blocksize), resid); - - itx = zil_itx_create(txtype, sizeof (*lr) + - (wr_state == WR_COPIED ? len : 0)); - lr = (lr_write_t *)&itx->itx_lr; - if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, - zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { - zil_itx_destroy(itx); - itx = zil_itx_create(txtype, sizeof (*lr)); - lr = (lr_write_t *)&itx->itx_lr; - wr_state = WR_NEED_COPY; - } - - itx->itx_wr_state = wr_state; - lr->lr_foid = zp->z_id; - lr->lr_offset = off; - lr->lr_length = len; - lr->lr_blkoff = 0; - BP_ZERO(&lr->lr_blkptr); - - itx->itx_private = zp->z_zfsvfs; - - if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) && - (fsync_cnt == 0)) - itx->itx_sync = B_FALSE; - - zil_itx_assign(zilog, itx, tx); - - off += len; - resid -= len; - } -} - -/* - * Handles TX_TRUNCATE transactions. - */ -void -zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, uint64_t off, uint64_t len) -{ - itx_t *itx; - lr_truncate_t *lr; - - if (zil_replaying(zilog, tx) || zp->z_unlinked) - return; - - itx = zil_itx_create(txtype, sizeof (*lr)); - lr = (lr_truncate_t *)&itx->itx_lr; - lr->lr_foid = zp->z_id; - lr->lr_offset = off; - lr->lr_length = len; - - itx->itx_sync = (zp->z_sync_cnt != 0); - zil_itx_assign(zilog, itx, tx); -} - -/* - * Handles TX_SETATTR transactions. - */ -void -zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) -{ - itx_t *itx; - lr_setattr_t *lr; - xvattr_t *xvap = (xvattr_t *)vap; - size_t recsize = sizeof (lr_setattr_t); - void *start; - - if (zil_replaying(zilog, tx) || zp->z_unlinked) - return; - - /* - * If XVATTR set, then log record size needs to allow - * for lr_attr_t + xvattr mask, mapsize and create time - * plus actual attribute values - */ - if (vap->va_mask & AT_XVATTR) - recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize); - - if (fuidp) - recsize += fuidp->z_domain_str_sz; - - itx = zil_itx_create(txtype, recsize); - lr = (lr_setattr_t *)&itx->itx_lr; - lr->lr_foid = zp->z_id; - lr->lr_mask = (uint64_t)mask_applied; - lr->lr_mode = (uint64_t)vap->va_mode; - if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid)) - lr->lr_uid = fuidp->z_fuid_owner; - else - lr->lr_uid = (uint64_t)vap->va_uid; - - if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid)) - lr->lr_gid = fuidp->z_fuid_group; - else - lr->lr_gid = (uint64_t)vap->va_gid; - - lr->lr_size = (uint64_t)vap->va_size; - ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime); - ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime); - start = (lr_setattr_t *)(lr + 1); - if (vap->va_mask & AT_XVATTR) { - zfs_log_xvattr((lr_attr_t *)start, xvap); - start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize); - } - - /* - * Now stick on domain information if any on end - */ - - if (fuidp) - (void) zfs_log_fuid_domains(fuidp, start); - - itx->itx_sync = (zp->z_sync_cnt != 0); - zil_itx_assign(zilog, itx, tx); -} - -/* - * Handles TX_ACL transactions. - */ -void -zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, - vsecattr_t *vsecp, zfs_fuid_info_t *fuidp) -{ - itx_t *itx; - lr_acl_v0_t *lrv0; - lr_acl_t *lr; - int txtype; - int lrsize; - size_t txsize; - size_t aclbytes = vsecp->vsa_aclentsz; - - if (zil_replaying(zilog, tx) || zp->z_unlinked) - return; - - txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ? - TX_ACL_V0 : TX_ACL; - - if (txtype == TX_ACL) - lrsize = sizeof (*lr); - else - lrsize = sizeof (*lrv0); - - txsize = lrsize + - ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) + - (fuidp ? fuidp->z_domain_str_sz : 0) + - sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0); - - itx = zil_itx_create(txtype, txsize); - - lr = (lr_acl_t *)&itx->itx_lr; - lr->lr_foid = zp->z_id; - if (txtype == TX_ACL) { - lr->lr_acl_bytes = aclbytes; - lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; - lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; - if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) - lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; - else - lr->lr_acl_flags = 0; - } - lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt; - - if (txtype == TX_ACL_V0) { - lrv0 = (lr_acl_v0_t *)lr; - bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes); - } else { - void *start = (ace_t *)(lr + 1); - - bcopy(vsecp->vsa_aclentp, start, aclbytes); - - start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes); - - if (fuidp) { - start = zfs_log_fuid_ids(fuidp, start); - (void) zfs_log_fuid_domains(fuidp, start); - } - } - - itx->itx_sync = (zp->z_sync_cnt != 0); - zil_itx_assign(zilog, itx, tx); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c deleted file mode 100644 index edb9ca86caa8..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c +++ /dev/null @@ -1,254 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * ZFS kernel routines may add/delete callback routines to be invoked - * upon process exit (triggered via the close operation from the /dev/zfs - * driver). - * - * These cleanup callbacks are intended to allow for the accumulation - * of kernel state across multiple ioctls. User processes participate - * by opening ZFS_DEV with O_EXCL. This causes the ZFS driver to do a - * clone-open, generating a unique minor number. The process then passes - * along that file descriptor to each ioctl that might have a cleanup operation. - * - * Consumers of the onexit routines should call zfs_onexit_fd_hold() early - * on to validate the given fd and add a reference to its file table entry. - * This allows the consumer to do its work and then add a callback, knowing - * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers - * should call zfs_onexit_fd_rele(). - * - * A simple example is zfs_ioc_recv(), where we might create an AVL tree - * with dataset/GUID mappings and then reuse that tree on subsequent - * zfs_ioc_recv() calls. - * - * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc() - * the AVL tree and pass it along with a callback function to - * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the - * callback and return an action handle. - * - * The action handle is then passed from user space to subsequent - * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree - * by calling zfs_onexit_cb_data() with the device minor number and - * action handle. - * - * If the user process exits abnormally, the callback is invoked implicitly - * as part of the driver close operation. Once the user space process is - * finished with the accumulated kernel state, it can also just call close(2) - * on the cleanup fd to trigger the cleanup callback. - */ - -void -zfs_onexit_init(zfs_onexit_t **zop) -{ - zfs_onexit_t *zo; - - zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP); - mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t), - offsetof(zfs_onexit_action_node_t, za_link)); -} - -void -zfs_onexit_destroy(zfs_onexit_t *zo) -{ - zfs_onexit_action_node_t *ap; - - mutex_enter(&zo->zo_lock); - while ((ap = list_head(&zo->zo_actions)) != NULL) { - list_remove(&zo->zo_actions, ap); - mutex_exit(&zo->zo_lock); - ap->za_func(ap->za_data); - kmem_free(ap, sizeof (zfs_onexit_action_node_t)); - mutex_enter(&zo->zo_lock); - } - mutex_exit(&zo->zo_lock); - - list_destroy(&zo->zo_actions); - mutex_destroy(&zo->zo_lock); - kmem_free(zo, sizeof (zfs_onexit_t)); -} - -static int -zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) -{ - *zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV); - if (*zo == NULL) - return (SET_ERROR(EBADF)); - - return (0); -} - -/* - * Consumers might need to operate by minor number instead of fd, since - * they might be running in another thread (e.g. txg_sync_thread). Callers - * of this function must call zfs_onexit_fd_rele() when they're finished - * using the minor number. - */ -int -zfs_onexit_fd_hold(int fd, minor_t *minorp) -{ - file_t *fp, *tmpfp; - zfs_onexit_t *zo; - cap_rights_t rights; - void *data; - int error; - - fp = getf(fd, &cap_no_rights); - if (fp == NULL) - return (SET_ERROR(EBADF)); - - tmpfp = curthread->td_fpop; - curthread->td_fpop = fp; - error = devfs_get_cdevpriv(&data); - if (error == 0) - *minorp = (minor_t)(uintptr_t)data; - curthread->td_fpop = tmpfp; - if (error != 0) - return (SET_ERROR(EBADF)); - return (zfs_onexit_minor_to_state(*minorp, &zo)); -} - -void -zfs_onexit_fd_rele(int fd) -{ - releasef(fd); -} - -/* - * Add a callback to be invoked when the calling process exits. - */ -int -zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, - uint64_t *action_handle) -{ - zfs_onexit_t *zo; - zfs_onexit_action_node_t *ap; - int error; - - error = zfs_onexit_minor_to_state(minor, &zo); - if (error) - return (error); - - ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP); - list_link_init(&ap->za_link); - ap->za_func = func; - ap->za_data = data; - - mutex_enter(&zo->zo_lock); - list_insert_tail(&zo->zo_actions, ap); - mutex_exit(&zo->zo_lock); - if (action_handle) - *action_handle = (uint64_t)(uintptr_t)ap; - - return (0); -} - -static zfs_onexit_action_node_t * -zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle) -{ - zfs_onexit_action_node_t *match; - zfs_onexit_action_node_t *ap; - list_t *l; - - ASSERT(MUTEX_HELD(&zo->zo_lock)); - - match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle; - l = &zo->zo_actions; - for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) { - if (match == ap) - break; - } - return (ap); -} - -/* - * Delete the callback, triggering it first if 'fire' is set. - */ -int -zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) -{ - zfs_onexit_t *zo; - zfs_onexit_action_node_t *ap; - int error; - - error = zfs_onexit_minor_to_state(minor, &zo); - if (error) - return (error); - - mutex_enter(&zo->zo_lock); - ap = zfs_onexit_find_cb(zo, action_handle); - if (ap != NULL) { - list_remove(&zo->zo_actions, ap); - mutex_exit(&zo->zo_lock); - if (fire) - ap->za_func(ap->za_data); - kmem_free(ap, sizeof (zfs_onexit_action_node_t)); - } else { - mutex_exit(&zo->zo_lock); - error = SET_ERROR(ENOENT); - } - - return (error); -} - -/* - * Return the data associated with this callback. This allows consumers - * of the cleanup-on-exit interfaces to stash kernel data across system - * calls, knowing that it will be cleaned up if the calling process exits. - */ -int -zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) -{ - zfs_onexit_t *zo; - zfs_onexit_action_node_t *ap; - int error; - - *data = NULL; - - error = zfs_onexit_minor_to_state(minor, &zo); - if (error) - return (error); - - mutex_enter(&zo->zo_lock); - ap = zfs_onexit_find_cb(zo, action_handle); - if (ap != NULL) - *data = ap->za_data; - else - error = SET_ERROR(ENOENT); - mutex_exit(&zo->zo_lock); - - return (error); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c deleted file mode 100644 index c913e287e2ad..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c +++ /dev/null @@ -1,1069 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Functions to replay ZFS intent log (ZIL) records - * The functions are called through a function vector (zfs_replay_vector) - * which is indexed by the transaction type. - */ - -static void -zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, - uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) -{ - VATTR_NULL(vap); - vap->va_mask = (uint_t)mask; - if (mask & AT_TYPE) - vap->va_type = IFTOVT(mode); - if (mask & AT_MODE) - vap->va_mode = mode & MODEMASK; - if (mask & AT_UID) - vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid; - if (mask & AT_GID) - vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid; - vap->va_rdev = zfs_cmpldev(rdev); - vap->va_nodeid = nodeid; -} - -/* ARGSUSED */ -static int -zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap) -{ - return (SET_ERROR(ENOTSUP)); -} - -static void -zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) -{ - xoptattr_t *xoap = NULL; - uint64_t *attrs; - uint64_t *crtime; - uint32_t *bitmap; - void *scanstamp; - int i; - - xvap->xva_vattr.va_mask |= AT_XVATTR; - if ((xoap = xva_getxoptattr(xvap)) == NULL) { - xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */ - return; - } - - ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize); - - bitmap = &lrattr->lr_attr_bitmap; - for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++) - xvap->xva_reqattrmap[i] = *bitmap; - - attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1); - crtime = attrs + 1; - scanstamp = (caddr_t)(crtime + 2); - - if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) - xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0); - if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) - xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0); - if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) - xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0); - if (XVA_ISSET_REQ(xvap, XAT_READONLY)) - xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0); - if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) - xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0); - if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) - xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0); - if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) - xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0); - if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) - xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0); - if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) - xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0); - if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) - xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0); - if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) - xoap->xoa_av_quarantined = - ((*attrs & XAT0_AV_QUARANTINED) != 0); - if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) - ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime); - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) - bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ); - if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) - xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0); - if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) - xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0); - if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) - xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0); -} - -static int -zfs_replay_domain_cnt(uint64_t uid, uint64_t gid) -{ - uint64_t uid_idx; - uint64_t gid_idx; - int domcnt = 0; - - uid_idx = FUID_INDEX(uid); - gid_idx = FUID_INDEX(gid); - if (uid_idx) - domcnt++; - if (gid_idx > 0 && gid_idx != uid_idx) - domcnt++; - - return (domcnt); -} - -static void * -zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start, - int domcnt) -{ - int i; - - for (i = 0; i != domcnt; i++) { - fuid_infop->z_domain_table[i] = start; - start = (caddr_t)start + strlen(start) + 1; - } - - return (start); -} - -/* - * Set the uid/gid in the fuid_info structure. - */ -static void -zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid) -{ - /* - * If owner or group are log specific FUIDs then slurp up - * domain information and build zfs_fuid_info_t - */ - if (IS_EPHEMERAL(uid)) - fuid_infop->z_fuid_owner = uid; - - if (IS_EPHEMERAL(gid)) - fuid_infop->z_fuid_group = gid; -} - -/* - * Load fuid domains into fuid_info_t - */ -static zfs_fuid_info_t * -zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid) -{ - int domcnt; - - zfs_fuid_info_t *fuid_infop; - - fuid_infop = zfs_fuid_info_alloc(); - - domcnt = zfs_replay_domain_cnt(uid, gid); - - if (domcnt == 0) - return (fuid_infop); - - fuid_infop->z_domain_table = - kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); - - zfs_replay_fuid_ugid(fuid_infop, uid, gid); - - fuid_infop->z_domain_cnt = domcnt; - *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt); - return (fuid_infop); -} - -/* - * load zfs_fuid_t's and fuid_domains into fuid_info_t - */ -static zfs_fuid_info_t * -zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid, - uint64_t gid) -{ - uint64_t *log_fuid = (uint64_t *)start; - zfs_fuid_info_t *fuid_infop; - int i; - - fuid_infop = zfs_fuid_info_alloc(); - fuid_infop->z_domain_cnt = domcnt; - - fuid_infop->z_domain_table = - kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); - - for (i = 0; i != idcnt; i++) { - zfs_fuid_t *zfuid; - - zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); - zfuid->z_logfuid = *log_fuid; - zfuid->z_id = -1; - zfuid->z_domidx = 0; - list_insert_tail(&fuid_infop->z_fuids, zfuid); - log_fuid++; - } - - zfs_replay_fuid_ugid(fuid_infop, uid, gid); - - *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt); - return (fuid_infop); -} - -static void -zfs_replay_swap_attrs(lr_attr_t *lrattr) -{ - /* swap the lr_attr structure */ - byteswap_uint32_array(lrattr, sizeof (*lrattr)); - /* swap the bitmap */ - byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) * - sizeof (uint32_t)); - /* swap the attributes, create time + 64 bit word for attributes */ - byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) * - (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t)); -} - -/* - * Replay file create with optional ACL, xvattr information as well - * as option FUID information. - */ -static int -zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) -{ - zfsvfs_t *zfsvfs = arg1; - lr_acl_create_t *lracl = arg2; - char *name = NULL; /* location determined later */ - lr_create_t *lr = (lr_create_t *)lracl; - znode_t *dzp; - vnode_t *vp = NULL; - xvattr_t xva; - int vflg = 0; - vsecattr_t vsec = { 0 }; - lr_attr_t *lrattr; - void *aclstart; - void *fuidstart; - size_t xvatlen = 0; - uint64_t txtype; - uint64_t objid; - uint64_t dnodesize; - int error; - - txtype = (lr->lr_common.lrc_txtype & ~TX_CI); - if (byteswap) { - byteswap_uint64_array(lracl, sizeof (*lracl)); - if (txtype == TX_CREATE_ACL_ATTR || - txtype == TX_MKDIR_ACL_ATTR) { - lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); - zfs_replay_swap_attrs(lrattr); - xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); - } - - aclstart = (caddr_t)(lracl + 1) + xvatlen; - zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE); - /* swap fuids */ - if (lracl->lr_fuidcnt) { - byteswap_uint64_array((caddr_t)aclstart + - ZIL_ACE_LENGTH(lracl->lr_acl_bytes), - lracl->lr_fuidcnt * sizeof (uint64_t)); - } - } - - if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) - return (error); - - objid = LR_FOID_GET_OBJ(lr->lr_foid); - dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; - - xva_init(&xva); - zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, - lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); - - /* - * All forms of zfs create (create, mkdir, mkxattrdir, symlink) - * eventually end up in zfs_mknode(), which assigns the object's - * creation time, generation number, and dnode size. The generic - * zfs_create() has no concept of these attributes, so we smuggle - * the values inside the vattr's otherwise unused va_ctime, - * va_nblocks, and va_fsid fields. - */ - ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); - xva.xva_vattr.va_nblocks = lr->lr_gen; - xva.xva_vattr.va_fsid = dnodesize; - - error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); - if (error != ENOENT) - goto bail; - - if (lr->lr_common.lrc_txtype & TX_CI) - vflg |= FIGNORECASE; - switch (txtype) { - case TX_CREATE_ACL: - aclstart = (caddr_t)(lracl + 1); - fuidstart = (caddr_t)aclstart + - ZIL_ACE_LENGTH(lracl->lr_acl_bytes); - zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, - (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, - lr->lr_uid, lr->lr_gid); - /*FALLTHROUGH*/ - case TX_CREATE_ACL_ATTR: - if (name == NULL) { - lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); - xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); - xva.xva_vattr.va_mask |= AT_XVATTR; - zfs_replay_xvattr(lrattr, &xva); - } - vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; - vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; - vsec.vsa_aclcnt = lracl->lr_aclcnt; - vsec.vsa_aclentsz = lracl->lr_acl_bytes; - vsec.vsa_aclflags = lracl->lr_acl_flags; - if (zfsvfs->z_fuid_replay == NULL) { - fuidstart = (caddr_t)(lracl + 1) + xvatlen + - ZIL_ACE_LENGTH(lracl->lr_acl_bytes); - zfsvfs->z_fuid_replay = - zfs_replay_fuids(fuidstart, - (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, - lr->lr_uid, lr->lr_gid); - } - -#ifdef TODO - error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr, - 0, 0, &vp, kcred, vflg, NULL, &vsec); -#else - panic("%s:%u: unsupported condition", __func__, __LINE__); -#endif - break; - case TX_MKDIR_ACL: - aclstart = (caddr_t)(lracl + 1); - fuidstart = (caddr_t)aclstart + - ZIL_ACE_LENGTH(lracl->lr_acl_bytes); - zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, - (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, - lr->lr_uid, lr->lr_gid); - /*FALLTHROUGH*/ - case TX_MKDIR_ACL_ATTR: - if (name == NULL) { - lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); - xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); - zfs_replay_xvattr(lrattr, &xva); - } - vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; - vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; - vsec.vsa_aclcnt = lracl->lr_aclcnt; - vsec.vsa_aclentsz = lracl->lr_acl_bytes; - vsec.vsa_aclflags = lracl->lr_acl_flags; - if (zfsvfs->z_fuid_replay == NULL) { - fuidstart = (caddr_t)(lracl + 1) + xvatlen + - ZIL_ACE_LENGTH(lracl->lr_acl_bytes); - zfsvfs->z_fuid_replay = - zfs_replay_fuids(fuidstart, - (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, - lr->lr_uid, lr->lr_gid); - } -#ifdef TODO - error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr, - &vp, kcred, NULL, vflg, &vsec); -#else - panic("%s:%u: unsupported condition", __func__, __LINE__); -#endif - break; - default: - error = SET_ERROR(ENOTSUP); - } - -bail: - if (error == 0 && vp != NULL) - VN_RELE(vp); - - VN_RELE(ZTOV(dzp)); - - if (zfsvfs->z_fuid_replay) - zfs_fuid_info_free(zfsvfs->z_fuid_replay); - zfsvfs->z_fuid_replay = NULL; - - return (error); -} - -static int -zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) -{ - zfsvfs_t *zfsvfs = arg1; - lr_create_t *lr = arg2; - char *name = NULL; /* location determined later */ - char *link; /* symlink content follows name */ - znode_t *dzp; - vnode_t *vp = NULL; - xvattr_t xva; - int vflg = 0; - size_t lrsize = sizeof (lr_create_t); - lr_attr_t *lrattr; - void *start; - size_t xvatlen; - uint64_t txtype; - struct componentname cn; - int error; - - txtype = (lr->lr_common.lrc_txtype & ~TX_CI); - if (byteswap) { - byteswap_uint64_array(lr, sizeof (*lr)); - if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) - zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); - } - - - if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) - return (error); - - uint64_t objid = LR_FOID_GET_OBJ(lr->lr_foid); - int dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; - - xva_init(&xva); - zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, - lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); - - /* - * All forms of zfs create (create, mkdir, mkxattrdir, symlink) - * eventually end up in zfs_mknode(), which assigns the object's - * creation time, generation number, and dnode slot count. The - * generic zfs_create() has no concept of these attributes, so - * we smuggle the values inside the vattr's otherwise unused - * va_ctime, va_nblocks and va_fsid fields. - */ - ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); - xva.xva_vattr.va_nblocks = lr->lr_gen; - xva.xva_vattr.va_fsid = dnodesize; - - error = dmu_object_info(zfsvfs->z_os, objid, NULL); - if (error != ENOENT) - goto out; - - if (lr->lr_common.lrc_txtype & TX_CI) - vflg |= FIGNORECASE; - - /* - * Symlinks don't have fuid info, and CIFS never creates - * symlinks. - * - * The _ATTR versions will grab the fuid info in their subcases. - */ - if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK && - (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR && - (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) { - start = (lr + 1); - zfsvfs->z_fuid_replay = - zfs_replay_fuid_domain(start, &start, - lr->lr_uid, lr->lr_gid); - } - - cn.cn_cred = kcred; - cn.cn_thread = curthread; - cn.cn_flags = SAVENAME; - - vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); - switch (txtype) { - case TX_CREATE_ATTR: - lrattr = (lr_attr_t *)(caddr_t)(lr + 1); - xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); - zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); - start = (caddr_t)(lr + 1) + xvatlen; - zfsvfs->z_fuid_replay = - zfs_replay_fuid_domain(start, &start, - lr->lr_uid, lr->lr_gid); - name = (char *)start; - - /*FALLTHROUGH*/ - case TX_CREATE: - if (name == NULL) - name = (char *)start; - - cn.cn_nameptr = name; - error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/); - break; - case TX_MKDIR_ATTR: - lrattr = (lr_attr_t *)(caddr_t)(lr + 1); - xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); - zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); - start = (caddr_t)(lr + 1) + xvatlen; - zfsvfs->z_fuid_replay = - zfs_replay_fuid_domain(start, &start, - lr->lr_uid, lr->lr_gid); - name = (char *)start; - - /*FALLTHROUGH*/ - case TX_MKDIR: - if (name == NULL) - name = (char *)(lr + 1); - - cn.cn_nameptr = name; - error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/); - break; - case TX_MKXATTR: - error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred); - break; - case TX_SYMLINK: - name = (char *)(lr + 1); - link = name + strlen(name) + 1; - cn.cn_nameptr = name; - error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &xva.xva_vattr, link /*,vflg*/); - break; - default: - error = SET_ERROR(ENOTSUP); - } - VOP_UNLOCK(ZTOV(dzp)); - -out: - if (error == 0 && vp != NULL) - VN_URELE(vp); - - VN_RELE(ZTOV(dzp)); - - if (zfsvfs->z_fuid_replay) - zfs_fuid_info_free(zfsvfs->z_fuid_replay); - zfsvfs->z_fuid_replay = NULL; - return (error); -} - -static int -zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap) -{ - zfsvfs_t *zfsvfs = arg1; - lr_remove_t *lr = arg2; - char *name = (char *)(lr + 1); /* name follows lr_remove_t */ - znode_t *dzp; - struct componentname cn; - vnode_t *vp; - int error; - int vflg = 0; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) - return (error); - - if (lr->lr_common.lrc_txtype & TX_CI) - vflg |= FIGNORECASE; - cn.cn_nameptr = name; - cn.cn_namelen = strlen(name); - cn.cn_nameiop = DELETE; - cn.cn_flags = ISLASTCN | SAVENAME; - cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; - cn.cn_cred = kcred; - cn.cn_thread = curthread; - vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); - error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn); - if (error != 0) { - VOP_UNLOCK(ZTOV(dzp)); - goto fail; - } - - switch ((int)lr->lr_common.lrc_txtype) { - case TX_REMOVE: - error = VOP_REMOVE(ZTOV(dzp), vp, &cn /*,vflg*/); - break; - case TX_RMDIR: - error = VOP_RMDIR(ZTOV(dzp), vp, &cn /*,vflg*/); - break; - default: - error = SET_ERROR(ENOTSUP); - } - vput(vp); - VOP_UNLOCK(ZTOV(dzp)); - -fail: - VN_RELE(ZTOV(dzp)); - - return (error); -} - -static int -zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) -{ - zfsvfs_t *zfsvfs = arg1; - lr_link_t *lr = arg2; - char *name = (char *)(lr + 1); /* name follows lr_link_t */ - znode_t *dzp, *zp; - struct componentname cn; - int error; - int vflg = 0; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) - return (error); - - if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { - VN_RELE(ZTOV(dzp)); - return (error); - } - - if (lr->lr_common.lrc_txtype & TX_CI) - vflg |= FIGNORECASE; - - cn.cn_nameptr = name; - cn.cn_cred = kcred; - cn.cn_thread = curthread; - cn.cn_flags = SAVENAME; - - vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); - vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); - error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn /*,vflg*/); - VOP_UNLOCK(ZTOV(zp)); - VOP_UNLOCK(ZTOV(dzp)); - - VN_RELE(ZTOV(zp)); - VN_RELE(ZTOV(dzp)); - - return (error); -} - -static int -zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) -{ - zfsvfs_t *zfsvfs = arg1; - lr_rename_t *lr = arg2; - char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ - char *tname = sname + strlen(sname) + 1; - znode_t *sdzp, *tdzp; - struct componentname scn, tcn; - vnode_t *svp, *tvp; - kthread_t *td = curthread; - int error; - int vflg = 0; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) - return (error); - - if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { - VN_RELE(ZTOV(sdzp)); - return (error); - } - - if (lr->lr_common.lrc_txtype & TX_CI) - vflg |= FIGNORECASE; - svp = tvp = NULL; - - scn.cn_nameptr = sname; - scn.cn_namelen = strlen(sname); - scn.cn_nameiop = DELETE; - scn.cn_flags = ISLASTCN | SAVENAME; - scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; - scn.cn_cred = kcred; - scn.cn_thread = td; - vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY); - error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn); - VOP_UNLOCK(ZTOV(sdzp)); - if (error != 0) - goto fail; - VOP_UNLOCK(svp); - - tcn.cn_nameptr = tname; - tcn.cn_namelen = strlen(tname); - tcn.cn_nameiop = RENAME; - tcn.cn_flags = ISLASTCN | SAVENAME; - tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; - tcn.cn_cred = kcred; - tcn.cn_thread = td; - vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY); - error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn); - if (error == EJUSTRETURN) - tvp = NULL; - else if (error != 0) { - VOP_UNLOCK(ZTOV(tdzp)); - goto fail; - } - - error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn /*,vflg*/); - return (error); -fail: - if (svp != NULL) - vrele(svp); - if (tvp != NULL) - vrele(tvp); - VN_RELE(ZTOV(tdzp)); - VN_RELE(ZTOV(sdzp)); - - return (error); -} - -static int -zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) -{ - zfsvfs_t *zfsvfs = arg1; - lr_write_t *lr = arg2; - char *data = (char *)(lr + 1); /* data follows lr_write_t */ - znode_t *zp; - int error; - ssize_t resid; - uint64_t eod, offset, length; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log writes out of order, it's possible the - * file has been removed. In this case just drop the write - * and return success. - */ - if (error == ENOENT) - error = 0; - return (error); - } - - offset = lr->lr_offset; - length = lr->lr_length; - eod = offset + length; /* end of data for this write */ - - /* - * This may be a write from a dmu_sync() for a whole block, - * and may extend beyond the current end of the file. - * We can't just replay what was written for this TX_WRITE as - * a future TX_WRITE2 may extend the eof and the data for that - * write needs to be there. So we write the whole block and - * reduce the eof. This needs to be done within the single dmu - * transaction created within vn_rdwr -> zfs_write. So a possible - * new end of file is passed through in zfsvfs->z_replay_eof - */ - - zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */ - - /* If it's a dmu_sync() block, write the whole block */ - if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { - uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); - if (length < blocksize) { - offset -= offset % blocksize; - length = blocksize; - } - if (zp->z_size < eod) - zfsvfs->z_replay_eof = eod; - } - - error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset, - UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); - - VN_RELE(ZTOV(zp)); - zfsvfs->z_replay_eof = 0; /* safety */ - - return (error); -} - -/* - * TX_WRITE2 are only generated when dmu_sync() returns EALREADY - * meaning the pool block is already being synced. So now that we always write - * out full blocks, all we have to do is expand the eof if - * the file is grown. - */ -static int -zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap) -{ - zfsvfs_t *zfsvfs = arg1; - lr_write_t *lr = arg2; - znode_t *zp; - int error; - uint64_t end; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) - return (error); - -top: - end = lr->lr_offset + lr->lr_length; - if (end > zp->z_size) { - dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); - - zp->z_size = end; - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - VN_RELE(ZTOV(zp)); - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - return (error); - } - (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), - (void *)&zp->z_size, sizeof (uint64_t), tx); - - /* Ensure the replayed seq is updated */ - (void) zil_replaying(zfsvfs->z_log, tx); - - dmu_tx_commit(tx); - } - - VN_RELE(ZTOV(zp)); - - return (error); -} - -static int -zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) -{ -#ifdef illumos - zfsvfs_t *zfsvfs = arg1; - lr_truncate_t *lr = arg2; - znode_t *zp; - flock64_t fl; - int error; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) - return (error); - - bzero(&fl, sizeof (fl)); - fl.l_type = F_WRLCK; - fl.l_whence = 0; - fl.l_start = lr->lr_offset; - fl.l_len = lr->lr_length; - - error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX, - lr->lr_offset, kcred, NULL); - - VN_RELE(ZTOV(zp)); - - return (error); -#else - ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org"); - return (EOPNOTSUPP); -#endif -} - -static int -zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) -{ - zfsvfs_t *zfsvfs = arg1; - lr_setattr_t *lr = arg2; - znode_t *zp; - xvattr_t xva; - vattr_t *vap = &xva.xva_vattr; - vnode_t *vp; - int error; - void *start; - - xva_init(&xva); - if (byteswap) { - byteswap_uint64_array(lr, sizeof (*lr)); - - if ((lr->lr_mask & AT_XVATTR) && - zfsvfs->z_version >= ZPL_VERSION_INITIAL) - zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); - } - - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) - return (error); - - zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode, - lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); - - vap->va_size = lr->lr_size; - ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime); - ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime); - - /* - * Fill in xvattr_t portions if necessary. - */ - - start = (lr_setattr_t *)(lr + 1); - if (vap->va_mask & AT_XVATTR) { - zfs_replay_xvattr((lr_attr_t *)start, &xva); - start = (caddr_t)start + - ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize); - } else - xva.xva_vattr.va_mask &= ~AT_XVATTR; - - zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, - lr->lr_uid, lr->lr_gid); - - vp = ZTOV(zp); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - error = VOP_SETATTR(vp, vap, kcred); - VOP_UNLOCK(vp); - - zfs_fuid_info_free(zfsvfs->z_fuid_replay); - zfsvfs->z_fuid_replay = NULL; - VN_RELE(vp); - - return (error); -} - -extern int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, - caller_context_t *ct); - -static int -zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap) -{ - zfsvfs_t *zfsvfs = arg1; - lr_acl_v0_t *lr = arg2; - ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ - vsecattr_t vsa; - vnode_t *vp; - znode_t *zp; - int error; - - if (byteswap) { - byteswap_uint64_array(lr, sizeof (*lr)); - zfs_oldace_byteswap(ace, lr->lr_aclcnt); - } - - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) - return (error); - - bzero(&vsa, sizeof (vsa)); - vsa.vsa_mask = VSA_ACE | VSA_ACECNT; - vsa.vsa_aclcnt = lr->lr_aclcnt; - vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt; - vsa.vsa_aclflags = 0; - vsa.vsa_aclentp = ace; - - vp = ZTOV(zp); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL); - VOP_UNLOCK(vp); - - VN_RELE(vp); - - return (error); -} - -/* - * Replaying ACLs is complicated by FUID support. - * The log record may contain some optional data - * to be used for replaying FUID's. These pieces - * are the actual FUIDs that were created initially. - * The FUID table index may no longer be valid and - * during zfs_create() a new index may be assigned. - * Because of this the log will contain the original - * doman+rid in order to create a new FUID. - * - * The individual ACEs may contain an ephemeral uid/gid which is no - * longer valid and will need to be replaced with an actual FUID. - * - */ -static int -zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap) -{ - zfsvfs_t *zfsvfs = arg1; - lr_acl_t *lr = arg2; - ace_t *ace = (ace_t *)(lr + 1); - vsecattr_t vsa; - znode_t *zp; - vnode_t *vp; - int error; - - if (byteswap) { - byteswap_uint64_array(lr, sizeof (*lr)); - zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE); - if (lr->lr_fuidcnt) { - byteswap_uint64_array((caddr_t)ace + - ZIL_ACE_LENGTH(lr->lr_acl_bytes), - lr->lr_fuidcnt * sizeof (uint64_t)); - } - } - - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) - return (error); - - bzero(&vsa, sizeof (vsa)); - vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS; - vsa.vsa_aclcnt = lr->lr_aclcnt; - vsa.vsa_aclentp = ace; - vsa.vsa_aclentsz = lr->lr_acl_bytes; - vsa.vsa_aclflags = lr->lr_acl_flags; - - if (lr->lr_fuidcnt) { - void *fuidstart = (caddr_t)ace + - ZIL_ACE_LENGTH(lr->lr_acl_bytes); - - zfsvfs->z_fuid_replay = - zfs_replay_fuids(fuidstart, &fuidstart, - lr->lr_fuidcnt, lr->lr_domcnt, 0, 0); - } - - vp = ZTOV(zp); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL); - VOP_UNLOCK(vp); - - if (zfsvfs->z_fuid_replay) - zfs_fuid_info_free(zfsvfs->z_fuid_replay); - - zfsvfs->z_fuid_replay = NULL; - VN_RELE(vp); - - return (error); -} - -/* - * Callback vectors for replaying records - */ -zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { - zfs_replay_error, /* 0 no such transaction type */ - zfs_replay_create, /* TX_CREATE */ - zfs_replay_create, /* TX_MKDIR */ - zfs_replay_create, /* TX_MKXATTR */ - zfs_replay_create, /* TX_SYMLINK */ - zfs_replay_remove, /* TX_REMOVE */ - zfs_replay_remove, /* TX_RMDIR */ - zfs_replay_link, /* TX_LINK */ - zfs_replay_rename, /* TX_RENAME */ - zfs_replay_write, /* TX_WRITE */ - zfs_replay_truncate, /* TX_TRUNCATE */ - zfs_replay_setattr, /* TX_SETATTR */ - zfs_replay_acl_v0, /* TX_ACL_V0 */ - zfs_replay_acl, /* TX_ACL */ - zfs_replay_create_acl, /* TX_CREATE_ACL */ - zfs_replay_create, /* TX_CREATE_ATTR */ - zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */ - zfs_replay_create_acl, /* TX_MKDIR_ACL */ - zfs_replay_create, /* TX_MKDIR_ATTR */ - zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ - zfs_replay_write2, /* TX_WRITE2 */ -}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c deleted file mode 100644 index 434be78ffce2..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c +++ /dev/null @@ -1,641 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - */ - -/* - * This file contains the code to implement file range locking in - * ZFS, although there isn't much specific to ZFS (all that comes to mind is - * support for growing the blocksize). - * - * Interface - * --------- - * Defined in zfs_rlock.h but essentially: - * lr = rangelock_enter(zp, off, len, lock_type); - * rangelock_reduce(lr, off, len); // optional - * rangelock_exit(lr); - * - * AVL tree - * -------- - * An AVL tree is used to maintain the state of the existing ranges - * that are locked for exclusive (writer) or shared (reader) use. - * The starting range offset is used for searching and sorting the tree. - * - * Common case - * ----------- - * The (hopefully) usual case is of no overlaps or contention for locks. On - * entry to rangelock_enter(), a locked_range_t is allocated; the tree - * searched that finds no overlap, and *this* locked_range_t is placed in the - * tree. - * - * Overlaps/Reference counting/Proxy locks - * --------------------------------------- - * The avl code only allows one node at a particular offset. Also it's very - * inefficient to search through all previous entries looking for overlaps - * (because the very 1st in the ordered list might be at offset 0 but - * cover the whole file). - * So this implementation uses reference counts and proxy range locks. - * Firstly, only reader locks use reference counts and proxy locks, - * because writer locks are exclusive. - * When a reader lock overlaps with another then a proxy lock is created - * for that range and replaces the original lock. If the overlap - * is exact then the reference count of the proxy is simply incremented. - * Otherwise, the proxy lock is split into smaller lock ranges and - * new proxy locks created for non overlapping ranges. - * The reference counts are adjusted accordingly. - * Meanwhile, the orginal lock is kept around (this is the callers handle) - * and its offset and length are used when releasing the lock. - * - * Thread coordination - * ------------------- - * In order to make wakeups efficient and to ensure multiple continuous - * readers on a range don't starve a writer for the same range lock, - * two condition variables are allocated in each rl_t. - * If a writer (or reader) can't get a range it initialises the writer - * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; - * and waits on that cv. When a thread unlocks that range it wakes up all - * writers then all readers before destroying the lock. - * - * Append mode writes - * ------------------ - * Append mode writes need to lock a range at the end of a file. - * The offset of the end of the file is determined under the - * range locking mutex, and the lock type converted from RL_APPEND to - * RL_WRITER and the range locked. - * - * Grow block handling - * ------------------- - * ZFS supports multiple block sizes, up to 16MB. The smallest - * block size is used for the file which is grown as needed. During this - * growth all other writers and readers must be excluded. - * So if the block size needs to be grown then the whole file is - * exclusively locked, then later the caller will reduce the lock - * range to just the range to be written using rangelock_reduce(). - */ - -#include -#include -#include - -/* - * AVL comparison function used to order range locks - * Locks are ordered on the start offset of the range. - */ -static int -rangelock_compare(const void *arg1, const void *arg2) -{ - const locked_range_t *rl1 = (const locked_range_t *)arg1; - const locked_range_t *rl2 = (const locked_range_t *)arg2; - - return (AVL_CMP(rl1->lr_offset, rl2->lr_offset)); -} - -/* - * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock. - * It must convert RL_APPEND to RL_WRITER (starting at the end of the file), - * and may increase the range that's locked for RL_WRITER. - */ -void -rangelock_init(rangelock_t *rl, rangelock_cb_t *cb, void *arg) -{ - mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&rl->rl_tree, rangelock_compare, - sizeof (locked_range_t), offsetof(locked_range_t, lr_node)); - rl->rl_cb = cb; - rl->rl_arg = arg; -} - -void -rangelock_fini(rangelock_t *rl) -{ - mutex_destroy(&rl->rl_lock); - avl_destroy(&rl->rl_tree); -} - -/* - * Check if a write lock can be grabbed. If not, fail immediately or sleep and - * recheck until available, depending on the value of the "nonblock" parameter. - */ -static boolean_t -rangelock_enter_writer(rangelock_t *rl, locked_range_t *new, boolean_t nonblock) -{ - avl_tree_t *tree = &rl->rl_tree; - locked_range_t *lr; - avl_index_t where; - uint64_t orig_off = new->lr_offset; - uint64_t orig_len = new->lr_length; - rangelock_type_t orig_type = new->lr_type; - - for (;;) { - /* - * Call callback which can modify new->r_off,len,type. - * Note, the callback is used by the ZPL to handle appending - * and changing blocksizes. It isn't needed for zvols. - */ - if (rl->rl_cb != NULL) { - rl->rl_cb(new, rl->rl_arg); - } - - /* - * If the type was APPEND, the callback must convert it to - * WRITER. - */ - ASSERT3U(new->lr_type, ==, RL_WRITER); - - /* - * First check for the usual case of no locks - */ - if (avl_numnodes(tree) == 0) { - avl_add(tree, new); - return (B_TRUE); - } - - /* - * Look for any locks in the range. - */ - lr = avl_find(tree, new, &where); - if (lr != NULL) - goto wait; /* already locked at same offset */ - - lr = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER); - if (lr != NULL && - lr->lr_offset < new->lr_offset + new->lr_length) - goto wait; - - lr = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE); - if (lr != NULL && - lr->lr_offset + lr->lr_length > new->lr_offset) - goto wait; - - avl_insert(tree, new, where); - return (B_TRUE); -wait: - if (nonblock) - return (B_FALSE); - if (!lr->lr_write_wanted) { - cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL); - lr->lr_write_wanted = B_TRUE; - } - cv_wait(&lr->lr_write_cv, &rl->rl_lock); - - /* reset to original */ - new->lr_offset = orig_off; - new->lr_length = orig_len; - new->lr_type = orig_type; - } -} - -/* - * If this is an original (non-proxy) lock then replace it by - * a proxy and return the proxy. - */ -static locked_range_t * -rangelock_proxify(avl_tree_t *tree, locked_range_t *lr) -{ - locked_range_t *proxy; - - if (lr->lr_proxy) - return (lr); /* already a proxy */ - - ASSERT3U(lr->lr_count, ==, 1); - ASSERT(lr->lr_write_wanted == B_FALSE); - ASSERT(lr->lr_read_wanted == B_FALSE); - avl_remove(tree, lr); - lr->lr_count = 0; - - /* create a proxy range lock */ - proxy = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); - proxy->lr_offset = lr->lr_offset; - proxy->lr_length = lr->lr_length; - proxy->lr_count = 1; - proxy->lr_type = RL_READER; - proxy->lr_proxy = B_TRUE; - proxy->lr_write_wanted = B_FALSE; - proxy->lr_read_wanted = B_FALSE; - avl_add(tree, proxy); - - return (proxy); -} - -/* - * Split the range lock at the supplied offset - * returning the *front* proxy. - */ -static locked_range_t * -rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off) -{ - ASSERT3U(lr->lr_length, >, 1); - ASSERT3U(off, >, lr->lr_offset); - ASSERT3U(off, <, lr->lr_offset + lr->lr_length); - ASSERT(lr->lr_write_wanted == B_FALSE); - ASSERT(lr->lr_read_wanted == B_FALSE); - - /* create the rear proxy range lock */ - locked_range_t *rear = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); - rear->lr_offset = off; - rear->lr_length = lr->lr_offset + lr->lr_length - off; - rear->lr_count = lr->lr_count; - rear->lr_type = RL_READER; - rear->lr_proxy = B_TRUE; - rear->lr_write_wanted = B_FALSE; - rear->lr_read_wanted = B_FALSE; - - locked_range_t *front = rangelock_proxify(tree, lr); - front->lr_length = off - lr->lr_offset; - - avl_insert_here(tree, rear, front, AVL_AFTER); - return (front); -} - -/* - * Create and add a new proxy range lock for the supplied range. - */ -static void -rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) -{ - ASSERT(len != 0); - locked_range_t *lr = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); - lr->lr_offset = off; - lr->lr_length = len; - lr->lr_count = 1; - lr->lr_type = RL_READER; - lr->lr_proxy = B_TRUE; - lr->lr_write_wanted = B_FALSE; - lr->lr_read_wanted = B_FALSE; - avl_add(tree, lr); -} - -static void -rangelock_add_reader(avl_tree_t *tree, locked_range_t *new, - locked_range_t *prev, avl_index_t where) -{ - locked_range_t *next; - uint64_t off = new->lr_offset; - uint64_t len = new->lr_length; - - /* - * prev arrives either: - * - pointing to an entry at the same offset - * - pointing to the entry with the closest previous offset whose - * range may overlap with the new range - * - null, if there were no ranges starting before the new one - */ - if (prev != NULL) { - if (prev->lr_offset + prev->lr_length <= off) { - prev = NULL; - } else if (prev->lr_offset != off) { - /* - * convert to proxy if needed then - * split this entry and bump ref count - */ - prev = rangelock_split(tree, prev, off); - prev = AVL_NEXT(tree, prev); /* move to rear range */ - } - } - ASSERT((prev == NULL) || (prev->lr_offset == off)); - - if (prev != NULL) - next = prev; - else - next = avl_nearest(tree, where, AVL_AFTER); - - if (next == NULL || off + len <= next->lr_offset) { - /* no overlaps, use the original new rl_t in the tree */ - avl_insert(tree, new, where); - return; - } - - if (off < next->lr_offset) { - /* Add a proxy for initial range before the overlap */ - rangelock_new_proxy(tree, off, next->lr_offset - off); - } - - new->lr_count = 0; /* will use proxies in tree */ - /* - * We now search forward through the ranges, until we go past the end - * of the new range. For each entry we make it a proxy if it - * isn't already, then bump its reference count. If there's any - * gaps between the ranges then we create a new proxy range. - */ - for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { - if (off + len <= next->lr_offset) - break; - if (prev != NULL && prev->lr_offset + prev->lr_length < - next->lr_offset) { - /* there's a gap */ - ASSERT3U(next->lr_offset, >, - prev->lr_offset + prev->lr_length); - rangelock_new_proxy(tree, - prev->lr_offset + prev->lr_length, - next->lr_offset - - (prev->lr_offset + prev->lr_length)); - } - if (off + len == next->lr_offset + next->lr_length) { - /* exact overlap with end */ - next = rangelock_proxify(tree, next); - next->lr_count++; - return; - } - if (off + len < next->lr_offset + next->lr_length) { - /* new range ends in the middle of this block */ - next = rangelock_split(tree, next, off + len); - next->lr_count++; - return; - } - ASSERT3U(off + len, >, next->lr_offset + next->lr_length); - next = rangelock_proxify(tree, next); - next->lr_count++; - } - - /* Add the remaining end range. */ - rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length, - (off + len) - (prev->lr_offset + prev->lr_length)); -} - -/* - * Check if a reader lock can be grabbed. If not, fail immediately or sleep and - * recheck until available, depending on the value of the "nonblock" parameter. - */ -static boolean_t -rangelock_enter_reader(rangelock_t *rl, locked_range_t *new, boolean_t nonblock) -{ - avl_tree_t *tree = &rl->rl_tree; - locked_range_t *prev, *next; - avl_index_t where; - uint64_t off = new->lr_offset; - uint64_t len = new->lr_length; - - /* - * Look for any writer locks in the range. - */ -retry: - prev = avl_find(tree, new, &where); - if (prev == NULL) - prev = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE); - - /* - * Check the previous range for a writer lock overlap. - */ - if (prev && (off < prev->lr_offset + prev->lr_length)) { - if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) { - if (nonblock) - return (B_FALSE); - if (!prev->lr_read_wanted) { - cv_init(&prev->lr_read_cv, - NULL, CV_DEFAULT, NULL); - prev->lr_read_wanted = B_TRUE; - } - cv_wait(&prev->lr_read_cv, &rl->rl_lock); - goto retry; - } - if (off + len < prev->lr_offset + prev->lr_length) - goto got_lock; - } - - /* - * Search through the following ranges to see if there's - * write lock any overlap. - */ - if (prev != NULL) - next = AVL_NEXT(tree, prev); - else - next = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER); - for (; next != NULL; next = AVL_NEXT(tree, next)) { - if (off + len <= next->lr_offset) - goto got_lock; - if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) { - if (nonblock) - return (B_FALSE); - if (!next->lr_read_wanted) { - cv_init(&next->lr_read_cv, - NULL, CV_DEFAULT, NULL); - next->lr_read_wanted = B_TRUE; - } - cv_wait(&next->lr_read_cv, &rl->rl_lock); - goto retry; - } - if (off + len <= next->lr_offset + next->lr_length) - goto got_lock; - } - -got_lock: - /* - * Add the read lock, which may involve splitting existing - * locks and bumping ref counts (r_count). - */ - rangelock_add_reader(tree, new, prev, where); - return (B_TRUE); -} - -/* - * Lock a range (offset, length) as either shared (RL_READER) or exclusive - * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert - * it to a RL_WRITER lock (with the offset at the end of the file). Returns - * the range lock structure for later unlocking (or reduce range if the - * entire file is locked as RL_WRITER). - */ -static locked_range_t * -_rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len, - rangelock_type_t type, boolean_t nonblock) -{ - ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); - - locked_range_t *new = kmem_alloc(sizeof (*new), KM_SLEEP); - new->lr_rangelock = rl; - new->lr_offset = off; - if (len + off < off) /* overflow */ - len = UINT64_MAX - off; - new->lr_length = len; - new->lr_count = 1; /* assume it's going to be in the tree */ - new->lr_type = type; - new->lr_proxy = B_FALSE; - new->lr_write_wanted = B_FALSE; - new->lr_read_wanted = B_FALSE; - - mutex_enter(&rl->rl_lock); - if (type == RL_READER) { - /* - * First check for the usual case of no locks - */ - if (avl_numnodes(&rl->rl_tree) == 0) { - avl_add(&rl->rl_tree, new); - } else if (!rangelock_enter_reader(rl, new, nonblock)) { - kmem_free(new, sizeof (*new)); - new = NULL; - } - } else if (!rangelock_enter_writer(rl, new, nonblock)) { - kmem_free(new, sizeof (*new)); - new = NULL; - } - mutex_exit(&rl->rl_lock); - return (new); -} - -locked_range_t * -rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len, - rangelock_type_t type) -{ - return (_rangelock_enter(rl, off, len, type, B_FALSE)); -} - -locked_range_t * -rangelock_tryenter(rangelock_t *rl, uint64_t off, uint64_t len, - rangelock_type_t type) -{ - return (_rangelock_enter(rl, off, len, type, B_TRUE)); -} - -/* - * Unlock a reader lock - */ -static void -rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove) -{ - avl_tree_t *tree = &rl->rl_tree; - uint64_t len; - - /* - * The common case is when the remove entry is in the tree - * (cnt == 1) meaning there's been no other reader locks overlapping - * with this one. Otherwise the remove entry will have been - * removed from the tree and replaced by proxies (one or - * more ranges mapping to the entire range). - */ - if (remove->lr_count == 1) { - avl_remove(tree, remove); - if (remove->lr_write_wanted) { - cv_broadcast(&remove->lr_write_cv); - cv_destroy(&remove->lr_write_cv); - } - if (remove->lr_read_wanted) { - cv_broadcast(&remove->lr_read_cv); - cv_destroy(&remove->lr_read_cv); - } - } else { - ASSERT0(remove->lr_count); - ASSERT0(remove->lr_write_wanted); - ASSERT0(remove->lr_read_wanted); - /* - * Find start proxy representing this reader lock, - * then decrement ref count on all proxies - * that make up this range, freeing them as needed. - */ - locked_range_t *lr = avl_find(tree, remove, NULL); - ASSERT3P(lr, !=, NULL); - ASSERT3U(lr->lr_count, !=, 0); - ASSERT3U(lr->lr_type, ==, RL_READER); - locked_range_t *next = NULL; - for (len = remove->lr_length; len != 0; lr = next) { - len -= lr->lr_length; - if (len != 0) { - next = AVL_NEXT(tree, lr); - ASSERT3P(next, !=, NULL); - ASSERT3U(lr->lr_offset + lr->lr_length, ==, - next->lr_offset); - ASSERT3U(next->lr_count, !=, 0); - ASSERT3U(next->lr_type, ==, RL_READER); - } - lr->lr_count--; - if (lr->lr_count == 0) { - avl_remove(tree, lr); - if (lr->lr_write_wanted) { - cv_broadcast(&lr->lr_write_cv); - cv_destroy(&lr->lr_write_cv); - } - if (lr->lr_read_wanted) { - cv_broadcast(&lr->lr_read_cv); - cv_destroy(&lr->lr_read_cv); - } - kmem_free(lr, sizeof (locked_range_t)); - } - } - } - kmem_free(remove, sizeof (locked_range_t)); -} - -/* - * Unlock range and destroy range lock structure. - */ -void -rangelock_exit(locked_range_t *lr) -{ - rangelock_t *rl = lr->lr_rangelock; - - ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER); - ASSERT(lr->lr_count == 1 || lr->lr_count == 0); - ASSERT(!lr->lr_proxy); - - mutex_enter(&rl->rl_lock); - if (lr->lr_type == RL_WRITER) { - /* writer locks can't be shared or split */ - avl_remove(&rl->rl_tree, lr); - mutex_exit(&rl->rl_lock); - if (lr->lr_write_wanted) { - cv_broadcast(&lr->lr_write_cv); - cv_destroy(&lr->lr_write_cv); - } - if (lr->lr_read_wanted) { - cv_broadcast(&lr->lr_read_cv); - cv_destroy(&lr->lr_read_cv); - } - kmem_free(lr, sizeof (locked_range_t)); - } else { - /* - * lock may be shared, let rangelock_exit_reader() - * release the lock and free the rl_t - */ - rangelock_exit_reader(rl, lr); - mutex_exit(&rl->rl_lock); - } -} - -/* - * Reduce range locked as RL_WRITER from whole file to specified range. - * Asserts the whole file is exclusively locked and so there's only one - * entry in the tree. - */ -void -rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len) -{ - rangelock_t *rl = lr->lr_rangelock; - - /* Ensure there are no other locks */ - ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1); - ASSERT3U(lr->lr_offset, ==, 0); - ASSERT3U(lr->lr_type, ==, RL_WRITER); - ASSERT(!lr->lr_proxy); - ASSERT3U(lr->lr_length, ==, UINT64_MAX); - ASSERT3U(lr->lr_count, ==, 1); - - mutex_enter(&rl->rl_lock); - lr->lr_offset = off; - lr->lr_length = len; - mutex_exit(&rl->rl_lock); - if (lr->lr_write_wanted) - cv_broadcast(&lr->lr_write_cv); - if (lr->lr_read_wanted) - cv_broadcast(&lr->lr_read_cv); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c deleted file mode 100644 index d12a70d74338..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c +++ /dev/null @@ -1,326 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#include -#include -#include -#include -#include - -/* - * ZPL attribute registration table. - * Order of attributes doesn't matter - * a unique value will be assigned for each - * attribute that is file system specific - * - * This is just the set of ZPL attributes that this - * version of ZFS deals with natively. The file system - * could have other attributes stored in files, but they will be - * ignored. The SA framework will preserve them, just that - * this version of ZFS won't change or delete them. - */ - -sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { - {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, - {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, - {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, - {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, - {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, - {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, - {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, - {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, - {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, - {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, - {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, - {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, - {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, - {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, - {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, - {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, - {"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0}, - {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0}, - {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0}, - {"ZPL_DACL_ACES", 0, SA_ACL, 0}, - {NULL, 0, 0, 0} -}; - -#ifdef _KERNEL - -int -zfs_sa_readlink(znode_t *zp, uio_t *uio) -{ - dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); - size_t bufsz; - int error; - - bufsz = zp->z_size; - if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) { - error = uiomove((caddr_t)db->db_data + - ZFS_OLD_ZNODE_PHYS_SIZE, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); - } else { - dmu_buf_t *dbp; - if ((error = dmu_buf_hold(zp->z_zfsvfs->z_os, zp->z_id, - 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) { - error = uiomove(dbp->db_data, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); - dmu_buf_rele(dbp, FTAG); - } - } - return (error); -} - -void -zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) -{ - dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); - - if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) { - VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx)); - if (len) { - bcopy(link, (caddr_t)db->db_data + - ZFS_OLD_ZNODE_PHYS_SIZE, len); - } - } else { - dmu_buf_t *dbp; - - zfs_grow_blocksize(zp, len, tx); - VERIFY(0 == dmu_buf_hold(zp->z_zfsvfs->z_os, - zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)); - - dmu_buf_will_dirty(dbp, tx); - - ASSERT3U(len, <=, dbp->db_size); - bcopy(link, dbp->db_data, len); - dmu_buf_rele(dbp, FTAG); - } -} - -void -zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - xoptattr_t *xoap; - - ASSERT_VOP_LOCKED(ZTOV(zp), __func__); - VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); - if (zp->z_is_sa) { - if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), - &xoap->xoa_av_scanstamp, - sizeof (xoap->xoa_av_scanstamp)) != 0) - return; - } else { - dmu_object_info_t doi; - dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); - int len; - - if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP)) - return; - - sa_object_info(zp->z_sa_hdl, &doi); - len = sizeof (xoap->xoa_av_scanstamp) + - ZFS_OLD_ZNODE_PHYS_SIZE; - - if (len <= doi.doi_bonus_size) { - (void) memcpy(xoap->xoa_av_scanstamp, - (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - sizeof (xoap->xoa_av_scanstamp)); - } - } - XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); -} - -void -zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - xoptattr_t *xoap; - - ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); - VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); - if (zp->z_is_sa) - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), - &xoap->xoa_av_scanstamp, - sizeof (xoap->xoa_av_scanstamp), tx)); - else { - dmu_object_info_t doi; - dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); - int len; - - sa_object_info(zp->z_sa_hdl, &doi); - len = sizeof (xoap->xoa_av_scanstamp) + - ZFS_OLD_ZNODE_PHYS_SIZE; - if (len > doi.doi_bonus_size) - VERIFY(dmu_set_bonus(db, len, tx) == 0); - (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp)); - - zp->z_pflags |= ZFS_BONUS_SCANSTAMP; - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), - &zp->z_pflags, sizeof (uint64_t), tx)); - } -} - -/* - * I'm not convinced we should do any of this upgrade. - * since the SA code can read both old/new znode formats - * with probably little to no performance difference. - * - * All new files will be created with the new format. - */ - -void -zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) -{ - dmu_buf_t *db = sa_get_db(hdl); - znode_t *zp = sa_get_userdata(hdl); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - sa_bulk_attr_t bulk[20]; - int count = 0; - sa_bulk_attr_t sa_attrs[20] = { 0 }; - zfs_acl_locator_cb_t locate = { 0 }; - uint64_t uid, gid, mode, rdev, xattr, parent; - uint64_t crtime[2], mtime[2], ctime[2]; - zfs_acl_phys_t znode_acl; - char scanstamp[AV_SCANSTAMP_SZ]; - - /* - * No upgrade if ACL isn't cached - * since we won't know which locks are held - * and ready the ACL would require special "locked" - * interfaces that would be messy - */ - if (zp->z_acl_cached == NULL || ZTOV(zp)->v_type == VLNK) - return; - - /* - * If the vnode lock is held and we aren't the owner - * then just return since we don't want to deadlock - * trying to update the status of z_is_sa. This - * file can then be upgraded at a later time. - * - * Otherwise, we know we are doing the - * sa_update() that caused us to enter this function. - */ - if (vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_NOWAIT) != 0) - return; - - /* First do a bulk query of the attributes that aren't cached */ - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, - &znode_acl, 88); - - if (sa_bulk_lookup_locked(hdl, bulk, count) != 0) - goto done; - - - /* - * While the order here doesn't matter its best to try and organize - * it is such a way to pick up an already existing layout number - */ - count = 0; - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL, - &zp->z_size, 8); - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs), - NULL, &zp->z_gen, 8); - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs), - NULL, &parent, 8); - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, 8); - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL, - zp->z_atime, 16); - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL, - &mtime, 16); - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL, - &ctime, 16); - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL, - &crtime, 16); - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, - &zp->z_links, 8); - if (zp->z_vnode->v_type == VBLK || zp->z_vnode->v_type == VCHR) - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL, - &rdev, 8); - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL, - &zp->z_acl_cached->z_acl_count, 8); - - if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID) - zfs_acl_xform(zp, zp->z_acl_cached, CRED()); - - locate.cb_aclp = zp->z_acl_cached; - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs), - zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes); - - if (xattr) - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs), - NULL, &xattr, 8); - - /* if scanstamp then add scanstamp */ - - if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { - bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - scanstamp, AV_SCANSTAMP_SZ); - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), - NULL, scanstamp, AV_SCANSTAMP_SZ); - zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; - } - - VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0); - VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs, - count, tx) == 0); - if (znode_acl.z_acl_extern_obj) - VERIFY(0 == dmu_object_free(zfsvfs->z_os, - znode_acl.z_acl_extern_obj, tx)); - - zp->z_is_sa = B_TRUE; -done: - VOP_UNLOCK(ZTOV(zp)); -} - -void -zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp) -{ - if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa) - return; - - - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - - if (zfs_external_acl(zp)) { - dmu_tx_hold_free(tx, zfs_external_acl(zp), 0, - DMU_OBJECT_END); - } -} - -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c deleted file mode 100644 index 1ce186d0862e..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ /dev/null @@ -1,2813 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 Pawel Jakub Dawidek . - * All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2016 Nexenta Systems, Inc. All rights reserved. - */ - -/* Portions Copyright 2010 Robert Milkowski */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zfs_comutil.h" - -struct mtx zfs_debug_mtx; -MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); - -SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "ZFS file system"); - -int zfs_super_owner; -SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, - "File system owner can perform privileged operation on his file systems"); - -int zfs_debug_level; -SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, - "Debug level"); - -SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, - "ZFS versions"); -static int zfs_version_acl = ZFS_ACL_VERSION; -SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, - "ZFS_ACL_VERSION"); -static int zfs_version_spa = SPA_VERSION; -SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, - "SPA_VERSION"); -static int zfs_version_zpl = ZPL_VERSION; -SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, - "ZPL_VERSION"); - -static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); -static int zfs_mount(vfs_t *vfsp); -static int zfs_umount(vfs_t *vfsp, int fflag); -static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); -static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); -static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); -static int zfs_sync(vfs_t *vfsp, int waitfor); -static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, - struct ucred **credanonp, int *numsecflavors, int *secflavors); -static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); -static void zfs_objset_close(zfsvfs_t *zfsvfs); -static void zfs_freevfs(vfs_t *vfsp); - -struct vfsops zfs_vfsops = { - .vfs_mount = zfs_mount, - .vfs_unmount = zfs_umount, - .vfs_root = vfs_cache_root, - .vfs_cachedroot = zfs_root, - .vfs_statfs = zfs_statfs, - .vfs_vget = zfs_vget, - .vfs_sync = zfs_sync, - .vfs_checkexp = zfs_checkexp, - .vfs_fhtovp = zfs_fhtovp, - .vfs_quotactl = zfs_quotactl, -}; - -VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); - -/* - * We need to keep a count of active fs's. - * This is necessary to prevent our module - * from being unloaded after a umount -f - */ -static uint32_t zfs_active_fs_count = 0; - -static int -zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) -{ - int error = 0; - char buf[32]; - int err; - uint64_t usedobj, quotaobj; - uint64_t quota, used = 0; - timespec_t now; - - usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; - quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; - - if (quotaobj == 0 || zfsvfs->z_replay) { - error = EINVAL; - goto done; - } - (void)sprintf(buf, "%llx", (longlong_t)id); - if ((error = zap_lookup(zfsvfs->z_os, quotaobj, - buf, sizeof(quota), 1, "a)) != 0) { - dprintf("%s(%d): quotaobj lookup failed\n", __FUNCTION__, __LINE__); - goto done; - } - /* - * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". - * So we set them to be the same. - */ - dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); - error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof(used), 1, &used); - if (error && error != ENOENT) { - dprintf("%s(%d): usedobj failed; %d\n", __FUNCTION__, __LINE__, error); - goto done; - } - dqp->dqb_curblocks = btodb(used); - dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; - vfs_timestamp(&now); - /* - * Setting this to 0 causes FreeBSD quota(8) to print - * the number of days since the epoch, which isn't - * particularly useful. - */ - dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; -done: - return (error); -} - -static int -zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - struct thread *td; - int cmd, type, error = 0; - int bitsize; - uint64_t fuid; - zfs_userquota_prop_t quota_type; - struct dqblk64 dqblk = { 0 }; - - td = curthread; - cmd = cmds >> SUBCMDSHIFT; - type = cmds & SUBCMDMASK; - - ZFS_ENTER(zfsvfs); - if (id == -1) { - switch (type) { - case USRQUOTA: - id = td->td_ucred->cr_ruid; - break; - case GRPQUOTA: - id = td->td_ucred->cr_rgid; - break; - default: - error = EINVAL; - if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) - vfs_unbusy(vfsp); - goto done; - } - } - /* - * Map BSD type to: - * ZFS_PROP_USERUSED, - * ZFS_PROP_USERQUOTA, - * ZFS_PROP_GROUPUSED, - * ZFS_PROP_GROUPQUOTA - */ - switch (cmd) { - case Q_SETQUOTA: - case Q_SETQUOTA32: - if (type == USRQUOTA) - quota_type = ZFS_PROP_USERQUOTA; - else if (type == GRPQUOTA) - quota_type = ZFS_PROP_GROUPQUOTA; - else - error = EINVAL; - break; - case Q_GETQUOTA: - case Q_GETQUOTA32: - if (type == USRQUOTA) - quota_type = ZFS_PROP_USERUSED; - else if (type == GRPQUOTA) - quota_type = ZFS_PROP_GROUPUSED; - else - error = EINVAL; - break; - } - - /* - * Depending on the cmd, we may need to get - * the ruid and domain (see fuidstr_to_sid?), - * the fuid (how?), or other information. - * Create fuid using zfs_fuid_create(zfsvfs, id, - * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? - * I think I can use just the id? - * - * Look at zfs_fuid_overquota() to look up a quota. - * zap_lookup(something, quotaobj, fuidstring, sizeof(long long), 1, "a) - * - * See zfs_set_userquota() to set a quota. - */ - if ((u_int)type >= MAXQUOTAS) { - error = EINVAL; - goto done; - } - - switch (cmd) { - case Q_GETQUOTASIZE: - bitsize = 64; - error = copyout(&bitsize, arg, sizeof(int)); - break; - case Q_QUOTAON: - // As far as I can tell, you can't turn quotas on or off on zfs - error = 0; - vfs_unbusy(vfsp); - break; - case Q_QUOTAOFF: - error = ENOTSUP; - vfs_unbusy(vfsp); - break; - case Q_SETQUOTA: - error = copyin(arg, &dqblk, sizeof(dqblk)); - if (error == 0) - error = zfs_set_userquota(zfsvfs, quota_type, - "", id, dbtob(dqblk.dqb_bhardlimit)); - break; - case Q_GETQUOTA: - error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); - if (error == 0) - error = copyout(&dqblk, arg, sizeof(dqblk)); - break; - default: - error = EINVAL; - break; - } -done: - ZFS_EXIT(zfsvfs); - return (error); -} - -/*ARGSUSED*/ -static int -zfs_sync(vfs_t *vfsp, int waitfor) -{ - - /* - * Data integrity is job one. We don't want a compromised kernel - * writing to the storage pool, so we never sync during panic. - */ - if (KERNEL_PANICKED()) - return (0); - - /* - * Ignore the system syncher. ZFS already commits async data - * at zfs_txg_timeout intervals. - */ - if (waitfor == MNT_LAZY) - return (0); - - if (vfsp != NULL) { - /* - * Sync a specific filesystem. - */ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - dsl_pool_t *dp; - int error; - - error = vfs_stdsync(vfsp, waitfor); - if (error != 0) - return (error); - - ZFS_ENTER(zfsvfs); - dp = dmu_objset_pool(zfsvfs->z_os); - - /* - * If the system is shutting down, then skip any - * filesystems which may exist on a suspended pool. - */ - if (sys_shutdown && spa_suspended(dp->dp_spa)) { - ZFS_EXIT(zfsvfs); - return (0); - } - - if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, 0); - - ZFS_EXIT(zfsvfs); - } else { - /* - * Sync all ZFS filesystems. This is what happens when you - * run sync(1M). Unlike other filesystems, ZFS honors the - * request by waiting for all pools to commit all dirty data. - */ - spa_sync_allpools(); - } - - return (0); -} - -#ifndef __FreeBSD_kernel__ -static int -zfs_create_unique_device(dev_t *dev) -{ - major_t new_major; - - do { - ASSERT3U(zfs_minor, <=, MAXMIN32); - minor_t start = zfs_minor; - do { - mutex_enter(&zfs_dev_mtx); - if (zfs_minor >= MAXMIN32) { - /* - * If we're still using the real major - * keep out of /dev/zfs and /dev/zvol minor - * number space. If we're using a getudev()'ed - * major number, we can use all of its minors. - */ - if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) - zfs_minor = ZFS_MIN_MINOR; - else - zfs_minor = 0; - } else { - zfs_minor++; - } - *dev = makedevice(zfs_major, zfs_minor); - mutex_exit(&zfs_dev_mtx); - } while (vfs_devismounted(*dev) && zfs_minor != start); - if (zfs_minor == start) { - /* - * We are using all ~262,000 minor numbers for the - * current major number. Create a new major number. - */ - if ((new_major = getudev()) == (major_t)-1) { - cmn_err(CE_WARN, - "zfs_mount: Can't get unique major " - "device number."); - return (-1); - } - mutex_enter(&zfs_dev_mtx); - zfs_major = new_major; - zfs_minor = 0; - - mutex_exit(&zfs_dev_mtx); - } else { - break; - } - /* CONSTANTCONDITION */ - } while (1); - - return (0); -} -#endif /* !__FreeBSD_kernel__ */ - -static void -atime_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval == TRUE) { - zfsvfs->z_atime = TRUE; - zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); - } else { - zfsvfs->z_atime = FALSE; - zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); - } -} - -static void -xattr_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval == TRUE) { - /* XXX locking on vfs_flag? */ -#ifdef TODO - zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; -#endif - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); - } else { - /* XXX locking on vfs_flag? */ -#ifdef TODO - zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; -#endif - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); - } -} - -static void -blksz_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); - ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); - ASSERT(ISP2(newval)); - - zfsvfs->z_max_blksz = newval; - zfsvfs->z_vfs->mnt_stat.f_iosize = newval; -} - -static void -readonly_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval) { - /* XXX locking on vfs_flag? */ - zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); - } else { - /* XXX locking on vfs_flag? */ - zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); - } -} - -static void -setuid_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval == FALSE) { - zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); - } else { - zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); - } -} - -static void -exec_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval == FALSE) { - zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); - } else { - zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); - } -} - -/* - * The nbmand mount option can be changed at mount time. - * We can't allow it to be toggled on live file systems or incorrect - * behavior may be seen from cifs clients - * - * This property isn't registered via dsl_prop_register(), but this callback - * will be called when a file system is first mounted - */ -static void -nbmand_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - if (newval == FALSE) { - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); - } else { - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); - } -} - -static void -snapdir_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - zfsvfs->z_show_ctldir = newval; -} - -static void -vscan_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - zfsvfs->z_vscan = newval; -} - -static void -acl_mode_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - zfsvfs->z_acl_mode = newval; -} - -static void -acl_inherit_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - zfsvfs->z_acl_inherit = newval; -} - -static int -zfs_register_callbacks(vfs_t *vfsp) -{ - struct dsl_dataset *ds = NULL; - objset_t *os = NULL; - zfsvfs_t *zfsvfs = NULL; - uint64_t nbmand; - boolean_t readonly = B_FALSE; - boolean_t do_readonly = B_FALSE; - boolean_t setuid = B_FALSE; - boolean_t do_setuid = B_FALSE; - boolean_t exec = B_FALSE; - boolean_t do_exec = B_FALSE; -#ifdef illumos - boolean_t devices = B_FALSE; - boolean_t do_devices = B_FALSE; -#endif - boolean_t xattr = B_FALSE; - boolean_t do_xattr = B_FALSE; - boolean_t atime = B_FALSE; - boolean_t do_atime = B_FALSE; - int error = 0; - - ASSERT(vfsp); - zfsvfs = vfsp->vfs_data; - ASSERT(zfsvfs); - os = zfsvfs->z_os; - - /* - * This function can be called for a snapshot when we update snapshot's - * mount point, which isn't really supported. - */ - if (dmu_objset_is_snapshot(os)) - return (EOPNOTSUPP); - - /* - * The act of registering our callbacks will destroy any mount - * options we may have. In order to enable temporary overrides - * of mount options, we stash away the current values and - * restore them after we register the callbacks. - */ - if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || - !spa_writeable(dmu_objset_spa(os))) { - readonly = B_TRUE; - do_readonly = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { - readonly = B_FALSE; - do_readonly = B_TRUE; - } - if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { - setuid = B_FALSE; - do_setuid = B_TRUE; - } else { - if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { - setuid = B_FALSE; - do_setuid = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { - setuid = B_TRUE; - do_setuid = B_TRUE; - } - } - if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { - exec = B_FALSE; - do_exec = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { - exec = B_TRUE; - do_exec = B_TRUE; - } - if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { - xattr = B_FALSE; - do_xattr = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { - xattr = B_TRUE; - do_xattr = B_TRUE; - } - if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { - atime = B_FALSE; - do_atime = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { - atime = B_TRUE; - do_atime = B_TRUE; - } - - /* - * We need to enter pool configuration here, so that we can use - * dsl_prop_get_int_ds() to handle the special nbmand property below. - * dsl_prop_get_integer() can not be used, because it has to acquire - * spa_namespace_lock and we can not do that because we already hold - * z_teardown_lock. The problem is that spa_write_cachefile() is called - * with spa_namespace_lock held and the function calls ZFS vnode - * operations to write the cache file and thus z_teardown_lock is - * acquired after spa_namespace_lock. - */ - ds = dmu_objset_ds(os); - dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - - /* - * nbmand is a special property. It can only be changed at - * mount time. - * - * This is weird, but it is documented to only be changeable - * at mount time. - */ - if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { - nbmand = B_FALSE; - } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { - nbmand = B_TRUE; - } else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) { - dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - return (error); - } - - /* - * Register property callbacks. - * - * It would probably be fine to just check for i/o error from - * the first prop_register(), but I guess I like to go - * overboard... - */ - error = dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); -#ifdef illumos - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); -#endif - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, - zfsvfs); - error = error ? error : dsl_prop_register(ds, - zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); - dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - if (error) - goto unregister; - - /* - * Invoke our callbacks to restore temporary mount options. - */ - if (do_readonly) - readonly_changed_cb(zfsvfs, readonly); - if (do_setuid) - setuid_changed_cb(zfsvfs, setuid); - if (do_exec) - exec_changed_cb(zfsvfs, exec); - if (do_xattr) - xattr_changed_cb(zfsvfs, xattr); - if (do_atime) - atime_changed_cb(zfsvfs, atime); - - nbmand_changed_cb(zfsvfs, nbmand); - - return (0); - -unregister: - dsl_prop_unregister_all(ds, zfsvfs); - return (error); -} - -static int -zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, - uint64_t *userp, uint64_t *groupp) -{ - /* - * Is it a valid type of object to track? - */ - if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) - return (SET_ERROR(ENOENT)); - - /* - * If we have a NULL data pointer - * then assume the id's aren't changing and - * return EEXIST to the dmu to let it know to - * use the same ids - */ - if (data == NULL) - return (SET_ERROR(EEXIST)); - - if (bonustype == DMU_OT_ZNODE) { - znode_phys_t *znp = data; - *userp = znp->zp_uid; - *groupp = znp->zp_gid; - } else { - int hdrsize; - sa_hdr_phys_t *sap = data; - sa_hdr_phys_t sa = *sap; - boolean_t swap = B_FALSE; - - ASSERT(bonustype == DMU_OT_SA); - - if (sa.sa_magic == 0) { - /* - * This should only happen for newly created - * files that haven't had the znode data filled - * in yet. - */ - *userp = 0; - *groupp = 0; - return (0); - } - if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { - sa.sa_magic = SA_MAGIC; - sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); - swap = B_TRUE; - } else { - VERIFY3U(sa.sa_magic, ==, SA_MAGIC); - } - - hdrsize = sa_hdrsize(&sa); - VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); - *userp = *((uint64_t *)((uintptr_t)data + hdrsize + - SA_UID_OFFSET)); - *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + - SA_GID_OFFSET)); - if (swap) { - *userp = BSWAP_64(*userp); - *groupp = BSWAP_64(*groupp); - } - } - return (0); -} - -static void -fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, - char *domainbuf, int buflen, uid_t *ridp) -{ - uint64_t fuid; - const char *domain; - - fuid = zfs_strtonum(fuidstr, NULL); - - domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); - if (domain) - (void) strlcpy(domainbuf, domain, buflen); - else - domainbuf[0] = '\0'; - *ridp = FUID_RID(fuid); -} - -static uint64_t -zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) -{ - switch (type) { - case ZFS_PROP_USERUSED: - return (DMU_USERUSED_OBJECT); - case ZFS_PROP_GROUPUSED: - return (DMU_GROUPUSED_OBJECT); - case ZFS_PROP_USERQUOTA: - return (zfsvfs->z_userquota_obj); - case ZFS_PROP_GROUPQUOTA: - return (zfsvfs->z_groupquota_obj); - } - return (0); -} - -int -zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) -{ - int error; - zap_cursor_t zc; - zap_attribute_t za; - zfs_useracct_t *buf = vbuf; - uint64_t obj; - - if (!dmu_objset_userspace_present(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); - - obj = zfs_userquota_prop_to_obj(zfsvfs, type); - if (obj == 0) { - *bufsizep = 0; - return (0); - } - - for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); - (error = zap_cursor_retrieve(&zc, &za)) == 0; - zap_cursor_advance(&zc)) { - if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > - *bufsizep) - break; - - fuidstr_to_sid(zfsvfs, za.za_name, - buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); - - buf->zu_space = za.za_first_integer; - buf++; - } - if (error == ENOENT) - error = 0; - - ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); - *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; - *cookiep = zap_cursor_serialize(&zc); - zap_cursor_fini(&zc); - return (error); -} - -/* - * buf must be big enough (eg, 32 bytes) - */ -static int -id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, - char *buf, boolean_t addok) -{ - uint64_t fuid; - int domainid = 0; - - if (domain && domain[0]) { - domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); - if (domainid == -1) - return (SET_ERROR(ENOENT)); - } - fuid = FUID_ENCODE(domainid, rid); - (void) sprintf(buf, "%llx", (longlong_t)fuid); - return (0); -} - -int -zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - const char *domain, uint64_t rid, uint64_t *valp) -{ - char buf[32]; - int err; - uint64_t obj; - - *valp = 0; - - if (!dmu_objset_userspace_present(zfsvfs->z_os)) - return (SET_ERROR(ENOTSUP)); - - obj = zfs_userquota_prop_to_obj(zfsvfs, type); - if (obj == 0) - return (0); - - err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); - if (err) - return (err); - - err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); - if (err == ENOENT) - err = 0; - return (err); -} - -int -zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - const char *domain, uint64_t rid, uint64_t quota) -{ - char buf[32]; - int err; - dmu_tx_t *tx; - uint64_t *objp; - boolean_t fuid_dirtied; - - if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) - return (SET_ERROR(EINVAL)); - - if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) - return (SET_ERROR(ENOTSUP)); - - objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : - &zfsvfs->z_groupquota_obj; - - err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); - if (err) - return (err); - fuid_dirtied = zfsvfs->z_fuid_dirty; - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); - if (*objp == 0) { - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, - zfs_userquota_prop_prefixes[type]); - } - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - - mutex_enter(&zfsvfs->z_lock); - if (*objp == 0) { - *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, - DMU_OT_NONE, 0, tx); - VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, - zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); - } - mutex_exit(&zfsvfs->z_lock); - - if (quota == 0) { - err = zap_remove(zfsvfs->z_os, *objp, buf, tx); - if (err == ENOENT) - err = 0; - } else { - err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); - } - ASSERT(err == 0); - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - dmu_tx_commit(tx); - return (err); -} - -boolean_t -zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) -{ - char buf[32]; - uint64_t used, quota, usedobj, quotaobj; - int err; - - usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; - quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; - - if (quotaobj == 0 || zfsvfs->z_replay) - return (B_FALSE); - - (void) sprintf(buf, "%llx", (longlong_t)fuid); - err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); - if (err != 0) - return (B_FALSE); - - err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); - if (err != 0) - return (B_FALSE); - return (used >= quota); -} - -boolean_t -zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) -{ - uint64_t fuid; - uint64_t quotaobj; - - quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; - - fuid = isgroup ? zp->z_gid : zp->z_uid; - - if (quotaobj == 0 || zfsvfs->z_replay) - return (B_FALSE); - - return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); -} - -/* - * Associate this zfsvfs with the given objset, which must be owned. - * This will cache a bunch of on-disk state from the objset in the - * zfsvfs. - */ -static int -zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) -{ - int error; - uint64_t val; - - zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; - zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; - zfsvfs->z_os = os; - - error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); - if (error != 0) - return (error); - if (zfsvfs->z_version > - zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { - (void) printf("Can't mount a version %lld file system " - "on a version %lld pool\n. Pool must be upgraded to mount " - "this file system.", (u_longlong_t)zfsvfs->z_version, - (u_longlong_t)spa_version(dmu_objset_spa(os))); - return (SET_ERROR(ENOTSUP)); - } - error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); - if (error != 0) - return (error); - zfsvfs->z_norm = (int)val; - - error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); - if (error != 0) - return (error); - zfsvfs->z_utf8 = (val != 0); - - error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); - if (error != 0) - return (error); - zfsvfs->z_case = (uint_t)val; - - /* - * Fold case on file systems that are always or sometimes case - * insensitive. - */ - if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || - zfsvfs->z_case == ZFS_CASE_MIXED) - zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; - - zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); - zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); - - uint64_t sa_obj = 0; - if (zfsvfs->z_use_sa) { - /* should either have both of these objects or none */ - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, - &sa_obj); - if (error != 0) - return (error); - } - - error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, - &zfsvfs->z_attr_table); - if (error != 0) - return (error); - - if (zfsvfs->z_version >= ZPL_VERSION_SA) - sa_register_update_callback(os, zfs_sa_upgrade); - - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, - &zfsvfs->z_root); - if (error != 0) - return (error); - ASSERT(zfsvfs->z_root != 0); - - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, - &zfsvfs->z_unlinkedobj); - if (error != 0) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, - zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], - 8, 1, &zfsvfs->z_userquota_obj); - if (error == ENOENT) - zfsvfs->z_userquota_obj = 0; - else if (error != 0) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, - zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], - 8, 1, &zfsvfs->z_groupquota_obj); - if (error == ENOENT) - zfsvfs->z_groupquota_obj = 0; - else if (error != 0) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, - &zfsvfs->z_fuid_obj); - if (error == ENOENT) - zfsvfs->z_fuid_obj = 0; - else if (error != 0) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, - &zfsvfs->z_shares_dir); - if (error == ENOENT) - zfsvfs->z_shares_dir = 0; - else if (error != 0) - return (error); - - /* - * Only use the name cache if we are looking for a - * name on a file system that does not require normalization - * or case folding. We can also look there if we happen to be - * on a non-normalizing, mixed sensitivity file system IF we - * are looking for the exact name (which is always the case on - * FreeBSD). - */ - zfsvfs->z_use_namecache = !zfsvfs->z_norm || - ((zfsvfs->z_case == ZFS_CASE_MIXED) && - !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); - - return (0); -} - -#if defined(__FreeBSD__) -taskq_t *zfsvfs_taskq; - -static void -zfsvfs_task_unlinked_drain(void *context, int pending __unused) -{ - - zfs_unlinked_drain((zfsvfs_t *)context); -} -#endif - -int -zfsvfs_create(const char *osname, zfsvfs_t **zfvp) -{ - objset_t *os; - zfsvfs_t *zfsvfs; - int error; - - /* - * XXX: Fix struct statfs so this isn't necessary! - * - * The 'osname' is used as the filesystem's special node, which means - * it must fit in statfs.f_mntfromname, or else it can't be - * enumerated, so libzfs_mnttab_find() returns NULL, which causes - * 'zfs unmount' to think it's not mounted when it is. - */ - if (strlen(osname) >= MNAMELEN) - return (SET_ERROR(ENAMETOOLONG)); - - zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); - - /* - * We claim to always be readonly so we can open snapshots; - * other ZPL code will prevent us from writing to snapshots. - */ - - error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); - if (error != 0) { - kmem_free(zfsvfs, sizeof (zfsvfs_t)); - return (error); - } - - error = zfsvfs_create_impl(zfvp, zfsvfs, os); - if (error != 0) { - dmu_objset_disown(os, zfsvfs); - } - return (error); -} - - -int -zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) -{ - int error; - - zfsvfs->z_vfs = NULL; - zfsvfs->z_parent = zfsvfs; - - mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), - offsetof(znode_t, z_link_node)); -#if defined(__FreeBSD__) - TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, - zfsvfs_task_unlinked_drain, zfsvfs); -#endif -#ifdef DIAGNOSTIC - rrm_init(&zfsvfs->z_teardown_lock, B_TRUE); -#else - rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); -#endif - rms_init(&zfsvfs->z_teardown_inactive_lock, "zfs teardown inactive"); - rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); - for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); - - error = zfsvfs_init(zfsvfs, os); - if (error != 0) { - *zfvp = NULL; - kmem_free(zfsvfs, sizeof (zfsvfs_t)); - return (error); - } - - *zfvp = zfsvfs; - return (0); -} - -static int -zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) -{ - int error; - - error = zfs_register_callbacks(zfsvfs->z_vfs); - if (error) - return (error); - - zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); - - /* - * If we are not mounting (ie: online recv), then we don't - * have to worry about replaying the log as we blocked all - * operations out since we closed the ZIL. - */ - if (mounting) { - boolean_t readonly; - - /* - * During replay we remove the read only flag to - * allow replays to succeed. - */ - readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; - if (readonly != 0) - zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; - else - zfs_unlinked_drain(zfsvfs); - - /* - * Parse and replay the intent log. - * - * Because of ziltest, this must be done after - * zfs_unlinked_drain(). (Further note: ziltest - * doesn't use readonly mounts, where - * zfs_unlinked_drain() isn't called.) This is because - * ziltest causes spa_sync() to think it's committed, - * but actually it is not, so the intent log contains - * many txg's worth of changes. - * - * In particular, if object N is in the unlinked set in - * the last txg to actually sync, then it could be - * actually freed in a later txg and then reallocated - * in a yet later txg. This would write a "create - * object N" record to the intent log. Normally, this - * would be fine because the spa_sync() would have - * written out the fact that object N is free, before - * we could write the "create object N" intent log - * record. - * - * But when we are in ziltest mode, we advance the "open - * txg" without actually spa_sync()-ing the changes to - * disk. So we would see that object N is still - * allocated and in the unlinked set, and there is an - * intent log record saying to allocate it. - */ - if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { - if (zil_replay_disable) { - zil_destroy(zfsvfs->z_log, B_FALSE); - } else { - zfsvfs->z_replay = B_TRUE; - zil_replay(zfsvfs->z_os, zfsvfs, - zfs_replay_vector); - zfsvfs->z_replay = B_FALSE; - } - } - zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ - } - - /* - * Set the objset user_ptr to track its zfsvfs. - */ - mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); - dmu_objset_set_user(zfsvfs->z_os, zfsvfs); - mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); - - return (0); -} - -extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ - -void -zfsvfs_free(zfsvfs_t *zfsvfs) -{ - int i; - - /* - * This is a barrier to prevent the filesystem from going away in - * zfs_znode_move() until we can safely ensure that the filesystem is - * not unmounted. We consider the filesystem valid before the barrier - * and invalid after the barrier. - */ - rw_enter(&zfsvfs_lock, RW_READER); - rw_exit(&zfsvfs_lock); - - zfs_fuid_destroy(zfsvfs); - - mutex_destroy(&zfsvfs->z_znodes_lock); - mutex_destroy(&zfsvfs->z_lock); - list_destroy(&zfsvfs->z_all_znodes); - rrm_destroy(&zfsvfs->z_teardown_lock); - rms_destroy(&zfsvfs->z_teardown_inactive_lock); - rw_destroy(&zfsvfs->z_fuid_lock); - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_destroy(&zfsvfs->z_hold_mtx[i]); - kmem_free(zfsvfs, sizeof (zfsvfs_t)); -} - -static void -zfs_set_fuid_feature(zfsvfs_t *zfsvfs) -{ - zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); - if (zfsvfs->z_vfs) { - if (zfsvfs->z_use_fuids) { - vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); - } else { - vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); - vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); - vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); - vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); - vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); - vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); - } - } - zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); -} - -static int -zfs_domount(vfs_t *vfsp, char *osname) -{ - uint64_t recordsize, fsid_guid; - int error = 0; - zfsvfs_t *zfsvfs; - vnode_t *vp; - - ASSERT(vfsp); - ASSERT(osname); - - error = zfsvfs_create(osname, &zfsvfs); - if (error) - return (error); - zfsvfs->z_vfs = vfsp; - -#ifdef illumos - /* Initialize the generic filesystem structure. */ - vfsp->vfs_bcount = 0; - vfsp->vfs_data = NULL; - - if (zfs_create_unique_device(&mount_dev) == -1) { - error = SET_ERROR(ENODEV); - goto out; - } - ASSERT(vfs_devismounted(mount_dev) == 0); -#endif - - if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, - NULL)) - goto out; - zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; - zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; - - vfsp->vfs_data = zfsvfs; - vfsp->mnt_flag |= MNT_LOCAL; -#if defined(_KERNEL) && !defined(KMEM_DEBUG) - vfsp->mnt_kern_flag |= MNTK_FPLOOKUP; -#endif - vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; - vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; - vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; - vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ - vfsp->mnt_kern_flag |= MNTK_NOMSYNC; - vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; - - /* - * The fsid is 64 bits, composed of an 8-bit fs type, which - * separates our fsid from any other filesystem types, and a - * 56-bit objset unique ID. The objset unique ID is unique to - * all objsets open on this system, provided by unique_create(). - * The 8-bit fs type must be put in the low bits of fsid[1] - * because that's where other Solaris filesystems put it. - */ - fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); - ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); - vfsp->vfs_fsid.val[0] = fsid_guid; - vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | - vfsp->mnt_vfc->vfc_typenum & 0xFF; - - /* - * Set features for file system. - */ - zfs_set_fuid_feature(zfsvfs); - if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { - vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); - vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); - vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); - } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { - vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); - vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); - } - vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); - - if (dmu_objset_is_snapshot(zfsvfs->z_os)) { - uint64_t pval; - - atime_changed_cb(zfsvfs, B_FALSE); - readonly_changed_cb(zfsvfs, B_TRUE); - if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) - goto out; - xattr_changed_cb(zfsvfs, pval); - zfsvfs->z_issnap = B_TRUE; - zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; - - mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); - dmu_objset_set_user(zfsvfs->z_os, zfsvfs); - mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); - } else { - error = zfsvfs_setup(zfsvfs, B_TRUE); - } - - vfs_mountedfrom(vfsp, osname); - - if (!zfsvfs->z_issnap) - zfsctl_create(zfsvfs); -out: - if (error) { - dmu_objset_disown(zfsvfs->z_os, zfsvfs); - zfsvfs_free(zfsvfs); - } else { - atomic_inc_32(&zfs_active_fs_count); - } - - return (error); -} - -void -zfs_unregister_callbacks(zfsvfs_t *zfsvfs) -{ - objset_t *os = zfsvfs->z_os; - - if (!dmu_objset_is_snapshot(os)) - dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); -} - -#ifdef SECLABEL -/* - * Convert a decimal digit string to a uint64_t integer. - */ -static int -str_to_uint64(char *str, uint64_t *objnum) -{ - uint64_t num = 0; - - while (*str) { - if (*str < '0' || *str > '9') - return (SET_ERROR(EINVAL)); - - num = num*10 + *str++ - '0'; - } - - *objnum = num; - return (0); -} - -/* - * The boot path passed from the boot loader is in the form of - * "rootpool-name/root-filesystem-object-number'. Convert this - * string to a dataset name: "rootpool-name/root-filesystem-name". - */ -static int -zfs_parse_bootfs(char *bpath, char *outpath) -{ - char *slashp; - uint64_t objnum; - int error; - - if (*bpath == 0 || *bpath == '/') - return (SET_ERROR(EINVAL)); - - (void) strcpy(outpath, bpath); - - slashp = strchr(bpath, '/'); - - /* if no '/', just return the pool name */ - if (slashp == NULL) { - return (0); - } - - /* if not a number, just return the root dataset name */ - if (str_to_uint64(slashp+1, &objnum)) { - return (0); - } - - *slashp = '\0'; - error = dsl_dsobj_to_dsname(bpath, objnum, outpath); - *slashp = '/'; - - return (error); -} - -/* - * Check that the hex label string is appropriate for the dataset being - * mounted into the global_zone proper. - * - * Return an error if the hex label string is not default or - * admin_low/admin_high. For admin_low labels, the corresponding - * dataset must be readonly. - */ -int -zfs_check_global_label(const char *dsname, const char *hexsl) -{ - if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) - return (0); - if (strcasecmp(hexsl, ADMIN_HIGH) == 0) - return (0); - if (strcasecmp(hexsl, ADMIN_LOW) == 0) { - /* must be readonly */ - uint64_t rdonly; - - if (dsl_prop_get_integer(dsname, - zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) - return (SET_ERROR(EACCES)); - return (rdonly ? 0 : EACCES); - } - return (SET_ERROR(EACCES)); -} - -/* - * Determine whether the mount is allowed according to MAC check. - * by comparing (where appropriate) label of the dataset against - * the label of the zone being mounted into. If the dataset has - * no label, create one. - * - * Returns 0 if access allowed, error otherwise (e.g. EACCES) - */ -static int -zfs_mount_label_policy(vfs_t *vfsp, char *osname) -{ - int error, retv; - zone_t *mntzone = NULL; - ts_label_t *mnt_tsl; - bslabel_t *mnt_sl; - bslabel_t ds_sl; - char ds_hexsl[MAXNAMELEN]; - - retv = EACCES; /* assume the worst */ - - /* - * Start by getting the dataset label if it exists. - */ - error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), - 1, sizeof (ds_hexsl), &ds_hexsl, NULL); - if (error) - return (SET_ERROR(EACCES)); - - /* - * If labeling is NOT enabled, then disallow the mount of datasets - * which have a non-default label already. No other label checks - * are needed. - */ - if (!is_system_labeled()) { - if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) - return (0); - return (SET_ERROR(EACCES)); - } - - /* - * Get the label of the mountpoint. If mounting into the global - * zone (i.e. mountpoint is not within an active zone and the - * zoned property is off), the label must be default or - * admin_low/admin_high only; no other checks are needed. - */ - mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); - if (mntzone->zone_id == GLOBAL_ZONEID) { - uint64_t zoned; - - zone_rele(mntzone); - - if (dsl_prop_get_integer(osname, - zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) - return (SET_ERROR(EACCES)); - if (!zoned) - return (zfs_check_global_label(osname, ds_hexsl)); - else - /* - * This is the case of a zone dataset being mounted - * initially, before the zone has been fully created; - * allow this mount into global zone. - */ - return (0); - } - - mnt_tsl = mntzone->zone_slabel; - ASSERT(mnt_tsl != NULL); - label_hold(mnt_tsl); - mnt_sl = label2bslabel(mnt_tsl); - - if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { - /* - * The dataset doesn't have a real label, so fabricate one. - */ - char *str = NULL; - - if (l_to_str_internal(mnt_sl, &str) == 0 && - dsl_prop_set_string(osname, - zfs_prop_to_name(ZFS_PROP_MLSLABEL), - ZPROP_SRC_LOCAL, str) == 0) - retv = 0; - if (str != NULL) - kmem_free(str, strlen(str) + 1); - } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { - /* - * Now compare labels to complete the MAC check. If the - * labels are equal then allow access. If the mountpoint - * label dominates the dataset label, allow readonly access. - * Otherwise, access is denied. - */ - if (blequal(mnt_sl, &ds_sl)) - retv = 0; - else if (bldominates(mnt_sl, &ds_sl)) { - vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); - retv = 0; - } - } - - label_rele(mnt_tsl); - zone_rele(mntzone); - return (retv); -} -#endif /* SECLABEL */ - -#ifdef OPENSOLARIS_MOUNTROOT -static int -zfs_mountroot(vfs_t *vfsp, enum whymountroot why) -{ - int error = 0; - static int zfsrootdone = 0; - zfsvfs_t *zfsvfs = NULL; - znode_t *zp = NULL; - vnode_t *vp = NULL; - char *zfs_bootfs; - char *zfs_devid; - - ASSERT(vfsp); - - /* - * The filesystem that we mount as root is defined in the - * boot property "zfs-bootfs" with a format of - * "poolname/root-dataset-objnum". - */ - if (why == ROOT_INIT) { - if (zfsrootdone++) - return (SET_ERROR(EBUSY)); - /* - * the process of doing a spa_load will require the - * clock to be set before we could (for example) do - * something better by looking at the timestamp on - * an uberblock, so just set it to -1. - */ - clkset(-1); - - if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { - cmn_err(CE_NOTE, "spa_get_bootfs: can not get " - "bootfs name"); - return (SET_ERROR(EINVAL)); - } - zfs_devid = spa_get_bootprop("diskdevid"); - error = spa_import_rootpool(rootfs.bo_name, zfs_devid); - if (zfs_devid) - spa_free_bootprop(zfs_devid); - if (error) { - spa_free_bootprop(zfs_bootfs); - cmn_err(CE_NOTE, "spa_import_rootpool: error %d", - error); - return (error); - } - if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { - spa_free_bootprop(zfs_bootfs); - cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", - error); - return (error); - } - - spa_free_bootprop(zfs_bootfs); - - if (error = vfs_lock(vfsp)) - return (error); - - if (error = zfs_domount(vfsp, rootfs.bo_name)) { - cmn_err(CE_NOTE, "zfs_domount: error %d", error); - goto out; - } - - zfsvfs = (zfsvfs_t *)vfsp->vfs_data; - ASSERT(zfsvfs); - if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { - cmn_err(CE_NOTE, "zfs_zget: error %d", error); - goto out; - } - - vp = ZTOV(zp); - mutex_enter(&vp->v_lock); - vp->v_flag |= VROOT; - mutex_exit(&vp->v_lock); - rootvp = vp; - - /* - * Leave rootvp held. The root file system is never unmounted. - */ - - vfs_add((struct vnode *)0, vfsp, - (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); -out: - vfs_unlock(vfsp); - return (error); - } else if (why == ROOT_REMOUNT) { - readonly_changed_cb(vfsp->vfs_data, B_FALSE); - vfsp->vfs_flag |= VFS_REMOUNT; - - /* refresh mount options */ - zfs_unregister_callbacks(vfsp->vfs_data); - return (zfs_register_callbacks(vfsp)); - - } else if (why == ROOT_UNMOUNT) { - zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); - (void) zfs_sync(vfsp, 0, 0); - return (0); - } - - /* - * if "why" is equal to anything else other than ROOT_INIT, - * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. - */ - return (SET_ERROR(ENOTSUP)); -} -#endif /* OPENSOLARIS_MOUNTROOT */ - -static int -getpoolname(const char *osname, char *poolname) -{ - char *p; - - p = strchr(osname, '/'); - if (p == NULL) { - if (strlen(osname) >= MAXNAMELEN) - return (ENAMETOOLONG); - (void) strcpy(poolname, osname); - } else { - if (p - osname >= MAXNAMELEN) - return (ENAMETOOLONG); - (void) strncpy(poolname, osname, p - osname); - poolname[p - osname] = '\0'; - } - return (0); -} - -static void -fetch_osname_options(char *name, bool *checkpointrewind) -{ - - if (name[0] == '!') { - *checkpointrewind = true; - memmove(name, name + 1, strlen(name)); - } else { - *checkpointrewind = false; - } -} - -/*ARGSUSED*/ -static int -zfs_mount(vfs_t *vfsp) -{ - kthread_t *td = curthread; - vnode_t *mvp = vfsp->mnt_vnodecovered; - cred_t *cr = td->td_ucred; - char *osname; - int error = 0; - int canwrite; - bool checkpointrewind; - -#ifdef illumos - if (mvp->v_type != VDIR) - return (SET_ERROR(ENOTDIR)); - - mutex_enter(&mvp->v_lock); - if ((uap->flags & MS_REMOUNT) == 0 && - (uap->flags & MS_OVERLAY) == 0 && - (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { - mutex_exit(&mvp->v_lock); - return (SET_ERROR(EBUSY)); - } - mutex_exit(&mvp->v_lock); - - /* - * ZFS does not support passing unparsed data in via MS_DATA. - * Users should use the MS_OPTIONSTR interface; this means - * that all option parsing is already done and the options struct - * can be interrogated. - */ - if ((uap->flags & MS_DATA) && uap->datalen > 0) - return (SET_ERROR(EINVAL)); - - /* - * Get the objset name (the "special" mount argument). - */ - if (error = pn_get(uap->spec, fromspace, &spn)) - return (error); - - osname = spn.pn_path; -#else /* !illumos */ - if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) - return (SET_ERROR(EINVAL)); - - /* - * If full-owner-access is enabled and delegated administration is - * turned on, we must set nosuid. - */ - if (zfs_super_owner && - dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { - secpolicy_fs_mount_clearopts(cr, vfsp); - } -#endif /* illumos */ - fetch_osname_options(osname, &checkpointrewind); - - /* - * Check for mount privilege? - * - * If we don't have privilege then see if - * we have local permission to allow it - */ - error = secpolicy_fs_mount(cr, mvp, vfsp); - if (error) { - if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) - goto out; - - if (!(vfsp->vfs_flag & MS_REMOUNT)) { - vattr_t vattr; - - /* - * Make sure user is the owner of the mount point - * or has sufficient privileges. - */ - - vattr.va_mask = AT_UID; - - vn_lock(mvp, LK_SHARED | LK_RETRY); - if (VOP_GETATTR(mvp, &vattr, cr)) { - VOP_UNLOCK(mvp); - goto out; - } - - if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && - VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { - VOP_UNLOCK(mvp); - goto out; - } - VOP_UNLOCK(mvp); - } - - secpolicy_fs_mount_clearopts(cr, vfsp); - } - - /* - * Refuse to mount a filesystem if we are in a local zone and the - * dataset is not visible. - */ - if (!INGLOBALZONE(curthread) && - (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { - error = SET_ERROR(EPERM); - goto out; - } - -#ifdef SECLABEL - error = zfs_mount_label_policy(vfsp, osname); - if (error) - goto out; -#endif - - vfsp->vfs_flag |= MNT_NFS4ACLS; - - /* - * When doing a remount, we simply refresh our temporary properties - * according to those options set in the current VFS options. - */ - if (vfsp->vfs_flag & MS_REMOUNT) { - zfsvfs_t *zfsvfs = vfsp->vfs_data; - - /* - * Refresh mount options with z_teardown_lock blocking I/O while - * the filesystem is in an inconsistent state. - * The lock also serializes this code with filesystem - * manipulations between entry to zfs_suspend_fs() and return - * from zfs_resume_fs(). - */ - rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); - zfs_unregister_callbacks(zfsvfs); - error = zfs_register_callbacks(vfsp); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); - goto out; - } - - /* Initial root mount: try hard to import the requested root pool. */ - if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && - (vfsp->vfs_flag & MNT_UPDATE) == 0) { - char pname[MAXNAMELEN]; - - error = getpoolname(osname, pname); - if (error == 0) - error = spa_import_rootpool(pname, checkpointrewind); - if (error) - goto out; - } - DROP_GIANT(); - error = zfs_domount(vfsp, osname); - PICKUP_GIANT(); - -#ifdef illumos - /* - * Add an extra VFS_HOLD on our parent vfs so that it can't - * disappear due to a forced unmount. - */ - if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) - VFS_HOLD(mvp->v_vfsp); -#endif - -out: - return (error); -} - -static int -zfs_statfs(vfs_t *vfsp, struct statfs *statp) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - uint64_t refdbytes, availbytes, usedobjs, availobjs; - - statp->f_version = STATFS_VERSION; - - ZFS_ENTER(zfsvfs); - - dmu_objset_space(zfsvfs->z_os, - &refdbytes, &availbytes, &usedobjs, &availobjs); - - /* - * The underlying storage pool actually uses multiple block sizes. - * We report the fragsize as the smallest block size we support, - * and we report our blocksize as the filesystem's maximum blocksize. - */ - statp->f_bsize = SPA_MINBLOCKSIZE; - statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; - - /* - * The following report "total" blocks of various kinds in the - * file system, but reported in terms of f_frsize - the - * "fragment" size. - */ - - statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; - statp->f_bfree = availbytes / statp->f_bsize; - statp->f_bavail = statp->f_bfree; /* no root reservation */ - - /* - * statvfs() should really be called statufs(), because it assumes - * static metadata. ZFS doesn't preallocate files, so the best - * we can do is report the max that could possibly fit in f_files, - * and that minus the number actually used in f_ffree. - * For f_ffree, report the smaller of the number of object available - * and the number of blocks (each object will take at least a block). - */ - statp->f_ffree = MIN(availobjs, statp->f_bfree); - statp->f_files = statp->f_ffree + usedobjs; - - /* - * We're a zfs filesystem. - */ - (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); - - strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, - sizeof(statp->f_mntfromname)); - strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, - sizeof(statp->f_mntonname)); - - statp->f_namemax = MAXNAMELEN - 1; - - ZFS_EXIT(zfsvfs); - return (0); -} - -static int -zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - znode_t *rootzp; - int error; - - ZFS_ENTER(zfsvfs); - - error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); - if (error == 0) - *vpp = ZTOV(rootzp); - - ZFS_EXIT(zfsvfs); - - if (error == 0) { - error = vn_lock(*vpp, flags); - if (error != 0) { - VN_RELE(*vpp); - *vpp = NULL; - } - } - return (error); -} - -/* - * Teardown the zfsvfs::z_os. - * - * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' - * and 'z_teardown_inactive_lock' held. - */ -static int -zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) -{ - znode_t *zp; - - rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); - - if (!unmounting) { - /* - * We purge the parent filesystem's vfsp as the parent - * filesystem and all of its snapshots have their vnode's - * v_vfsp set to the parent's filesystem's vfsp. Note, - * 'z_parent' is self referential for non-snapshots. - */ - (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); -#ifdef FREEBSD_NAMECACHE - cache_purgevfs(zfsvfs->z_parent->z_vfs, true); -#endif - } - - /* - * Close the zil. NB: Can't close the zil while zfs_inactive - * threads are blocked as zil_close can call zfs_inactive. - */ - if (zfsvfs->z_log) { - zil_close(zfsvfs->z_log); - zfsvfs->z_log = NULL; - } - - ZFS_WLOCK_TEARDOWN_INACTIVE(zfsvfs); - - /* - * If we are not unmounting (ie: online recv) and someone already - * unmounted this file system while we were doing the switcheroo, - * or a reopen of z_os failed then just bail out now. - */ - if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { - ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); - return (SET_ERROR(EIO)); - } - - /* - * At this point there are no vops active, and any new vops will - * fail with EIO since we have z_teardown_lock for writer (only - * relavent for forced unmount). - * - * Release all holds on dbufs. - */ - mutex_enter(&zfsvfs->z_znodes_lock); - for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; - zp = list_next(&zfsvfs->z_all_znodes, zp)) - if (zp->z_sa_hdl) { - ASSERT(ZTOV(zp)->v_count >= 0); - zfs_znode_dmu_fini(zp); - } - mutex_exit(&zfsvfs->z_znodes_lock); - - /* - * If we are unmounting, set the unmounted flag and let new vops - * unblock. zfs_inactive will have the unmounted behavior, and all - * other vops will fail with EIO. - */ - if (unmounting) { - zfsvfs->z_unmounted = B_TRUE; - ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); - } - - /* - * z_os will be NULL if there was an error in attempting to reopen - * zfsvfs, so just return as the properties had already been - * unregistered and cached data had been evicted before. - */ - if (zfsvfs->z_os == NULL) - return (0); - - /* - * Unregister properties. - */ - zfs_unregister_callbacks(zfsvfs); - - /* - * Evict cached data - */ - if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && - !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); - dmu_objset_evict_dbufs(zfsvfs->z_os); - - return (0); -} - -/*ARGSUSED*/ -static int -zfs_umount(vfs_t *vfsp, int fflag) -{ - kthread_t *td = curthread; - zfsvfs_t *zfsvfs = vfsp->vfs_data; - objset_t *os; - cred_t *cr = td->td_ucred; - int ret; - - ret = secpolicy_fs_unmount(cr, vfsp); - if (ret) { - if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), - ZFS_DELEG_PERM_MOUNT, cr)) - return (ret); - } - - /* - * We purge the parent filesystem's vfsp as the parent filesystem - * and all of its snapshots have their vnode's v_vfsp set to the - * parent's filesystem's vfsp. Note, 'z_parent' is self - * referential for non-snapshots. - */ - (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); - - /* - * Unmount any snapshots mounted under .zfs before unmounting the - * dataset itself. - */ - if (zfsvfs->z_ctldir != NULL) { - if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) - return (ret); - } - - if (fflag & MS_FORCE) { - /* - * Mark file system as unmounted before calling - * vflush(FORCECLOSE). This way we ensure no future vnops - * will be called and risk operating on DOOMED vnodes. - */ - rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); - zfsvfs->z_unmounted = B_TRUE; - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); - } - - /* - * Flush all the files. - */ - ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); - if (ret != 0) - return (ret); - -#ifdef illumos - if (!(fflag & MS_FORCE)) { - /* - * Check the number of active vnodes in the file system. - * Our count is maintained in the vfs structure, but the - * number is off by 1 to indicate a hold on the vfs - * structure itself. - * - * The '.zfs' directory maintains a reference of its - * own, and any active references underneath are - * reflected in the vnode count. - */ - if (zfsvfs->z_ctldir == NULL) { - if (vfsp->vfs_count > 1) - return (SET_ERROR(EBUSY)); - } else { - if (vfsp->vfs_count > 2 || - zfsvfs->z_ctldir->v_count > 1) - return (SET_ERROR(EBUSY)); - } - } -#endif - - while (taskqueue_cancel(zfsvfs_taskq->tq_queue, - &zfsvfs->z_unlinked_drain_task, NULL) != 0) - taskqueue_drain(zfsvfs_taskq->tq_queue, - &zfsvfs->z_unlinked_drain_task); - - VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); - os = zfsvfs->z_os; - - /* - * z_os will be NULL if there was an error in - * attempting to reopen zfsvfs. - */ - if (os != NULL) { - /* - * Unset the objset user_ptr. - */ - mutex_enter(&os->os_user_ptr_lock); - dmu_objset_set_user(os, NULL); - mutex_exit(&os->os_user_ptr_lock); - - /* - * Finally release the objset - */ - dmu_objset_disown(os, zfsvfs); - } - - /* - * We can now safely destroy the '.zfs' directory node. - */ - if (zfsvfs->z_ctldir != NULL) - zfsctl_destroy(zfsvfs); - zfs_freevfs(vfsp); - - return (0); -} - -static int -zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - znode_t *zp; - int err; - - /* - * zfs_zget() can't operate on virtual entries like .zfs/ or - * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. - * This will make NFS to switch to LOOKUP instead of using VGET. - */ - if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || - (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) - return (EOPNOTSUPP); - - ZFS_ENTER(zfsvfs); - err = zfs_zget(zfsvfs, ino, &zp); - if (err == 0 && zp->z_unlinked) { - vrele(ZTOV(zp)); - err = EINVAL; - } - if (err == 0) - *vpp = ZTOV(zp); - ZFS_EXIT(zfsvfs); - if (err == 0) { - err = vn_lock(*vpp, flags); - if (err != 0) - vrele(*vpp); - } - if (err != 0) - *vpp = NULL; - return (err); -} - -static int -zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, - struct ucred **credanonp, int *numsecflavors, int *secflavors) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - - /* - * If this is regular file system vfsp is the same as - * zfsvfs->z_parent->z_vfs, but if it is snapshot, - * zfsvfs->z_parent->z_vfs represents parent file system - * which we have to use here, because only this file system - * has mnt_export configured. - */ - return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, - credanonp, numsecflavors, secflavors)); -} - -CTASSERT(SHORT_FID_LEN <= sizeof(struct fid)); -CTASSERT(LONG_FID_LEN <= sizeof(struct fid)); - -static int -zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) -{ - struct componentname cn; - zfsvfs_t *zfsvfs = vfsp->vfs_data; - znode_t *zp; - vnode_t *dvp; - uint64_t object = 0; - uint64_t fid_gen = 0; - uint64_t gen_mask; - uint64_t zp_gen; - int i, err; - - *vpp = NULL; - - ZFS_ENTER(zfsvfs); - - /* - * On FreeBSD we can get snapshot's mount point or its parent file - * system mount point depending if snapshot is already mounted or not. - */ - if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { - zfid_long_t *zlfid = (zfid_long_t *)fidp; - uint64_t objsetid = 0; - uint64_t setgen = 0; - - for (i = 0; i < sizeof (zlfid->zf_setid); i++) - objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); - - for (i = 0; i < sizeof (zlfid->zf_setgen); i++) - setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); - - ZFS_EXIT(zfsvfs); - - err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); - if (err) - return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - } - - if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { - zfid_short_t *zfid = (zfid_short_t *)fidp; - - for (i = 0; i < sizeof (zfid->zf_object); i++) - object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); - - for (i = 0; i < sizeof (zfid->zf_gen); i++) - fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); - } else { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * A zero fid_gen means we are in .zfs or the .zfs/snapshot - * directory tree. If the object == zfsvfs->z_shares_dir, then - * we are in the .zfs/shares directory tree. - */ - if ((fid_gen == 0 && - (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || - (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { - ZFS_EXIT(zfsvfs); - VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); - if (object == ZFSCTL_INO_SNAPDIR) { - cn.cn_nameptr = "snapshot"; - cn.cn_namelen = strlen(cn.cn_nameptr); - cn.cn_nameiop = LOOKUP; - cn.cn_flags = ISLASTCN | LOCKLEAF; - cn.cn_lkflags = flags; - VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); - vput(dvp); - } else if (object == zfsvfs->z_shares_dir) { - /* - * XXX This branch must not be taken, - * if it is, then the lookup below will - * explode. - */ - cn.cn_nameptr = "shares"; - cn.cn_namelen = strlen(cn.cn_nameptr); - cn.cn_nameiop = LOOKUP; - cn.cn_flags = ISLASTCN; - cn.cn_lkflags = flags; - VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); - vput(dvp); - } else { - *vpp = dvp; - } - return (err); - } - - gen_mask = -1ULL >> (64 - 8 * i); - - dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); - if (err = zfs_zget(zfsvfs, object, &zp)) { - ZFS_EXIT(zfsvfs); - return (err); - } - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, - sizeof (uint64_t)); - zp_gen = zp_gen & gen_mask; - if (zp_gen == 0) - zp_gen = 1; - if (zp->z_unlinked || zp_gen != fid_gen) { - dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); - vrele(ZTOV(zp)); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - *vpp = ZTOV(zp); - ZFS_EXIT(zfsvfs); - err = vn_lock(*vpp, flags); - if (err == 0) - vnode_create_vobject(*vpp, zp->z_size, curthread); - else - *vpp = NULL; - return (err); -} - -/* - * Block out VOPs and close zfsvfs_t::z_os - * - * Note, if successful, then we return with the 'z_teardown_lock' and - * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying - * dataset and objset intact so that they can be atomically handed off during - * a subsequent rollback or recv operation and the resume thereafter. - */ -int -zfs_suspend_fs(zfsvfs_t *zfsvfs) -{ - int error; - - if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) - return (error); - - return (0); -} - -/* - * Rebuild SA and release VOPs. Note that ownership of the underlying dataset - * is an invariant across any of the operations that can be performed while the - * filesystem was suspended. Whether it succeeded or failed, the preconditions - * are the same: the relevant objset and associated dataset are owned by - * zfsvfs, held, and long held on entry. - */ -int -zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) -{ - int err; - znode_t *zp; - - ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); - ASSERT(ZFS_TEARDOWN_INACTIVE_WLOCKED(zfsvfs)); - - /* - * We already own this, so just update the objset_t, as the one we - * had before may have been evicted. - */ - objset_t *os; - VERIFY3P(ds->ds_owner, ==, zfsvfs); - VERIFY(dsl_dataset_long_held(ds)); - VERIFY0(dmu_objset_from_ds(ds, &os)); - - err = zfsvfs_init(zfsvfs, os); - if (err != 0) - goto bail; - - VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); - - zfs_set_fuid_feature(zfsvfs); - - /* - * Attempt to re-establish all the active znodes with - * their dbufs. If a zfs_rezget() fails, then we'll let - * any potential callers discover that via ZFS_ENTER_VERIFY_VP - * when they try to use their znode. - */ - mutex_enter(&zfsvfs->z_znodes_lock); - for (zp = list_head(&zfsvfs->z_all_znodes); zp; - zp = list_next(&zfsvfs->z_all_znodes, zp)) { - (void) zfs_rezget(zp); - } - mutex_exit(&zfsvfs->z_znodes_lock); - -bail: - /* release the VOPs */ - ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); - - if (err) { - /* - * Since we couldn't setup the sa framework, try to force - * unmount this file system. - */ - if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { - vfs_ref(zfsvfs->z_vfs); - (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); - } - } - return (err); -} - -static void -zfs_freevfs(vfs_t *vfsp) -{ - zfsvfs_t *zfsvfs = vfsp->vfs_data; - -#ifdef illumos - /* - * If this is a snapshot, we have an extra VFS_HOLD on our parent - * from zfs_mount(). Release it here. If we came through - * zfs_mountroot() instead, we didn't grab an extra hold, so - * skip the VFS_RELE for rootvfs. - */ - if (zfsvfs->z_issnap && (vfsp != rootvfs)) - VFS_RELE(zfsvfs->z_parent->z_vfs); -#endif - - zfsvfs_free(zfsvfs); - - atomic_dec_32(&zfs_active_fs_count); -} - -#ifdef __i386__ -static int desiredvnodes_backup; -#endif - -static void -zfs_vnodes_adjust(void) -{ -#ifdef __i386__ - int newdesiredvnodes; - - desiredvnodes_backup = desiredvnodes; - - /* - * We calculate newdesiredvnodes the same way it is done in - * vntblinit(). If it is equal to desiredvnodes, it means that - * it wasn't tuned by the administrator and we can tune it down. - */ - newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * - vm_kmem_size / (5 * (sizeof(struct vm_object) + - sizeof(struct vnode)))); - if (newdesiredvnodes == desiredvnodes) - desiredvnodes = (3 * newdesiredvnodes) / 4; -#endif -} - -static void -zfs_vnodes_adjust_back(void) -{ - -#ifdef __i386__ - desiredvnodes = desiredvnodes_backup; -#endif -} - -void -zfs_init(void) -{ - - printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); - - /* - * Initialize .zfs directory structures - */ - zfsctl_init(); - - /* - * Initialize znode cache, vnode ops, etc... - */ - zfs_znode_init(); - - /* - * Reduce number of vnodes. Originally number of vnodes is calculated - * with UFS inode in mind. We reduce it here, because it's too big for - * ZFS/i386. - */ - zfs_vnodes_adjust(); - - dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); -#if defined(__FreeBSD__) - zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); -#endif -} - -void -zfs_fini(void) -{ -#if defined(__FreeBSD__) - taskq_destroy(zfsvfs_taskq); -#endif - zfsctl_fini(); - zfs_znode_fini(); - zfs_vnodes_adjust_back(); -} - -int -zfs_busy(void) -{ - return (zfs_active_fs_count != 0); -} - -int -zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) -{ - int error; - objset_t *os = zfsvfs->z_os; - dmu_tx_t *tx; - - if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) - return (SET_ERROR(EINVAL)); - - if (newvers < zfsvfs->z_version) - return (SET_ERROR(EINVAL)); - - if (zfs_spa_version_map(newvers) > - spa_version(dmu_objset_spa(zfsvfs->z_os))) - return (SET_ERROR(ENOTSUP)); - - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); - if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, - ZFS_SA_ATTRS); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - } - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } - - error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, - 8, 1, &newvers, tx); - - if (error) { - dmu_tx_commit(tx); - return (error); - } - - if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { - uint64_t sa_obj; - - ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, - SPA_VERSION_SA); - sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, - DMU_OT_NONE, 0, tx); - - error = zap_add(os, MASTER_NODE_OBJ, - ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); - ASSERT0(error); - - VERIFY(0 == sa_set_sa_object(os, sa_obj)); - sa_register_update_callback(os, zfs_sa_upgrade); - } - - spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, - "from %llu to %llu", zfsvfs->z_version, newvers); - - dmu_tx_commit(tx); - - zfsvfs->z_version = newvers; - os->os_version = newvers; - - zfs_set_fuid_feature(zfsvfs); - - return (0); -} - -/* - * Read a property stored within the master node. - */ -int -zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) -{ - uint64_t *cached_copy = NULL; - - /* - * Figure out where in the objset_t the cached copy would live, if it - * is available for the requested property. - */ - if (os != NULL) { - switch (prop) { - case ZFS_PROP_VERSION: - cached_copy = &os->os_version; - break; - case ZFS_PROP_NORMALIZE: - cached_copy = &os->os_normalization; - break; - case ZFS_PROP_UTF8ONLY: - cached_copy = &os->os_utf8only; - break; - case ZFS_PROP_CASE: - cached_copy = &os->os_casesensitivity; - break; - default: - break; - } - } - if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { - *value = *cached_copy; - return (0); - } - - /* - * If the property wasn't cached, look up the file system's value for - * the property. For the version property, we look up a slightly - * different string. - */ - const char *pname; - int error = ENOENT; - if (prop == ZFS_PROP_VERSION) { - pname = ZPL_VERSION_STR; - } else { - pname = zfs_prop_to_name(prop); - } - - if (os != NULL) { - ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); - error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); - } - - if (error == ENOENT) { - /* No value set, use the default value */ - switch (prop) { - case ZFS_PROP_VERSION: - *value = ZPL_VERSION; - break; - case ZFS_PROP_NORMALIZE: - case ZFS_PROP_UTF8ONLY: - *value = 0; - break; - case ZFS_PROP_CASE: - *value = ZFS_CASE_SENSITIVE; - break; - default: - return (error); - } - error = 0; - } - - /* - * If one of the methods for getting the property value above worked, - * copy it into the objset_t's cache. - */ - if (error == 0 && cached_copy != NULL) { - *cached_copy = *value; - } - - return (error); -} - -/* - * Return true if the coresponding vfs's unmounted flag is set. - * Otherwise return false. - * If this function returns true we know VFS unmount has been initiated. - */ -boolean_t -zfs_get_vfs_flag_unmounted(objset_t *os) -{ - zfsvfs_t *zfvp; - boolean_t unmounted = B_FALSE; - - ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); - - mutex_enter(&os->os_user_ptr_lock); - zfvp = dmu_objset_get_user(os); - if (zfvp != NULL && zfvp->z_vfs != NULL && - (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) - unmounted = B_TRUE; - mutex_exit(&os->os_user_ptr_lock); - - return (unmounted); -} - -#ifdef _KERNEL -void -zfsvfs_update_fromname(const char *oldname, const char *newname) -{ - char tmpbuf[MAXPATHLEN]; - struct mount *mp; - char *fromname; - size_t oldlen; - - oldlen = strlen(oldname); - - mtx_lock(&mountlist_mtx); - TAILQ_FOREACH(mp, &mountlist, mnt_list) { - fromname = mp->mnt_stat.f_mntfromname; - if (strcmp(fromname, oldname) == 0) { - (void)strlcpy(fromname, newname, - sizeof(mp->mnt_stat.f_mntfromname)); - continue; - } - if (strncmp(fromname, oldname, oldlen) == 0 && - (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { - (void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s", - newname, fromname + oldlen); - (void)strlcpy(fromname, tmpbuf, - sizeof(mp->mnt_stat.f_mntfromname)); - continue; - } - } - mtx_unlock(&mountlist_mtx); -} -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c deleted file mode 100644 index 9ac9503d2f77..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ /dev/null @@ -1,6124 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2017 Nexenta Systems, Inc. - */ - -/* Portions Copyright 2007 Jeremy Teo */ -/* Portions Copyright 2010 Robert Milkowski */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -VFS_SMR_DECLARE; - -/* - * Programming rules. - * - * Each vnode op performs some logical unit of work. To do this, the ZPL must - * properly lock its in-core state, create a DMU transaction, do the work, - * record this work in the intent log (ZIL), commit the DMU transaction, - * and wait for the intent log to commit if it is a synchronous operation. - * Moreover, the vnode ops must work in both normal and log replay context. - * The ordering of events is important to avoid deadlocks and references - * to freed memory. The example below illustrates the following Big Rules: - * - * (1) A check must be made in each zfs thread for a mounted file system. - * This is done avoiding races using ZFS_ENTER(zfsvfs). - * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes - * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros - * can return EIO from the calling function. - * - * (2) VN_RELE() should always be the last thing except for zil_commit() - * (if necessary) and ZFS_EXIT(). This is for 3 reasons: - * First, if it's the last reference, the vnode/znode - * can be freed, so the zp may point to freed memory. Second, the last - * reference will call zfs_zinactive(), which may induce a lot of work -- - * pushing cached pages (which acquires range locks) and syncing out - * cached atime changes. Third, zfs_zinactive() may require a new tx, - * which could deadlock the system if you were already holding one. - * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). - * - * (3) All range locks must be grabbed before calling dmu_tx_assign(), - * as they can span dmu_tx_assign() calls. - * - * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to - * dmu_tx_assign(). This is critical because we don't want to block - * while holding locks. - * - * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This - * reduces lock contention and CPU usage when we must wait (note that if - * throughput is constrained by the storage, nearly every transaction - * must wait). - * - * Note, in particular, that if a lock is sometimes acquired before - * the tx assigns, and sometimes after (e.g. z_lock), then failing - * to use a non-blocking assign can deadlock the system. The scenario: - * - * Thread A has grabbed a lock before calling dmu_tx_assign(). - * Thread B is in an already-assigned tx, and blocks for this lock. - * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() - * forever, because the previous txg can't quiesce until B's tx commits. - * - * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, - * then drop all locks, call dmu_tx_wait(), and try again. On subsequent - * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, - * to indicate that this operation has already called dmu_tx_wait(). - * This will ensure that we don't retry forever, waiting a short bit - * each time. - * - * (5) If the operation succeeded, generate the intent log entry for it - * before dropping locks. This ensures that the ordering of events - * in the intent log matches the order in which they actually occurred. - * During ZIL replay the zfs_log_* functions will update the sequence - * number to indicate the zil transaction has replayed. - * - * (6) At the end of each vnode op, the DMU tx must always commit, - * regardless of whether there were any errors. - * - * (7) After dropping all locks, invoke zil_commit(zilog, foid) - * to ensure that synchronous semantics are provided when necessary. - * - * In general, this is how things should be ordered in each vnode op: - * - * ZFS_ENTER(zfsvfs); // exit if unmounted - * top: - * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) - * rw_enter(...); // grab any other locks you need - * tx = dmu_tx_create(...); // get DMU tx - * dmu_tx_hold_*(); // hold each object you might modify - * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); - * if (error) { - * rw_exit(...); // drop locks - * zfs_dirent_unlock(dl); // unlock directory entry - * VN_RELE(...); // release held vnodes - * if (error == ERESTART) { - * waited = B_TRUE; - * dmu_tx_wait(tx); - * dmu_tx_abort(tx); - * goto top; - * } - * dmu_tx_abort(tx); // abort DMU tx - * ZFS_EXIT(zfsvfs); // finished in zfs - * return (error); // really out of space - * } - * error = do_real_work(); // do whatever this VOP does - * if (error == 0) - * zfs_log_*(...); // on success, make ZIL entry - * dmu_tx_commit(tx); // commit DMU tx -- error or not - * rw_exit(...); // drop locks - * zfs_dirent_unlock(dl); // unlock directory entry - * VN_RELE(...); // release held vnodes - * zil_commit(zilog, foid); // synchronous when necessary - * ZFS_EXIT(zfsvfs); // finished in zfs - * return (error); // done, report error - */ - -/* ARGSUSED */ -static int -zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(*vpp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && - ((flag & FAPPEND) == 0)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && - ZTOV(zp)->v_type == VREG && - !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { - if (fs_vscan(*vpp, cr, 0) != 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EACCES)); - } - } - - /* Keep a count of the synchronous opens in the znode */ - if (flag & (FSYNC | FDSYNC)) - atomic_inc_32(&zp->z_sync_cnt); - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* ARGSUSED */ -static int -zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - /* - * Clean up any locks held by this process on the vp. - */ - cleanlocks(vp, ddi_get_pid(), 0); - cleanshares(vp, ddi_get_pid()); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* Decrement the synchronous opens in the znode */ - if ((flag & (FSYNC | FDSYNC)) && (count == 1)) - atomic_dec_32(&zp->z_sync_cnt); - - if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && - ZTOV(zp)->v_type == VREG && - !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) - VERIFY(fs_vscan(vp, cr, 1) == 0); - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and - * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. - */ -static int -zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) -{ - znode_t *zp = VTOZ(vp); - uint64_t noff = (uint64_t)*off; /* new offset */ - uint64_t file_sz; - int error; - boolean_t hole; - - file_sz = zp->z_size; - if (noff >= file_sz) { - return (SET_ERROR(ENXIO)); - } - - if (cmd == _FIO_SEEK_HOLE) - hole = B_TRUE; - else - hole = B_FALSE; - - error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); - - if (error == ESRCH) - return (SET_ERROR(ENXIO)); - - /* - * We could find a hole that begins after the logical end-of-file, - * because dmu_offset_next() only works on whole blocks. If the - * EOF falls mid-block, then indicate that the "virtual hole" - * at the end of the file begins at the logical EOF, rather than - * at the end of the last block. - */ - if (noff > file_sz) { - ASSERT(hole); - noff = file_sz; - } - - if (noff < *off) - return (error); - *off = noff; - return (error); -} - -/* ARGSUSED */ -static int -zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, - int *rvalp, caller_context_t *ct) -{ - offset_t off; - offset_t ndata; - dmu_object_info_t doi; - int error; - zfsvfs_t *zfsvfs; - znode_t *zp; - - switch (com) { - case _FIOFFS: - { - return (0); - - /* - * The following two ioctls are used by bfu. Faking out, - * necessary to avoid bfu errors. - */ - } - case _FIOGDIO: - case _FIOSDIO: - { - return (0); - } - - case _FIO_SEEK_DATA: - case _FIO_SEEK_HOLE: - { -#ifdef illumos - if (ddi_copyin((void *)data, &off, sizeof (off), flag)) - return (SET_ERROR(EFAULT)); -#else - off = *(offset_t *)data; -#endif - zp = VTOZ(vp); - zfsvfs = zp->z_zfsvfs; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* offset parameter is in/out */ - error = zfs_holey(vp, com, &off); - ZFS_EXIT(zfsvfs); - if (error) - return (error); -#ifdef illumos - if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) - return (SET_ERROR(EFAULT)); -#else - *(offset_t *)data = off; -#endif - return (0); - } -#ifdef illumos - case _FIO_COUNT_FILLED: - { - /* - * _FIO_COUNT_FILLED adds a new ioctl command which - * exposes the number of filled blocks in a - * ZFS object. - */ - zp = VTOZ(vp); - zfsvfs = zp->z_zfsvfs; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* - * Wait for all dirty blocks for this object - * to get synced out to disk, and the DMU info - * updated. - */ - error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Retrieve fill count from DMU object. - */ - error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - - ndata = doi.doi_fill_count; - - ZFS_EXIT(zfsvfs); - if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) - return (SET_ERROR(EFAULT)); - return (0); - } -#endif - } - return (SET_ERROR(ENOTTY)); -} - -static vm_page_t -page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) -{ - vm_object_t obj; - vm_page_t pp; - int64_t end; - - /* - * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE - * aligned boundaries, if the range is not aligned. As a result a - * DEV_BSIZE subrange with partially dirty data may get marked as clean. - * It may happen that all DEV_BSIZE subranges are marked clean and thus - * the whole page would be considred clean despite have some dirty data. - * For this reason we should shrink the range to DEV_BSIZE aligned - * boundaries before calling vm_page_clear_dirty. - */ - end = rounddown2(off + nbytes, DEV_BSIZE); - off = roundup2(off, DEV_BSIZE); - nbytes = end - off; - - obj = vp->v_object; - - vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start), - VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | - VM_ALLOC_IGN_SBUSY); - if (pp != NULL) { - ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); - vm_object_pip_add(obj, 1); - pmap_remove_write(pp); - if (nbytes != 0) - vm_page_clear_dirty(pp, off, nbytes); - } - return (pp); -} - -static void -page_unbusy(vm_page_t pp) -{ - - vm_page_sunbusy(pp); - vm_object_pip_wakeup(pp->object); -} - -static vm_page_t -page_wire(vnode_t *vp, int64_t start) -{ - vm_object_t obj; - vm_page_t m; - - obj = vp->v_object; - vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start), - VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | - VM_ALLOC_NOBUSY); - return (m); -} - -static void -page_unwire(vm_page_t pp) -{ - - vm_page_unwire(pp, PQ_ACTIVE); -} - -/* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Write: If we find a memory mapped page, we write to *both* - * the page and the dmu buffer. - */ -static void -update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, - int segflg, dmu_tx_t *tx) -{ - vm_object_t obj; - struct sf_buf *sf; - caddr_t va; - int off; - - ASSERT(segflg != UIO_NOCOPY); - ASSERT(vp->v_mount != NULL); - obj = vp->v_object; - ASSERT(obj != NULL); - - off = start & PAGEOFFSET; - vm_object_pip_add(obj, 1); - for (start &= PAGEMASK; len > 0; start += PAGESIZE) { - vm_page_t pp; - int nbytes = imin(PAGESIZE - off, len); - - if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { - va = zfs_map_page(pp, &sf); - (void) dmu_read(os, oid, start+off, nbytes, - va+off, DMU_READ_PREFETCH);; - zfs_unmap_page(sf); - page_unbusy(pp); - } - len -= nbytes; - off = 0; - } - vm_object_pip_wakeup(obj); -} - -/* - * Read with UIO_NOCOPY flag means that sendfile(2) requests - * ZFS to populate a range of page cache pages with data. - * - * NOTE: this function could be optimized to pre-allocate - * all pages in advance, drain exclusive busy on all of them, - * map them into contiguous KVA region and populate them - * in one single dmu_read() call. - */ -static int -mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) -{ - znode_t *zp = VTOZ(vp); - objset_t *os = zp->z_zfsvfs->z_os; - struct sf_buf *sf; - vm_object_t obj; - vm_page_t pp; - int64_t start; - caddr_t va; - int len = nbytes; - int off; - int error = 0; - - ASSERT(uio->uio_segflg == UIO_NOCOPY); - ASSERT(vp->v_mount != NULL); - obj = vp->v_object; - ASSERT(obj != NULL); - ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); - - for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { - int bytes = MIN(PAGESIZE, len); - - pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start), - VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); - if (vm_page_none_valid(pp)) { - va = zfs_map_page(pp, &sf); - error = dmu_read(os, zp->z_id, start, bytes, va, - DMU_READ_PREFETCH); - if (bytes != PAGESIZE && error == 0) - bzero(va + bytes, PAGESIZE - bytes); - zfs_unmap_page(sf); - if (error == 0) { - vm_page_valid(pp); - vm_page_activate(pp); - vm_page_sunbusy(pp); - } else { - zfs_vmobject_wlock(obj); - if (!vm_page_wired(pp) && pp->valid == 0 && - vm_page_busy_tryupgrade(pp)) - vm_page_free(pp); - else - vm_page_sunbusy(pp); - zfs_vmobject_wunlock(obj); - } - } else { - ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); - vm_page_sunbusy(pp); - } - if (error) - break; - uio->uio_resid -= bytes; - uio->uio_offset += bytes; - len -= bytes; - } - return (error); -} - -/* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Read: We "read" preferentially from memory mapped pages, - * else we default from the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. - */ -static int -mappedread(vnode_t *vp, int nbytes, uio_t *uio) -{ - znode_t *zp = VTOZ(vp); - vm_object_t obj; - int64_t start; - caddr_t va; - int len = nbytes; - int off; - int error = 0; - - ASSERT(vp->v_mount != NULL); - obj = vp->v_object; - ASSERT(obj != NULL); - - start = uio->uio_loffset; - off = start & PAGEOFFSET; - for (start &= PAGEMASK; len > 0; start += PAGESIZE) { - vm_page_t pp; - uint64_t bytes = MIN(PAGESIZE - off, len); - - if (pp = page_wire(vp, start)) { - struct sf_buf *sf; - caddr_t va; - - va = zfs_map_page(pp, &sf); -#ifdef illumos - error = uiomove(va + off, bytes, UIO_READ, uio); -#else - error = vn_io_fault_uiomove(va + off, bytes, uio); -#endif - zfs_unmap_page(sf); - page_unwire(pp); - } else { - error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, bytes); - } - len -= bytes; - off = 0; - if (error) - break; - } - return (error); -} - -offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ - -/* - * Read bytes from specified file into supplied buffer. - * - * IN: vp - vnode of file to be read from. - * uio - structure supplying read location, range info, - * and return buffer. - * ioflag - SYNC flags; used to provide FRSYNC semantics. - * cr - credentials of caller. - * ct - caller context - * - * OUT: uio - updated offset and range, buffer filled. - * - * RETURN: 0 on success, error code on failure. - * - * Side Effects: - * vp - atime updated if byte count > 0 - */ -/* ARGSUSED */ -static int -zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - ssize_t n, nbytes; - int error = 0; - xuio_t *xuio = NULL; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* We don't copy out anything useful for directories. */ - if (vp->v_type == VDIR) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EISDIR)); - } - - if (zp->z_pflags & ZFS_AV_QUARANTINED) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EACCES)); - } - - /* - * Validate file offset - */ - if (uio->uio_loffset < (offset_t)0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Fasttrack empty reads - */ - if (uio->uio_resid == 0) { - ZFS_EXIT(zfsvfs); - return (0); - } - - /* - * Check for mandatory locks - */ - if (MANDMODE(zp->z_mode)) { - if (error = chklock(vp, FREAD, - uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - - /* - * If we're in FRSYNC mode, sync out this znode before reading it. - */ - if (zfsvfs->z_log && - (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) - zil_commit(zfsvfs->z_log, zp->z_id); - - /* - * Lock the range against changes. - */ - locked_range_t *lr = rangelock_enter(&zp->z_rangelock, - uio->uio_loffset, uio->uio_resid, RL_READER); - - /* - * If we are reading past end-of-file we can skip - * to the end; but we might still need to set atime. - */ - if (uio->uio_loffset >= zp->z_size) { - error = 0; - goto out; - } - - ASSERT(uio->uio_loffset < zp->z_size); - n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); - -#ifdef illumos - if ((uio->uio_extflg == UIO_XUIO) && - (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { - int nblk; - int blksz = zp->z_blksz; - uint64_t offset = uio->uio_loffset; - - xuio = (xuio_t *)uio; - if ((ISP2(blksz))) { - nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, - blksz)) / blksz; - } else { - ASSERT(offset + n <= blksz); - nblk = 1; - } - (void) dmu_xuio_init(xuio, nblk); - - if (vn_has_cached_data(vp)) { - /* - * For simplicity, we always allocate a full buffer - * even if we only expect to read a portion of a block. - */ - while (--nblk >= 0) { - (void) dmu_xuio_add(xuio, - dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz), 0, blksz); - } - } - } -#endif /* illumos */ - - while (n > 0) { - nbytes = MIN(n, zfs_read_chunk_size - - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); - -#ifdef __FreeBSD__ - if (uio->uio_segflg == UIO_NOCOPY) - error = mappedread_sf(vp, nbytes, uio); - else -#endif /* __FreeBSD__ */ - if (vn_has_cached_data(vp)) { - error = mappedread(vp, nbytes, uio); - } else { - error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes); - } - if (error) { - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = SET_ERROR(EIO); - break; - } - - n -= nbytes; - } -out: - rangelock_exit(lr); - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Write the bytes to a file. - * - * IN: vp - vnode of file to be written to. - * uio - structure supplying write location, range info, - * and data buffer. - * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is - * set if in append mode. - * cr - credentials of caller. - * ct - caller context (NFS/CIFS fem monitor only) - * - * OUT: uio - updated offset and range. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * vp - ctime|mtime updated if byte count > 0 - */ - -/* ARGSUSED */ -static int -zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - rlim64_t limit = MAXOFFSET_T; - ssize_t start_resid = uio->uio_resid; - ssize_t tx_bytes; - uint64_t end_size; - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog; - offset_t woff; - ssize_t n, nbytes; - int max_blksz = zfsvfs->z_max_blksz; - int error = 0; - arc_buf_t *abuf; - iovec_t *aiov = NULL; - xuio_t *xuio = NULL; - int i_iov = 0; - int iovcnt = uio->uio_iovcnt; - iovec_t *iovp = uio->uio_iov; - int write_eof; - int count = 0; - sa_bulk_attr_t bulk[4]; - uint64_t mtime[2], ctime[2]; - - /* - * Fasttrack empty write - */ - n = start_resid; - if (n == 0) - return (0); - - if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) - limit = MAXOFFSET_T; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, - &zp->z_size, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, 8); - - /* - * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our - * callers might not be able to detect properly that we are read-only, - * so check it explicitly here. - */ - if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EROFS)); - } - - /* - * If immutable or not appending then return EPERM. - * Intentionally allow ZFS_READONLY through here. - * See zfs_zaccess_common() - */ - if ((zp->z_pflags & ZFS_IMMUTABLE) || - ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && - (uio->uio_loffset < zp->z_size))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - zilog = zfsvfs->z_log; - - /* - * Validate file offset - */ - woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; - if (woff < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Check for mandatory locks before calling rangelock_enter() - * in order to prevent a deadlock with locks set via fcntl(). - */ - if (MANDMODE((mode_t)zp->z_mode) && - (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - -#ifdef illumos - /* - * Pre-fault the pages to ensure slow (eg NFS) pages - * don't hold up txg. - * Skip this if uio contains loaned arc_buf. - */ - if ((uio->uio_extflg == UIO_XUIO) && - (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) - xuio = (xuio_t *)uio; - else - uio_prefaultpages(MIN(n, max_blksz), uio); -#endif - - /* - * If in append mode, set the io offset pointer to eof. - */ - locked_range_t *lr; - if (ioflag & FAPPEND) { - /* - * Obtain an appending range lock to guarantee file append - * semantics. We reset the write offset once we have the lock. - */ - lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); - woff = lr->lr_offset; - if (lr->lr_length == UINT64_MAX) { - /* - * We overlocked the file because this write will cause - * the file block size to increase. - * Note that zp_size cannot change with this lock held. - */ - woff = zp->z_size; - } - uio->uio_loffset = woff; - } else { - /* - * Note that if the file block size will change as a result of - * this write, then this range lock will lock the entire file - * so that we can re-write the block safely. - */ - lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); - } - - if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { - rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (EFBIG); - } - - if (woff >= limit) { - rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EFBIG)); - } - - if ((woff + n) > limit || woff > (limit - n)) - n = limit - woff; - - /* Will this write extend the file length? */ - write_eof = (woff + n > zp->z_size); - - end_size = MAX(zp->z_size, woff + n); - - /* - * Write the file in reasonable size chunks. Each chunk is written - * in a separate transaction; this keeps the intent log records small - * and allows us to do more fine-grained space accounting. - */ - while (n > 0) { - abuf = NULL; - woff = uio->uio_loffset; - if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || - zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { - if (abuf != NULL) - dmu_return_arcbuf(abuf); - error = SET_ERROR(EDQUOT); - break; - } - - if (xuio && abuf == NULL) { - ASSERT(i_iov < iovcnt); - aiov = &iovp[i_iov]; - abuf = dmu_xuio_arcbuf(xuio, i_iov); - dmu_xuio_clear(xuio, i_iov); - DTRACE_PROBE3(zfs_cp_write, int, i_iov, - iovec_t *, aiov, arc_buf_t *, abuf); - ASSERT((aiov->iov_base == abuf->b_data) || - ((char *)aiov->iov_base - (char *)abuf->b_data + - aiov->iov_len == arc_buf_size(abuf))); - i_iov++; - } else if (abuf == NULL && n >= max_blksz && - woff >= zp->z_size && - P2PHASE(woff, max_blksz) == 0 && - zp->z_blksz == max_blksz) { - /* - * This write covers a full block. "Borrow" a buffer - * from the dmu so that we can fill it before we enter - * a transaction. This avoids the possibility of - * holding up the transaction if the data copy hangs - * up on a pagefault (e.g., from an NFS server mapping). - */ - size_t cbytes; - - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - max_blksz); - ASSERT(abuf != NULL); - ASSERT(arc_buf_size(abuf) == max_blksz); - if (error = uiocopy(abuf->b_data, max_blksz, - UIO_WRITE, uio, &cbytes)) { - dmu_return_arcbuf(abuf); - break; - } - ASSERT(cbytes == max_blksz); - } - - /* - * Start a transaction. - */ - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - if (abuf != NULL) - dmu_return_arcbuf(abuf); - break; - } - - /* - * If rangelock_enter() over-locked we grow the blocksize - * and then reduce the lock range. This will only happen - * on the first iteration since rangelock_reduce() will - * shrink down lr_length to the appropriate size. - */ - if (lr->lr_length == UINT64_MAX) { - uint64_t new_blksz; - - if (zp->z_blksz > max_blksz) { - /* - * File's blocksize is already larger than the - * "recordsize" property. Only let it grow to - * the next power of 2. - */ - ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end_size, - 1 << highbit64(zp->z_blksz)); - } else { - new_blksz = MIN(end_size, max_blksz); - } - zfs_grow_blocksize(zp, new_blksz, tx); - rangelock_reduce(lr, woff, n); - } - - /* - * XXX - should we really limit each write to z_max_blksz? - * Perhaps we should use SPA_MAXBLOCKSIZE chunks? - */ - nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - - if (woff + nbytes > zp->z_size) - vnode_pager_setsize(vp, woff + nbytes); - - if (abuf == NULL) { - tx_bytes = uio->uio_resid; - error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes, tx); - tx_bytes -= uio->uio_resid; - } else { - tx_bytes = nbytes; - ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); - /* - * If this is not a full block write, but we are - * extending the file past EOF and this data starts - * block-aligned, use assign_arcbuf(). Otherwise, - * write via dmu_write(). - */ - if (tx_bytes < max_blksz && (!write_eof || - aiov->iov_base != abuf->b_data)) { - ASSERT(xuio); - dmu_write(zfsvfs->z_os, zp->z_id, woff, - aiov->iov_len, aiov->iov_base, tx); - dmu_return_arcbuf(abuf); - xuio_stat_wbuf_copied(); - } else { - ASSERT(xuio || tx_bytes == max_blksz); - dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), - woff, abuf, tx); - } - ASSERT(tx_bytes <= uio->uio_resid); - uioskip(uio, tx_bytes); - } - if (tx_bytes && vn_has_cached_data(vp)) { - update_pages(vp, woff, tx_bytes, zfsvfs->z_os, - zp->z_id, uio->uio_segflg, tx); - } - - /* - * If we made no progress, we're done. If we made even - * partial progress, update the znode and ZIL accordingly. - */ - if (tx_bytes == 0) { - (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), - (void *)&zp->z_size, sizeof (uint64_t), tx); - dmu_tx_commit(tx); - ASSERT(error != 0); - break; - } - - /* - * Clear Set-UID/Set-GID bits on successful write if not - * privileged and at least one of the excute bits is set. - * - * It would be nice to to this after all writes have - * been done, but that would still expose the ISUID/ISGID - * to another app after the partial write is committed. - * - * Note: we don't call zfs_fuid_map_id() here because - * user 0 is not an ephemeral uid. - */ - mutex_enter(&zp->z_acl_lock); - if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | - (S_IXUSR >> 6))) != 0 && - (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(vp, cr, - (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { - uint64_t newmode; - zp->z_mode &= ~(S_ISUID | S_ISGID); - newmode = zp->z_mode; - (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), - (void *)&newmode, sizeof (uint64_t), tx); - } - mutex_exit(&zp->z_acl_lock); - - zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, - B_TRUE); - - /* - * Update the file size (zp_size) if it has changed; - * account for possible concurrent updates. - */ - while ((end_size = zp->z_size) < uio->uio_loffset) { - (void) atomic_cas_64(&zp->z_size, end_size, - uio->uio_loffset); -#ifdef illumos - ASSERT(error == 0); -#else - ASSERT(error == 0 || error == EFAULT); -#endif - } - /* - * If we are replaying and eof is non zero then force - * the file size to the specified eof. Note, there's no - * concurrency during replay. - */ - if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) - zp->z_size = zfsvfs->z_replay_eof; - - if (error == 0) - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - else - (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - - zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); - dmu_tx_commit(tx); - - if (error != 0) - break; - ASSERT(tx_bytes == nbytes); - n -= nbytes; - -#ifdef illumos - if (!xuio && n > 0) - uio_prefaultpages(MIN(n, max_blksz), uio); -#endif - } - - rangelock_exit(lr); - - /* - * If we're in replay mode, or we made no progress, return error. - * Otherwise, it's at least a partial write, so it's successful. - */ - if (zfsvfs->z_replay || uio->uio_resid == start_resid) { - ZFS_EXIT(zfsvfs); - return (error); - } - -#ifdef __FreeBSD__ - /* - * EFAULT means that at least one page of the source buffer was not - * available. VFS will re-try remaining I/O upon this error. - */ - if (error == EFAULT) { - ZFS_EXIT(zfsvfs); - return (error); - } -#endif - - if (ioflag & (FSYNC | FDSYNC) || - zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, zp->z_id); - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* ARGSUSED */ -void -zfs_get_done(zgd_t *zgd, int error) -{ - znode_t *zp = zgd->zgd_private; - objset_t *os = zp->z_zfsvfs->z_os; - - if (zgd->zgd_db) - dmu_buf_rele(zgd->zgd_db, zgd); - - rangelock_exit(zgd->zgd_lr); - - /* - * Release the vnode asynchronously as we currently have the - * txg stopped from syncing. - */ - VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); - - kmem_free(zgd, sizeof (zgd_t)); -} - -#ifdef DEBUG -static int zil_fault_io = 0; -#endif - -/* - * Get data to generate a TX_WRITE intent log record. - */ -int -zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) -{ - zfsvfs_t *zfsvfs = arg; - objset_t *os = zfsvfs->z_os; - znode_t *zp; - uint64_t object = lr->lr_foid; - uint64_t offset = lr->lr_offset; - uint64_t size = lr->lr_length; - dmu_buf_t *db; - zgd_t *zgd; - int error = 0; - - ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); - ASSERT3U(size, !=, 0); - - /* - * Nothing to do if the file has been removed - */ - if (zfs_zget(zfsvfs, object, &zp) != 0) - return (SET_ERROR(ENOENT)); - if (zp->z_unlinked) { - /* - * Release the vnode asynchronously as we currently have the - * txg stopped from syncing. - */ - VN_RELE_ASYNC(ZTOV(zp), - dsl_pool_vnrele_taskq(dmu_objset_pool(os))); - return (SET_ERROR(ENOENT)); - } - - zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_lwb = lwb; - zgd->zgd_private = zp; - - /* - * Write records come in two flavors: immediate and indirect. - * For small writes it's cheaper to store the data with the - * log record (immediate); for large writes it's cheaper to - * sync the data and get a pointer to it (indirect) so that - * we don't have to write the data twice. - */ - if (buf != NULL) { /* immediate write */ - zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); - /* test for truncation needs to be done while range locked */ - if (offset >= zp->z_size) { - error = SET_ERROR(ENOENT); - } else { - error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); - } - ASSERT(error == 0 || error == ENOENT); - } else { /* indirect write */ - /* - * Have to lock the whole block to ensure when it's - * written out and its checksum is being calculated - * that no one can change the data. We need to re-check - * blocksize after we get the lock in case it's changed! - */ - for (;;) { - uint64_t blkoff; - size = zp->z_blksz; - blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; - offset -= blkoff; - zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); - if (zp->z_blksz == size) - break; - offset += blkoff; - rangelock_exit(zgd->zgd_lr); - } - /* test for truncation needs to be done while range locked */ - if (lr->lr_offset >= zp->z_size) - error = SET_ERROR(ENOENT); -#ifdef DEBUG - if (zil_fault_io) { - error = SET_ERROR(EIO); - zil_fault_io = 0; - } -#endif - if (error == 0) - error = dmu_buf_hold(os, object, offset, zgd, &db, - DMU_READ_NO_PREFETCH); - - if (error == 0) { - blkptr_t *bp = &lr->lr_blkptr; - - zgd->zgd_db = db; - zgd->zgd_bp = bp; - - ASSERT(db->db_offset == offset); - ASSERT(db->db_size == size); - - error = dmu_sync(zio, lr->lr_common.lrc_txg, - zfs_get_done, zgd); - ASSERT(error || lr->lr_length <= size); - - /* - * On success, we need to wait for the write I/O - * initiated by dmu_sync() to complete before we can - * release this dbuf. We will finish everything up - * in the zfs_get_done() callback. - */ - if (error == 0) - return (0); - - if (error == EALREADY) { - lr->lr_common.lrc_txtype = TX_WRITE2; - /* - * TX_WRITE2 relies on the data previously - * written by the TX_WRITE that caused - * EALREADY. We zero out the BP because - * it is the old, currently-on-disk BP. - */ - zgd->zgd_bp = NULL; - BP_ZERO(bp); - error = 0; - } - } - } - - zfs_get_done(zgd, error); - - return (error); -} - -/*ARGSUSED*/ -static int -zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (flag & V_ACE_MASK) - error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); - else - error = zfs_zaccess_rwx(zp, mode, flag, cr); - - ZFS_EXIT(zfsvfs); - return (error); -} - -static int -zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) -{ - int error; - - *vpp = arg; - error = vn_lock(*vpp, lkflags); - if (error != 0) - vrele(*vpp); - return (error); -} - -static int -zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) -{ - znode_t *zdp = VTOZ(dvp); - zfsvfs_t *zfsvfs = zdp->z_zfsvfs; - int error; - int ltype; - - ASSERT_VOP_LOCKED(dvp, __func__); -#ifdef DIAGNOSTIC - if ((zdp->z_pflags & ZFS_XATTR) == 0) - VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); -#endif - - if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { - ASSERT3P(dvp, ==, vp); - vref(dvp); - ltype = lkflags & LK_TYPE_MASK; - if (ltype != VOP_ISLOCKED(dvp)) { - if (ltype == LK_EXCLUSIVE) - vn_lock(dvp, LK_UPGRADE | LK_RETRY); - else /* if (ltype == LK_SHARED) */ - vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); - - /* - * Relock for the "." case could leave us with - * reclaimed vnode. - */ - if (VN_IS_DOOMED(dvp)) { - vrele(dvp); - return (SET_ERROR(ENOENT)); - } - } - return (0); - } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { - /* - * Note that in this case, dvp is the child vnode, and we - * are looking up the parent vnode - exactly reverse from - * normal operation. Unlocking dvp requires some rather - * tricky unlock/relock dance to prevent mp from being freed; - * use vn_vget_ino_gen() which takes care of all that. - * - * XXX Note that there is a time window when both vnodes are - * unlocked. It is possible, although highly unlikely, that - * during that window the parent-child relationship between - * the vnodes may change, for example, get reversed. - * In that case we would have a wrong lock order for the vnodes. - * All other filesystems seem to ignore this problem, so we - * do the same here. - * A potential solution could be implemented as follows: - * - using LK_NOWAIT when locking the second vnode and retrying - * if necessary - * - checking that the parent-child relationship still holds - * after locking both vnodes and retrying if it doesn't - */ - error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); - return (error); - } else { - error = vn_lock(vp, lkflags); - if (error != 0) - vrele(vp); - return (error); - } -} - -/* - * Lookup an entry in a directory, or an extended attribute directory. - * If it exists, return a held vnode reference for it. - * - * IN: dvp - vnode of directory to search. - * nm - name of entry to lookup. - * pnp - full pathname to lookup [UNUSED]. - * flags - LOOKUP_XATTR set if looking for an attribute. - * rdir - root directory vnode [UNUSED]. - * cr - credentials of caller. - * ct - caller context - * - * OUT: vpp - vnode of located entry, NULL if not found. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * NA - */ -/* ARGSUSED */ -static int -zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, - int nameiop, cred_t *cr, kthread_t *td, int flags, boolean_t cached) -{ - znode_t *zdp = VTOZ(dvp); - znode_t *zp; - zfsvfs_t *zfsvfs = zdp->z_zfsvfs; - int error = 0; - - /* - * Fast path lookup, however we must skip DNLC lookup - * for case folding or normalizing lookups because the - * DNLC code only stores the passed in name. This means - * creating 'a' and removing 'A' on a case insensitive - * file system would work, but DNLC still thinks 'a' - * exists and won't let you create it again on the next - * pass through fast path. - */ - if (!(flags & LOOKUP_XATTR)) { - if (dvp->v_type != VDIR) { - return (SET_ERROR(ENOTDIR)); - } else if (zdp->z_sa_hdl == NULL) { - return (SET_ERROR(EIO)); - } - } - - DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zdp); - - *vpp = NULL; - - if (flags & LOOKUP_XATTR) { -#ifdef TODO - /* - * If the xattr property is off, refuse the lookup request. - */ - if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } -#endif - - /* - * We don't allow recursive attributes.. - * Maybe someday we will. - */ - if (zdp->z_pflags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Do we have permission to get into attribute directory? - */ - if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, - B_FALSE, cr)) { - vrele(*vpp); - *vpp = NULL; - } - - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Check accessibility of directory. - */ - if (!cached) { - if ((cnp->cn_flags & NOEXECCHECK) != 0) { - cnp->cn_flags &= ~NOEXECCHECK; - } else { - error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); - if (error != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - } - - if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), - NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - - - /* - * First handle the special cases. - */ - if ((cnp->cn_flags & ISDOTDOT) != 0) { - /* - * If we are a snapshot mounted under .zfs, return - * the vp for the snapshot directory. - */ - if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { - struct componentname cn; - vnode_t *zfsctl_vp; - int ltype; - - ZFS_EXIT(zfsvfs); - ltype = VOP_ISLOCKED(dvp); - VOP_UNLOCK(dvp); - error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, - &zfsctl_vp); - if (error == 0) { - cn.cn_nameptr = "snapshot"; - cn.cn_namelen = strlen(cn.cn_nameptr); - cn.cn_nameiop = cnp->cn_nameiop; - cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; - cn.cn_lkflags = cnp->cn_lkflags; - error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); - vput(zfsctl_vp); - } - vn_lock(dvp, ltype | LK_RETRY); - return (error); - } - } - if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { - ZFS_EXIT(zfsvfs); - if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) - return (SET_ERROR(ENOTSUP)); - error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); - return (error); - } - - /* - * The loop is retry the lookup if the parent-child relationship - * changes during the dot-dot locking complexities. - */ - for (;;) { - uint64_t parent; - - error = zfs_dirlook(zdp, nm, &zp); - if (error == 0) - *vpp = ZTOV(zp); - - ZFS_EXIT(zfsvfs); - if (error != 0) - break; - - error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); - if (error != 0) { - /* - * If we've got a locking error, then the vnode - * got reclaimed because of a force unmount. - * We never enter doomed vnodes into the name cache. - */ - *vpp = NULL; - return (error); - } - - if ((cnp->cn_flags & ISDOTDOT) == 0) - break; - - ZFS_ENTER(zfsvfs); - if (zdp->z_sa_hdl == NULL) { - error = SET_ERROR(EIO); - } else { - error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), - &parent, sizeof (parent)); - } - if (error != 0) { - ZFS_EXIT(zfsvfs); - vput(ZTOV(zp)); - break; - } - if (zp->z_id == parent) { - ZFS_EXIT(zfsvfs); - break; - } - vput(ZTOV(zp)); - } - -out: - if (error != 0) - *vpp = NULL; - - /* Translate errors and add SAVENAME when needed. */ - if (cnp->cn_flags & ISLASTCN) { - switch (nameiop) { - case CREATE: - case RENAME: - if (error == ENOENT) { - error = EJUSTRETURN; - cnp->cn_flags |= SAVENAME; - break; - } - /* FALLTHROUGH */ - case DELETE: - if (error == 0) - cnp->cn_flags |= SAVENAME; - break; - } - } - - /* Insert name into cache (as non-existent) if appropriate. */ - if (zfsvfs->z_use_namecache && - error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) - cache_enter(dvp, NULL, cnp); - - /* Insert name into cache if appropriate. */ - if (zfsvfs->z_use_namecache && - error == 0 && (cnp->cn_flags & MAKEENTRY)) { - if (!(cnp->cn_flags & ISLASTCN) || - (nameiop != DELETE && nameiop != RENAME)) { - cache_enter(dvp, *vpp, cnp); - } - } - - return (error); -} - -/* - * Attempt to create a new entry in a directory. If the entry - * already exists, truncate the file if permissible, else return - * an error. Return the vp of the created or trunc'd file. - * - * IN: dvp - vnode of directory to put new file entry in. - * name - name of new file entry. - * vap - attributes of new file. - * excl - flag indicating exclusive or non-exclusive mode. - * mode - mode to open file with. - * cr - credentials of caller. - * flag - large file flag [UNUSED]. - * ct - caller context - * vsecp - ACL to be set - * - * OUT: vpp - vnode of created or trunc'd entry. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * dvp - ctime|mtime updated if new entry created - * vp - ctime|mtime always, atime if new - */ - -/* ARGSUSED */ -static int -zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, - vnode_t **vpp, cred_t *cr, kthread_t *td) -{ - znode_t *zp, *dzp = VTOZ(dvp); - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog; - objset_t *os; - dmu_tx_t *tx; - int error; - ksid_t *ksid; - uid_t uid; - gid_t gid = crgetgid(cr); - zfs_acl_ids_t acl_ids; - boolean_t fuid_dirtied; - void *vsecp = NULL; - int flag = 0; - uint64_t txtype; - - /* - * If we have an ephemeral id, ACL, or XVATTR then - * make sure file system is at proper version - */ - - ksid = crgetsid(cr, KSID_OWNER); - if (ksid) - uid = ksid_getid(ksid); - else - uid = crgetuid(cr); - - if (zfsvfs->z_use_fuids == B_FALSE && - (vsecp || (vap->va_mask & AT_XVATTR) || - IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - os = zfsvfs->z_os; - zilog = zfsvfs->z_log; - - if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), - NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - - if (vap->va_mask & AT_XVATTR) { - if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, - crgetuid(cr), cr, vap->va_type)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - - *vpp = NULL; - - if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) - vap->va_mode &= ~S_ISVTX; - - error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - ASSERT3P(zp, ==, NULL); - - /* - * Create a new file object and update the directory - * to reference it. - */ - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { - goto out; - } - - /* - * We only support the creation of regular files in - * extended attribute directories. - */ - - if ((dzp->z_pflags & ZFS_XATTR) && - (vap->va_type != VREG)) { - error = SET_ERROR(EINVAL); - goto out; - } - - if ((error = zfs_acl_ids_create(dzp, 0, vap, - cr, vsecp, &acl_ids)) != 0) - goto out; - - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { - zfs_acl_ids_free(&acl_ids); - error = SET_ERROR(EDQUOT); - goto out; - } - - getnewvnode_reserve(); - - tx = dmu_tx_create(os); - - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE); - - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); - if (!zfsvfs->z_use_sa && - acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, acl_ids.z_aclp->z_acl_bytes); - } - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - getnewvnode_drop_reserve(); - ZFS_EXIT(zfsvfs); - return (error); - } - zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - (void) zfs_link_create(dzp, name, zp, tx, ZNEW); - txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); - zfs_log_create(zilog, tx, txtype, dzp, zp, name, - vsecp, acl_ids.z_fuidp, vap); - zfs_acl_ids_free(&acl_ids); - dmu_tx_commit(tx); - - getnewvnode_drop_reserve(); - -out: - if (error == 0) { - *vpp = ZTOV(zp); - } - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Remove an entry from a directory. - * - * IN: dvp - vnode of directory to remove entry from. - * name - name of entry to remove. - * cr - credentials of caller. - * ct - caller context - * flags - case flags - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * dvp - ctime|mtime - * vp - ctime (if nlink > 0) - */ - -/*ARGSUSED*/ -static int -zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) -{ - znode_t *dzp = VTOZ(dvp); - znode_t *zp = VTOZ(vp); - znode_t *xzp; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog; - uint64_t acl_obj, xattr_obj; - uint64_t obj = 0; - dmu_tx_t *tx; - boolean_t unlinked, toobig = FALSE; - uint64_t txtype; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - ZFS_VERIFY_ZP(zp); - zilog = zfsvfs->z_log; - zp = VTOZ(vp); - - xattr_obj = 0; - xzp = NULL; - - if (error = zfs_zaccess_delete(dzp, zp, cr)) { - goto out; - } - - /* - * Need to use rmdir for removing directories. - */ - if (vp->v_type == VDIR) { - error = SET_ERROR(EPERM); - goto out; - } - - vnevent_remove(vp, dvp, name, ct); - - obj = zp->z_id; - - /* are there any extended attributes? */ - error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), - &xattr_obj, sizeof (xattr_obj)); - if (error == 0 && xattr_obj) { - error = zfs_zget(zfsvfs, xattr_obj, &xzp); - ASSERT0(error); - } - - /* - * We may delete the znode now, or we may put it in the unlinked set; - * it depends on whether we're the last link, and on whether there are - * other holds on the vnode. So we dmu_tx_hold() the right things to - * allow for either case. - */ - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - zfs_sa_upgrade_txholds(tx, dzp); - - if (xzp) { - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); - } - - /* charge as an update -- would be nice not to charge at all */ - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - - /* - * Mark this transaction as typically resulting in a net free of space - */ - dmu_tx_mark_netfree(tx); - - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Remove the directory entry. - */ - error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); - - if (error) { - dmu_tx_commit(tx); - goto out; - } - - if (unlinked) { - zfs_unlinked_add(zp, tx); - vp->v_vflag |= VV_NOSYNC; - } - - txtype = TX_REMOVE; - zfs_log_remove(zilog, tx, txtype, dzp, name, obj); - - dmu_tx_commit(tx); -out: - - if (xzp) - vrele(ZTOV(xzp)); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Create a new directory and insert it into dvp using the name - * provided. Return a pointer to the inserted directory. - * - * IN: dvp - vnode of directory to add subdir to. - * dirname - name of new directory. - * vap - attributes of new directory. - * cr - credentials of caller. - * ct - caller context - * flags - case flags - * vsecp - ACL to be set - * - * OUT: vpp - vnode of created directory. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * dvp - ctime|mtime updated - * vp - ctime|mtime|atime updated - */ -/*ARGSUSED*/ -static int -zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) -{ - znode_t *zp, *dzp = VTOZ(dvp); - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog; - uint64_t txtype; - dmu_tx_t *tx; - int error; - ksid_t *ksid; - uid_t uid; - gid_t gid = crgetgid(cr); - zfs_acl_ids_t acl_ids; - boolean_t fuid_dirtied; - - ASSERT(vap->va_type == VDIR); - - /* - * If we have an ephemeral id, ACL, or XVATTR then - * make sure file system is at proper version - */ - - ksid = crgetsid(cr, KSID_OWNER); - if (ksid) - uid = ksid_getid(ksid); - else - uid = crgetuid(cr); - if (zfsvfs->z_use_fuids == B_FALSE && - ((vap->va_mask & AT_XVATTR) || - IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (dzp->z_pflags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - if (zfsvfs->z_utf8 && u8_validate(dirname, - strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - - if (vap->va_mask & AT_XVATTR) { - if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, - crgetuid(cr), cr, vap->va_type)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - - if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, - NULL, &acl_ids)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * First make sure the new directory doesn't exist. - * - * Existence is checked first to make sure we don't return - * EACCES instead of EEXIST which can cause some applications - * to fail. - */ - *vpp = NULL; - - if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { - zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); - return (error); - } - ASSERT3P(zp, ==, NULL); - - if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { - zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); - return (error); - } - - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { - zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EDQUOT)); - } - - /* - * Add a new entry to the directory. - */ - getnewvnode_reserve(); - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - acl_ids.z_aclp->z_acl_bytes); - } - - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE); - - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - getnewvnode_drop_reserve(); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Create new node. - */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - /* - * Now put new name in parent dir. - */ - (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); - - *vpp = ZTOV(zp); - - txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); - zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, - acl_ids.z_fuidp, vap); - - zfs_acl_ids_free(&acl_ids); - - dmu_tx_commit(tx); - - getnewvnode_drop_reserve(); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * Remove a directory subdir entry. If the current working - * directory is the same as the subdir to be removed, the - * remove will fail. - * - * IN: dvp - vnode of directory to remove from. - * name - name of directory to be removed. - * cwd - vnode of current working directory. - * cr - credentials of caller. - * ct - caller context - * flags - case flags - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * dvp - ctime|mtime updated - */ -/*ARGSUSED*/ -static int -zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) -{ - znode_t *dzp = VTOZ(dvp); - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog; - dmu_tx_t *tx; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - ZFS_VERIFY_ZP(zp); - zilog = zfsvfs->z_log; - - - if (error = zfs_zaccess_delete(dzp, zp, cr)) { - goto out; - } - - if (vp->v_type != VDIR) { - error = SET_ERROR(ENOTDIR); - goto out; - } - - vnevent_rmdir(vp, dvp, name, ct); - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - zfs_sa_upgrade_txholds(tx, zp); - zfs_sa_upgrade_txholds(tx, dzp); - dmu_tx_mark_netfree(tx); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - cache_purge(dvp); - - error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); - - if (error == 0) { - uint64_t txtype = TX_RMDIR; - zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); - } - - dmu_tx_commit(tx); - - cache_purge(vp); -out: - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Read as many directory entries as will fit into the provided - * buffer from the given directory cursor position (specified in - * the uio structure). - * - * IN: vp - vnode of directory to read. - * uio - structure supplying read location, range info, - * and return buffer. - * cr - credentials of caller. - * ct - caller context - * flags - case flags - * - * OUT: uio - updated offset and range, buffer filled. - * eofp - set to true if end-of-file detected. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * vp - atime updated - * - * Note that the low 4 bits of the cookie returned by zap is always zero. - * This allows us to use the low range for "special" directory entries: - * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, - * we use the offset 2 for the '.zfs' directory. - */ -/* ARGSUSED */ -static int -zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) -{ - znode_t *zp = VTOZ(vp); - iovec_t *iovp; - edirent_t *eodp; - dirent64_t *odp; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - objset_t *os; - caddr_t outbuf; - size_t bufsize; - zap_cursor_t zc; - zap_attribute_t zap; - uint_t bytes_wanted; - uint64_t offset; /* must be unsigned; checks for < 1 */ - uint64_t parent; - int local_eof; - int outcount; - int error; - uint8_t prefetch; - boolean_t check_sysattrs; - uint8_t type; - int ncooks; - u_long *cooks = NULL; - int flags = 0; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), - &parent, sizeof (parent))) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * If we are not given an eof variable, - * use a local one. - */ - if (eofp == NULL) - eofp = &local_eof; - - /* - * Check for valid iov_len. - */ - if (uio->uio_iov->iov_len <= 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Quit if directory has been removed (posix) - */ - if ((*eofp = zp->z_unlinked) != 0) { - ZFS_EXIT(zfsvfs); - return (0); - } - - error = 0; - os = zfsvfs->z_os; - offset = uio->uio_loffset; - prefetch = zp->z_zn_prefetch; - - /* - * Initialize the iterator cursor. - */ - if (offset <= 3) { - /* - * Start iteration from the beginning of the directory. - */ - zap_cursor_init(&zc, os, zp->z_id); - } else { - /* - * The offset is a serialized cursor. - */ - zap_cursor_init_serialized(&zc, os, zp->z_id, offset); - } - - /* - * Get space to change directory entries into fs independent format. - */ - iovp = uio->uio_iov; - bytes_wanted = iovp->iov_len; - if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { - bufsize = bytes_wanted; - outbuf = kmem_alloc(bufsize, KM_SLEEP); - odp = (struct dirent64 *)outbuf; - } else { - bufsize = bytes_wanted; - outbuf = NULL; - odp = (struct dirent64 *)iovp->iov_base; - } - eodp = (struct edirent *)odp; - - if (ncookies != NULL) { - /* - * Minimum entry size is dirent size and 1 byte for a file name. - */ - ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); - cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); - *cookies = cooks; - *ncookies = ncooks; - } - /* - * If this VFS supports the system attribute view interface; and - * we're looking at an extended attribute directory; and we care - * about normalization conflicts on this vfs; then we must check - * for normalization conflicts with the sysattr name space. - */ -#ifdef TODO - check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && - (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && - (flags & V_RDDIR_ENTFLAGS); -#else - check_sysattrs = 0; -#endif - - /* - * Transform to file-system independent format - */ - outcount = 0; - while (outcount < bytes_wanted) { - ino64_t objnum; - ushort_t reclen; - off64_t *next = NULL; - - /* - * Special case `.', `..', and `.zfs'. - */ - if (offset == 0) { - (void) strcpy(zap.za_name, "."); - zap.za_normalization_conflict = 0; - objnum = zp->z_id; - type = DT_DIR; - } else if (offset == 1) { - (void) strcpy(zap.za_name, ".."); - zap.za_normalization_conflict = 0; - objnum = parent; - type = DT_DIR; - } else if (offset == 2 && zfs_show_ctldir(zp)) { - (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); - zap.za_normalization_conflict = 0; - objnum = ZFSCTL_INO_ROOT; - type = DT_DIR; - } else { - /* - * Grab next entry. - */ - if (error = zap_cursor_retrieve(&zc, &zap)) { - if ((*eofp = (error == ENOENT)) != 0) - break; - else - goto update; - } - - if (zap.za_integer_length != 8 || - zap.za_num_integers != 1) { - cmn_err(CE_WARN, "zap_readdir: bad directory " - "entry, obj = %lld, offset = %lld\n", - (u_longlong_t)zp->z_id, - (u_longlong_t)offset); - error = SET_ERROR(ENXIO); - goto update; - } - - objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); - /* - * MacOS X can extract the object type here such as: - * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); - */ - type = ZFS_DIRENT_TYPE(zap.za_first_integer); - - if (check_sysattrs && !zap.za_normalization_conflict) { -#ifdef TODO - zap.za_normalization_conflict = - xattr_sysattr_casechk(zap.za_name); -#else - panic("%s:%u: TODO", __func__, __LINE__); -#endif - } - } - - if (flags & V_RDDIR_ACCFILTER) { - /* - * If we have no access at all, don't include - * this entry in the returned information - */ - znode_t *ezp; - if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) - goto skip_entry; - if (!zfs_has_access(ezp, cr)) { - vrele(ZTOV(ezp)); - goto skip_entry; - } - vrele(ZTOV(ezp)); - } - - if (flags & V_RDDIR_ENTFLAGS) - reclen = EDIRENT_RECLEN(strlen(zap.za_name)); - else - reclen = DIRENT64_RECLEN(strlen(zap.za_name)); - - /* - * Will this entry fit in the buffer? - */ - if (outcount + reclen > bufsize) { - /* - * Did we manage to fit anything in the buffer? - */ - if (!outcount) { - error = SET_ERROR(EINVAL); - goto update; - } - break; - } - if (flags & V_RDDIR_ENTFLAGS) { - /* - * Add extended flag entry: - */ - eodp->ed_ino = objnum; - eodp->ed_reclen = reclen; - /* NOTE: ed_off is the offset for the *next* entry. */ - next = &eodp->ed_off; - eodp->ed_eflags = zap.za_normalization_conflict ? - ED_CASE_CONFLICT : 0; - (void) strncpy(eodp->ed_name, zap.za_name, - EDIRENT_NAMELEN(reclen)); - eodp = (edirent_t *)((intptr_t)eodp + reclen); - } else { - /* - * Add normal entry: - */ - odp->d_ino = objnum; - odp->d_reclen = reclen; - odp->d_namlen = strlen(zap.za_name); - /* NOTE: d_off is the offset for the *next* entry. */ - next = &odp->d_off; - (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); - odp->d_type = type; - dirent_terminate(odp); - odp = (dirent64_t *)((intptr_t)odp + reclen); - } - outcount += reclen; - - ASSERT(outcount <= bufsize); - - /* Prefetch znode */ - if (prefetch) - dmu_prefetch(os, objnum, 0, 0, 0, - ZIO_PRIORITY_SYNC_READ); - - skip_entry: - /* - * Move to the next entry, fill in the previous offset. - */ - if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { - zap_cursor_advance(&zc); - offset = zap_cursor_serialize(&zc); - } else { - offset += 1; - } - - /* Fill the offset right after advancing the cursor. */ - if (next != NULL) - *next = offset; - if (cooks != NULL) { - *cooks++ = offset; - ncooks--; - KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); - } - } - zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ - - /* Subtract unused cookies */ - if (ncookies != NULL) - *ncookies -= ncooks; - - if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { - iovp->iov_base += outcount; - iovp->iov_len -= outcount; - uio->uio_resid -= outcount; - } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { - /* - * Reset the pointer. - */ - offset = uio->uio_loffset; - } - -update: - zap_cursor_fini(&zc); - if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) - kmem_free(outbuf, bufsize); - - if (error == ENOENT) - error = 0; - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - - uio->uio_loffset = offset; - ZFS_EXIT(zfsvfs); - if (error != 0 && cookies != NULL) { - free(*cookies, M_TEMP); - *cookies = NULL; - *ncookies = 0; - } - return (error); -} - -ulong_t zfs_fsync_sync_cnt = 4; - -static int -zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); - - if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - zil_commit(zfsvfs->z_log, zp->z_id); - ZFS_EXIT(zfsvfs); - } - return (0); -} - - -/* - * Get the requested file attributes and place them in the provided - * vattr structure. - * - * IN: vp - vnode of file. - * vap - va_mask identifies requested attributes. - * If AT_XVATTR set, then optional attrs are requested - * flags - ATTR_NOACLCHECK (CIFS server context) - * cr - credentials of caller. - * ct - caller context - * - * OUT: vap - attribute values. - * - * RETURN: 0 (always succeeds). - */ -/* ARGSUSED */ -static int -zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error = 0; - uint32_t blksize; - u_longlong_t nblocks; - uint64_t mtime[2], ctime[2], crtime[2], rdev; - xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ - xoptattr_t *xoap = NULL; - boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - sa_bulk_attr_t bulk[4]; - int count = 0; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); - if (vp->v_type == VBLK || vp->v_type == VCHR) - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, - &rdev, 8); - - if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. - * Also, if we are the owner don't bother, since owner should - * always be allowed to read basic attributes of file. - */ - if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && - (vap->va_uid != crgetuid(cr))) { - if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, - skipaclchk, cr)) { - ZFS_EXIT(zfsvfs); - return (error); - } - } - - /* - * Return all attributes. It's cheaper to provide the answer - * than to determine whether we were asked the question. - */ - - vap->va_type = IFTOVT(zp->z_mode); - vap->va_mode = zp->z_mode & ~S_IFMT; -#ifdef illumos - vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; -#else - vn_fsid(vp, vap); -#endif - vap->va_nodeid = zp->z_id; - vap->va_nlink = zp->z_links; - if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) && - zp->z_links < ZFS_LINK_MAX) - vap->va_nlink++; - vap->va_size = zp->z_size; -#ifdef illumos - vap->va_rdev = vp->v_rdev; -#else - if (vp->v_type == VBLK || vp->v_type == VCHR) - vap->va_rdev = zfs_cmpldev(rdev); -#endif - vap->va_seq = zp->z_seq; - vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ - vap->va_filerev = zp->z_seq; - - /* - * Add in any requested optional attributes and the create time. - * Also set the corresponding bits in the returned attribute bitmap. - */ - if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { - if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { - xoap->xoa_archive = - ((zp->z_pflags & ZFS_ARCHIVE) != 0); - XVA_SET_RTN(xvap, XAT_ARCHIVE); - } - - if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { - xoap->xoa_readonly = - ((zp->z_pflags & ZFS_READONLY) != 0); - XVA_SET_RTN(xvap, XAT_READONLY); - } - - if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { - xoap->xoa_system = - ((zp->z_pflags & ZFS_SYSTEM) != 0); - XVA_SET_RTN(xvap, XAT_SYSTEM); - } - - if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { - xoap->xoa_hidden = - ((zp->z_pflags & ZFS_HIDDEN) != 0); - XVA_SET_RTN(xvap, XAT_HIDDEN); - } - - if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { - xoap->xoa_nounlink = - ((zp->z_pflags & ZFS_NOUNLINK) != 0); - XVA_SET_RTN(xvap, XAT_NOUNLINK); - } - - if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { - xoap->xoa_immutable = - ((zp->z_pflags & ZFS_IMMUTABLE) != 0); - XVA_SET_RTN(xvap, XAT_IMMUTABLE); - } - - if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { - xoap->xoa_appendonly = - ((zp->z_pflags & ZFS_APPENDONLY) != 0); - XVA_SET_RTN(xvap, XAT_APPENDONLY); - } - - if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { - xoap->xoa_nodump = - ((zp->z_pflags & ZFS_NODUMP) != 0); - XVA_SET_RTN(xvap, XAT_NODUMP); - } - - if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { - xoap->xoa_opaque = - ((zp->z_pflags & ZFS_OPAQUE) != 0); - XVA_SET_RTN(xvap, XAT_OPAQUE); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { - xoap->xoa_av_quarantined = - ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); - XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { - xoap->xoa_av_modified = - ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); - XVA_SET_RTN(xvap, XAT_AV_MODIFIED); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && - vp->v_type == VREG) { - zfs_sa_get_scanstamp(zp, xvap); - } - - if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { - xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); - XVA_SET_RTN(xvap, XAT_REPARSE); - } - if (XVA_ISSET_REQ(xvap, XAT_GEN)) { - xoap->xoa_generation = zp->z_gen; - XVA_SET_RTN(xvap, XAT_GEN); - } - - if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { - xoap->xoa_offline = - ((zp->z_pflags & ZFS_OFFLINE) != 0); - XVA_SET_RTN(xvap, XAT_OFFLINE); - } - - if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { - xoap->xoa_sparse = - ((zp->z_pflags & ZFS_SPARSE) != 0); - XVA_SET_RTN(xvap, XAT_SPARSE); - } - } - - ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); - ZFS_TIME_DECODE(&vap->va_mtime, mtime); - ZFS_TIME_DECODE(&vap->va_ctime, ctime); - ZFS_TIME_DECODE(&vap->va_birthtime, crtime); - - - sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); - vap->va_blksize = blksize; - vap->va_bytes = nblocks << 9; /* nblocks * 512 */ - - if (zp->z_blksz == 0) { - /* - * Block size hasn't been set; suggest maximal I/O transfers. - */ - vap->va_blksize = zfsvfs->z_max_blksz; - } - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* - * Set the file attributes to the values contained in the - * vattr structure. - * - * IN: vp - vnode of file to be modified. - * vap - new attribute values. - * If AT_XVATTR set, then optional attrs are being set - * flags - ATTR_UTIME set if non-default time values provided. - * - ATTR_NOACLCHECK (CIFS context only). - * cr - credentials of caller. - * ct - caller context - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * vp - ctime updated, mtime updated if size changed. - */ -/* ARGSUSED */ -static int -zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog; - dmu_tx_t *tx; - vattr_t oldva; - xvattr_t tmpxvattr; - uint_t mask = vap->va_mask; - uint_t saved_mask = 0; - uint64_t saved_mode; - int trim_mask = 0; - uint64_t new_mode; - uint64_t new_uid, new_gid; - uint64_t xattr_obj; - uint64_t mtime[2], ctime[2]; - znode_t *attrzp; - int need_policy = FALSE; - int err, err2; - zfs_fuid_info_t *fuidp = NULL; - xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ - xoptattr_t *xoap; - zfs_acl_t *aclp; - boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - boolean_t fuid_dirtied = B_FALSE; - sa_bulk_attr_t bulk[7], xattr_bulk[7]; - int count = 0, xattr_count = 0; - - if (mask == 0) - return (0); - - if (mask & AT_NOSET) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - zilog = zfsvfs->z_log; - - /* - * Make sure that if we have ephemeral uid/gid or xvattr specified - * that file system is at proper version level - */ - - if (zfsvfs->z_use_fuids == B_FALSE && - (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || - ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || - (mask & AT_XVATTR))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - if (mask & AT_SIZE && vp->v_type == VDIR) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EISDIR)); - } - - if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * If this is an xvattr_t, then get a pointer to the structure of - * optional attributes. If this is NULL, then we have a vattr_t. - */ - xoap = xva_getxoptattr(xvap); - - xva_init(&tmpxvattr); - - /* - * Immutable files can only alter immutable bit and atime - */ - if ((zp->z_pflags & ZFS_IMMUTABLE) && - ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || - ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - /* - * Note: ZFS_READONLY is handled in zfs_zaccess_common. - */ - - /* - * Verify timestamps doesn't overflow 32 bits. - * ZFS can handle large timestamps, but 32bit syscalls can't - * handle times greater than 2039. This check should be removed - * once large timestamps are fully supported. - */ - if (mask & (AT_ATIME | AT_MTIME)) { - if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || - ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EOVERFLOW)); - } - } - if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) && - TIMESPEC_OVERFLOW(&vap->va_birthtime)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EOVERFLOW)); - } - - attrzp = NULL; - aclp = NULL; - - /* Can this be moved to before the top label? */ - if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EROFS)); - } - - /* - * First validate permissions - */ - - if (mask & AT_SIZE) { - /* - * XXX - Note, we are not providing any open - * mode flags here (like FNDELAY), so we may - * block if there are locks present... this - * should be addressed in openat(). - */ - /* XXX - would it be OK to generate a log record here? */ - err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); - if (err) { - ZFS_EXIT(zfsvfs); - return (err); - } - } - - if (mask & (AT_ATIME|AT_MTIME) || - ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || - XVA_ISSET_REQ(xvap, XAT_READONLY) || - XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || - XVA_ISSET_REQ(xvap, XAT_OFFLINE) || - XVA_ISSET_REQ(xvap, XAT_SPARSE) || - XVA_ISSET_REQ(xvap, XAT_CREATETIME) || - XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { - need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, - skipaclchk, cr); - } - - if (mask & (AT_UID|AT_GID)) { - int idmask = (mask & (AT_UID|AT_GID)); - int take_owner; - int take_group; - - /* - * NOTE: even if a new mode is being set, - * we may clear S_ISUID/S_ISGID bits. - */ - - if (!(mask & AT_MODE)) - vap->va_mode = zp->z_mode; - - /* - * Take ownership or chgrp to group we are a member of - */ - - take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); - take_group = (mask & AT_GID) && - zfs_groupmember(zfsvfs, vap->va_gid, cr); - - /* - * If both AT_UID and AT_GID are set then take_owner and - * take_group must both be set in order to allow taking - * ownership. - * - * Otherwise, send the check through secpolicy_vnode_setattr() - * - */ - - if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || - ((idmask == AT_UID) && take_owner) || - ((idmask == AT_GID) && take_group)) { - if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, - skipaclchk, cr) == 0) { - /* - * Remove setuid/setgid for non-privileged users - */ - secpolicy_setid_clear(vap, vp, cr); - trim_mask = (mask & (AT_UID|AT_GID)); - } else { - need_policy = TRUE; - } - } else { - need_policy = TRUE; - } - } - - oldva.va_mode = zp->z_mode; - zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); - if (mask & AT_XVATTR) { - /* - * Update xvattr mask to include only those attributes - * that are actually changing. - * - * the bits will be restored prior to actually setting - * the attributes so the caller thinks they were set. - */ - if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { - if (xoap->xoa_appendonly != - ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_APPENDONLY); - XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { - if (xoap->xoa_nounlink != - ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_NOUNLINK); - XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { - if (xoap->xoa_immutable != - ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_IMMUTABLE); - XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { - if (xoap->xoa_nodump != - ((zp->z_pflags & ZFS_NODUMP) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_NODUMP); - XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { - if (xoap->xoa_av_modified != - ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); - XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { - if ((vp->v_type != VREG && - xoap->xoa_av_quarantined) || - xoap->xoa_av_quarantined != - ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { - need_policy = TRUE; - } else { - XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); - XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if (need_policy == FALSE && - (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || - XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { - need_policy = TRUE; - } - } - - if (mask & AT_MODE) { - if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { - err = secpolicy_setid_setsticky_clear(vp, vap, - &oldva, cr); - if (err) { - ZFS_EXIT(zfsvfs); - return (err); - } - trim_mask |= AT_MODE; - } else { - need_policy = TRUE; - } - } - - if (need_policy) { - /* - * If trim_mask is set then take ownership - * has been granted or write_acl is present and user - * has the ability to modify mode. In that case remove - * UID|GID and or MODE from mask so that - * secpolicy_vnode_setattr() doesn't revoke it. - */ - - if (trim_mask) { - saved_mask = vap->va_mask; - vap->va_mask &= ~trim_mask; - if (trim_mask & AT_MODE) { - /* - * Save the mode, as secpolicy_vnode_setattr() - * will overwrite it with ova.va_mode. - */ - saved_mode = vap->va_mode; - } - } - err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, - (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); - if (err) { - ZFS_EXIT(zfsvfs); - return (err); - } - - if (trim_mask) { - vap->va_mask |= saved_mask; - if (trim_mask & AT_MODE) { - /* - * Recover the mode after - * secpolicy_vnode_setattr(). - */ - vap->va_mode = saved_mode; - } - } - } - - /* - * secpolicy_vnode_setattr, or take ownership may have - * changed va_mask - */ - mask = vap->va_mask; - - if ((mask & (AT_UID | AT_GID))) { - err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), - &xattr_obj, sizeof (xattr_obj)); - - if (err == 0 && xattr_obj) { - err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); - if (err == 0) { - err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); - if (err != 0) - vrele(ZTOV(attrzp)); - } - if (err) - goto out2; - } - if (mask & AT_UID) { - new_uid = zfs_fuid_create(zfsvfs, - (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); - if (new_uid != zp->z_uid && - zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { - if (attrzp) - vput(ZTOV(attrzp)); - err = SET_ERROR(EDQUOT); - goto out2; - } - } - - if (mask & AT_GID) { - new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, - cr, ZFS_GROUP, &fuidp); - if (new_gid != zp->z_gid && - zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { - if (attrzp) - vput(ZTOV(attrzp)); - err = SET_ERROR(EDQUOT); - goto out2; - } - } - } - tx = dmu_tx_create(zfsvfs->z_os); - - if (mask & AT_MODE) { - uint64_t pmode = zp->z_mode; - uint64_t acl_obj; - new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); - - if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && - !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { - err = SET_ERROR(EPERM); - goto out; - } - - if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) - goto out; - - if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { - /* - * Are we upgrading ACL from old V0 format - * to V1 format? - */ - if (zfsvfs->z_version >= ZPL_VERSION_FUID && - zfs_znode_acl_version(zp) == - ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, acl_obj, 0, - DMU_OBJECT_END); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } else { - dmu_tx_hold_write(tx, acl_obj, 0, - aclp->z_acl_bytes); - } - } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - } else { - if ((mask & AT_XVATTR) && - XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); - else - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - } - - if (attrzp) { - dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); - } - - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - - zfs_sa_upgrade_txholds(tx, zp); - - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) - goto out; - - count = 0; - /* - * Set each attribute requested. - * We group settings according to the locks they need to acquire. - * - * Note: you cannot set ctime directly, although it will be - * updated as a side-effect of calling this function. - */ - - if (mask & (AT_UID|AT_GID|AT_MODE)) - mutex_enter(&zp->z_acl_lock); - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, sizeof (zp->z_pflags)); - - if (attrzp) { - if (mask & (AT_UID|AT_GID|AT_MODE)) - mutex_enter(&attrzp->z_acl_lock); - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, - sizeof (attrzp->z_pflags)); - } - - if (mask & (AT_UID|AT_GID)) { - - if (mask & AT_UID) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, - &new_uid, sizeof (new_uid)); - zp->z_uid = new_uid; - if (attrzp) { - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_UID(zfsvfs), NULL, &new_uid, - sizeof (new_uid)); - attrzp->z_uid = new_uid; - } - } - - if (mask & AT_GID) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), - NULL, &new_gid, sizeof (new_gid)); - zp->z_gid = new_gid; - if (attrzp) { - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_GID(zfsvfs), NULL, &new_gid, - sizeof (new_gid)); - attrzp->z_gid = new_gid; - } - } - if (!(mask & AT_MODE)) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), - NULL, &new_mode, sizeof (new_mode)); - new_mode = zp->z_mode; - } - err = zfs_acl_chown_setattr(zp); - ASSERT(err == 0); - if (attrzp) { - err = zfs_acl_chown_setattr(attrzp); - ASSERT(err == 0); - } - } - - if (mask & AT_MODE) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, - &new_mode, sizeof (new_mode)); - zp->z_mode = new_mode; - ASSERT3U((uintptr_t)aclp, !=, 0); - err = zfs_aclset_common(zp, aclp, cr, tx); - ASSERT0(err); - if (zp->z_acl_cached) - zfs_acl_free(zp->z_acl_cached); - zp->z_acl_cached = aclp; - aclp = NULL; - } - - - if (mask & AT_ATIME) { - ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, - &zp->z_atime, sizeof (zp->z_atime)); - } - - if (mask & AT_MTIME) { - ZFS_TIME_ENCODE(&vap->va_mtime, mtime); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, - mtime, sizeof (mtime)); - } - - /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ - if (mask & AT_SIZE && !(mask & AT_MTIME)) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), - NULL, mtime, sizeof (mtime)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, - &ctime, sizeof (ctime)); - zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, - B_TRUE); - } else if (mask != 0) { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, - &ctime, sizeof (ctime)); - zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, - B_TRUE); - if (attrzp) { - SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, - SA_ZPL_CTIME(zfsvfs), NULL, - &ctime, sizeof (ctime)); - zfs_tstamp_update_setup(attrzp, STATE_CHANGED, - mtime, ctime, B_TRUE); - } - } - /* - * Do this after setting timestamps to prevent timestamp - * update from toggling bit - */ - - if (xoap && (mask & AT_XVATTR)) { - - if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) - xoap->xoa_createtime = vap->va_birthtime; - /* - * restore trimmed off masks - * so that return masks can be set for caller. - */ - - if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { - XVA_SET_REQ(xvap, XAT_APPENDONLY); - } - if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { - XVA_SET_REQ(xvap, XAT_NOUNLINK); - } - if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { - XVA_SET_REQ(xvap, XAT_IMMUTABLE); - } - if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { - XVA_SET_REQ(xvap, XAT_NODUMP); - } - if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { - XVA_SET_REQ(xvap, XAT_AV_MODIFIED); - } - if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { - XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); - } - - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) - ASSERT(vp->v_type == VREG); - - zfs_xvattr_set(zp, xvap, tx); - } - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - if (mask != 0) - zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); - - if (mask & (AT_UID|AT_GID|AT_MODE)) - mutex_exit(&zp->z_acl_lock); - - if (attrzp) { - if (mask & (AT_UID|AT_GID|AT_MODE)) - mutex_exit(&attrzp->z_acl_lock); - } -out: - if (err == 0 && attrzp) { - err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, - xattr_count, tx); - ASSERT(err2 == 0); - } - - if (attrzp) - vput(ZTOV(attrzp)); - - if (aclp) - zfs_acl_free(aclp); - - if (fuidp) { - zfs_fuid_info_free(fuidp); - fuidp = NULL; - } - - if (err) { - dmu_tx_abort(tx); - } else { - err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - dmu_tx_commit(tx); - } - -out2: - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (err); -} - -/* - * We acquire all but fdvp locks using non-blocking acquisitions. If we - * fail to acquire any lock in the path we will drop all held locks, - * acquire the new lock in a blocking fashion, and then release it and - * restart the rename. This acquire/release step ensures that we do not - * spin on a lock waiting for release. On error release all vnode locks - * and decrement references the way tmpfs_rename() would do. - */ -static int -zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, - struct vnode *tdvp, struct vnode **tvpp, - const struct componentname *scnp, const struct componentname *tcnp) -{ - zfsvfs_t *zfsvfs; - struct vnode *nvp, *svp, *tvp; - znode_t *sdzp, *tdzp, *szp, *tzp; - const char *snm = scnp->cn_nameptr; - const char *tnm = tcnp->cn_nameptr; - int error; - - VOP_UNLOCK(tdvp); - if (*tvpp != NULL && *tvpp != tdvp) - VOP_UNLOCK(*tvpp); - -relock: - error = vn_lock(sdvp, LK_EXCLUSIVE); - if (error) - goto out; - sdzp = VTOZ(sdvp); - - error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); - if (error != 0) { - VOP_UNLOCK(sdvp); - if (error != EBUSY) - goto out; - error = vn_lock(tdvp, LK_EXCLUSIVE); - if (error) - goto out; - VOP_UNLOCK(tdvp); - goto relock; - } - tdzp = VTOZ(tdvp); - - /* - * Before using sdzp and tdzp we must ensure that they are live. - * As a porting legacy from illumos we have two things to worry - * about. One is typical for FreeBSD and it is that the vnode is - * not reclaimed (doomed). The other is that the znode is live. - * The current code can invalidate the znode without acquiring the - * corresponding vnode lock if the object represented by the znode - * and vnode is no longer valid after a rollback or receive operation. - * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock - * that protects the znodes from the invalidation. - */ - zfsvfs = sdzp->z_zfsvfs; - ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); - ZFS_ENTER(zfsvfs); - - /* - * We can not use ZFS_VERIFY_ZP() here because it could directly return - * bypassing the cleanup code in the case of an error. - */ - if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { - ZFS_EXIT(zfsvfs); - VOP_UNLOCK(sdvp); - VOP_UNLOCK(tdvp); - error = SET_ERROR(EIO); - goto out; - } - - /* - * Re-resolve svp to be certain it still exists and fetch the - * correct vnode. - */ - error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); - if (error != 0) { - /* Source entry invalid or not there. */ - ZFS_EXIT(zfsvfs); - VOP_UNLOCK(sdvp); - VOP_UNLOCK(tdvp); - if ((scnp->cn_flags & ISDOTDOT) != 0 || - (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) - error = SET_ERROR(EINVAL); - goto out; - } - svp = ZTOV(szp); - - /* - * Re-resolve tvp, if it disappeared we just carry on. - */ - error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); - if (error != 0) { - ZFS_EXIT(zfsvfs); - VOP_UNLOCK(sdvp); - VOP_UNLOCK(tdvp); - vrele(svp); - if ((tcnp->cn_flags & ISDOTDOT) != 0) - error = SET_ERROR(EINVAL); - goto out; - } - if (tzp != NULL) - tvp = ZTOV(tzp); - else - tvp = NULL; - - /* - * At present the vnode locks must be acquired before z_teardown_lock, - * although it would be more logical to use the opposite order. - */ - ZFS_EXIT(zfsvfs); - - /* - * Now try acquire locks on svp and tvp. - */ - nvp = svp; - error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); - if (error != 0) { - VOP_UNLOCK(sdvp); - VOP_UNLOCK(tdvp); - if (tvp != NULL) - vrele(tvp); - if (error != EBUSY) { - vrele(nvp); - goto out; - } - error = vn_lock(nvp, LK_EXCLUSIVE); - if (error != 0) { - vrele(nvp); - goto out; - } - VOP_UNLOCK(nvp); - /* - * Concurrent rename race. - * XXX ? - */ - if (nvp == tdvp) { - vrele(nvp); - error = SET_ERROR(EINVAL); - goto out; - } - vrele(*svpp); - *svpp = nvp; - goto relock; - } - vrele(*svpp); - *svpp = nvp; - - if (*tvpp != NULL) - vrele(*tvpp); - *tvpp = NULL; - if (tvp != NULL) { - nvp = tvp; - error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); - if (error != 0) { - VOP_UNLOCK(sdvp); - VOP_UNLOCK(tdvp); - VOP_UNLOCK(*svpp); - if (error != EBUSY) { - vrele(nvp); - goto out; - } - error = vn_lock(nvp, LK_EXCLUSIVE); - if (error != 0) { - vrele(nvp); - goto out; - } - vput(nvp); - goto relock; - } - *tvpp = nvp; - } - - return (0); - -out: - return (error); -} - -/* - * Note that we must use VRELE_ASYNC in this function as it walks - * up the directory tree and vrele may need to acquire an exclusive - * lock if a last reference to a vnode is dropped. - */ -static int -zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) -{ - zfsvfs_t *zfsvfs; - znode_t *zp, *zp1; - uint64_t parent; - int error; - - zfsvfs = tdzp->z_zfsvfs; - if (tdzp == szp) - return (SET_ERROR(EINVAL)); - if (tdzp == sdzp) - return (0); - if (tdzp->z_id == zfsvfs->z_root) - return (0); - zp = tdzp; - for (;;) { - ASSERT(!zp->z_unlinked); - if ((error = sa_lookup(zp->z_sa_hdl, - SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) - break; - - if (parent == szp->z_id) { - error = SET_ERROR(EINVAL); - break; - } - if (parent == zfsvfs->z_root) - break; - if (parent == sdzp->z_id) - break; - - error = zfs_zget(zfsvfs, parent, &zp1); - if (error != 0) - break; - - if (zp != tdzp) - VN_RELE_ASYNC(ZTOV(zp), - dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); - zp = zp1; - } - - if (error == ENOTDIR) - panic("checkpath: .. not a directory\n"); - if (zp != tdzp) - VN_RELE_ASYNC(ZTOV(zp), - dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); - return (error); -} - -/* - * Move an entry from the provided source directory to the target - * directory. Change the entry name as indicated. - * - * IN: sdvp - Source directory containing the "old entry". - * snm - Old entry name. - * tdvp - Target directory to contain the "new entry". - * tnm - New entry name. - * cr - credentials of caller. - * ct - caller context - * flags - case flags - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * sdvp,tdvp - ctime|mtime updated - */ -/*ARGSUSED*/ -static int -zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, - vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, - cred_t *cr) -{ - zfsvfs_t *zfsvfs; - znode_t *sdzp, *tdzp, *szp, *tzp; - zilog_t *zilog = NULL; - dmu_tx_t *tx; - char *snm = scnp->cn_nameptr; - char *tnm = tcnp->cn_nameptr; - int error = 0; - bool want_seqc_end = false; - - /* Reject renames across filesystems. */ - if ((*svpp)->v_mount != tdvp->v_mount || - ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { - error = SET_ERROR(EXDEV); - goto out; - } - - if (zfsctl_is_node(tdvp)) { - error = SET_ERROR(EXDEV); - goto out; - } - - /* - * Lock all four vnodes to ensure safety and semantics of renaming. - */ - error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); - if (error != 0) { - /* no vnodes are locked in the case of error here */ - return (error); - } - - tdzp = VTOZ(tdvp); - sdzp = VTOZ(sdvp); - zfsvfs = tdzp->z_zfsvfs; - zilog = zfsvfs->z_log; - - /* - * After we re-enter ZFS_ENTER() we will have to revalidate all - * znodes involved. - */ - ZFS_ENTER(zfsvfs); - - if (zfsvfs->z_utf8 && u8_validate(tnm, - strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - error = SET_ERROR(EILSEQ); - goto unlockout; - } - - /* If source and target are the same file, there is nothing to do. */ - if ((*svpp) == (*tvpp)) { - error = 0; - goto unlockout; - } - - if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || - ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && - (*tvpp)->v_mountedhere != NULL)) { - error = SET_ERROR(EXDEV); - goto unlockout; - } - - /* - * We can not use ZFS_VERIFY_ZP() here because it could directly return - * bypassing the cleanup code in the case of an error. - */ - if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { - error = SET_ERROR(EIO); - goto unlockout; - } - - szp = VTOZ(*svpp); - tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); - if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { - error = SET_ERROR(EIO); - goto unlockout; - } - - /* - * This is to prevent the creation of links into attribute space - * by renaming a linked file into/outof an attribute directory. - * See the comment in zfs_link() for why this is considered bad. - */ - if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { - error = SET_ERROR(EINVAL); - goto unlockout; - } - - /* - * Must have write access at the source to remove the old entry - * and write access at the target to create the new entry. - * Note that if target and source are the same, this can be - * done in a single check. - */ - if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) - goto unlockout; - - if ((*svpp)->v_type == VDIR) { - /* - * Avoid ".", "..", and aliases of "." for obvious reasons. - */ - if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || - sdzp == szp || - (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { - error = EINVAL; - goto unlockout; - } - - /* - * Check to make sure rename is valid. - * Can't do a move like this: /usr/a/b to /usr/a/b/c/d - */ - if (error = zfs_rename_check(szp, sdzp, tdzp)) - goto unlockout; - } - - /* - * Does target exist? - */ - if (tzp) { - /* - * Source and target must be the same type. - */ - if ((*svpp)->v_type == VDIR) { - if ((*tvpp)->v_type != VDIR) { - error = SET_ERROR(ENOTDIR); - goto unlockout; - } else { - cache_purge(tdvp); - if (sdvp != tdvp) - cache_purge(sdvp); - } - } else { - if ((*tvpp)->v_type == VDIR) { - error = SET_ERROR(EISDIR); - goto unlockout; - } - } - } - - vn_seqc_write_begin(*svpp); - vn_seqc_write_begin(sdvp); - if (*tvpp != NULL) - vn_seqc_write_begin(*tvpp); - if (tdvp != *tvpp) - vn_seqc_write_begin(tdvp); - want_seqc_end = true; - - vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); - if (tzp) - vnevent_rename_dest(*tvpp, tdvp, tnm, ct); - - /* - * notify the target directory if it is not the same - * as source directory. - */ - if (tdvp != sdvp) { - vnevent_rename_dest_dir(tdvp, ct); - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); - dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); - dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); - if (sdzp != tdzp) { - dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, tdzp); - } - if (tzp) { - dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, tzp); - } - - zfs_sa_upgrade_txholds(tx, szp); - dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - goto unlockout; - } - - - if (tzp) /* Attempt to remove the existing target */ - error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); - - if (error == 0) { - error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); - if (error == 0) { - szp->z_pflags |= ZFS_AV_MODIFIED; - - error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), - (void *)&szp->z_pflags, sizeof (uint64_t), tx); - ASSERT0(error); - - error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, - NULL); - if (error == 0) { - zfs_log_rename(zilog, tx, TX_RENAME, sdzp, - snm, tdzp, tnm, szp); - - /* - * Update path information for the target vnode - */ - vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); - } else { - /* - * At this point, we have successfully created - * the target name, but have failed to remove - * the source name. Since the create was done - * with the ZRENAMING flag, there are - * complications; for one, the link count is - * wrong. The easiest way to deal with this - * is to remove the newly created target, and - * return the original error. This must - * succeed; fortunately, it is very unlikely to - * fail, since we just created it. - */ - VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, - ZRENAMING, NULL), ==, 0); - } - } - if (error == 0) { - cache_purge(*svpp); - if (*tvpp != NULL) - cache_purge(*tvpp); - cache_purge_negative(tdvp); - } - } - - dmu_tx_commit(tx); - -unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ - ZFS_EXIT(zfsvfs); - if (want_seqc_end) { - vn_seqc_write_end(*svpp); - vn_seqc_write_end(sdvp); - if (*tvpp != NULL) - vn_seqc_write_end(*tvpp); - if (tdvp != *tvpp) - vn_seqc_write_end(tdvp); - want_seqc_end = false; - } - VOP_UNLOCK(*svpp); - VOP_UNLOCK(sdvp); - -out: /* original two vnodes are locked */ - MPASS(!want_seqc_end); - if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - if (*tvpp != NULL) - VOP_UNLOCK(*tvpp); - if (tdvp != *tvpp) - VOP_UNLOCK(tdvp); - return (error); -} - -/* - * Insert the indicated symbolic reference entry into the directory. - * - * IN: dvp - Directory to contain new symbolic link. - * link - Name for new symlink entry. - * vap - Attributes of new entry. - * cr - credentials of caller. - * ct - caller context - * flags - case flags - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * dvp - ctime|mtime updated - */ -/*ARGSUSED*/ -static int -zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, - cred_t *cr, kthread_t *td) -{ - znode_t *zp, *dzp = VTOZ(dvp); - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog; - uint64_t len = strlen(link); - int error; - zfs_acl_ids_t acl_ids; - boolean_t fuid_dirtied; - uint64_t txtype = TX_SYMLINK; - int flags = 0; - - ASSERT(vap->va_type == VLNK); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), - NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - - if (len > MAXPATHLEN) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENAMETOOLONG)); - } - - if ((error = zfs_acl_ids_create(dzp, 0, - vap, cr, NULL, &acl_ids)) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Attempt to lock directory; fail if entry already exists. - */ - error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); - if (error) { - zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); - return (error); - } - - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { - zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); - return (error); - } - - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { - zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EDQUOT)); - } - - getnewvnode_reserve(); - tx = dmu_tx_create(zfsvfs->z_os); - fuid_dirtied = zfsvfs->z_fuid_dirty; - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + - ZFS_SA_BASE_ATTR_SIZE + len); - dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); - if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - acl_ids.z_aclp->z_acl_bytes); - } - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - zfs_acl_ids_free(&acl_ids); - dmu_tx_abort(tx); - getnewvnode_drop_reserve(); - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Create a new object for the symlink. - * for version 4 ZPL datsets the symlink will be an SA attribute - */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - - if (zp->z_is_sa) - error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), - link, len, tx); - else - zfs_sa_symlink(zp, link, len, tx); - - zp->z_size = len; - (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), - &zp->z_size, sizeof (zp->z_size), tx); - /* - * Insert the new object into the directory. - */ - (void) zfs_link_create(dzp, name, zp, tx, ZNEW); - - zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); - *vpp = ZTOV(zp); - - zfs_acl_ids_free(&acl_ids); - - dmu_tx_commit(tx); - - getnewvnode_drop_reserve(); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Return, in the buffer contained in the provided uio structure, - * the symbolic path referred to by vp. - * - * IN: vp - vnode of symbolic link. - * uio - structure to contain the link path. - * cr - credentials of caller. - * ct - caller context - * - * OUT: uio - structure containing the link path. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * vp - atime updated - */ -/* ARGSUSED */ -static int -zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (zp->z_is_sa) - error = sa_lookup_uio(zp->z_sa_hdl, - SA_ZPL_SYMLINK(zfsvfs), uio); - else - error = zfs_sa_readlink(zp, uio); - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Insert a new entry into directory tdvp referencing svp. - * - * IN: tdvp - Directory to contain new entry. - * svp - vnode of new entry. - * name - name of new entry. - * cr - credentials of caller. - * ct - caller context - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * tdvp - ctime|mtime updated - * svp - ctime updated - */ -/* ARGSUSED */ -static int -zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, - caller_context_t *ct, int flags) -{ - znode_t *dzp = VTOZ(tdvp); - znode_t *tzp, *szp; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog; - dmu_tx_t *tx; - int error; - uint64_t parent; - uid_t owner; - - ASSERT(tdvp->v_type == VDIR); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); - zilog = zfsvfs->z_log; - - /* - * POSIX dictates that we return EPERM here. - * Better choices include ENOTSUP or EISDIR. - */ - if (svp->v_type == VDIR) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - szp = VTOZ(svp); - ZFS_VERIFY_ZP(szp); - - if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - /* Prevent links to .zfs/shares files */ - - if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), - &parent, sizeof (uint64_t))) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - if (parent == zfsvfs->z_shares_dir) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if (zfsvfs->z_utf8 && u8_validate(name, - strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EILSEQ)); - } - - /* - * We do not support links between attributes and non-attributes - * because of the potential security risk of creating links - * into "normal" file space in order to circumvent restrictions - * imposed in attribute space. - */ - if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - - owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); - if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * Attempt to lock directory; fail if entry already exists. - */ - error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - zfs_sa_upgrade_txholds(tx, szp); - zfs_sa_upgrade_txholds(tx, dzp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - - error = zfs_link_create(dzp, name, szp, tx, 0); - - if (error == 0) { - uint64_t txtype = TX_LINK; - zfs_log_link(zilog, tx, txtype, dzp, szp, name); - } - - dmu_tx_commit(tx); - - if (error == 0) { - vnevent_link(svp, ct); - } - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - - -/*ARGSUSED*/ -void -zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - ZFS_RLOCK_TEARDOWN_INACTIVE(zfsvfs); - if (zp->z_sa_hdl == NULL) { - /* - * The fs has been unmounted, or we did a - * suspend/resume and this file no longer exists. - */ - ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs); - vrecycle(vp); - return; - } - - if (zp->z_unlinked) { - /* - * Fast path to recycle a vnode of a removed file. - */ - ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs); - vrecycle(vp); - return; - } - - if (zp->z_atime_dirty && zp->z_unlinked == 0) { - dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); - - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), - (void *)&zp->z_atime, sizeof (zp->z_atime), tx); - zp->z_atime_dirty = 0; - dmu_tx_commit(tx); - } - } - ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs); -} - - -CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); -CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); - -/*ARGSUSED*/ -static int -zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint32_t gen; - uint64_t gen64; - uint64_t object = zp->z_id; - zfid_short_t *zfid; - int size, i, error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), - &gen64, sizeof (uint64_t))) != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - gen = (uint32_t)gen64; - - size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; - -#ifdef illumos - if (fidp->fid_len < size) { - fidp->fid_len = size; - ZFS_EXIT(zfsvfs); - return (SET_ERROR(ENOSPC)); - } -#else - fidp->fid_len = size; -#endif - - zfid = (zfid_short_t *)fidp; - - zfid->zf_len = size; - - for (i = 0; i < sizeof (zfid->zf_object); i++) - zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); - - /* Must have a non-zero generation number to distinguish from .zfs */ - if (gen == 0) - gen = 1; - for (i = 0; i < sizeof (zfid->zf_gen); i++) - zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); - - if (size == LONG_FID_LEN) { - uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); - zfid_long_t *zlfid; - - zlfid = (zfid_long_t *)fidp; - - for (i = 0; i < sizeof (zlfid->zf_setid); i++) - zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); - - /* XXX - this should be the generation number for the objset */ - for (i = 0; i < sizeof (zlfid->zf_setgen); i++) - zlfid->zf_setgen[i] = 0; - } - - ZFS_EXIT(zfsvfs); - return (0); -} - -static int -zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp, *xzp; - zfsvfs_t *zfsvfs; - int error; - - switch (cmd) { - case _PC_LINK_MAX: - *valp = MIN(LONG_MAX, ZFS_LINK_MAX); - return (0); - - case _PC_FILESIZEBITS: - *valp = 64; - return (0); -#ifdef illumos - case _PC_XATTR_EXISTS: - zp = VTOZ(vp); - zfsvfs = zp->z_zfsvfs; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - *valp = 0; - error = zfs_dirent_lookup(zp, "", &xzp, - ZXATTR | ZEXISTS | ZSHARED); - if (error == 0) { - if (!zfs_dirempty(xzp)) - *valp = 1; - vrele(ZTOV(xzp)); - } else if (error == ENOENT) { - /* - * If there aren't extended attributes, it's the - * same as having zero of them. - */ - error = 0; - } - ZFS_EXIT(zfsvfs); - return (error); - - case _PC_SATTR_ENABLED: - case _PC_SATTR_EXISTS: - *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && - (vp->v_type == VREG || vp->v_type == VDIR); - return (0); - - case _PC_ACCESS_FILTERING: - *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && - vp->v_type == VDIR; - return (0); - - case _PC_ACL_ENABLED: - *valp = _ACL_ACE_ENABLED; - return (0); -#endif /* illumos */ - case _PC_MIN_HOLE_SIZE: - *valp = (int)SPA_MINBLOCKSIZE; - return (0); -#ifdef illumos - case _PC_TIMESTAMP_RESOLUTION: - /* nanosecond timestamp resolution */ - *valp = 1L; - return (0); -#endif - case _PC_ACL_EXTENDED: - *valp = 0; - return (0); - - case _PC_ACL_NFS4: - *valp = 1; - return (0); - - case _PC_ACL_PATH_MAX: - *valp = ACL_MAX_ENTRIES; - return (0); - - default: - return (EOPNOTSUPP); - } -} - -/*ARGSUSED*/ -static int -zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - error = zfs_getacl(zp, vsecp, skipaclchk, cr); - ZFS_EXIT(zfsvfs); - - return (error); -} - -/*ARGSUSED*/ -int -zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - zilog_t *zilog = zfsvfs->z_log; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - error = zfs_setacl(zp, vsecp, skipaclchk, cr); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -static int -zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, - int *rahead) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - objset_t *os = zp->z_zfsvfs->z_os; - locked_range_t *lr; - vm_object_t object; - off_t start, end, obj_size; - uint_t blksz; - int pgsin_b, pgsin_a; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - start = IDX_TO_OFF(ma[0]->pindex); - end = IDX_TO_OFF(ma[count - 1]->pindex + 1); - - /* - * Try to lock a range covering all required and optional pages, to - * handle the case of the block size growing. It is not safe to block - * on the range lock since the owner may be waiting for the fault page - * to be unbusied. - */ - for (;;) { - blksz = zp->z_blksz; - lr = rangelock_tryenter(&zp->z_rangelock, - rounddown(start, blksz), - roundup(end, blksz) - rounddown(start, blksz), RL_READER); - if (lr == NULL) { - if (rahead != NULL) { - *rahead = 0; - rahead = NULL; - } - if (rbehind != NULL) { - *rbehind = 0; - rbehind = NULL; - } - break; - } - if (blksz == zp->z_blksz) - break; - rangelock_exit(lr); - } - - object = ma[0]->object; - zfs_vmobject_wlock(object); - obj_size = object->un_pager.vnp.vnp_size; - zfs_vmobject_wunlock(object); - if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { - if (lr != NULL) - rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (zfs_vm_pagerret_bad); - } - - pgsin_b = 0; - if (rbehind != NULL) { - pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz)); - pgsin_b = MIN(*rbehind, pgsin_b); - } - - pgsin_a = 0; - if (rahead != NULL) { - pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end); - if (end + IDX_TO_OFF(pgsin_a) >= obj_size) - pgsin_a = OFF_TO_IDX(round_page(obj_size) - end); - pgsin_a = MIN(*rahead, pgsin_a); - } - - /* - * NB: we need to pass the exact byte size of the data that we expect - * to read after accounting for the file size. This is required because - * ZFS will panic if we request DMU to read beyond the end of the last - * allocated block. - */ - error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a, - MIN(end, obj_size) - (end - PAGE_SIZE)); - - if (lr != NULL) - rangelock_exit(lr); - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - ZFS_EXIT(zfsvfs); - - if (error != 0) - return (zfs_vm_pagerret_error); - - VM_CNT_INC(v_vnodein); - VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a); - if (rbehind != NULL) - *rbehind = pgsin_b; - if (rahead != NULL) - *rahead = pgsin_a; - return (zfs_vm_pagerret_ok); -} - -static int -zfs_freebsd_getpages(ap) - struct vop_getpages_args /* { - struct vnode *a_vp; - vm_page_t *a_m; - int a_count; - int *a_rbehind; - int *a_rahead; - } */ *ap; -{ - - return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, - ap->a_rahead)); -} - -static int -zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, - int *rtvals) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - locked_range_t *lr; - dmu_tx_t *tx; - struct sf_buf *sf; - vm_object_t object; - vm_page_t m; - caddr_t va; - size_t tocopy; - size_t lo_len; - vm_ooffset_t lo_off; - vm_ooffset_t off; - uint_t blksz; - int ncount; - int pcount; - int err; - int i; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - object = vp->v_object; - pcount = btoc(len); - ncount = pcount; - - KASSERT(ma[0]->object == object, ("mismatching object")); - KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); - - for (i = 0; i < pcount; i++) - rtvals[i] = zfs_vm_pagerret_error; - - off = IDX_TO_OFF(ma[0]->pindex); - blksz = zp->z_blksz; - lo_off = rounddown(off, blksz); - lo_len = roundup(len + (off - lo_off), blksz); - lr = rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER); - - zfs_vmobject_wlock(object); - if (len + off > object->un_pager.vnp.vnp_size) { - if (object->un_pager.vnp.vnp_size > off) { - int pgoff; - - len = object->un_pager.vnp.vnp_size - off; - ncount = btoc(len); - if ((pgoff = (int)len & PAGE_MASK) != 0) { - /* - * If the object is locked and the following - * conditions hold, then the page's dirty - * field cannot be concurrently changed by a - * pmap operation. - */ - m = ma[ncount - 1]; - vm_page_assert_sbusied(m); - KASSERT(!pmap_page_is_write_mapped(m), - ("zfs_putpages: page %p is not read-only", m)); - vm_page_clear_dirty(m, pgoff, PAGE_SIZE - - pgoff); - } - } else { - len = 0; - ncount = 0; - } - if (ncount < pcount) { - for (i = ncount; i < pcount; i++) { - rtvals[i] = zfs_vm_pagerret_bad; - } - } - } - zfs_vmobject_wunlock(object); - - if (ncount == 0) - goto out; - - if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || - zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { - goto out; - } - - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_write(tx, zp->z_id, off, len); - - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { - dmu_tx_abort(tx); - goto out; - } - - if (zp->z_blksz < PAGE_SIZE) { - for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { - tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; - va = zfs_map_page(ma[i], &sf); - dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); - zfs_unmap_page(sf); - } - } else { - err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); - } - - if (err == 0) { - uint64_t mtime[2], ctime[2]; - sa_bulk_attr_t bulk[3]; - int count = 0; - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, - &mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, - &ctime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, 8); - zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, - B_TRUE); - err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - ASSERT0(err); - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); - - zfs_vmobject_wlock(object); - for (i = 0; i < ncount; i++) { - rtvals[i] = zfs_vm_pagerret_ok; - vm_page_undirty(ma[i]); - } - zfs_vmobject_wunlock(object); - VM_CNT_INC(v_vnodeout); - VM_CNT_ADD(v_vnodepgsout, ncount); - } - dmu_tx_commit(tx); - -out: - rangelock_exit(lr); - if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || - zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zfsvfs->z_log, zp->z_id); - ZFS_EXIT(zfsvfs); - return (rtvals[0]); -} - -int -zfs_freebsd_putpages(ap) - struct vop_putpages_args /* { - struct vnode *a_vp; - vm_page_t *a_m; - int a_count; - int a_sync; - int *a_rtvals; - } */ *ap; -{ - - return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, - ap->a_rtvals)); -} - -static int -zfs_freebsd_bmap(ap) - struct vop_bmap_args /* { - struct vnode *a_vp; - daddr_t a_bn; - struct bufobj **a_bop; - daddr_t *a_bnp; - int *a_runp; - int *a_runb; - } */ *ap; -{ - - if (ap->a_bop != NULL) - *ap->a_bop = &ap->a_vp->v_bufobj; - if (ap->a_bnp != NULL) - *ap->a_bnp = ap->a_bn; - if (ap->a_runp != NULL) - *ap->a_runp = 0; - if (ap->a_runb != NULL) - *ap->a_runb = 0; - - return (0); -} - -static int -zfs_freebsd_open(ap) - struct vop_open_args /* { - struct vnode *a_vp; - int a_mode; - struct ucred *a_cred; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - znode_t *zp = VTOZ(vp); - int error; - - error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); - if (error == 0) - vnode_create_vobject(vp, zp->z_size, ap->a_td); - return (error); -} - -static int -zfs_freebsd_close(ap) - struct vop_close_args /* { - struct vnode *a_vp; - int a_fflag; - struct ucred *a_cred; - struct thread *a_td; - } */ *ap; -{ - - return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL)); -} - -static int -zfs_freebsd_ioctl(ap) - struct vop_ioctl_args /* { - struct vnode *a_vp; - u_long a_command; - caddr_t a_data; - int a_fflag; - struct ucred *cred; - struct thread *td; - } */ *ap; -{ - - return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, - ap->a_fflag, ap->a_cred, NULL, NULL)); -} - -static int -ioflags(int ioflags) -{ - int flags = 0; - - if (ioflags & IO_APPEND) - flags |= FAPPEND; - if (ioflags & IO_NDELAY) - flags |= FNONBLOCK; - if (ioflags & IO_SYNC) - flags |= (FSYNC | FDSYNC | FRSYNC); - - return (flags); -} - -static int -zfs_freebsd_read(ap) - struct vop_read_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; -{ - - return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), - ap->a_cred, NULL)); -} - -static int -zfs_freebsd_write(ap) - struct vop_write_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; -{ - - return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), - ap->a_cred, NULL)); -} - -/* - * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see - * the comment above cache_fplookup for details. - */ -static int -zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v) -{ - vnode_t *vp; - znode_t *zp; - uint64_t pflags; - - vp = v->a_vp; - zp = VTOZ_SMR(vp); - if (__predict_false(zp == NULL)) - return (EAGAIN); - pflags = atomic_load_64(&zp->z_pflags); - if (pflags & ZFS_AV_QUARANTINED) - return (EAGAIN); - if (pflags & ZFS_XATTR) - return (EAGAIN); - if ((pflags & ZFS_NO_EXECS_DENIED) == 0) - return (EAGAIN); - return (0); -} - -static int -zfs_freebsd_access(ap) - struct vop_access_args /* { - struct vnode *a_vp; - accmode_t a_accmode; - struct ucred *a_cred; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - znode_t *zp = VTOZ(vp); - accmode_t accmode; - int error = 0; - - if (ap->a_accmode == VEXEC) { - if (zfs_freebsd_fastaccesschk_execute(ap->a_vp, ap->a_cred) == 0) - return (0); - } - - /* - * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, - */ - accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); - if (accmode != 0) - error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); - - /* - * VADMIN has to be handled by vaccess(). - */ - if (error == 0) { - accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); - if (accmode != 0) { - error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, - zp->z_gid, accmode, ap->a_cred); - } - } - - /* - * For VEXEC, ensure that at least one execute bit is set for - * non-directories. - */ - if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && - (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { - error = EACCES; - } - - return (error); -} - -static int -zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) -{ - struct componentname *cnp = ap->a_cnp; - char nm[NAME_MAX + 1]; - - ASSERT(cnp->cn_namelen < sizeof(nm)); - strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); - - return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, - cnp->cn_cred, cnp->cn_thread, 0, cached)); -} - -static int -zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap) -{ - - return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE)); -} - -static int -zfs_cache_lookup(ap) - struct vop_lookup_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - } */ *ap; -{ - zfsvfs_t *zfsvfs; - - zfsvfs = ap->a_dvp->v_mount->mnt_data; - if (zfsvfs->z_use_namecache) - return (vfs_cache_lookup(ap)); - else - return (zfs_freebsd_lookup(ap, B_FALSE)); -} - -static int -zfs_freebsd_create(ap) - struct vop_create_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vattr *a_vap; - } */ *ap; -{ - zfsvfs_t *zfsvfs; - struct componentname *cnp = ap->a_cnp; - vattr_t *vap = ap->a_vap; - int error, mode; - - ASSERT(cnp->cn_flags & SAVENAME); - - vattr_init_mask(vap); - mode = vap->va_mode & ALLPERMS; - zfsvfs = ap->a_dvp->v_mount->mnt_data; - - error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, - ap->a_vpp, cnp->cn_cred, cnp->cn_thread); - if (zfsvfs->z_use_namecache && - error == 0 && (cnp->cn_flags & MAKEENTRY) != 0) - cache_enter(ap->a_dvp, *ap->a_vpp, cnp); - return (error); -} - -static int -zfs_freebsd_remove(ap) - struct vop_remove_args /* { - struct vnode *a_dvp; - struct vnode *a_vp; - struct componentname *a_cnp; - } */ *ap; -{ - - ASSERT(ap->a_cnp->cn_flags & SAVENAME); - - return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, - ap->a_cnp->cn_cred)); -} - -static int -zfs_freebsd_mkdir(ap) - struct vop_mkdir_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vattr *a_vap; - } */ *ap; -{ - vattr_t *vap = ap->a_vap; - - ASSERT(ap->a_cnp->cn_flags & SAVENAME); - - vattr_init_mask(vap); - - return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, - ap->a_cnp->cn_cred)); -} - -static int -zfs_freebsd_rmdir(ap) - struct vop_rmdir_args /* { - struct vnode *a_dvp; - struct vnode *a_vp; - struct componentname *a_cnp; - } */ *ap; -{ - struct componentname *cnp = ap->a_cnp; - - ASSERT(cnp->cn_flags & SAVENAME); - - return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); -} - -static int -zfs_freebsd_readdir(ap) - struct vop_readdir_args /* { - struct vnode *a_vp; - struct uio *a_uio; - struct ucred *a_cred; - int *a_eofflag; - int *a_ncookies; - u_long **a_cookies; - } */ *ap; -{ - - return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, - ap->a_ncookies, ap->a_cookies)); -} - -static int -zfs_freebsd_fsync(ap) - struct vop_fsync_args /* { - struct vnode *a_vp; - int a_waitfor; - struct thread *a_td; - } */ *ap; -{ - - vop_stdfsync(ap); - return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); -} - -static int -zfs_freebsd_getattr(ap) - struct vop_getattr_args /* { - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - } */ *ap; -{ - vattr_t *vap = ap->a_vap; - xvattr_t xvap; - u_long fflags = 0; - int error; - - xva_init(&xvap); - xvap.xva_vattr = *vap; - xvap.xva_vattr.va_mask |= AT_XVATTR; - - /* Convert chflags into ZFS-type flags. */ - /* XXX: what about SF_SETTABLE?. */ - XVA_SET_REQ(&xvap, XAT_IMMUTABLE); - XVA_SET_REQ(&xvap, XAT_APPENDONLY); - XVA_SET_REQ(&xvap, XAT_NOUNLINK); - XVA_SET_REQ(&xvap, XAT_NODUMP); - XVA_SET_REQ(&xvap, XAT_READONLY); - XVA_SET_REQ(&xvap, XAT_ARCHIVE); - XVA_SET_REQ(&xvap, XAT_SYSTEM); - XVA_SET_REQ(&xvap, XAT_HIDDEN); - XVA_SET_REQ(&xvap, XAT_REPARSE); - XVA_SET_REQ(&xvap, XAT_OFFLINE); - XVA_SET_REQ(&xvap, XAT_SPARSE); - - error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); - if (error != 0) - return (error); - - /* Convert ZFS xattr into chflags. */ -#define FLAG_CHECK(fflag, xflag, xfield) do { \ - if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ - fflags |= (fflag); \ -} while (0) - FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, - xvap.xva_xoptattrs.xoa_immutable); - FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, - xvap.xva_xoptattrs.xoa_appendonly); - FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, - xvap.xva_xoptattrs.xoa_nounlink); - FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, - xvap.xva_xoptattrs.xoa_archive); - FLAG_CHECK(UF_NODUMP, XAT_NODUMP, - xvap.xva_xoptattrs.xoa_nodump); - FLAG_CHECK(UF_READONLY, XAT_READONLY, - xvap.xva_xoptattrs.xoa_readonly); - FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, - xvap.xva_xoptattrs.xoa_system); - FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, - xvap.xva_xoptattrs.xoa_hidden); - FLAG_CHECK(UF_REPARSE, XAT_REPARSE, - xvap.xva_xoptattrs.xoa_reparse); - FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, - xvap.xva_xoptattrs.xoa_offline); - FLAG_CHECK(UF_SPARSE, XAT_SPARSE, - xvap.xva_xoptattrs.xoa_sparse); - -#undef FLAG_CHECK - *vap = xvap.xva_vattr; - vap->va_flags = fflags; - return (0); -} - -static int -zfs_freebsd_setattr(ap) - struct vop_setattr_args /* { - struct vnode *a_vp; - struct vattr *a_vap; - struct ucred *a_cred; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - vattr_t *vap = ap->a_vap; - cred_t *cred = ap->a_cred; - xvattr_t xvap; - u_long fflags; - uint64_t zflags; - - vattr_init_mask(vap); - vap->va_mask &= ~AT_NOSET; - - xva_init(&xvap); - xvap.xva_vattr = *vap; - - zflags = VTOZ(vp)->z_pflags; - - if (vap->va_flags != VNOVAL) { - zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; - int error; - - if (zfsvfs->z_use_fuids == B_FALSE) - return (EOPNOTSUPP); - - fflags = vap->va_flags; - /* - * XXX KDM - * We need to figure out whether it makes sense to allow - * UF_REPARSE through, since we don't really have other - * facilities to handle reparse points and zfs_setattr() - * doesn't currently allow setting that attribute anyway. - */ - if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| - UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| - UF_OFFLINE|UF_SPARSE)) != 0) - return (EOPNOTSUPP); - /* - * Unprivileged processes are not permitted to unset system - * flags, or modify flags if any system flags are set. - * Privileged non-jail processes may not modify system flags - * if securelevel > 0 and any existing system flags are set. - * Privileged jail processes behave like privileged non-jail - * processes if the PR_ALLOW_CHFLAGS permission bit is set; - * otherwise, they behave like unprivileged processes. - */ - if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || - priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) { - if (zflags & - (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { - error = securelevel_gt(cred, 0); - if (error != 0) - return (error); - } - } else { - /* - * Callers may only modify the file flags on objects they - * have VADMIN rights for. - */ - if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) - return (error); - if (zflags & - (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { - return (EPERM); - } - if (fflags & - (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { - return (EPERM); - } - } - -#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ - if (((fflags & (fflag)) && !(zflags & (zflag))) || \ - ((zflags & (zflag)) && !(fflags & (fflag)))) { \ - XVA_SET_REQ(&xvap, (xflag)); \ - (xfield) = ((fflags & (fflag)) != 0); \ - } \ -} while (0) - /* Convert chflags into ZFS-type flags. */ - /* XXX: what about SF_SETTABLE?. */ - FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, - xvap.xva_xoptattrs.xoa_immutable); - FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, - xvap.xva_xoptattrs.xoa_appendonly); - FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, - xvap.xva_xoptattrs.xoa_nounlink); - FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, - xvap.xva_xoptattrs.xoa_archive); - FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, - xvap.xva_xoptattrs.xoa_nodump); - FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, - xvap.xva_xoptattrs.xoa_readonly); - FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, - xvap.xva_xoptattrs.xoa_system); - FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, - xvap.xva_xoptattrs.xoa_hidden); - FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, - xvap.xva_xoptattrs.xoa_reparse); - FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, - xvap.xva_xoptattrs.xoa_offline); - FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, - xvap.xva_xoptattrs.xoa_sparse); -#undef FLAG_CHANGE - } - if (vap->va_birthtime.tv_sec != VNOVAL) { - xvap.xva_vattr.va_mask |= AT_XVATTR; - XVA_SET_REQ(&xvap, XAT_CREATETIME); - } - return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); -} - -static int -zfs_freebsd_rename(ap) - struct vop_rename_args /* { - struct vnode *a_fdvp; - struct vnode *a_fvp; - struct componentname *a_fcnp; - struct vnode *a_tdvp; - struct vnode *a_tvp; - struct componentname *a_tcnp; - } */ *ap; -{ - vnode_t *fdvp = ap->a_fdvp; - vnode_t *fvp = ap->a_fvp; - vnode_t *tdvp = ap->a_tdvp; - vnode_t *tvp = ap->a_tvp; - int error; - - ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); - ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); - - error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, - ap->a_tcnp, ap->a_fcnp->cn_cred); - - vrele(fdvp); - vrele(fvp); - vrele(tdvp); - if (tvp != NULL) - vrele(tvp); - - return (error); -} - -static int -zfs_freebsd_symlink(ap) - struct vop_symlink_args /* { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vattr *a_vap; - char *a_target; - } */ *ap; -{ - struct componentname *cnp = ap->a_cnp; - vattr_t *vap = ap->a_vap; - - ASSERT(cnp->cn_flags & SAVENAME); - - vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ - vattr_init_mask(vap); - - return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, - __DECONST(char *, ap->a_target), cnp->cn_cred, cnp->cn_thread)); -} - -static int -zfs_freebsd_readlink(ap) - struct vop_readlink_args /* { - struct vnode *a_vp; - struct uio *a_uio; - struct ucred *a_cred; - } */ *ap; -{ - - return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); -} - -static int -zfs_freebsd_link(ap) - struct vop_link_args /* { - struct vnode *a_tdvp; - struct vnode *a_vp; - struct componentname *a_cnp; - } */ *ap; -{ - struct componentname *cnp = ap->a_cnp; - vnode_t *vp = ap->a_vp; - vnode_t *tdvp = ap->a_tdvp; - - if (tdvp->v_mount != vp->v_mount) - return (EXDEV); - - ASSERT(cnp->cn_flags & SAVENAME); - - return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); -} - -static int -zfs_freebsd_inactive(ap) - struct vop_inactive_args /* { - struct vnode *a_vp; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - - zfs_inactive(vp, ap->a_td->td_ucred, NULL); - return (0); -} - -static int -zfs_freebsd_need_inactive(ap) - struct vop_need_inactive_args /* { - struct vnode *a_vp; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int need; - - if (vn_need_pageq_flush(vp)) - return (1); - - if (!ZFS_TRYRLOCK_TEARDOWN_INACTIVE(zfsvfs)) - return (1); - need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty); - ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs); - - return (need); -} - -static int -zfs_freebsd_reclaim(ap) - struct vop_reclaim_args /* { - struct vnode *a_vp; - struct thread *a_td; - } */ *ap; -{ - vnode_t *vp = ap->a_vp; - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - ASSERT(zp != NULL); - - /* - * z_teardown_inactive_lock protects from a race with - * zfs_znode_dmu_fini in zfsvfs_teardown during - * force unmount. - */ - ZFS_RLOCK_TEARDOWN_INACTIVE(zfsvfs); - if (zp->z_sa_hdl == NULL) - zfs_znode_free(zp); - else - zfs_zinactive(zp); - ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs); - - vp->v_data = NULL; - return (0); -} - -static int -zfs_freebsd_fid(ap) - struct vop_fid_args /* { - struct vnode *a_vp; - struct fid *a_fid; - } */ *ap; -{ - - return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); -} - -static int -zfs_freebsd_pathconf(ap) - struct vop_pathconf_args /* { - struct vnode *a_vp; - int a_name; - register_t *a_retval; - } */ *ap; -{ - ulong_t val; - int error; - - error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); - if (error == 0) { - *ap->a_retval = val; - return (error); - } - if (error != EOPNOTSUPP) - return (error); - - switch (ap->a_name) { - case _PC_NAME_MAX: - *ap->a_retval = NAME_MAX; - return (0); - case _PC_PIPE_BUF: - if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { - *ap->a_retval = PIPE_BUF; - return (0); - } - return (EINVAL); - default: - return (vop_stdpathconf(ap)); - } -} - -/* - * FreeBSD's extended attributes namespace defines file name prefix for ZFS' - * extended attribute name: - * - * NAMESPACE PREFIX - * system freebsd:system: - * user (none, can be used to access ZFS fsattr(5) attributes - * created on Solaris) - */ -static int -zfs_create_attrname(int attrnamespace, const char *name, char *attrname, - size_t size) -{ - const char *namespace, *prefix, *suffix; - - /* We don't allow '/' character in attribute name. */ - if (strchr(name, '/') != NULL) - return (EINVAL); - /* We don't allow attribute names that start with "freebsd:" string. */ - if (strncmp(name, "freebsd:", 8) == 0) - return (EINVAL); - - bzero(attrname, size); - - switch (attrnamespace) { - case EXTATTR_NAMESPACE_USER: -#if 0 - prefix = "freebsd:"; - namespace = EXTATTR_NAMESPACE_USER_STRING; - suffix = ":"; -#else - /* - * This is the default namespace by which we can access all - * attributes created on Solaris. - */ - prefix = namespace = suffix = ""; -#endif - break; - case EXTATTR_NAMESPACE_SYSTEM: - prefix = "freebsd:"; - namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; - suffix = ":"; - break; - case EXTATTR_NAMESPACE_EMPTY: - default: - return (EINVAL); - } - if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, - name) >= size) { - return (ENAMETOOLONG); - } - return (0); -} - -/* - * Vnode operating to retrieve a named extended attribute. - */ -static int -zfs_getextattr(struct vop_getextattr_args *ap) -/* -vop_getextattr { - IN struct vnode *a_vp; - IN int a_attrnamespace; - IN const char *a_name; - INOUT struct uio *a_uio; - OUT size_t *a_size; - IN struct ucred *a_cred; - IN struct thread *a_td; -}; -*/ -{ - zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; - struct thread *td = ap->a_td; - struct nameidata nd; - char attrname[255]; - struct vattr va; - vnode_t *xvp = NULL, *vp; - int error, flags; - - error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, - ap->a_cred, ap->a_td, VREAD); - if (error != 0) - return (error); - - error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, - sizeof(attrname)); - if (error != 0) - return (error); - - ZFS_ENTER(zfsvfs); - - error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, - LOOKUP_XATTR, B_FALSE); - if (error != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - flags = FREAD; - NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, - xvp, td); - error = vn_open_cred(&nd, &flags, VN_OPEN_INVFS, 0, ap->a_cred, NULL); - vp = nd.ni_vp; - NDFREE(&nd, NDF_ONLY_PNBUF); - if (error != 0) { - ZFS_EXIT(zfsvfs); - if (error == ENOENT) - error = ENOATTR; - return (error); - } - - if (ap->a_size != NULL) { - error = VOP_GETATTR(vp, &va, ap->a_cred); - if (error == 0) - *ap->a_size = (size_t)va.va_size; - } else if (ap->a_uio != NULL) - error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); - - VOP_UNLOCK(vp); - vn_close(vp, flags, ap->a_cred, td); - ZFS_EXIT(zfsvfs); - - return (error); -} - -/* - * Vnode operation to remove a named attribute. - */ -int -zfs_deleteextattr(struct vop_deleteextattr_args *ap) -/* -vop_deleteextattr { - IN struct vnode *a_vp; - IN int a_attrnamespace; - IN const char *a_name; - IN struct ucred *a_cred; - IN struct thread *a_td; -}; -*/ -{ - zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; - struct thread *td = ap->a_td; - struct nameidata nd; - char attrname[255]; - struct vattr va; - vnode_t *xvp = NULL, *vp; - int error, flags; - - error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, - ap->a_cred, ap->a_td, VWRITE); - if (error != 0) - return (error); - - error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, - sizeof(attrname)); - if (error != 0) - return (error); - - ZFS_ENTER(zfsvfs); - - error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, - LOOKUP_XATTR, B_FALSE); - if (error != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, - UIO_SYSSPACE, attrname, xvp, td); - error = namei(&nd); - vp = nd.ni_vp; - if (error != 0) { - ZFS_EXIT(zfsvfs); - NDFREE(&nd, NDF_ONLY_PNBUF); - if (error == ENOENT) - error = ENOATTR; - return (error); - } - - error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); - NDFREE(&nd, NDF_ONLY_PNBUF); - - vput(nd.ni_dvp); - if (vp == nd.ni_dvp) - vrele(vp); - else - vput(vp); - ZFS_EXIT(zfsvfs); - - return (error); -} - -/* - * Vnode operation to set a named attribute. - */ -static int -zfs_setextattr(struct vop_setextattr_args *ap) -/* -vop_setextattr { - IN struct vnode *a_vp; - IN int a_attrnamespace; - IN const char *a_name; - INOUT struct uio *a_uio; - IN struct ucred *a_cred; - IN struct thread *a_td; -}; -*/ -{ - zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; - struct thread *td = ap->a_td; - struct nameidata nd; - char attrname[255]; - struct vattr va; - vnode_t *xvp = NULL, *vp; - int error, flags; - - error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, - ap->a_cred, ap->a_td, VWRITE); - if (error != 0) - return (error); - - error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, - sizeof(attrname)); - if (error != 0) - return (error); - - ZFS_ENTER(zfsvfs); - - error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, - LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE); - if (error != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - flags = FFLAGS(O_WRONLY | O_CREAT); - NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, - xvp, td); - error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred, - NULL); - vp = nd.ni_vp; - NDFREE(&nd, NDF_ONLY_PNBUF); - if (error != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - VATTR_NULL(&va); - va.va_size = 0; - error = VOP_SETATTR(vp, &va, ap->a_cred); - if (error == 0) - VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); - - VOP_UNLOCK(vp); - vn_close(vp, flags, ap->a_cred, td); - ZFS_EXIT(zfsvfs); - - return (error); -} - -/* - * Vnode operation to retrieve extended attributes on a vnode. - */ -static int -zfs_listextattr(struct vop_listextattr_args *ap) -/* -vop_listextattr { - IN struct vnode *a_vp; - IN int a_attrnamespace; - INOUT struct uio *a_uio; - OUT size_t *a_size; - IN struct ucred *a_cred; - IN struct thread *a_td; -}; -*/ -{ - zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; - struct thread *td = ap->a_td; - struct nameidata nd; - char attrprefix[16]; - u_char dirbuf[sizeof(struct dirent)]; - struct dirent *dp; - struct iovec aiov; - struct uio auio, *uio = ap->a_uio; - size_t *sizep = ap->a_size; - size_t plen; - vnode_t *xvp = NULL, *vp; - int done, error, eof, pos; - - error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, - ap->a_cred, ap->a_td, VREAD); - if (error != 0) - return (error); - - error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, - sizeof(attrprefix)); - if (error != 0) - return (error); - plen = strlen(attrprefix); - - ZFS_ENTER(zfsvfs); - - if (sizep != NULL) - *sizep = 0; - - error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, - LOOKUP_XATTR, B_FALSE); - if (error != 0) { - ZFS_EXIT(zfsvfs); - /* - * ENOATTR means that the EA directory does not yet exist, - * i.e. there are no extended attributes there. - */ - if (error == ENOATTR) - error = 0; - return (error); - } - - NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, - UIO_SYSSPACE, ".", xvp, td); - error = namei(&nd); - vp = nd.ni_vp; - NDFREE(&nd, NDF_ONLY_PNBUF); - if (error != 0) { - ZFS_EXIT(zfsvfs); - return (error); - } - - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_td = td; - auio.uio_rw = UIO_READ; - auio.uio_offset = 0; - - do { - u_char nlen; - - aiov.iov_base = (void *)dirbuf; - aiov.iov_len = sizeof(dirbuf); - auio.uio_resid = sizeof(dirbuf); - error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); - done = sizeof(dirbuf) - auio.uio_resid; - if (error != 0) - break; - for (pos = 0; pos < done;) { - dp = (struct dirent *)(dirbuf + pos); - pos += dp->d_reclen; - /* - * XXX: Temporarily we also accept DT_UNKNOWN, as this - * is what we get when attribute was created on Solaris. - */ - if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) - continue; - if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) - continue; - else if (strncmp(dp->d_name, attrprefix, plen) != 0) - continue; - nlen = dp->d_namlen - plen; - if (sizep != NULL) - *sizep += 1 + nlen; - else if (uio != NULL) { - /* - * Format of extattr name entry is one byte for - * length and the rest for name. - */ - error = uiomove(&nlen, 1, uio->uio_rw, uio); - if (error == 0) { - error = uiomove(dp->d_name + plen, nlen, - uio->uio_rw, uio); - } - if (error != 0) - break; - } - } - } while (!eof && error == 0); - - vput(vp); - ZFS_EXIT(zfsvfs); - - return (error); -} - -int -zfs_freebsd_getacl(ap) - struct vop_getacl_args /* { - struct vnode *vp; - acl_type_t type; - struct acl *aclp; - struct ucred *cred; - struct thread *td; - } */ *ap; -{ - int error; - vsecattr_t vsecattr; - - if (ap->a_type != ACL_TYPE_NFS4) - return (EINVAL); - - vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; - if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) - return (error); - - error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); - if (vsecattr.vsa_aclentp != NULL) - kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); - - return (error); -} - -int -zfs_freebsd_setacl(ap) - struct vop_setacl_args /* { - struct vnode *vp; - acl_type_t type; - struct acl *aclp; - struct ucred *cred; - struct thread *td; - } */ *ap; -{ - int error; - vsecattr_t vsecattr; - int aclbsize; /* size of acl list in bytes */ - aclent_t *aaclp; - - if (ap->a_type != ACL_TYPE_NFS4) - return (EINVAL); - - if (ap->a_aclp == NULL) - return (EINVAL); - - if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) - return (EINVAL); - - /* - * With NFSv4 ACLs, chmod(2) may need to add additional entries, - * splitting every entry into two and appending "canonical six" - * entries at the end. Don't allow for setting an ACL that would - * cause chmod(2) to run out of ACL entries. - */ - if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) - return (ENOSPC); - - error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); - if (error != 0) - return (error); - - vsecattr.vsa_mask = VSA_ACE; - aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); - vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); - aaclp = vsecattr.vsa_aclentp; - vsecattr.vsa_aclentsz = aclbsize; - - aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); - error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); - kmem_free(aaclp, aclbsize); - - return (error); -} - -int -zfs_freebsd_aclcheck(ap) - struct vop_aclcheck_args /* { - struct vnode *vp; - acl_type_t type; - struct acl *aclp; - struct ucred *cred; - struct thread *td; - } */ *ap; -{ - - return (EOPNOTSUPP); -} - -static int -zfs_vptocnp(struct vop_vptocnp_args *ap) -{ - vnode_t *covered_vp; - vnode_t *vp = ap->a_vp;; - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - znode_t *zp = VTOZ(vp); - enum vgetstate vs; - int ltype; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* - * If we are a snapshot mounted under .zfs, run the operation - * on the covered vnode. - */ - if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { - char name[MAXNAMLEN + 1]; - znode_t *dzp; - size_t len; - - error = zfs_znode_parent_and_name(zp, &dzp, name); - if (error == 0) { - len = strlen(name); - if (*ap->a_buflen < len) - error = SET_ERROR(ENOMEM); - } - if (error == 0) { - *ap->a_buflen -= len; - bcopy(name, ap->a_buf + *ap->a_buflen, len); - *ap->a_vpp = ZTOV(dzp); - } - ZFS_EXIT(zfsvfs); - return (error); - } - ZFS_EXIT(zfsvfs); - - covered_vp = vp->v_mount->mnt_vnodecovered; - vs = vget_prep(covered_vp); - ltype = VOP_ISLOCKED(vp); - VOP_UNLOCK(vp); - error = vget_finish(covered_vp, LK_SHARED, vs); - if (error == 0) { - error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, - ap->a_buf, ap->a_buflen); - vput(covered_vp); - } - vn_lock(vp, ltype | LK_RETRY); - if (VN_IS_DOOMED(vp)) - error = SET_ERROR(ENOENT); - return (error); -} - -#ifdef DIAGNOSTIC -static int -zfs_lock(ap) - struct vop_lock1_args /* { - struct vnode *a_vp; - int a_flags; - char *file; - int line; - } */ *ap; -{ - vnode_t *vp; - znode_t *zp; - int err; - - err = vop_lock(ap); - if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { - vp = ap->a_vp; - zp = vp->v_data; - if (vp->v_mount != NULL && !VN_IS_DOOMED(vp) && - zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) - VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); - } - return (err); -} -#endif - -struct vop_vector zfs_vnodeops; -struct vop_vector zfs_fifoops; -struct vop_vector zfs_shareops; - -struct vop_vector zfs_vnodeops = { - .vop_default = &default_vnodeops, - .vop_inactive = zfs_freebsd_inactive, - .vop_need_inactive = zfs_freebsd_need_inactive, - .vop_reclaim = zfs_freebsd_reclaim, - .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec, - .vop_access = zfs_freebsd_access, - .vop_allocate = VOP_EINVAL, - .vop_lookup = zfs_cache_lookup, - .vop_cachedlookup = zfs_freebsd_cachedlookup, - .vop_getattr = zfs_freebsd_getattr, - .vop_setattr = zfs_freebsd_setattr, - .vop_create = zfs_freebsd_create, - .vop_mknod = zfs_freebsd_create, - .vop_mkdir = zfs_freebsd_mkdir, - .vop_readdir = zfs_freebsd_readdir, - .vop_fsync = zfs_freebsd_fsync, - .vop_open = zfs_freebsd_open, - .vop_close = zfs_freebsd_close, - .vop_rmdir = zfs_freebsd_rmdir, - .vop_ioctl = zfs_freebsd_ioctl, - .vop_link = zfs_freebsd_link, - .vop_symlink = zfs_freebsd_symlink, - .vop_readlink = zfs_freebsd_readlink, - .vop_read = zfs_freebsd_read, - .vop_write = zfs_freebsd_write, - .vop_remove = zfs_freebsd_remove, - .vop_rename = zfs_freebsd_rename, - .vop_pathconf = zfs_freebsd_pathconf, - .vop_bmap = zfs_freebsd_bmap, - .vop_fid = zfs_freebsd_fid, - .vop_getextattr = zfs_getextattr, - .vop_deleteextattr = zfs_deleteextattr, - .vop_setextattr = zfs_setextattr, - .vop_listextattr = zfs_listextattr, - .vop_getacl = zfs_freebsd_getacl, - .vop_setacl = zfs_freebsd_setacl, - .vop_aclcheck = zfs_freebsd_aclcheck, - .vop_getpages = zfs_freebsd_getpages, - .vop_putpages = zfs_freebsd_putpages, - .vop_vptocnp = zfs_vptocnp, -#ifdef DIAGNOSTIC - .vop_lock1 = zfs_lock, -#else - .vop_lock1 = vop_lock, -#endif - .vop_unlock = vop_unlock, - .vop_islocked = vop_islocked, -}; -VFS_VOP_VECTOR_REGISTER(zfs_vnodeops); - -struct vop_vector zfs_fifoops = { - .vop_default = &fifo_specops, - .vop_fsync = zfs_freebsd_fsync, - .vop_access = zfs_freebsd_access, - .vop_getattr = zfs_freebsd_getattr, - .vop_inactive = zfs_freebsd_inactive, - .vop_read = VOP_PANIC, - .vop_reclaim = zfs_freebsd_reclaim, - .vop_setattr = zfs_freebsd_setattr, - .vop_write = VOP_PANIC, - .vop_pathconf = zfs_freebsd_pathconf, - .vop_fid = zfs_freebsd_fid, - .vop_getacl = zfs_freebsd_getacl, - .vop_setacl = zfs_freebsd_setacl, - .vop_aclcheck = zfs_freebsd_aclcheck, -}; -VFS_VOP_VECTOR_REGISTER(zfs_fifoops); - -/* - * special share hidden files vnode operations template - */ -struct vop_vector zfs_shareops = { - .vop_default = &default_vnodeops, - .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec, - .vop_access = zfs_freebsd_access, - .vop_inactive = zfs_freebsd_inactive, - .vop_reclaim = zfs_freebsd_reclaim, - .vop_fid = zfs_freebsd_fid, - .vop_pathconf = zfs_freebsd_pathconf, -}; -VFS_VOP_VECTOR_REGISTER(zfs_shareops); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c deleted file mode 100644 index ecc11d16f42a..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c +++ /dev/null @@ -1,2388 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -/* Portions Copyright 2007 Jeremy Teo */ -/* Portions Copyright 2011 Martin Matuska */ - -#ifdef _KERNEL -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#endif /* _KERNEL */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zfs_prop.h" -#include "zfs_comutil.h" - -/* Used by fstat(1). */ -SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(znode_t), "sizeof(znode_t)"); - -/* - * Define ZNODE_STATS to turn on statistic gathering. By default, it is only - * turned on when DEBUG is also defined. - */ -#ifdef DEBUG -#define ZNODE_STATS -#endif /* DEBUG */ - -#ifdef ZNODE_STATS -#define ZNODE_STAT_ADD(stat) ((stat)++) -#else -#define ZNODE_STAT_ADD(stat) /* nothing */ -#endif /* ZNODE_STATS */ - -/* - * Functions needed for userland (ie: libzpool) are not put under - * #ifdef_KERNEL; the rest of the functions have dependencies - * (such as VFS logic) that will not compile easily in userland. - */ -#ifdef _KERNEL -/* - * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to - * be freed before it can be safely accessed. - */ -krwlock_t zfsvfs_lock; - -#if defined(_KERNEL) && !defined(KMEM_DEBUG) -#define _ZFS_USE_SMR -static uma_zone_t znode_uma_zone; -#else -static kmem_cache_t *znode_cache = NULL; -#endif - -/*ARGSUSED*/ -static void -znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) -{ - /* - * We should never drop all dbuf refs without first clearing - * the eviction callback. - */ - panic("evicting znode %p\n", user_ptr); -} - -extern struct vop_vector zfs_vnodeops; -extern struct vop_vector zfs_fifoops; -extern struct vop_vector zfs_shareops; - -/* - * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on - * z_rangelock. It will modify the offset and length of the lock to reflect - * znode-specific information, and convert RL_APPEND to RL_WRITER. This is - * called with the rangelock_t's rl_lock held, which avoids races. - */ -static void -zfs_rangelock_cb(locked_range_t *new, void *arg) -{ - znode_t *zp = arg; - - /* - * If in append mode, convert to writer and lock starting at the - * current end of file. - */ - if (new->lr_type == RL_APPEND) { - new->lr_offset = zp->z_size; - new->lr_type = RL_WRITER; - } - - /* - * If we need to grow the block size then lock the whole file range. - */ - uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); - if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || - zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { - new->lr_offset = 0; - new->lr_length = UINT64_MAX; - } -} - -/*ARGSUSED*/ -static int -zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) -{ - znode_t *zp = buf; - - POINTER_INVALIDATE(&zp->z_zfsvfs); - - list_link_init(&zp->z_link_node); - - mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); - - rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); - - zp->z_acl_cached = NULL; - zp->z_vnode = NULL; - zp->z_moved = 0; - return (0); -} - -/*ARGSUSED*/ -static void -zfs_znode_cache_destructor(void *buf, void *arg) -{ - znode_t *zp = buf; - - ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); - ASSERT3P(zp->z_vnode, ==, NULL); - ASSERT(!list_link_active(&zp->z_link_node)); - mutex_destroy(&zp->z_acl_lock); - rangelock_fini(&zp->z_rangelock); - - ASSERT(zp->z_acl_cached == NULL); -} - -#ifdef ZNODE_STATS -static struct { - uint64_t zms_zfsvfs_invalid; - uint64_t zms_zfsvfs_recheck1; - uint64_t zms_zfsvfs_unmounted; - uint64_t zms_zfsvfs_recheck2; - uint64_t zms_obj_held; - uint64_t zms_vnode_locked; - uint64_t zms_not_only_dnlc; -} znode_move_stats; -#endif /* ZNODE_STATS */ - -#ifdef illumos -static void -zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) -{ - vnode_t *vp; - - /* Copy fields. */ - nzp->z_zfsvfs = ozp->z_zfsvfs; - - /* Swap vnodes. */ - vp = nzp->z_vnode; - nzp->z_vnode = ozp->z_vnode; - ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ - ZTOV(ozp)->v_data = ozp; - ZTOV(nzp)->v_data = nzp; - - nzp->z_id = ozp->z_id; - ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ - nzp->z_unlinked = ozp->z_unlinked; - nzp->z_atime_dirty = ozp->z_atime_dirty; - nzp->z_zn_prefetch = ozp->z_zn_prefetch; - nzp->z_blksz = ozp->z_blksz; - nzp->z_seq = ozp->z_seq; - nzp->z_mapcnt = ozp->z_mapcnt; - nzp->z_gen = ozp->z_gen; - nzp->z_sync_cnt = ozp->z_sync_cnt; - nzp->z_is_sa = ozp->z_is_sa; - nzp->z_sa_hdl = ozp->z_sa_hdl; - bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2); - nzp->z_links = ozp->z_links; - nzp->z_size = ozp->z_size; - nzp->z_pflags = ozp->z_pflags; - nzp->z_uid = ozp->z_uid; - nzp->z_gid = ozp->z_gid; - nzp->z_mode = ozp->z_mode; - - /* - * Since this is just an idle znode and kmem is already dealing with - * memory pressure, release any cached ACL. - */ - if (ozp->z_acl_cached) { - zfs_acl_free(ozp->z_acl_cached); - ozp->z_acl_cached = NULL; - } - - sa_set_userp(nzp->z_sa_hdl, nzp); - - /* - * Invalidate the original znode by clearing fields that provide a - * pointer back to the znode. Set the low bit of the vfs pointer to - * ensure that zfs_znode_move() recognizes the znode as invalid in any - * subsequent callback. - */ - ozp->z_sa_hdl = NULL; - POINTER_INVALIDATE(&ozp->z_zfsvfs); - - /* - * Mark the znode. - */ - nzp->z_moved = 1; - ozp->z_moved = (uint8_t)-1; -} - -/*ARGSUSED*/ -static kmem_cbrc_t -zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) -{ - znode_t *ozp = buf, *nzp = newbuf; - zfsvfs_t *zfsvfs; - vnode_t *vp; - - /* - * The znode is on the file system's list of known znodes if the vfs - * pointer is valid. We set the low bit of the vfs pointer when freeing - * the znode to invalidate it, and the memory patterns written by kmem - * (baddcafe and deadbeef) set at least one of the two low bits. A newly - * created znode sets the vfs pointer last of all to indicate that the - * znode is known and in a valid state to be moved by this function. - */ - zfsvfs = ozp->z_zfsvfs; - if (!POINTER_IS_VALID(zfsvfs)) { - ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); - return (KMEM_CBRC_DONT_KNOW); - } - - /* - * Close a small window in which it's possible that the filesystem could - * be unmounted and freed, and zfsvfs, though valid in the previous - * statement, could point to unrelated memory by the time we try to - * prevent the filesystem from being unmounted. - */ - rw_enter(&zfsvfs_lock, RW_WRITER); - if (zfsvfs != ozp->z_zfsvfs) { - rw_exit(&zfsvfs_lock); - ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1); - return (KMEM_CBRC_DONT_KNOW); - } - - /* - * If the znode is still valid, then so is the file system. We know that - * no valid file system can be freed while we hold zfsvfs_lock, so we - * can safely ensure that the filesystem is not and will not be - * unmounted. The next statement is equivalent to ZFS_ENTER(). - */ - rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); - if (zfsvfs->z_unmounted) { - ZFS_EXIT(zfsvfs); - rw_exit(&zfsvfs_lock); - ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); - return (KMEM_CBRC_DONT_KNOW); - } - rw_exit(&zfsvfs_lock); - - mutex_enter(&zfsvfs->z_znodes_lock); - /* - * Recheck the vfs pointer in case the znode was removed just before - * acquiring the lock. - */ - if (zfsvfs != ozp->z_zfsvfs) { - mutex_exit(&zfsvfs->z_znodes_lock); - ZFS_EXIT(zfsvfs); - ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2); - return (KMEM_CBRC_DONT_KNOW); - } - - /* - * At this point we know that as long as we hold z_znodes_lock, the - * znode cannot be freed and fields within the znode can be safely - * accessed. Now, prevent a race with zfs_zget(). - */ - if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { - mutex_exit(&zfsvfs->z_znodes_lock); - ZFS_EXIT(zfsvfs); - ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); - return (KMEM_CBRC_LATER); - } - - vp = ZTOV(ozp); - if (mutex_tryenter(&vp->v_lock) == 0) { - ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); - mutex_exit(&zfsvfs->z_znodes_lock); - ZFS_EXIT(zfsvfs); - ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); - return (KMEM_CBRC_LATER); - } - - /* Only move znodes that are referenced _only_ by the DNLC. */ - if (vp->v_count != 1 || !vn_in_dnlc(vp)) { - mutex_exit(&vp->v_lock); - ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); - mutex_exit(&zfsvfs->z_znodes_lock); - ZFS_EXIT(zfsvfs); - ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); - return (KMEM_CBRC_LATER); - } - - /* - * The znode is known and in a valid state to move. We're holding the - * locks needed to execute the critical section. - */ - zfs_znode_move_impl(ozp, nzp); - mutex_exit(&vp->v_lock); - ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); - - list_link_replace(&ozp->z_link_node, &nzp->z_link_node); - mutex_exit(&zfsvfs->z_znodes_lock); - ZFS_EXIT(zfsvfs); - - return (KMEM_CBRC_YES); -} -#endif /* illumos */ - -#ifdef _ZFS_USE_SMR -VFS_SMR_DECLARE; - -static int -zfs_znode_cache_constructor_smr(void *mem, int size __unused, void *private, int flags) -{ - - return (zfs_znode_cache_constructor(mem, private, flags)); -} - -static void -zfs_znode_cache_destructor_smr(void *mem, int size __unused, void *private) -{ - - zfs_znode_cache_destructor(mem, private); -} - -void -zfs_znode_init(void) -{ - /* - * Initialize zcache - */ - rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); - ASSERT(znode_uma_zone == NULL); - znode_uma_zone = uma_zcreate("zfs_znode_cache", - sizeof (znode_t), zfs_znode_cache_constructor_smr, - zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0); - VFS_SMR_ZONE_SET(znode_uma_zone); -} - -static znode_t * -zfs_znode_alloc_kmem(int flags) -{ - - return (uma_zalloc_smr(znode_uma_zone, flags)); -} - -static void -zfs_znode_free_kmem(znode_t *zp) -{ - - uma_zfree_smr(znode_uma_zone, zp); -} -#else -void -zfs_znode_init(void) -{ - /* - * Initialize zcache - */ - rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); - ASSERT(znode_cache == NULL); - znode_cache = kmem_cache_create("zfs_znode_cache", - sizeof (znode_t), 0, zfs_znode_cache_constructor, - zfs_znode_cache_destructor, NULL, NULL, NULL, 0); - kmem_cache_set_move(znode_cache, zfs_znode_move); -} - -static znode_t * -zfs_znode_alloc_kmem(int flags) -{ - - return (kmem_cache_alloc(znode_cache, flags)); -} - -static void -zfs_znode_free_kmem(znode_t *zp) -{ - - kmem_cache_free(znode_cache, zp); -} -#endif - -void -zfs_znode_fini(void) -{ -#ifdef illumos - /* - * Cleanup vfs & vnode ops - */ - zfs_remove_op_tables(); -#endif - - /* - * Cleanup zcache - */ -#ifdef _ZFS_USE_SMR - if (znode_uma_zone) { - uma_zdestroy(znode_uma_zone); - znode_uma_zone = NULL; - } -#else - if (znode_cache) { - kmem_cache_destroy(znode_cache); - znode_cache = NULL; - } -#endif - rw_destroy(&zfsvfs_lock); -} - -#ifdef illumos -struct vnodeops *zfs_dvnodeops; -struct vnodeops *zfs_fvnodeops; -struct vnodeops *zfs_symvnodeops; -struct vnodeops *zfs_xdvnodeops; -struct vnodeops *zfs_evnodeops; -struct vnodeops *zfs_sharevnodeops; - -void -zfs_remove_op_tables() -{ - /* - * Remove vfs ops - */ - ASSERT(zfsfstype); - (void) vfs_freevfsops_by_type(zfsfstype); - zfsfstype = 0; - - /* - * Remove vnode ops - */ - if (zfs_dvnodeops) - vn_freevnodeops(zfs_dvnodeops); - if (zfs_fvnodeops) - vn_freevnodeops(zfs_fvnodeops); - if (zfs_symvnodeops) - vn_freevnodeops(zfs_symvnodeops); - if (zfs_xdvnodeops) - vn_freevnodeops(zfs_xdvnodeops); - if (zfs_evnodeops) - vn_freevnodeops(zfs_evnodeops); - if (zfs_sharevnodeops) - vn_freevnodeops(zfs_sharevnodeops); - - zfs_dvnodeops = NULL; - zfs_fvnodeops = NULL; - zfs_symvnodeops = NULL; - zfs_xdvnodeops = NULL; - zfs_evnodeops = NULL; - zfs_sharevnodeops = NULL; -} - -extern const fs_operation_def_t zfs_dvnodeops_template[]; -extern const fs_operation_def_t zfs_fvnodeops_template[]; -extern const fs_operation_def_t zfs_xdvnodeops_template[]; -extern const fs_operation_def_t zfs_symvnodeops_template[]; -extern const fs_operation_def_t zfs_evnodeops_template[]; -extern const fs_operation_def_t zfs_sharevnodeops_template[]; - -int -zfs_create_op_tables() -{ - int error; - - /* - * zfs_dvnodeops can be set if mod_remove() calls mod_installfs() - * due to a failure to remove the the 2nd modlinkage (zfs_modldrv). - * In this case we just return as the ops vectors are already set up. - */ - if (zfs_dvnodeops) - return (0); - - error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template, - &zfs_dvnodeops); - if (error) - return (error); - - error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template, - &zfs_fvnodeops); - if (error) - return (error); - - error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template, - &zfs_symvnodeops); - if (error) - return (error); - - error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template, - &zfs_xdvnodeops); - if (error) - return (error); - - error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, - &zfs_evnodeops); - if (error) - return (error); - - error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template, - &zfs_sharevnodeops); - - return (error); -} -#endif /* illumos */ - -int -zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) -{ - zfs_acl_ids_t acl_ids; - vattr_t vattr; - znode_t *sharezp; - znode_t *zp; - int error; - - vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; - vattr.va_type = VDIR; - vattr.va_mode = S_IFDIR|0555; - vattr.va_uid = crgetuid(kcred); - vattr.va_gid = crgetgid(kcred); - - sharezp = zfs_znode_alloc_kmem(KM_SLEEP); - ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); - sharezp->z_moved = 0; - sharezp->z_unlinked = 0; - sharezp->z_atime_dirty = 0; - sharezp->z_zfsvfs = zfsvfs; - sharezp->z_is_sa = zfsvfs->z_use_sa; - - VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, - kcred, NULL, &acl_ids)); - zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); - ASSERT3P(zp, ==, sharezp); - POINTER_INVALIDATE(&sharezp->z_zfsvfs); - error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, - ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); - zfsvfs->z_shares_dir = sharezp->z_id; - - zfs_acl_ids_free(&acl_ids); - sa_handle_destroy(sharezp->z_sa_hdl); - zfs_znode_free_kmem(sharezp); - - return (error); -} - -/* - * define a couple of values we need available - * for both 64 and 32 bit environments. - */ -#ifndef NBITSMINOR64 -#define NBITSMINOR64 32 -#endif -#ifndef MAXMAJ64 -#define MAXMAJ64 0xffffffffUL -#endif -#ifndef MAXMIN64 -#define MAXMIN64 0xffffffffUL -#endif - -/* - * Create special expldev for ZFS private use. - * Can't use standard expldev since it doesn't do - * what we want. The standard expldev() takes a - * dev32_t in LP64 and expands it to a long dev_t. - * We need an interface that takes a dev32_t in ILP32 - * and expands it to a long dev_t. - */ -static uint64_t -zfs_expldev(dev_t dev) -{ - return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); -} -/* - * Special cmpldev for ZFS private use. - * Can't use standard cmpldev since it takes - * a long dev_t and compresses it to dev32_t in - * LP64. We need to do a compaction of a long dev_t - * to a dev32_t in ILP32. - */ -dev_t -zfs_cmpldev(uint64_t dev) -{ - return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); -} - -static void -zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, - dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) -{ - ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); - ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); - - ASSERT(zp->z_sa_hdl == NULL); - ASSERT(zp->z_acl_cached == NULL); - if (sa_hdl == NULL) { - VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, - SA_HDL_SHARED, &zp->z_sa_hdl)); - } else { - zp->z_sa_hdl = sa_hdl; - sa_set_userp(sa_hdl, zp); - } - - zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; - - /* - * Slap on VROOT if we are the root znode unless we are the root - * node of a snapshot mounted under .zfs. - */ - if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs) - ZTOV(zp)->v_flag |= VROOT; - - vn_exists(ZTOV(zp)); -} - -void -zfs_znode_dmu_fini(znode_t *zp) -{ - ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || - zp->z_unlinked || - ZFS_TEARDOWN_INACTIVE_WLOCKED(zp->z_zfsvfs)); - - sa_handle_destroy(zp->z_sa_hdl); - zp->z_sa_hdl = NULL; -} - -static void -zfs_vnode_forget(vnode_t *vp) -{ - - /* copied from insmntque_stddtr */ - vp->v_data = NULL; - vp->v_op = &dead_vnodeops; - vgone(vp); - vput(vp); -} - -/* - * Construct a new znode/vnode and intialize. - * - * This does not do a call to dmu_set_user() that is - * up to the caller to do, in case you don't want to - * return the znode - */ -static znode_t * -zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, - dmu_object_type_t obj_type, sa_handle_t *hdl) -{ - znode_t *zp; - vnode_t *vp; - uint64_t mode; - uint64_t parent; - sa_bulk_attr_t bulk[9]; - int count = 0; - int error; - - zp = zfs_znode_alloc_kmem(KM_SLEEP); - -#ifndef _ZFS_USE_SMR - KASSERT((zfsvfs->z_parent->z_vfs->mnt_kern_flag & MNTK_FPLOOKUP) == 0, - ("%s: fast path lookup enabled without smr", __func__)); -#endif - - KASSERT(curthread->td_vp_reserved != NULL, - ("zfs_znode_alloc: getnewvnode without preallocated vnode")); - error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp); - if (error != 0) { - zfs_znode_free_kmem(zp); - return (NULL); - } - zp->z_vnode = vp; - vp->v_data = zp; - - ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); - zp->z_moved = 0; - - /* - * Defer setting z_zfsvfs until the znode is ready to be a candidate for - * the zfs_znode_move() callback. - */ - zp->z_sa_hdl = NULL; - zp->z_unlinked = 0; - zp->z_atime_dirty = 0; - zp->z_mapcnt = 0; - zp->z_id = db->db_object; - zp->z_blksz = blksz; - zp->z_seq = 0x7A4653; - zp->z_sync_cnt = 0; - - vp = ZTOV(zp); - - zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, - &zp->z_size, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, - &zp->z_links, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, - &zp->z_atime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, - &zp->z_uid, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, - &zp->z_gid, 8); - - if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) { - if (hdl == NULL) - sa_handle_destroy(zp->z_sa_hdl); - zfs_vnode_forget(vp); - zp->z_vnode = NULL; - zfs_znode_free_kmem(zp); - return (NULL); - } - - zp->z_mode = mode; - - vp->v_type = IFTOVT((mode_t)mode); - - switch (vp->v_type) { - case VDIR: - zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ - break; -#ifdef illumos - case VBLK: - case VCHR: - { - uint64_t rdev; - VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), - &rdev, sizeof (rdev)) == 0); - - vp->v_rdev = zfs_cmpldev(rdev); - } - break; -#endif - case VFIFO: -#ifdef illumos - case VSOCK: - case VDOOR: -#endif - vp->v_op = &zfs_fifoops; - break; - case VREG: - if (parent == zfsvfs->z_shares_dir) { - ASSERT(zp->z_uid == 0 && zp->z_gid == 0); - vp->v_op = &zfs_shareops; - } - break; -#ifdef illumos - case VLNK: - vn_setops(vp, zfs_symvnodeops); - break; - default: - vn_setops(vp, zfs_evnodeops); - break; -#endif - } - - mutex_enter(&zfsvfs->z_znodes_lock); - list_insert_tail(&zfsvfs->z_all_znodes, zp); - membar_producer(); - /* - * Everything else must be valid before assigning z_zfsvfs makes the - * znode eligible for zfs_znode_move(). - */ - zp->z_zfsvfs = zfsvfs; - mutex_exit(&zfsvfs->z_znodes_lock); - - /* - * Acquire vnode lock before making it available to the world. - */ - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - VN_LOCK_AREC(vp); - if (vp->v_type != VFIFO) - VN_LOCK_ASHARE(vp); - -#ifdef illumos - VFS_HOLD(zfsvfs->z_vfs); -#endif - return (zp); -} - -static uint64_t empty_xattr; -static uint64_t pad[4]; -static zfs_acl_phys_t acl_phys; -/* - * Create a new DMU object to hold a zfs znode. - * - * IN: dzp - parent directory for new znode - * vap - file attributes for new znode - * tx - dmu transaction id for zap operations - * cr - credentials of caller - * flag - flags: - * IS_ROOT_NODE - new object will be root - * IS_XATTR - new object is an attribute - * bonuslen - length of bonus buffer - * setaclp - File/Dir initial ACL - * fuidp - Tracks fuid allocation. - * - * OUT: zpp - allocated znode - * - */ -void -zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) -{ - uint64_t crtime[2], atime[2], mtime[2], ctime[2]; - uint64_t mode, size, links, parent, pflags; - uint64_t dzp_pflags = 0; - uint64_t rdev = 0; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - dmu_buf_t *db; - timestruc_t now; - uint64_t gen, obj; - int err; - int bonuslen; - int dnodesize; - sa_handle_t *sa_hdl; - dmu_object_type_t obj_type; - sa_bulk_attr_t *sa_attrs; - int cnt = 0; - zfs_acl_locator_cb_t locate = { 0 }; - - ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); - - if (zfsvfs->z_replay) { - obj = vap->va_nodeid; - now = vap->va_ctime; /* see zfs_replay_create() */ - gen = vap->va_nblocks; /* ditto */ - dnodesize = vap->va_fsid; /* ditto */ - } else { - obj = 0; - vfs_timestamp(&now); - gen = dmu_tx_get_txg(tx); - dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); - } - - if (dnodesize == 0) - dnodesize = DNODE_MIN_SIZE; - - obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; - bonuslen = (obj_type == DMU_OT_SA) ? - DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; - - /* - * Create a new DMU object. - */ - /* - * There's currently no mechanism for pre-reading the blocks that will - * be needed to allocate a new object, so we accept the small chance - * that there will be an i/o error and we will fail one of the - * assertions below. - */ - if (vap->va_type == VDIR) { - if (zfsvfs->z_replay) { - VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, - zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, - obj_type, bonuslen, dnodesize, tx)); - } else { - obj = zap_create_norm_dnsize(zfsvfs->z_os, - zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, - obj_type, bonuslen, dnodesize, tx); - } - } else { - if (zfsvfs->z_replay) { - VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, - DMU_OT_PLAIN_FILE_CONTENTS, 0, - obj_type, bonuslen, dnodesize, tx)); - } else { - obj = dmu_object_alloc_dnsize(zfsvfs->z_os, - DMU_OT_PLAIN_FILE_CONTENTS, 0, - obj_type, bonuslen, dnodesize, tx); - } - } - - ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); - VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); - - /* - * If this is the root, fix up the half-initialized parent pointer - * to reference the just-allocated physical data area. - */ - if (flag & IS_ROOT_NODE) { - dzp->z_id = obj; - } else { - dzp_pflags = dzp->z_pflags; - } - - /* - * If parent is an xattr, so am I. - */ - if (dzp_pflags & ZFS_XATTR) { - flag |= IS_XATTR; - } - - if (zfsvfs->z_use_fuids) - pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; - else - pflags = 0; - - if (vap->va_type == VDIR) { - size = 2; /* contents ("." and "..") */ - links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; - } else { - size = links = 0; - } - - if (vap->va_type == VBLK || vap->va_type == VCHR) { - rdev = zfs_expldev(vap->va_rdev); - } - - parent = dzp->z_id; - mode = acl_ids->z_mode; - if (flag & IS_XATTR) - pflags |= ZFS_XATTR; - - /* - * No execs denied will be deterimed when zfs_mode_compute() is called. - */ - pflags |= acl_ids->z_aclp->z_hints & - (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| - ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); - - ZFS_TIME_ENCODE(&now, crtime); - ZFS_TIME_ENCODE(&now, ctime); - - if (vap->va_mask & AT_ATIME) { - ZFS_TIME_ENCODE(&vap->va_atime, atime); - } else { - ZFS_TIME_ENCODE(&now, atime); - } - - if (vap->va_mask & AT_MTIME) { - ZFS_TIME_ENCODE(&vap->va_mtime, mtime); - } else { - ZFS_TIME_ENCODE(&now, mtime); - } - - /* Now add in all of the "SA" attributes */ - VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, - &sa_hdl)); - - /* - * Setup the array of attributes to be replaced/set on the new file - * - * order for DMU_OT_ZNODE is critical since it needs to be constructed - * in the old znode_phys_t format. Don't change this ordering - */ - sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); - - if (obj_type == DMU_OT_ZNODE) { - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), - NULL, &atime, 16); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), - NULL, &mtime, 16); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), - NULL, &ctime, 16); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), - NULL, &crtime, 16); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), - NULL, &gen, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), - NULL, &mode, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), - NULL, &size, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), - NULL, &parent, 8); - } else { - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), - NULL, &mode, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), - NULL, &size, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), - NULL, &gen, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), - NULL, &acl_ids->z_fuid, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), - NULL, &acl_ids->z_fgid, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), - NULL, &parent, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), - NULL, &pflags, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), - NULL, &atime, 16); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), - NULL, &mtime, 16); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), - NULL, &ctime, 16); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), - NULL, &crtime, 16); - } - - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); - - if (obj_type == DMU_OT_ZNODE) { - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, - &empty_xattr, 8); - } - if (obj_type == DMU_OT_ZNODE || - (vap->va_type == VBLK || vap->va_type == VCHR)) { - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), - NULL, &rdev, 8); - - } - if (obj_type == DMU_OT_ZNODE) { - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), - NULL, &pflags, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, - &acl_ids->z_fuid, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, - &acl_ids->z_fgid, 8); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, - sizeof (uint64_t) * 4); - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, - &acl_phys, sizeof (zfs_acl_phys_t)); - } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, - &acl_ids->z_aclp->z_acl_count, 8); - locate.cb_aclp = acl_ids->z_aclp; - SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), - zfs_acl_data_locator, &locate, - acl_ids->z_aclp->z_acl_bytes); - mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, - acl_ids->z_fuid, acl_ids->z_fgid); - } - - VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); - - if (!(flag & IS_ROOT_NODE)) { - *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); - ASSERT(*zpp != NULL); - } else { - /* - * If we are creating the root node, the "parent" we - * passed in is the znode for the root. - */ - *zpp = dzp; - - (*zpp)->z_sa_hdl = sa_hdl; - } - - (*zpp)->z_pflags = pflags; - (*zpp)->z_mode = mode; - (*zpp)->z_dnodesize = dnodesize; - - if (vap->va_mask & AT_XVATTR) - zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); - - if (obj_type == DMU_OT_ZNODE || - acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { - VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); - } - if (!(flag & IS_ROOT_NODE)) { - vnode_t *vp; - - vp = ZTOV(*zpp); - vp->v_vflag |= VV_FORCEINSMQ; - err = insmntque(vp, zfsvfs->z_vfs); - vp->v_vflag &= ~VV_FORCEINSMQ; - KASSERT(err == 0, ("insmntque() failed: error %d", err)); - } - kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); -} - -/* - * Update in-core attributes. It is assumed the caller will be doing an - * sa_bulk_update to push the changes out. - */ -void -zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) -{ - xoptattr_t *xoap; - - xoap = xva_getxoptattr(xvap); - ASSERT(xoap); - - ASSERT_VOP_IN_SEQC(ZTOV(zp)); - - if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { - uint64_t times[2]; - ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); - (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), - ×, sizeof (times), tx); - XVA_SET_RTN(xvap, XAT_CREATETIME); - } - if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { - ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_READONLY); - } - if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { - ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_HIDDEN); - } - if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { - ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_SYSTEM); - } - if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { - ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_ARCHIVE); - } - if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { - ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_IMMUTABLE); - } - if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { - ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_NOUNLINK); - } - if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { - ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_APPENDONLY); - } - if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { - ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_NODUMP); - } - if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { - ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_OPAQUE); - } - if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { - ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, - xoap->xoa_av_quarantined, zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); - } - if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { - ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_AV_MODIFIED); - } - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { - zfs_sa_set_scanstamp(zp, xvap, tx); - XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); - } - if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { - ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_REPARSE); - } - if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { - ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_OFFLINE); - } - if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { - ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, - zp->z_pflags, tx); - XVA_SET_RTN(xvap, XAT_SPARSE); - } -} - -int -zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) -{ - dmu_object_info_t doi; - dmu_buf_t *db; - znode_t *zp; - vnode_t *vp; - sa_handle_t *hdl; - struct thread *td; - int locked; - int err; - - td = curthread; - getnewvnode_reserve(); -again: - *zpp = NULL; - ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); - - err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); - if (err) { - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - getnewvnode_drop_reserve(); - return (err); - } - - dmu_object_info_from_db(db, &doi); - if (doi.doi_bonus_type != DMU_OT_SA && - (doi.doi_bonus_type != DMU_OT_ZNODE || - (doi.doi_bonus_type == DMU_OT_ZNODE && - doi.doi_bonus_size < sizeof (znode_phys_t)))) { - sa_buf_rele(db, NULL); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); -#ifdef __FreeBSD__ - getnewvnode_drop_reserve(); -#endif - return (SET_ERROR(EINVAL)); - } - - hdl = dmu_buf_get_user(db); - if (hdl != NULL) { - zp = sa_get_userdata(hdl); - - /* - * Since "SA" does immediate eviction we - * should never find a sa handle that doesn't - * know about the znode. - */ - ASSERT3P(zp, !=, NULL); - ASSERT3U(zp->z_id, ==, obj_num); - if (zp->z_unlinked) { - err = SET_ERROR(ENOENT); - } else { - vp = ZTOV(zp); - /* - * Don't let the vnode disappear after - * ZFS_OBJ_HOLD_EXIT. - */ - VN_HOLD(vp); - *zpp = zp; - err = 0; - } - - sa_buf_rele(db, NULL); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - - if (err) { - getnewvnode_drop_reserve(); - return (err); - } - - locked = VOP_ISLOCKED(vp); - VI_LOCK(vp); - if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) { - /* - * The vnode is doomed and this thread doesn't - * hold the exclusive lock on it, so the vnode - * must be being reclaimed by another thread. - * Otherwise the doomed vnode is being reclaimed - * by this thread and zfs_zget is called from - * ZIL internals. - */ - VI_UNLOCK(vp); - - /* - * XXX vrele() locks the vnode when the last reference - * is dropped. Although in this case the vnode is - * doomed / dead and so no inactivation is required, - * the vnode lock is still acquired. That could result - * in a LOR with z_teardown_lock if another thread holds - * the vnode's lock and tries to take z_teardown_lock. - * But that is only possible if the other thread peforms - * a ZFS vnode operation on the vnode. That either - * should not happen if the vnode is dead or the thread - * should also have a refrence to the vnode and thus - * our reference is not last. - */ - VN_RELE(vp); - goto again; - } - VI_UNLOCK(vp); - getnewvnode_drop_reserve(); - return (err); - } - - /* - * Not found create new znode/vnode - * but only if file exists. - * - * There is a small window where zfs_vget() could - * find this object while a file create is still in - * progress. This is checked for in zfs_znode_alloc() - * - * if zfs_znode_alloc() fails it will drop the hold on the - * bonus buffer. - */ - zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, - doi.doi_bonus_type, NULL); - if (zp == NULL) { - err = SET_ERROR(ENOENT); - } else { - *zpp = zp; - } - if (err == 0) { - vnode_t *vp = ZTOV(zp); - - err = insmntque(vp, zfsvfs->z_vfs); - if (err == 0) { - vp->v_hash = obj_num; - VOP_UNLOCK(vp); - } else { - zp->z_vnode = NULL; - zfs_znode_dmu_fini(zp); - zfs_znode_free(zp); - *zpp = NULL; - } - } - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - getnewvnode_drop_reserve(); - return (err); -} - -int -zfs_rezget(znode_t *zp) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - dmu_object_info_t doi; - dmu_buf_t *db; - vnode_t *vp; - uint64_t obj_num = zp->z_id; - uint64_t mode, size; - sa_bulk_attr_t bulk[8]; - int err; - int count = 0; - uint64_t gen; - - /* - * Remove cached pages before reloading the znode, so that they are not - * lingering after we run into any error. Ideally, we should vgone() - * the vnode in case of error, but currently we cannot do that - * because of the LOR between the vnode lock and z_teardown_lock. - * So, instead, we have to "doom" the znode in the illumos style. - */ - vp = ZTOV(zp); - vn_pages_remove(vp, 0, 0); - - ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); - - mutex_enter(&zp->z_acl_lock); - if (zp->z_acl_cached) { - zfs_acl_free(zp->z_acl_cached); - zp->z_acl_cached = NULL; - } - - mutex_exit(&zp->z_acl_lock); - ASSERT(zp->z_sa_hdl == NULL); - err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); - if (err) { - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - return (err); - } - - dmu_object_info_from_db(db, &doi); - if (doi.doi_bonus_type != DMU_OT_SA && - (doi.doi_bonus_type != DMU_OT_ZNODE || - (doi.doi_bonus_type == DMU_OT_ZNODE && - doi.doi_bonus_size < sizeof (znode_phys_t)))) { - sa_buf_rele(db, NULL); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - return (SET_ERROR(EINVAL)); - } - - zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); - size = zp->z_size; - - /* reload cached values */ - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, - &gen, sizeof (gen)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, - &zp->z_size, sizeof (zp->z_size)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, - &zp->z_links, sizeof (zp->z_links)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, sizeof (zp->z_pflags)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, - &zp->z_atime, sizeof (zp->z_atime)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, - &zp->z_uid, sizeof (zp->z_uid)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, - &zp->z_gid, sizeof (zp->z_gid)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, - &mode, sizeof (mode)); - - if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { - zfs_znode_dmu_fini(zp); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - return (SET_ERROR(EIO)); - } - - zp->z_mode = mode; - - if (gen != zp->z_gen) { - zfs_znode_dmu_fini(zp); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - return (SET_ERROR(EIO)); - } - - /* - * It is highly improbable but still quite possible that two - * objects in different datasets are created with the same - * object numbers and in transaction groups with the same - * numbers. znodes corresponding to those objects would - * have the same z_id and z_gen, but their other attributes - * may be different. - * zfs recv -F may replace one of such objects with the other. - * As a result file properties recorded in the replaced - * object's vnode may no longer match the received object's - * properties. At present the only cached property is the - * files type recorded in v_type. - * So, handle this case by leaving the old vnode and znode - * disassociated from the actual object. A new vnode and a - * znode will be created if the object is accessed - * (e.g. via a look-up). The old vnode and znode will be - * recycled when the last vnode reference is dropped. - */ - if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) { - zfs_znode_dmu_fini(zp); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - return (SET_ERROR(EIO)); - } - - /* - * If the file has zero links, then it has been unlinked on the send - * side and it must be in the received unlinked set. - * We call zfs_znode_dmu_fini() now to prevent any accesses to the - * stale data and to prevent automatical removal of the file in - * zfs_zinactive(). The file will be removed either when it is removed - * on the send side and the next incremental stream is received or - * when the unlinked set gets processed. - */ - zp->z_unlinked = (zp->z_links == 0); - if (zp->z_unlinked) { - zfs_znode_dmu_fini(zp); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - return (0); - } - - zp->z_blksz = doi.doi_data_block_size; - if (zp->z_size != size) - vnode_pager_setsize(vp, zp->z_size); - - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - - return (0); -} - -void -zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - objset_t *os = zfsvfs->z_os; - uint64_t obj = zp->z_id; - uint64_t acl_obj = zfs_external_acl(zp); - - ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); - if (acl_obj) { - VERIFY(!zp->z_is_sa); - VERIFY(0 == dmu_object_free(os, acl_obj, tx)); - } - VERIFY(0 == dmu_object_free(os, obj, tx)); - zfs_znode_dmu_fini(zp); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); - zfs_znode_free(zp); -} - -void -zfs_zinactive(znode_t *zp) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint64_t z_id = zp->z_id; - - ASSERT(zp->z_sa_hdl); - - /* - * Don't allow a zfs_zget() while were trying to release this znode - */ - ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); - - /* - * If this was the last reference to a file with no links, remove - * the file from the file system unless the file system is mounted - * read-only. That can happen, for example, if the file system was - * originally read-write, the file was opened, then unlinked and - * the file system was made read-only before the file was finally - * closed. The file will remain in the unlinked set. - */ - if (zp->z_unlinked) { - ASSERT(!zfsvfs->z_issnap); - if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) { - ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); - zfs_rmnode(zp); - return; - } - } - - zfs_znode_dmu_fini(zp); - ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); - zfs_znode_free(zp); -} - -void -zfs_znode_free(znode_t *zp) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - ASSERT(zp->z_sa_hdl == NULL); - zp->z_vnode = NULL; - mutex_enter(&zfsvfs->z_znodes_lock); - POINTER_INVALIDATE(&zp->z_zfsvfs); - list_remove(&zfsvfs->z_all_znodes, zp); - mutex_exit(&zfsvfs->z_znodes_lock); - - if (zp->z_acl_cached) { - zfs_acl_free(zp->z_acl_cached); - zp->z_acl_cached = NULL; - } - - zfs_znode_free_kmem(zp); - -#ifdef illumos - VFS_RELE(zfsvfs->z_vfs); -#endif -} - -void -zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], - uint64_t ctime[2], boolean_t have_tx) -{ - timestruc_t now; - - vfs_timestamp(&now); - - if (have_tx) { /* will sa_bulk_update happen really soon? */ - zp->z_atime_dirty = 0; - zp->z_seq++; - } else { - zp->z_atime_dirty = 1; - } - - if (flag & AT_ATIME) { - ZFS_TIME_ENCODE(&now, zp->z_atime); - } - - if (flag & AT_MTIME) { - ZFS_TIME_ENCODE(&now, mtime); - if (zp->z_zfsvfs->z_use_fuids) { - zp->z_pflags |= (ZFS_ARCHIVE | - ZFS_AV_MODIFIED); - } - } - - if (flag & AT_CTIME) { - ZFS_TIME_ENCODE(&now, ctime); - if (zp->z_zfsvfs->z_use_fuids) - zp->z_pflags |= ZFS_ARCHIVE; - } -} - -/* - * Grow the block size for a file. - * - * IN: zp - znode of file to free data in. - * size - requested block size - * tx - open transaction. - * - * NOTE: this function assumes that the znode is write locked. - */ -void -zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) -{ - int error; - u_longlong_t dummy; - - if (size <= zp->z_blksz) - return; - /* - * If the file size is already greater than the current blocksize, - * we will not grow. If there is more than one block in a file, - * the blocksize cannot change. - */ - if (zp->z_blksz && zp->z_size > zp->z_blksz) - return; - - error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, - size, 0, tx); - - if (error == ENOTSUP) - return; - ASSERT0(error); - - /* What blocksize did we actually get? */ - dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); -} - -#ifdef illumos -/* - * This is a dummy interface used when pvn_vplist_dirty() should *not* - * be calling back into the fs for a putpage(). E.g.: when truncating - * a file, the pages being "thrown away* don't need to be written out. - */ -/* ARGSUSED */ -static int -zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, - int flags, cred_t *cr) -{ - ASSERT(0); - return (0); -} -#endif - -/* - * Increase the file length - * - * IN: zp - znode of file to free data in. - * end - new end-of-file - * - * RETURN: 0 on success, error code on failure - */ -static int -zfs_extend(znode_t *zp, uint64_t end) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - dmu_tx_t *tx; - locked_range_t *lr; - uint64_t newblksz; - int error; - - /* - * We will change zp_size, lock the whole file. - */ - lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); - - /* - * Nothing to do if file already at desired length. - */ - if (end <= zp->z_size) { - rangelock_exit(lr); - return (0); - } - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - if (end > zp->z_blksz && - (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { - /* - * We are growing the file past the current block size. - */ - if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { - /* - * File's blocksize is already larger than the - * "recordsize" property. Only let it grow to - * the next power of 2. - */ - ASSERT(!ISP2(zp->z_blksz)); - newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); - } else { - newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); - } - dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); - } else { - newblksz = 0; - } - - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - rangelock_exit(lr); - return (error); - } - - if (newblksz) - zfs_grow_blocksize(zp, newblksz, tx); - - zp->z_size = end; - - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), - &zp->z_size, sizeof (zp->z_size), tx)); - - vnode_pager_setsize(ZTOV(zp), end); - - rangelock_exit(lr); - - dmu_tx_commit(tx); - - return (0); -} - -/* - * Free space in a file. - * - * IN: zp - znode of file to free data in. - * off - start of section to free. - * len - length of section to free. - * - * RETURN: 0 on success, error code on failure - */ -static int -zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - locked_range_t *lr; - int error; - - /* - * Lock the range being freed. - */ - lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); - - /* - * Nothing to do if file already at desired length. - */ - if (off >= zp->z_size) { - rangelock_exit(lr); - return (0); - } - - if (off + len > zp->z_size) - len = zp->z_size - off; - - error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); - - if (error == 0) { - /* - * In FreeBSD we cannot free block in the middle of a file, - * but only at the end of a file, so this code path should - * never happen. - */ - vnode_pager_setsize(ZTOV(zp), off); - } - - rangelock_exit(lr); - - return (error); -} - -/* - * Truncate a file - * - * IN: zp - znode of file to free data in. - * end - new end-of-file. - * - * RETURN: 0 on success, error code on failure - */ -static int -zfs_trunc(znode_t *zp, uint64_t end) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - vnode_t *vp = ZTOV(zp); - dmu_tx_t *tx; - locked_range_t *lr; - int error; - sa_bulk_attr_t bulk[2]; - int count = 0; - - /* - * We will change zp_size, lock the whole file. - */ - lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); - - /* - * Nothing to do if file already at desired length. - */ - if (end >= zp->z_size) { - rangelock_exit(lr); - return (0); - } - - error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, - DMU_OBJECT_END); - if (error) { - rangelock_exit(lr); - return (error); - } - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - dmu_tx_mark_netfree(tx); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - rangelock_exit(lr); - return (error); - } - - zp->z_size = end; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), - NULL, &zp->z_size, sizeof (zp->z_size)); - - if (end == 0) { - zp->z_pflags &= ~ZFS_SPARSE; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), - NULL, &zp->z_pflags, 8); - } - VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); - - dmu_tx_commit(tx); - - /* - * Clear any mapped pages in the truncated region. This has to - * happen outside of the transaction to avoid the possibility of - * a deadlock with someone trying to push a page that we are - * about to invalidate. - */ - vnode_pager_setsize(vp, end); - - rangelock_exit(lr); - - return (0); -} - -/* - * Free space in a file - * - * IN: zp - znode of file to free data in. - * off - start of range - * len - end of range (0 => EOF) - * flag - current file open mode flags. - * log - TRUE if this action should be logged - * - * RETURN: 0 on success, error code on failure - */ -int -zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) -{ - vnode_t *vp = ZTOV(zp); - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - uint64_t mode; - uint64_t mtime[2], ctime[2]; - sa_bulk_attr_t bulk[3]; - int count = 0; - int error; - - if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, - sizeof (mode))) != 0) - return (error); - - if (off > zp->z_size) { - error = zfs_extend(zp, off+len); - if (error == 0 && log) - goto log; - else - return (error); - } - - /* - * Check for any locks in the region to be freed. - */ - - if (MANDLOCK(vp, (mode_t)mode)) { - uint64_t length = (len ? len : zp->z_size - off); - if (error = chklock(vp, FWRITE, off, length, flag, NULL)) - return (error); - } - - if (len == 0) { - error = zfs_trunc(zp, off); - } else { - if ((error = zfs_free_range(zp, off, len)) == 0 && - off + len > zp->z_size) - error = zfs_extend(zp, off+len); - } - if (error || !log) - return (error); -log: - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), - NULL, &zp->z_pflags, 8); - zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); - - zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); - - dmu_tx_commit(tx); - return (0); -} - -void -zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) -{ - uint64_t moid, obj, sa_obj, version; - uint64_t sense = ZFS_CASE_SENSITIVE; - uint64_t norm = 0; - nvpair_t *elem; - int error; - int i; - znode_t *rootzp = NULL; - zfsvfs_t *zfsvfs; - vattr_t vattr; - znode_t *zp; - zfs_acl_ids_t acl_ids; - - /* - * First attempt to create master node. - */ - /* - * In an empty objset, there are no blocks to read and thus - * there can be no i/o errors (which we assert below). - */ - moid = MASTER_NODE_OBJ; - error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, - DMU_OT_NONE, 0, tx); - ASSERT(error == 0); - - /* - * Set starting attributes. - */ - version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); - elem = NULL; - while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { - /* For the moment we expect all zpl props to be uint64_ts */ - uint64_t val; - char *name; - - ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); - VERIFY(nvpair_value_uint64(elem, &val) == 0); - name = nvpair_name(elem); - if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { - if (val < version) - version = val; - } else { - error = zap_update(os, moid, name, 8, 1, &val, tx); - } - ASSERT(error == 0); - if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) - norm = val; - else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) - sense = val; - } - ASSERT(version != 0); - error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); - - /* - * Create zap object used for SA attribute registration - */ - - if (version >= ZPL_VERSION_SA) { - sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, - DMU_OT_NONE, 0, tx); - error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); - ASSERT(error == 0); - } else { - sa_obj = 0; - } - /* - * Create a delete queue. - */ - obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); - - error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); - ASSERT(error == 0); - - /* - * Create root znode. Create minimal znode/vnode/zfsvfs - * to allow zfs_mknode to work. - */ - VATTR_NULL(&vattr); - vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; - vattr.va_type = VDIR; - vattr.va_mode = S_IFDIR|0755; - vattr.va_uid = crgetuid(cr); - vattr.va_gid = crgetgid(cr); - - zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); - - rootzp = zfs_znode_alloc_kmem(KM_SLEEP); - ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); - rootzp->z_moved = 0; - rootzp->z_unlinked = 0; - rootzp->z_atime_dirty = 0; - rootzp->z_is_sa = USE_SA(version, os); - - zfsvfs->z_os = os; - zfsvfs->z_parent = zfsvfs; - zfsvfs->z_version = version; - zfsvfs->z_use_fuids = USE_FUIDS(version, os); - zfsvfs->z_use_sa = USE_SA(version, os); - zfsvfs->z_norm = norm; - - error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, - &zfsvfs->z_attr_table); - - ASSERT(error == 0); - - /* - * Fold case on file systems that are always or sometimes case - * insensitive. - */ - if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) - zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; - - mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), - offsetof(znode_t, z_link_node)); - - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); - - rootzp->z_zfsvfs = zfsvfs; - VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, - cr, NULL, &acl_ids)); - zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); - ASSERT3P(zp, ==, rootzp); - error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); - ASSERT(error == 0); - zfs_acl_ids_free(&acl_ids); - POINTER_INVALIDATE(&rootzp->z_zfsvfs); - - sa_handle_destroy(rootzp->z_sa_hdl); - zfs_znode_free_kmem(rootzp); - - /* - * Create shares directory - */ - - error = zfs_create_share_dir(zfsvfs, tx); - - ASSERT(error == 0); - - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_destroy(&zfsvfs->z_hold_mtx[i]); - kmem_free(zfsvfs, sizeof (zfsvfs_t)); -} -#endif /* _KERNEL */ - -static int -zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) -{ - uint64_t sa_obj = 0; - int error; - - error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); - if (error != 0 && error != ENOENT) - return (error); - - error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); - return (error); -} - -static int -zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, - dmu_buf_t **db, void *tag) -{ - dmu_object_info_t doi; - int error; - - if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) - return (error); - - dmu_object_info_from_db(*db, &doi); - if ((doi.doi_bonus_type != DMU_OT_SA && - doi.doi_bonus_type != DMU_OT_ZNODE) || - doi.doi_bonus_type == DMU_OT_ZNODE && - doi.doi_bonus_size < sizeof (znode_phys_t)) { - sa_buf_rele(*db, tag); - return (SET_ERROR(ENOTSUP)); - } - - error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); - if (error != 0) { - sa_buf_rele(*db, tag); - return (error); - } - - return (0); -} - -void -zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) -{ - sa_handle_destroy(hdl); - sa_buf_rele(db, tag); -} - -/* - * Given an object number, return its parent object number and whether - * or not the object is an extended attribute directory. - */ -static int -zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, - uint64_t *pobjp, int *is_xattrdir) -{ - uint64_t parent; - uint64_t pflags; - uint64_t mode; - uint64_t parent_mode; - sa_bulk_attr_t bulk[3]; - sa_handle_t *sa_hdl; - dmu_buf_t *sa_db; - int count = 0; - int error; - - SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, - &parent, sizeof (parent)); - SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, - &pflags, sizeof (pflags)); - SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, - &mode, sizeof (mode)); - - if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) - return (error); - - /* - * When a link is removed its parent pointer is not changed and will - * be invalid. There are two cases where a link is removed but the - * file stays around, when it goes to the delete queue and when there - * are additional links. - */ - error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); - if (error != 0) - return (error); - - error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); - zfs_release_sa_handle(sa_hdl, sa_db, FTAG); - if (error != 0) - return (error); - - *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); - - /* - * Extended attributes can be applied to files, directories, etc. - * Otherwise the parent must be a directory. - */ - if (!*is_xattrdir && !S_ISDIR(parent_mode)) - return (SET_ERROR(EINVAL)); - - *pobjp = parent; - - return (0); -} - -/* - * Given an object number, return some zpl level statistics - */ -static int -zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, - zfs_stat_t *sb) -{ - sa_bulk_attr_t bulk[4]; - int count = 0; - - SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, - &sb->zs_mode, sizeof (sb->zs_mode)); - SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, - &sb->zs_gen, sizeof (sb->zs_gen)); - SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, - &sb->zs_links, sizeof (sb->zs_links)); - SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, - &sb->zs_ctime, sizeof (sb->zs_ctime)); - - return (sa_bulk_lookup(hdl, bulk, count)); -} - -static int -zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, - sa_attr_type_t *sa_table, char *buf, int len) -{ - sa_handle_t *sa_hdl; - sa_handle_t *prevhdl = NULL; - dmu_buf_t *prevdb = NULL; - dmu_buf_t *sa_db = NULL; - char *path = buf + len - 1; - int error; - - *path = '\0'; - sa_hdl = hdl; - - uint64_t deleteq_obj; - VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, - ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); - error = zap_lookup_int(osp, deleteq_obj, obj); - if (error == 0) { - return (ESTALE); - } else if (error != ENOENT) { - return (error); - } - error = 0; - - for (;;) { - uint64_t pobj; - char component[MAXNAMELEN + 2]; - size_t complen; - int is_xattrdir; - - if (prevdb) - zfs_release_sa_handle(prevhdl, prevdb, FTAG); - - if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, - &is_xattrdir)) != 0) - break; - - if (pobj == obj) { - if (path[0] != '/') - *--path = '/'; - break; - } - - component[0] = '/'; - if (is_xattrdir) { - (void) sprintf(component + 1, ""); - } else { - error = zap_value_search(osp, pobj, obj, - ZFS_DIRENT_OBJ(-1ULL), component + 1); - if (error != 0) - break; - } - - complen = strlen(component); - path -= complen; - ASSERT(path >= buf); - bcopy(component, path, complen); - obj = pobj; - - if (sa_hdl != hdl) { - prevhdl = sa_hdl; - prevdb = sa_db; - } - error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); - if (error != 0) { - sa_hdl = prevhdl; - sa_db = prevdb; - break; - } - } - - if (sa_hdl != NULL && sa_hdl != hdl) { - ASSERT(sa_db != NULL); - zfs_release_sa_handle(sa_hdl, sa_db, FTAG); - } - - if (error == 0) - (void) memmove(buf, path, buf + len - path); - - return (error); -} - -int -zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) -{ - sa_attr_type_t *sa_table; - sa_handle_t *hdl; - dmu_buf_t *db; - int error; - - error = zfs_sa_setup(osp, &sa_table); - if (error != 0) - return (error); - - error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); - if (error != 0) - return (error); - - error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); - - zfs_release_sa_handle(hdl, db, FTAG); - return (error); -} - -int -zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, - char *buf, int len) -{ - char *path = buf + len - 1; - sa_attr_type_t *sa_table; - sa_handle_t *hdl; - dmu_buf_t *db; - int error; - - *path = '\0'; - - error = zfs_sa_setup(osp, &sa_table); - if (error != 0) - return (error); - - error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); - if (error != 0) - return (error); - - error = zfs_obj_to_stats_impl(hdl, sa_table, sb); - if (error != 0) { - zfs_release_sa_handle(hdl, db, FTAG); - return (error); - } - - error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); - - zfs_release_sa_handle(hdl, db, FTAG); - return (error); -} - -#ifdef _KERNEL -int -zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint64_t parent; - int is_xattrdir; - int err; - - /* Extended attributes should not be visible as regular files. */ - if ((zp->z_pflags & ZFS_XATTR) != 0) - return (SET_ERROR(EINVAL)); - - err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table, - &parent, &is_xattrdir); - if (err != 0) - return (err); - ASSERT0(is_xattrdir); - - /* No name as this is a root object. */ - if (parent == zp->z_id) - return (SET_ERROR(EINVAL)); - - err = zap_value_search(zfsvfs->z_os, parent, zp->z_id, - ZFS_DIRENT_OBJ(-1ULL), buf); - if (err != 0) - return (err); - err = zfs_zget(zfsvfs, parent, dzpp); - return (err); -} -#endif /* _KERNEL */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c deleted file mode 100644 index a2b9f9bbeaa0..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c +++ /dev/null @@ -1,3499 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - */ - -/* Portions Copyright 2010 Robert Milkowski */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system - * calls that change the file system. Each itx has enough information to - * be able to replay them after a system crash, power loss, or - * equivalent failure mode. These are stored in memory until either: - * - * 1. they are committed to the pool by the DMU transaction group - * (txg), at which point they can be discarded; or - * 2. they are committed to the on-disk ZIL for the dataset being - * modified (e.g. due to an fsync, O_DSYNC, or other synchronous - * requirement). - * - * In the event of a crash or power loss, the itxs contained by each - * dataset's on-disk ZIL will be replayed when that dataset is first - * instantianted (e.g. if the dataset is a normal fileystem, when it is - * first mounted). - * - * As hinted at above, there is one ZIL per dataset (both the in-memory - * representation, and the on-disk representation). The on-disk format - * consists of 3 parts: - * - * - a single, per-dataset, ZIL header; which points to a chain of - * - zero or more ZIL blocks; each of which contains - * - zero or more ZIL records - * - * A ZIL record holds the information necessary to replay a single - * system call transaction. A ZIL block can hold many ZIL records, and - * the blocks are chained together, similarly to a singly linked list. - * - * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL - * block in the chain, and the ZIL header points to the first block in - * the chain. - * - * Note, there is not a fixed place in the pool to hold these ZIL - * blocks; they are dynamically allocated and freed as needed from the - * blocks available on the pool, though they can be preferentially - * allocated from a dedicated "log" vdev. - */ - -/* - * This controls the amount of time that a ZIL block (lwb) will remain - * "open" when it isn't "full", and it has a thread waiting for it to be - * committed to stable storage. Please refer to the zil_commit_waiter() - * function (and the comments within it) for more details. - */ -int zfs_commit_timeout_pct = 5; - -/* - * Disable intent logging replay. This global ZIL switch affects all pools. - */ -int zil_replay_disable = 0; -SYSCTL_DECL(_vfs_zfs); -SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RWTUN, - &zil_replay_disable, 0, "Disable intent logging replay"); - -/* - * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to - * the disk(s) by the ZIL after an LWB write has completed. Setting this - * will cause ZIL corruption on power loss if a volatile out-of-order - * write cache is enabled. - */ -boolean_t zil_nocacheflush = B_FALSE; -SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_nocacheflush, CTLFLAG_RWTUN, - &zil_nocacheflush, 0, "Disable ZIL cache flush"); - -boolean_t zfs_trim_enabled = B_TRUE; -SYSCTL_DECL(_vfs_zfs_trim); -SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0, - "Enable ZFS TRIM"); - -/* - * Limit SLOG write size per commit executed with synchronous priority. - * Any writes above that will be executed with lower (asynchronous) priority - * to limit potential SLOG device abuse by single active ZIL writer. - */ -uint64_t zil_slog_bulk = 768 * 1024; -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_bulk, CTLFLAG_RWTUN, - &zil_slog_bulk, 0, "Maximal SLOG commit size with sync priority"); - -static kmem_cache_t *zil_lwb_cache; -static kmem_cache_t *zil_zcw_cache; - -#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ - sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) - -static int -zil_bp_compare(const void *x1, const void *x2) -{ - const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; - const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; - - int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); - if (likely(cmp)) - return (cmp); - - return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); -} - -static void -zil_bp_tree_init(zilog_t *zilog) -{ - avl_create(&zilog->zl_bp_tree, zil_bp_compare, - sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); -} - -static void -zil_bp_tree_fini(zilog_t *zilog) -{ - avl_tree_t *t = &zilog->zl_bp_tree; - zil_bp_node_t *zn; - void *cookie = NULL; - - while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(zn, sizeof (zil_bp_node_t)); - - avl_destroy(t); -} - -int -zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) -{ - avl_tree_t *t = &zilog->zl_bp_tree; - const dva_t *dva; - zil_bp_node_t *zn; - avl_index_t where; - - if (BP_IS_EMBEDDED(bp)) - return (0); - - dva = BP_IDENTITY(bp); - - if (avl_find(t, dva, &where) != NULL) - return (SET_ERROR(EEXIST)); - - zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); - zn->zn_dva = *dva; - avl_insert(t, zn, where); - - return (0); -} - -static zil_header_t * -zil_header_in_syncing_context(zilog_t *zilog) -{ - return ((zil_header_t *)zilog->zl_header); -} - -static void -zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) -{ - zio_cksum_t *zc = &bp->blk_cksum; - - zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); - zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); - zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); - zc->zc_word[ZIL_ZC_SEQ] = 1ULL; -} - -/* - * Read a log block and make sure it's valid. - */ -static int -zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, - char **end) -{ - enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; - arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf = NULL; - zbookmark_phys_t zb; - int error; - - if (zilog->zl_header->zh_claim_txg == 0) - zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; - - if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) - zio_flags |= ZIO_FLAG_SPECULATIVE; - - SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], - ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - - error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); - - if (error == 0) { - zio_cksum_t cksum = bp->blk_cksum; - - /* - * Validate the checksummed log block. - * - * Sequence numbers should be... sequential. The checksum - * verifier for the next block should be bp's checksum plus 1. - * - * Also check the log chain linkage and size used. - */ - cksum.zc_word[ZIL_ZC_SEQ]++; - - if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = abuf->b_data; - char *lr = (char *)(zilc + 1); - uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); - - if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { - error = SET_ERROR(ECKSUM); - } else { - ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); - bcopy(lr, dst, len); - *end = (char *)dst + len; - *nbp = zilc->zc_next_blk; - } - } else { - char *lr = abuf->b_data; - uint64_t size = BP_GET_LSIZE(bp); - zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; - - if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || - (zilc->zc_nused > (size - sizeof (*zilc)))) { - error = SET_ERROR(ECKSUM); - } else { - ASSERT3U(zilc->zc_nused, <=, - SPA_OLD_MAXBLOCKSIZE); - bcopy(lr, dst, zilc->zc_nused); - *end = (char *)dst + zilc->zc_nused; - *nbp = zilc->zc_next_blk; - } - } - - arc_buf_destroy(abuf, &abuf); - } - - return (error); -} - -/* - * Read a TX_WRITE log data block. - */ -static int -zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) -{ - enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; - const blkptr_t *bp = &lr->lr_blkptr; - arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf = NULL; - zbookmark_phys_t zb; - int error; - - if (BP_IS_HOLE(bp)) { - if (wbuf != NULL) - bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); - return (0); - } - - if (zilog->zl_header->zh_claim_txg == 0) - zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; - - SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, - ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); - - error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); - - if (error == 0) { - if (wbuf != NULL) - bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); - arc_buf_destroy(abuf, &abuf); - } - - return (error); -} - -/* - * Parse the intent log, and call parse_func for each valid record within. - */ -int -zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, - zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) -{ - const zil_header_t *zh = zilog->zl_header; - boolean_t claimed = !!zh->zh_claim_txg; - uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; - uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; - uint64_t max_blk_seq = 0; - uint64_t max_lr_seq = 0; - uint64_t blk_count = 0; - uint64_t lr_count = 0; - blkptr_t blk, next_blk; - char *lrbuf, *lrp; - int error = 0; - - /* - * Old logs didn't record the maximum zh_claim_lr_seq. - */ - if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) - claim_lr_seq = UINT64_MAX; - - /* - * Starting at the block pointed to by zh_log we read the log chain. - * For each block in the chain we strongly check that block to - * ensure its validity. We stop when an invalid block is found. - * For each block pointer in the chain we call parse_blk_func(). - * For each record in each valid block we call parse_lr_func(). - * If the log has been claimed, stop if we encounter a sequence - * number greater than the highest claimed sequence number. - */ - lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); - zil_bp_tree_init(zilog); - - for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { - uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; - int reclen; - char *end; - - if (blk_seq > claim_blk_seq) - break; - if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) - break; - ASSERT3U(max_blk_seq, <, blk_seq); - max_blk_seq = blk_seq; - blk_count++; - - if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) - break; - - error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); - if (error != 0) - break; - - for (lrp = lrbuf; lrp < end; lrp += reclen) { - lr_t *lr = (lr_t *)lrp; - reclen = lr->lrc_reclen; - ASSERT3U(reclen, >=, sizeof (lr_t)); - if (lr->lrc_seq > claim_lr_seq) - goto done; - if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) - goto done; - ASSERT3U(max_lr_seq, <, lr->lrc_seq); - max_lr_seq = lr->lrc_seq; - lr_count++; - } - } -done: - zilog->zl_parse_error = error; - zilog->zl_parse_blk_seq = max_blk_seq; - zilog->zl_parse_lr_seq = max_lr_seq; - zilog->zl_parse_blk_count = blk_count; - zilog->zl_parse_lr_count = lr_count; - - ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || - (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); - - zil_bp_tree_fini(zilog); - zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); - - return (error); -} - -/* ARGSUSED */ -static int -zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) -{ - ASSERT(!BP_IS_HOLE(bp)); - - /* - * As we call this function from the context of a rewind to a - * checkpoint, each ZIL block whose txg is later than the txg - * that we rewind to is invalid. Thus, we return -1 so - * zil_parse() doesn't attempt to read it. - */ - if (bp->blk_birth >= first_txg) - return (-1); - - if (zil_bp_tree_add(zilog, bp) != 0) - return (0); - - zio_free(zilog->zl_spa, first_txg, bp); - return (0); -} - -/* ARGSUSED */ -static int -zil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) -{ - return (0); -} - -static int -zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) -{ - /* - * Claim log block if not already committed and not already claimed. - * If tx == NULL, just verify that the block is claimable. - */ - if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || - zil_bp_tree_add(zilog, bp) != 0) - return (0); - - return (zio_wait(zio_claim(NULL, zilog->zl_spa, - tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); -} - -static int -zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) -{ - lr_write_t *lr = (lr_write_t *)lrc; - int error; - - if (lrc->lrc_txtype != TX_WRITE) - return (0); - - /* - * If the block is not readable, don't claim it. This can happen - * in normal operation when a log block is written to disk before - * some of the dmu_sync() blocks it points to. In this case, the - * transaction cannot have been committed to anyone (we would have - * waited for all writes to be stable first), so it is semantically - * correct to declare this the end of the log. - */ - if (lr->lr_blkptr.blk_birth >= first_txg && - (error = zil_read_log_data(zilog, lr, NULL)) != 0) - return (error); - return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); -} - -/* ARGSUSED */ -static int -zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) -{ - zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); - - return (0); -} - -static int -zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) -{ - lr_write_t *lr = (lr_write_t *)lrc; - blkptr_t *bp = &lr->lr_blkptr; - - /* - * If we previously claimed it, we need to free it. - */ - if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && - bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && - !BP_IS_HOLE(bp)) - zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); - - return (0); -} - -static int -zil_lwb_vdev_compare(const void *x1, const void *x2) -{ - const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; - const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; - - return (AVL_CMP(v1, v2)); -} - -static lwb_t * -zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg) -{ - lwb_t *lwb; - - lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); - lwb->lwb_zilog = zilog; - lwb->lwb_blk = *bp; - lwb->lwb_slog = slog; - lwb->lwb_state = LWB_STATE_CLOSED; - lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); - lwb->lwb_max_txg = txg; - lwb->lwb_write_zio = NULL; - lwb->lwb_root_zio = NULL; - lwb->lwb_tx = NULL; - lwb->lwb_issued_timestamp = 0; - if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - lwb->lwb_nused = sizeof (zil_chain_t); - lwb->lwb_sz = BP_GET_LSIZE(bp); - } else { - lwb->lwb_nused = 0; - lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); - } - - mutex_enter(&zilog->zl_lock); - list_insert_tail(&zilog->zl_lwb_list, lwb); - mutex_exit(&zilog->zl_lock); - - ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); - ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); - VERIFY(list_is_empty(&lwb->lwb_waiters)); - - return (lwb); -} - -static void -zil_free_lwb(zilog_t *zilog, lwb_t *lwb) -{ - ASSERT(MUTEX_HELD(&zilog->zl_lock)); - ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); - VERIFY(list_is_empty(&lwb->lwb_waiters)); - ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); - ASSERT3P(lwb->lwb_write_zio, ==, NULL); - ASSERT3P(lwb->lwb_root_zio, ==, NULL); - ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); - ASSERT(lwb->lwb_state == LWB_STATE_CLOSED || - lwb->lwb_state == LWB_STATE_FLUSH_DONE); - - /* - * Clear the zilog's field to indicate this lwb is no longer - * valid, and prevent use-after-free errors. - */ - if (zilog->zl_last_lwb_opened == lwb) - zilog->zl_last_lwb_opened = NULL; - - kmem_cache_free(zil_lwb_cache, lwb); -} - -/* - * Called when we create in-memory log transactions so that we know - * to cleanup the itxs at the end of spa_sync(). - */ -void -zilog_dirty(zilog_t *zilog, uint64_t txg) -{ - dsl_pool_t *dp = zilog->zl_dmu_pool; - dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); - - ASSERT(spa_writeable(zilog->zl_spa)); - - if (ds->ds_is_snapshot) - panic("dirtying snapshot!"); - - if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { - /* up the hold count until we can be written out */ - dmu_buf_add_ref(ds->ds_dbuf, zilog); - - zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg); - } -} - -/* - * Determine if the zil is dirty in the specified txg. Callers wanting to - * ensure that the dirty state does not change must hold the itxg_lock for - * the specified txg. Holding the lock will ensure that the zil cannot be - * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current - * state. - */ -boolean_t -zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) -{ - dsl_pool_t *dp = zilog->zl_dmu_pool; - - if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK)) - return (B_TRUE); - return (B_FALSE); -} - -/* - * Determine if the zil is dirty. The zil is considered dirty if it has - * any pending itx records that have not been cleaned by zil_clean(). - */ -boolean_t -zilog_is_dirty(zilog_t *zilog) -{ - dsl_pool_t *dp = zilog->zl_dmu_pool; - - for (int t = 0; t < TXG_SIZE; t++) { - if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) - return (B_TRUE); - } - return (B_FALSE); -} - -/* - * Create an on-disk intent log. - */ -static lwb_t * -zil_create(zilog_t *zilog) -{ - const zil_header_t *zh = zilog->zl_header; - lwb_t *lwb = NULL; - uint64_t txg = 0; - dmu_tx_t *tx = NULL; - blkptr_t blk; - int error = 0; - boolean_t slog = FALSE; - - /* - * Wait for any previous destroy to complete. - */ - txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); - - ASSERT(zh->zh_claim_txg == 0); - ASSERT(zh->zh_replay_seq == 0); - - blk = zh->zh_log; - - /* - * Allocate an initial log block if: - * - there isn't one already - * - the existing block is the wrong endianess - */ - if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { - tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - txg = dmu_tx_get_txg(tx); - - if (!BP_IS_HOLE(&blk)) { - zio_free(zilog->zl_spa, txg, &blk); - BP_ZERO(&blk); - } - - error = zio_alloc_zil(zilog->zl_spa, - zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL, - ZIL_MIN_BLKSZ, &slog); - - if (error == 0) - zil_init_log_chain(zilog, &blk); - } - - /* - * Allocate a log write block (lwb) for the first log block. - */ - if (error == 0) - lwb = zil_alloc_lwb(zilog, &blk, slog, txg); - - /* - * If we just allocated the first log block, commit our transaction - * and wait for zil_sync() to stuff the block poiner into zh_log. - * (zh is part of the MOS, so we cannot modify it in open context.) - */ - if (tx != NULL) { - dmu_tx_commit(tx); - txg_wait_synced(zilog->zl_dmu_pool, txg); - } - - ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); - - return (lwb); -} - -/* - * In one tx, free all log blocks and clear the log header. If keep_first - * is set, then we're replaying a log with no content. We want to keep the - * first block, however, so that the first synchronous transaction doesn't - * require a txg_wait_synced() in zil_create(). We don't need to - * txg_wait_synced() here either when keep_first is set, because both - * zil_create() and zil_destroy() will wait for any in-progress destroys - * to complete. - */ -void -zil_destroy(zilog_t *zilog, boolean_t keep_first) -{ - const zil_header_t *zh = zilog->zl_header; - lwb_t *lwb; - dmu_tx_t *tx; - uint64_t txg; - - /* - * Wait for any previous destroy to complete. - */ - txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); - - zilog->zl_old_header = *zh; /* debugging aid */ - - if (BP_IS_HOLE(&zh->zh_log)) - return; - - tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - txg = dmu_tx_get_txg(tx); - - mutex_enter(&zilog->zl_lock); - - ASSERT3U(zilog->zl_destroy_txg, <, txg); - zilog->zl_destroy_txg = txg; - zilog->zl_keep_first = keep_first; - - if (!list_is_empty(&zilog->zl_lwb_list)) { - ASSERT(zh->zh_claim_txg == 0); - VERIFY(!keep_first); - while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { - list_remove(&zilog->zl_lwb_list, lwb); - if (lwb->lwb_buf != NULL) - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); - zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); - zil_free_lwb(zilog, lwb); - } - } else if (!keep_first) { - zil_destroy_sync(zilog, tx); - } - mutex_exit(&zilog->zl_lock); - - dmu_tx_commit(tx); -} - -void -zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) -{ - ASSERT(list_is_empty(&zilog->zl_lwb_list)); - (void) zil_parse(zilog, zil_free_log_block, - zil_free_log_record, tx, zilog->zl_header->zh_claim_txg); -} - -int -zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) -{ - dmu_tx_t *tx = txarg; - zilog_t *zilog; - uint64_t first_txg; - zil_header_t *zh; - objset_t *os; - int error; - - error = dmu_objset_own_obj(dp, ds->ds_object, - DMU_OST_ANY, B_FALSE, FTAG, &os); - if (error != 0) { - /* - * EBUSY indicates that the objset is inconsistent, in which - * case it can not have a ZIL. - */ - if (error != EBUSY) { - cmn_err(CE_WARN, "can't open objset for %llu, error %u", - (unsigned long long)ds->ds_object, error); - } - return (0); - } - - zilog = dmu_objset_zil(os); - zh = zil_header_in_syncing_context(zilog); - ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa)); - first_txg = spa_min_claim_txg(zilog->zl_spa); - - /* - * If the spa_log_state is not set to be cleared, check whether - * the current uberblock is a checkpoint one and if the current - * header has been claimed before moving on. - * - * If the current uberblock is a checkpointed uberblock then - * one of the following scenarios took place: - * - * 1] We are currently rewinding to the checkpoint of the pool. - * 2] We crashed in the middle of a checkpoint rewind but we - * did manage to write the checkpointed uberblock to the - * vdev labels, so when we tried to import the pool again - * the checkpointed uberblock was selected from the import - * procedure. - * - * In both cases we want to zero out all the ZIL blocks, except - * the ones that have been claimed at the time of the checkpoint - * (their zh_claim_txg != 0). The reason is that these blocks - * may be corrupted since we may have reused their locations on - * disk after we took the checkpoint. - * - * We could try to set spa_log_state to SPA_LOG_CLEAR earlier - * when we first figure out whether the current uberblock is - * checkpointed or not. Unfortunately, that would discard all - * the logs, including the ones that are claimed, and we would - * leak space. - */ - if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR || - (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && - zh->zh_claim_txg == 0)) { - if (!BP_IS_HOLE(&zh->zh_log)) { - (void) zil_parse(zilog, zil_clear_log_block, - zil_noop_log_record, tx, first_txg); - } - BP_ZERO(&zh->zh_log); - dsl_dataset_dirty(dmu_objset_ds(os), tx); - dmu_objset_disown(os, FTAG); - return (0); - } - - /* - * If we are not rewinding and opening the pool normally, then - * the min_claim_txg should be equal to the first txg of the pool. - */ - ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa)); - - /* - * Claim all log blocks if we haven't already done so, and remember - * the highest claimed sequence number. This ensures that if we can - * read only part of the log now (e.g. due to a missing device), - * but we can read the entire log later, we will not try to replay - * or destroy beyond the last block we successfully claimed. - */ - ASSERT3U(zh->zh_claim_txg, <=, first_txg); - if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { - (void) zil_parse(zilog, zil_claim_log_block, - zil_claim_log_record, tx, first_txg); - zh->zh_claim_txg = first_txg; - zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; - zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; - if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) - zh->zh_flags |= ZIL_REPLAY_NEEDED; - zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; - dsl_dataset_dirty(dmu_objset_ds(os), tx); - } - - ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); - dmu_objset_disown(os, FTAG); - return (0); -} - -/* - * Check the log by walking the log chain. - * Checksum errors are ok as they indicate the end of the chain. - * Any other error (no device or read failure) returns an error. - */ -/* ARGSUSED */ -int -zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) -{ - zilog_t *zilog; - objset_t *os; - blkptr_t *bp; - int error; - - ASSERT(tx == NULL); - - error = dmu_objset_from_ds(ds, &os); - if (error != 0) { - cmn_err(CE_WARN, "can't open objset %llu, error %d", - (unsigned long long)ds->ds_object, error); - return (0); - } - - zilog = dmu_objset_zil(os); - bp = (blkptr_t *)&zilog->zl_header->zh_log; - - if (!BP_IS_HOLE(bp)) { - vdev_t *vd; - boolean_t valid = B_TRUE; - - /* - * Check the first block and determine if it's on a log device - * which may have been removed or faulted prior to loading this - * pool. If so, there's no point in checking the rest of the - * log as its content should have already been synced to the - * pool. - */ - spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); - vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); - if (vd->vdev_islog && vdev_is_dead(vd)) - valid = vdev_log_state_valid(vd); - spa_config_exit(os->os_spa, SCL_STATE, FTAG); - - if (!valid) - return (0); - - /* - * Check whether the current uberblock is checkpointed (e.g. - * we are rewinding) and whether the current header has been - * claimed or not. If it hasn't then skip verifying it. We - * do this because its ZIL blocks may be part of the pool's - * state before the rewind, which is no longer valid. - */ - zil_header_t *zh = zil_header_in_syncing_context(zilog); - if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && - zh->zh_claim_txg == 0) - return (0); - } - - /* - * Because tx == NULL, zil_claim_log_block() will not actually claim - * any blocks, but just determine whether it is possible to do so. - * In addition to checking the log chain, zil_claim_log_block() - * will invoke zio_claim() with a done func of spa_claim_notify(), - * which will update spa_max_claim_txg. See spa_load() for details. - */ - error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, - zilog->zl_header->zh_claim_txg ? -1ULL : - spa_min_claim_txg(os->os_spa)); - - return ((error == ECKSUM || error == ENOENT) ? 0 : error); -} - -/* - * When an itx is "skipped", this function is used to properly mark the - * waiter as "done, and signal any thread(s) waiting on it. An itx can - * be skipped (and not committed to an lwb) for a variety of reasons, - * one of them being that the itx was committed via spa_sync(), prior to - * it being committed to an lwb; this can happen if a thread calling - * zil_commit() is racing with spa_sync(). - */ -static void -zil_commit_waiter_skip(zil_commit_waiter_t *zcw) -{ - mutex_enter(&zcw->zcw_lock); - ASSERT3B(zcw->zcw_done, ==, B_FALSE); - zcw->zcw_done = B_TRUE; - cv_broadcast(&zcw->zcw_cv); - mutex_exit(&zcw->zcw_lock); -} - -/* - * This function is used when the given waiter is to be linked into an - * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb. - * At this point, the waiter will no longer be referenced by the itx, - * and instead, will be referenced by the lwb. - */ -static void -zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) -{ - /* - * The lwb_waiters field of the lwb is protected by the zilog's - * zl_lock, thus it must be held when calling this function. - */ - ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); - - mutex_enter(&zcw->zcw_lock); - ASSERT(!list_link_active(&zcw->zcw_node)); - ASSERT3P(zcw->zcw_lwb, ==, NULL); - ASSERT3P(lwb, !=, NULL); - ASSERT(lwb->lwb_state == LWB_STATE_OPENED || - lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE); - - list_insert_tail(&lwb->lwb_waiters, zcw); - zcw->zcw_lwb = lwb; - mutex_exit(&zcw->zcw_lock); -} - -/* - * This function is used when zio_alloc_zil() fails to allocate a ZIL - * block, and the given waiter must be linked to the "nolwb waiters" - * list inside of zil_process_commit_list(). - */ -static void -zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) -{ - mutex_enter(&zcw->zcw_lock); - ASSERT(!list_link_active(&zcw->zcw_node)); - ASSERT3P(zcw->zcw_lwb, ==, NULL); - list_insert_tail(nolwb, zcw); - mutex_exit(&zcw->zcw_lock); -} - -void -zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) -{ - avl_tree_t *t = &lwb->lwb_vdev_tree; - avl_index_t where; - zil_vdev_node_t *zv, zvsearch; - int ndvas = BP_GET_NDVAS(bp); - int i; - - if (zil_nocacheflush) - return; - - mutex_enter(&lwb->lwb_vdev_lock); - for (i = 0; i < ndvas; i++) { - zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); - if (avl_find(t, &zvsearch, &where) == NULL) { - zv = kmem_alloc(sizeof (*zv), KM_SLEEP); - zv->zv_vdev = zvsearch.zv_vdev; - avl_insert(t, zv, where); - } - } - mutex_exit(&lwb->lwb_vdev_lock); -} - -static void -zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) -{ - avl_tree_t *src = &lwb->lwb_vdev_tree; - avl_tree_t *dst = &nlwb->lwb_vdev_tree; - void *cookie = NULL; - zil_vdev_node_t *zv; - - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); - ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); - ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); - - /* - * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does - * not need the protection of lwb_vdev_lock (it will only be modified - * while holding zilog->zl_lock) as its writes and those of its - * children have all completed. The younger 'nlwb' may be waiting on - * future writes to additional vdevs. - */ - mutex_enter(&nlwb->lwb_vdev_lock); - /* - * Tear down the 'lwb' vdev tree, ensuring that entries which do not - * exist in 'nlwb' are moved to it, freeing any would-be duplicates. - */ - while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) { - avl_index_t where; - - if (avl_find(dst, zv, &where) == NULL) { - avl_insert(dst, zv, where); - } else { - kmem_free(zv, sizeof (*zv)); - } - } - mutex_exit(&nlwb->lwb_vdev_lock); -} - -void -zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) -{ - lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); -} - -/* - * This function is a called after all vdevs associated with a given lwb - * write have completed their DKIOCFLUSHWRITECACHE command; or as soon - * as the lwb write completes, if "zil_nocacheflush" is set. Further, - * all "previous" lwb's will have completed before this function is - * called; i.e. this function is called for all previous lwbs before - * it's called for "this" lwb (enforced via zio the dependencies - * configured in zil_lwb_set_zio_dependency()). - * - * The intention is for this function to be called as soon as the - * contents of an lwb are considered "stable" on disk, and will survive - * any sudden loss of power. At this point, any threads waiting for the - * lwb to reach this state are signalled, and the "waiter" structures - * are marked "done". - */ -static void -zil_lwb_flush_vdevs_done(zio_t *zio) -{ - lwb_t *lwb = zio->io_private; - zilog_t *zilog = lwb->lwb_zilog; - dmu_tx_t *tx = lwb->lwb_tx; - zil_commit_waiter_t *zcw; - - spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); - - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); - - mutex_enter(&zilog->zl_lock); - - /* - * Ensure the lwb buffer pointer is cleared before releasing the - * txg. If we have had an allocation failure and the txg is - * waiting to sync then we want zil_sync() to remove the lwb so - * that it's not picked up as the next new one in - * zil_process_commit_list(). zil_sync() will only remove the - * lwb if lwb_buf is null. - */ - lwb->lwb_buf = NULL; - lwb->lwb_tx = NULL; - - ASSERT3U(lwb->lwb_issued_timestamp, >, 0); - zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp; - - lwb->lwb_root_zio = NULL; - - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); - lwb->lwb_state = LWB_STATE_FLUSH_DONE; - - if (zilog->zl_last_lwb_opened == lwb) { - /* - * Remember the highest committed log sequence number - * for ztest. We only update this value when all the log - * writes succeeded, because ztest wants to ASSERT that - * it got the whole log chain. - */ - zilog->zl_commit_lr_seq = zilog->zl_lr_seq; - } - - while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) { - mutex_enter(&zcw->zcw_lock); - - ASSERT(list_link_active(&zcw->zcw_node)); - list_remove(&lwb->lwb_waiters, zcw); - - ASSERT3P(zcw->zcw_lwb, ==, lwb); - zcw->zcw_lwb = NULL; - - zcw->zcw_zio_error = zio->io_error; - - ASSERT3B(zcw->zcw_done, ==, B_FALSE); - zcw->zcw_done = B_TRUE; - cv_broadcast(&zcw->zcw_cv); - - mutex_exit(&zcw->zcw_lock); - } - - mutex_exit(&zilog->zl_lock); - - /* - * Now that we've written this log block, we have a stable pointer - * to the next block in the chain, so it's OK to let the txg in - * which we allocated the next block sync. - */ - dmu_tx_commit(tx); -} - -/* - * This is called when an lwb's write zio completes. The callback's - * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs - * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved - * in writing out this specific lwb's data, and in the case that cache - * flushes have been deferred, vdevs involved in writing the data for - * previous lwbs. The writes corresponding to all the vdevs in the - * lwb_vdev_tree will have completed by the time this is called, due to - * the zio dependencies configured in zil_lwb_set_zio_dependency(), - * which takes deferred flushes into account. The lwb will be "done" - * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio - * completion callback for the lwb's root zio. - */ -static void -zil_lwb_write_done(zio_t *zio) -{ - lwb_t *lwb = zio->io_private; - spa_t *spa = zio->io_spa; - zilog_t *zilog = lwb->lwb_zilog; - avl_tree_t *t = &lwb->lwb_vdev_tree; - void *cookie = NULL; - zil_vdev_node_t *zv; - lwb_t *nlwb; - - ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); - - ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); - ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); - ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); - ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); - ASSERT(!BP_IS_GANG(zio->io_bp)); - ASSERT(!BP_IS_HOLE(zio->io_bp)); - ASSERT(BP_GET_FILL(zio->io_bp) == 0); - - abd_put(zio->io_abd); - - mutex_enter(&zilog->zl_lock); - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); - lwb->lwb_state = LWB_STATE_WRITE_DONE; - lwb->lwb_write_zio = NULL; - nlwb = list_next(&zilog->zl_lwb_list, lwb); - mutex_exit(&zilog->zl_lock); - - if (avl_numnodes(t) == 0) - return; - - /* - * If there was an IO error, we're not going to call zio_flush() - * on these vdevs, so we simply empty the tree and free the - * nodes. We avoid calling zio_flush() since there isn't any - * good reason for doing so, after the lwb block failed to be - * written out. - */ - if (zio->io_error != 0) { - while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(zv, sizeof (*zv)); - return; - } - - /* - * If this lwb does not have any threads waiting for it to - * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE - * command to the vdevs written to by "this" lwb, and instead - * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE - * command for those vdevs. Thus, we merge the vdev tree of - * "this" lwb with the vdev tree of the "next" lwb in the list, - * and assume the "next" lwb will handle flushing the vdevs (or - * deferring the flush(s) again). - * - * This is a useful performance optimization, especially for - * workloads with lots of async write activity and few sync - * write and/or fsync activity, as it has the potential to - * coalesce multiple flush commands to a vdev into one. - */ - if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) { - zil_lwb_flush_defer(lwb, nlwb); - ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); - return; - } - - while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { - vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); - if (vd != NULL) - zio_flush(lwb->lwb_root_zio, vd); - kmem_free(zv, sizeof (*zv)); - } -} - -static void -zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) -{ - lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened; - - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - ASSERT(MUTEX_HELD(&zilog->zl_lock)); - - /* - * The zilog's "zl_last_lwb_opened" field is used to build the - * lwb/zio dependency chain, which is used to preserve the - * ordering of lwb completions that is required by the semantics - * of the ZIL. Each new lwb zio becomes a parent of the - * "previous" lwb zio, such that the new lwb's zio cannot - * complete until the "previous" lwb's zio completes. - * - * This is required by the semantics of zil_commit(); the commit - * waiters attached to the lwbs will be woken in the lwb zio's - * completion callback, so this zio dependency graph ensures the - * waiters are woken in the correct order (the same order the - * lwbs were created). - */ - if (last_lwb_opened != NULL && - last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) { - ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || - last_lwb_opened->lwb_state == LWB_STATE_ISSUED || - last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE); - - ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL); - zio_add_child(lwb->lwb_root_zio, - last_lwb_opened->lwb_root_zio); - - /* - * If the previous lwb's write hasn't already completed, - * we also want to order the completion of the lwb write - * zios (above, we only order the completion of the lwb - * root zios). This is required because of how we can - * defer the DKIOCFLUSHWRITECACHE commands for each lwb. - * - * When the DKIOCFLUSHWRITECACHE commands are defered, - * the previous lwb will rely on this lwb to flush the - * vdevs written to by that previous lwb. Thus, we need - * to ensure this lwb doesn't issue the flush until - * after the previous lwb's write completes. We ensure - * this ordering by setting the zio parent/child - * relationship here. - * - * Without this relationship on the lwb's write zio, - * it's possible for this lwb's write to complete prior - * to the previous lwb's write completing; and thus, the - * vdevs for the previous lwb would be flushed prior to - * that lwb's data being written to those vdevs (the - * vdevs are flushed in the lwb write zio's completion - * handler, zil_lwb_write_done()). - */ - if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) { - ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || - last_lwb_opened->lwb_state == LWB_STATE_ISSUED); - - ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL); - zio_add_child(lwb->lwb_write_zio, - last_lwb_opened->lwb_write_zio); - } - } -} - - -/* - * This function's purpose is to "open" an lwb such that it is ready to - * accept new itxs being committed to it. To do this, the lwb's zio - * structures are created, and linked to the lwb. This function is - * idempotent; if the passed in lwb has already been opened, this - * function is essentially a no-op. - */ -static void -zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) -{ - zbookmark_phys_t zb; - zio_priority_t prio; - - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - ASSERT3P(lwb, !=, NULL); - EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); - EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); - - SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], - ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, - lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); - - if (lwb->lwb_root_zio == NULL) { - abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, - BP_GET_LSIZE(&lwb->lwb_blk)); - - if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) - prio = ZIO_PRIORITY_SYNC_WRITE; - else - prio = ZIO_PRIORITY_ASYNC_WRITE; - - lwb->lwb_root_zio = zio_root(zilog->zl_spa, - zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); - ASSERT3P(lwb->lwb_root_zio, !=, NULL); - - lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, - zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, - BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, - prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); - - lwb->lwb_state = LWB_STATE_OPENED; - - mutex_enter(&zilog->zl_lock); - zil_lwb_set_zio_dependency(zilog, lwb); - zilog->zl_last_lwb_opened = lwb; - mutex_exit(&zilog->zl_lock); - } - - ASSERT3P(lwb->lwb_root_zio, !=, NULL); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); -} - -/* - * Define a limited set of intent log block sizes. - * - * These must be a multiple of 4KB. Note only the amount used (again - * aligned to 4KB) actually gets written. However, we can't always just - * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. - */ -struct { - uint64_t limit; - uint64_t blksz; -} zil_block_buckets[] = { - { 4096, 4096 }, /* non TX_WRITE */ - { 8192 + 4096, 8192 + 4096 }, /* database */ - { 32768 + 4096, 32768 + 4096 }, /* NFS writes */ - { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */ - { 131072, 131072 }, /* < 128KB writes */ - { 131072 + 4096, 65536 + 4096 }, /* 128KB writes */ - { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */ -}; - -/* - * Maximum block size used by the ZIL. This is picked up when the ZIL is - * initialized. Otherwise this should not be used directly; see - * zl_max_block_size instead. - */ -int zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; -SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_maxblocksize, CTLFLAG_RWTUN, - &zil_maxblocksize, 0, "Limit in bytes of ZIL log block size"); - -/* - * Start a log block write and advance to the next log block. - * Calls are serialized. - */ -static lwb_t * -zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) -{ - lwb_t *nlwb = NULL; - zil_chain_t *zilc; - spa_t *spa = zilog->zl_spa; - blkptr_t *bp; - dmu_tx_t *tx; - uint64_t txg; - uint64_t zil_blksz, wsz; - int i, error; - boolean_t slog; - - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - ASSERT3P(lwb->lwb_root_zio, !=, NULL); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); - - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { - zilc = (zil_chain_t *)lwb->lwb_buf; - bp = &zilc->zc_next_blk; - } else { - zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); - bp = &zilc->zc_next_blk; - } - - ASSERT(lwb->lwb_nused <= lwb->lwb_sz); - - /* - * Allocate the next block and save its address in this block - * before writing it in order to establish the log chain. - * Note that if the allocation of nlwb synced before we wrote - * the block that points at it (lwb), we'd leak it if we crashed. - * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). - * We dirty the dataset to ensure that zil_sync() will be called - * to clean up in the event of allocation failure or I/O failure. - */ - - tx = dmu_tx_create(zilog->zl_os); - - /* - * Since we are not going to create any new dirty data, and we - * can even help with clearing the existing dirty data, we - * should not be subject to the dirty data based delays. We - * use TXG_NOTHROTTLE to bypass the delay mechanism. - */ - VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); - - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - txg = dmu_tx_get_txg(tx); - - lwb->lwb_tx = tx; - - /* - * Log blocks are pre-allocated. Here we select the size of the next - * block, based on size used in the last block. - * - first find the smallest bucket that will fit the block from a - * limited set of block sizes. This is because it's faster to write - * blocks allocated from the same metaslab as they are adjacent or - * close. - * - next find the maximum from the new suggested size and an array of - * previous sizes. This lessens a picket fence effect of wrongly - * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k - * requests. - * - * Note we only write what is used, but we can't just allocate - * the maximum block size because we can exhaust the available - * pool log space. - */ - zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); - for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) - continue; - zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); - zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; - for (i = 0; i < ZIL_PREV_BLKS; i++) - zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); - zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); - - BP_ZERO(bp); - - /* pass the old blkptr in order to spread log blocks across devs */ - error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object, - txg, bp, &lwb->lwb_blk, zil_blksz, &slog); - if (error == 0) { - ASSERT3U(bp->blk_birth, ==, txg); - bp->blk_cksum = lwb->lwb_blk.blk_cksum; - bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; - - /* - * Allocate a new log write block (lwb). - */ - nlwb = zil_alloc_lwb(zilog, bp, slog, txg); - } - - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { - /* For Slim ZIL only write what is used. */ - wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); - ASSERT3U(wsz, <=, lwb->lwb_sz); - zio_shrink(lwb->lwb_write_zio, wsz); - - } else { - wsz = lwb->lwb_sz; - } - - zilc->zc_pad = 0; - zilc->zc_nused = lwb->lwb_nused; - zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; - - /* - * clear unused data for security - */ - bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); - - spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); - - zil_lwb_add_block(lwb, &lwb->lwb_blk); - lwb->lwb_issued_timestamp = gethrtime(); - lwb->lwb_state = LWB_STATE_ISSUED; - - zio_nowait(lwb->lwb_root_zio); - zio_nowait(lwb->lwb_write_zio); - - /* - * If there was an allocation failure then nlwb will be null which - * forces a txg_wait_synced(). - */ - return (nlwb); -} - -/* - * Maximum amount of write data that can be put into single log block. - */ -uint64_t -zil_max_log_data(zilog_t *zilog) -{ - return (zilog->zl_max_block_size - - sizeof (zil_chain_t) - sizeof (lr_write_t)); -} - -/* - * Maximum amount of log space we agree to waste to reduce number of - * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%). - */ -static inline uint64_t -zil_max_waste_space(zilog_t *zilog) -{ - return (zil_max_log_data(zilog) / 8); -} - -/* - * Maximum amount of write data for WR_COPIED. For correctness, consumers - * must fall back to WR_NEED_COPY if we can't fit the entire record into one - * maximum sized log block, because each WR_COPIED record must fit in a - * single log block. For space efficiency, we want to fit two records into a - * max-sized log block. - */ -uint64_t -zil_max_copied_data(zilog_t *zilog) -{ - return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 - - sizeof (lr_write_t)); -} - -static lwb_t * -zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) -{ - lr_t *lrcb, *lrc; - lr_write_t *lrwb, *lrw; - char *lr_buf; - uint64_t dlen, dnow, lwb_sp, reclen, txg, max_log_data; - - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - ASSERT3P(lwb, !=, NULL); - ASSERT3P(lwb->lwb_buf, !=, NULL); - - zil_lwb_write_open(zilog, lwb); - - lrc = &itx->itx_lr; - lrw = (lr_write_t *)lrc; - - /* - * A commit itx doesn't represent any on-disk state; instead - * it's simply used as a place holder on the commit list, and - * provides a mechanism for attaching a "commit waiter" onto the - * correct lwb (such that the waiter can be signalled upon - * completion of that lwb). Thus, we don't process this itx's - * log record if it's a commit itx (these itx's don't have log - * records), and instead link the itx's waiter onto the lwb's - * list of waiters. - * - * For more details, see the comment above zil_commit(). - */ - if (lrc->lrc_txtype == TX_COMMIT) { - mutex_enter(&zilog->zl_lock); - zil_commit_waiter_link_lwb(itx->itx_private, lwb); - itx->itx_private = NULL; - mutex_exit(&zilog->zl_lock); - return (lwb); - } - - if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { - dlen = P2ROUNDUP_TYPED( - lrw->lr_length, sizeof (uint64_t), uint64_t); - } else { - dlen = 0; - } - reclen = lrc->lrc_reclen; - zilog->zl_cur_used += (reclen + dlen); - txg = lrc->lrc_txg; - - ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen)); - -cont: - /* - * If this record won't fit in the current log block, start a new one. - * For WR_NEED_COPY optimize layout for minimal number of chunks. - */ - lwb_sp = lwb->lwb_sz - lwb->lwb_nused; - max_log_data = zil_max_log_data(zilog); - if (reclen > lwb_sp || (reclen + dlen > lwb_sp && - lwb_sp < zil_max_waste_space(zilog) && - (dlen % max_log_data == 0 || - lwb_sp < reclen + dlen % max_log_data))) { - lwb = zil_lwb_write_issue(zilog, lwb); - if (lwb == NULL) - return (NULL); - zil_lwb_write_open(zilog, lwb); - ASSERT(LWB_EMPTY(lwb)); - lwb_sp = lwb->lwb_sz - lwb->lwb_nused; - - /* - * There must be enough space in the new, empty log block to - * hold reclen. For WR_COPIED, we need to fit the whole - * record in one block, and reclen is the header size + the - * data size. For WR_NEED_COPY, we can create multiple - * records, splitting the data into multiple blocks, so we - * only need to fit one word of data per block; in this case - * reclen is just the header size (no data). - */ - ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); - } - - dnow = MIN(dlen, lwb_sp - reclen); - lr_buf = lwb->lwb_buf + lwb->lwb_nused; - bcopy(lrc, lr_buf, reclen); - lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ - lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ - - /* - * If it's a write, fetch the data or get its blkptr as appropriate. - */ - if (lrc->lrc_txtype == TX_WRITE) { - if (txg > spa_freeze_txg(zilog->zl_spa)) - txg_wait_synced(zilog->zl_dmu_pool, txg); - if (itx->itx_wr_state != WR_COPIED) { - char *dbuf; - int error; - - if (itx->itx_wr_state == WR_NEED_COPY) { - dbuf = lr_buf + reclen; - lrcb->lrc_reclen += dnow; - if (lrwb->lr_length > dnow) - lrwb->lr_length = dnow; - lrw->lr_offset += dnow; - lrw->lr_length -= dnow; - } else { - ASSERT(itx->itx_wr_state == WR_INDIRECT); - dbuf = NULL; - } - - /* - * We pass in the "lwb_write_zio" rather than - * "lwb_root_zio" so that the "lwb_write_zio" - * becomes the parent of any zio's created by - * the "zl_get_data" callback. The vdevs are - * flushed after the "lwb_write_zio" completes, - * so we want to make sure that completion - * callback waits for these additional zio's, - * such that the vdevs used by those zio's will - * be included in the lwb's vdev tree, and those - * vdevs will be properly flushed. If we passed - * in "lwb_root_zio" here, then these additional - * vdevs may not be flushed; e.g. if these zio's - * completed after "lwb_write_zio" completed. - */ - error = zilog->zl_get_data(itx->itx_private, - lrwb, dbuf, lwb, lwb->lwb_write_zio); - - if (error == EIO) { - txg_wait_synced(zilog->zl_dmu_pool, txg); - return (lwb); - } - if (error != 0) { - ASSERT(error == ENOENT || error == EEXIST || - error == EALREADY); - return (lwb); - } - } - } - - /* - * We're actually making an entry, so update lrc_seq to be the - * log record sequence number. Note that this is generally not - * equal to the itx sequence number because not all transactions - * are synchronous, and sometimes spa_sync() gets there first. - */ - lrcb->lrc_seq = ++zilog->zl_lr_seq; - lwb->lwb_nused += reclen + dnow; - - zil_lwb_add_txg(lwb, txg); - - ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); - ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); - - dlen -= dnow; - if (dlen > 0) { - zilog->zl_cur_used += reclen; - goto cont; - } - - return (lwb); -} - -itx_t * -zil_itx_create(uint64_t txtype, size_t lrsize) -{ - itx_t *itx; - - lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); - - itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); - itx->itx_lr.lrc_txtype = txtype; - itx->itx_lr.lrc_reclen = lrsize; - itx->itx_lr.lrc_seq = 0; /* defensive */ - itx->itx_sync = B_TRUE; /* default is synchronous */ - - return (itx); -} - -void -zil_itx_destroy(itx_t *itx) -{ - kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); -} - -/* - * Free up the sync and async itxs. The itxs_t has already been detached - * so no locks are needed. - */ -static void -zil_itxg_clean(itxs_t *itxs) -{ - itx_t *itx; - list_t *list; - avl_tree_t *t; - void *cookie; - itx_async_node_t *ian; - - list = &itxs->i_sync_list; - while ((itx = list_head(list)) != NULL) { - /* - * In the general case, commit itxs will not be found - * here, as they'll be committed to an lwb via - * zil_lwb_commit(), and free'd in that function. Having - * said that, it is still possible for commit itxs to be - * found here, due to the following race: - * - * - a thread calls zil_commit() which assigns the - * commit itx to a per-txg i_sync_list - * - zil_itxg_clean() is called (e.g. via spa_sync()) - * while the waiter is still on the i_sync_list - * - * There's nothing to prevent syncing the txg while the - * waiter is on the i_sync_list. This normally doesn't - * happen because spa_sync() is slower than zil_commit(), - * but if zil_commit() calls txg_wait_synced() (e.g. - * because zil_create() or zil_commit_writer_stall() is - * called) we will hit this case. - */ - if (itx->itx_lr.lrc_txtype == TX_COMMIT) - zil_commit_waiter_skip(itx->itx_private); - - list_remove(list, itx); - zil_itx_destroy(itx); - } - - cookie = NULL; - t = &itxs->i_async_tree; - while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { - list = &ian->ia_list; - while ((itx = list_head(list)) != NULL) { - list_remove(list, itx); - /* commit itxs should never be on the async lists. */ - ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); - zil_itx_destroy(itx); - } - list_destroy(list); - kmem_free(ian, sizeof (itx_async_node_t)); - } - avl_destroy(t); - - kmem_free(itxs, sizeof (itxs_t)); -} - -static int -zil_aitx_compare(const void *x1, const void *x2) -{ - const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; - const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; - - return (AVL_CMP(o1, o2)); -} - -/* - * Remove all async itx with the given oid. - */ -static void -zil_remove_async(zilog_t *zilog, uint64_t oid) -{ - uint64_t otxg, txg; - itx_async_node_t *ian; - avl_tree_t *t; - avl_index_t where; - list_t clean_list; - itx_t *itx; - - ASSERT(oid != 0); - list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); - - if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ - otxg = ZILTEST_TXG; - else - otxg = spa_last_synced_txg(zilog->zl_spa) + 1; - - for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { - itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; - - mutex_enter(&itxg->itxg_lock); - if (itxg->itxg_txg != txg) { - mutex_exit(&itxg->itxg_lock); - continue; - } - - /* - * Locate the object node and append its list. - */ - t = &itxg->itxg_itxs->i_async_tree; - ian = avl_find(t, &oid, &where); - if (ian != NULL) - list_move_tail(&clean_list, &ian->ia_list); - mutex_exit(&itxg->itxg_lock); - } - while ((itx = list_head(&clean_list)) != NULL) { - list_remove(&clean_list, itx); - /* commit itxs should never be on the async lists. */ - ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); - zil_itx_destroy(itx); - } - list_destroy(&clean_list); -} - -void -zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) -{ - uint64_t txg; - itxg_t *itxg; - itxs_t *itxs, *clean = NULL; - - /* - * Object ids can be re-instantiated in the next txg so - * remove any async transactions to avoid future leaks. - * This can happen if a fsync occurs on the re-instantiated - * object for a WR_INDIRECT or WR_NEED_COPY write, which gets - * the new file data and flushes a write record for the old object. - */ - if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) - zil_remove_async(zilog, itx->itx_oid); - - /* - * Ensure the data of a renamed file is committed before the rename. - */ - if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) - zil_async_to_sync(zilog, itx->itx_oid); - - if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) - txg = ZILTEST_TXG; - else - txg = dmu_tx_get_txg(tx); - - itxg = &zilog->zl_itxg[txg & TXG_MASK]; - mutex_enter(&itxg->itxg_lock); - itxs = itxg->itxg_itxs; - if (itxg->itxg_txg != txg) { - if (itxs != NULL) { - /* - * The zil_clean callback hasn't got around to cleaning - * this itxg. Save the itxs for release below. - * This should be rare. - */ - zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " - "txg %llu", itxg->itxg_txg); - clean = itxg->itxg_itxs; - } - itxg->itxg_txg = txg; - itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); - - list_create(&itxs->i_sync_list, sizeof (itx_t), - offsetof(itx_t, itx_node)); - avl_create(&itxs->i_async_tree, zil_aitx_compare, - sizeof (itx_async_node_t), - offsetof(itx_async_node_t, ia_node)); - } - if (itx->itx_sync) { - list_insert_tail(&itxs->i_sync_list, itx); - } else { - avl_tree_t *t = &itxs->i_async_tree; - uint64_t foid = - LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid); - itx_async_node_t *ian; - avl_index_t where; - - ian = avl_find(t, &foid, &where); - if (ian == NULL) { - ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); - list_create(&ian->ia_list, sizeof (itx_t), - offsetof(itx_t, itx_node)); - ian->ia_foid = foid; - avl_insert(t, ian, where); - } - list_insert_tail(&ian->ia_list, itx); - } - - itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); - - /* - * We don't want to dirty the ZIL using ZILTEST_TXG, because - * zil_clean() will never be called using ZILTEST_TXG. Thus, we - * need to be careful to always dirty the ZIL using the "real" - * TXG (not itxg_txg) even when the SPA is frozen. - */ - zilog_dirty(zilog, dmu_tx_get_txg(tx)); - mutex_exit(&itxg->itxg_lock); - - /* Release the old itxs now we've dropped the lock */ - if (clean != NULL) - zil_itxg_clean(clean); -} - -/* - * If there are any in-memory intent log transactions which have now been - * synced then start up a taskq to free them. We should only do this after we - * have written out the uberblocks (i.e. txg has been comitted) so that - * don't inadvertently clean out in-memory log records that would be required - * by zil_commit(). - */ -void -zil_clean(zilog_t *zilog, uint64_t synced_txg) -{ - itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; - itxs_t *clean_me; - - ASSERT3U(synced_txg, <, ZILTEST_TXG); - - mutex_enter(&itxg->itxg_lock); - if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { - mutex_exit(&itxg->itxg_lock); - return; - } - ASSERT3U(itxg->itxg_txg, <=, synced_txg); - ASSERT3U(itxg->itxg_txg, !=, 0); - clean_me = itxg->itxg_itxs; - itxg->itxg_itxs = NULL; - itxg->itxg_txg = 0; - mutex_exit(&itxg->itxg_lock); - /* - * Preferably start a task queue to free up the old itxs but - * if taskq_dispatch can't allocate resources to do that then - * free it in-line. This should be rare. Note, using TQ_SLEEP - * created a bad performance problem. - */ - ASSERT3P(zilog->zl_dmu_pool, !=, NULL); - ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); - if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, - (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0) - zil_itxg_clean(clean_me); -} - -/* - * This function will traverse the queue of itxs that need to be - * committed, and move them onto the ZIL's zl_itx_commit_list. - */ -static void -zil_get_commit_list(zilog_t *zilog) -{ - uint64_t otxg, txg; - list_t *commit_list = &zilog->zl_itx_commit_list; - - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - - if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ - otxg = ZILTEST_TXG; - else - otxg = spa_last_synced_txg(zilog->zl_spa) + 1; - - /* - * This is inherently racy, since there is nothing to prevent - * the last synced txg from changing. That's okay since we'll - * only commit things in the future. - */ - for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { - itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; - - mutex_enter(&itxg->itxg_lock); - if (itxg->itxg_txg != txg) { - mutex_exit(&itxg->itxg_lock); - continue; - } - - /* - * If we're adding itx records to the zl_itx_commit_list, - * then the zil better be dirty in this "txg". We can assert - * that here since we're holding the itxg_lock which will - * prevent spa_sync from cleaning it. Once we add the itxs - * to the zl_itx_commit_list we must commit it to disk even - * if it's unnecessary (i.e. the txg was synced). - */ - ASSERT(zilog_is_dirty_in_txg(zilog, txg) || - spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); - list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); - - mutex_exit(&itxg->itxg_lock); - } -} - -/* - * Move the async itxs for a specified object to commit into sync lists. - */ -void -zil_async_to_sync(zilog_t *zilog, uint64_t foid) -{ - uint64_t otxg, txg; - itx_async_node_t *ian; - avl_tree_t *t; - avl_index_t where; - - if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ - otxg = ZILTEST_TXG; - else - otxg = spa_last_synced_txg(zilog->zl_spa) + 1; - - /* - * This is inherently racy, since there is nothing to prevent - * the last synced txg from changing. - */ - for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { - itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; - - mutex_enter(&itxg->itxg_lock); - if (itxg->itxg_txg != txg) { - mutex_exit(&itxg->itxg_lock); - continue; - } - - /* - * If a foid is specified then find that node and append its - * list. Otherwise walk the tree appending all the lists - * to the sync list. We add to the end rather than the - * beginning to ensure the create has happened. - */ - t = &itxg->itxg_itxs->i_async_tree; - if (foid != 0) { - ian = avl_find(t, &foid, &where); - if (ian != NULL) { - list_move_tail(&itxg->itxg_itxs->i_sync_list, - &ian->ia_list); - } - } else { - void *cookie = NULL; - - while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { - list_move_tail(&itxg->itxg_itxs->i_sync_list, - &ian->ia_list); - list_destroy(&ian->ia_list); - kmem_free(ian, sizeof (itx_async_node_t)); - } - } - mutex_exit(&itxg->itxg_lock); - } -} - -/* - * This function will prune commit itxs that are at the head of the - * commit list (it won't prune past the first non-commit itx), and - * either: a) attach them to the last lwb that's still pending - * completion, or b) skip them altogether. - * - * This is used as a performance optimization to prevent commit itxs - * from generating new lwbs when it's unnecessary to do so. - */ -static void -zil_prune_commit_list(zilog_t *zilog) -{ - itx_t *itx; - - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - - while (itx = list_head(&zilog->zl_itx_commit_list)) { - lr_t *lrc = &itx->itx_lr; - if (lrc->lrc_txtype != TX_COMMIT) - break; - - mutex_enter(&zilog->zl_lock); - - lwb_t *last_lwb = zilog->zl_last_lwb_opened; - if (last_lwb == NULL || - last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) { - /* - * All of the itxs this waiter was waiting on - * must have already completed (or there were - * never any itx's for it to wait on), so it's - * safe to skip this waiter and mark it done. - */ - zil_commit_waiter_skip(itx->itx_private); - } else { - zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); - itx->itx_private = NULL; - } - - mutex_exit(&zilog->zl_lock); - - list_remove(&zilog->zl_itx_commit_list, itx); - zil_itx_destroy(itx); - } - - IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); -} - -static void -zil_commit_writer_stall(zilog_t *zilog) -{ - /* - * When zio_alloc_zil() fails to allocate the next lwb block on - * disk, we must call txg_wait_synced() to ensure all of the - * lwbs in the zilog's zl_lwb_list are synced and then freed (in - * zil_sync()), such that any subsequent ZIL writer (i.e. a call - * to zil_process_commit_list()) will have to call zil_create(), - * and start a new ZIL chain. - * - * Since zil_alloc_zil() failed, the lwb that was previously - * issued does not have a pointer to the "next" lwb on disk. - * Thus, if another ZIL writer thread was to allocate the "next" - * on-disk lwb, that block could be leaked in the event of a - * crash (because the previous lwb on-disk would not point to - * it). - * - * We must hold the zilog's zl_issuer_lock while we do this, to - * ensure no new threads enter zil_process_commit_list() until - * all lwb's in the zl_lwb_list have been synced and freed - * (which is achieved via the txg_wait_synced() call). - */ - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - txg_wait_synced(zilog->zl_dmu_pool, 0); - ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); -} - -/* - * This function will traverse the commit list, creating new lwbs as - * needed, and committing the itxs from the commit list to these newly - * created lwbs. Additionally, as a new lwb is created, the previous - * lwb will be issued to the zio layer to be written to disk. - */ -static void -zil_process_commit_list(zilog_t *zilog) -{ - spa_t *spa = zilog->zl_spa; - list_t nolwb_waiters; - lwb_t *lwb; - itx_t *itx; - - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - - /* - * Return if there's nothing to commit before we dirty the fs by - * calling zil_create(). - */ - if (list_head(&zilog->zl_itx_commit_list) == NULL) - return; - - list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), - offsetof(zil_commit_waiter_t, zcw_node)); - - lwb = list_tail(&zilog->zl_lwb_list); - if (lwb == NULL) { - lwb = zil_create(zilog); - } else { - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); - } - - while (itx = list_head(&zilog->zl_itx_commit_list)) { - lr_t *lrc = &itx->itx_lr; - uint64_t txg = lrc->lrc_txg; - - ASSERT3U(txg, !=, 0); - - if (lrc->lrc_txtype == TX_COMMIT) { - DTRACE_PROBE2(zil__process__commit__itx, - zilog_t *, zilog, itx_t *, itx); - } else { - DTRACE_PROBE2(zil__process__normal__itx, - zilog_t *, zilog, itx_t *, itx); - } - - boolean_t synced = txg <= spa_last_synced_txg(spa); - boolean_t frozen = txg > spa_freeze_txg(spa); - - /* - * If the txg of this itx has already been synced out, then - * we don't need to commit this itx to an lwb. This is - * because the data of this itx will have already been - * written to the main pool. This is inherently racy, and - * it's still ok to commit an itx whose txg has already - * been synced; this will result in a write that's - * unnecessary, but will do no harm. - * - * With that said, we always want to commit TX_COMMIT itxs - * to an lwb, regardless of whether or not that itx's txg - * has been synced out. We do this to ensure any OPENED lwb - * will always have at least one zil_commit_waiter_t linked - * to the lwb. - * - * As a counter-example, if we skipped TX_COMMIT itx's - * whose txg had already been synced, the following - * situation could occur if we happened to be racing with - * spa_sync: - * - * 1. we commit a non-TX_COMMIT itx to an lwb, where the - * itx's txg is 10 and the last synced txg is 9. - * 2. spa_sync finishes syncing out txg 10. - * 3. we move to the next itx in the list, it's a TX_COMMIT - * whose txg is 10, so we skip it rather than committing - * it to the lwb used in (1). - * - * If the itx that is skipped in (3) is the last TX_COMMIT - * itx in the commit list, than it's possible for the lwb - * used in (1) to remain in the OPENED state indefinitely. - * - * To prevent the above scenario from occuring, ensuring - * that once an lwb is OPENED it will transition to ISSUED - * and eventually DONE, we always commit TX_COMMIT itx's to - * an lwb here, even if that itx's txg has already been - * synced. - * - * Finally, if the pool is frozen, we _always_ commit the - * itx. The point of freezing the pool is to prevent data - * from being written to the main pool via spa_sync, and - * instead rely solely on the ZIL to persistently store the - * data; i.e. when the pool is frozen, the last synced txg - * value can't be trusted. - */ - if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { - if (lwb != NULL) { - lwb = zil_lwb_commit(zilog, itx, lwb); - } else if (lrc->lrc_txtype == TX_COMMIT) { - ASSERT3P(lwb, ==, NULL); - zil_commit_waiter_link_nolwb( - itx->itx_private, &nolwb_waiters); - } - } - - list_remove(&zilog->zl_itx_commit_list, itx); - zil_itx_destroy(itx); - } - - if (lwb == NULL) { - /* - * This indicates zio_alloc_zil() failed to allocate the - * "next" lwb on-disk. When this happens, we must stall - * the ZIL write pipeline; see the comment within - * zil_commit_writer_stall() for more details. - */ - zil_commit_writer_stall(zilog); - - /* - * Additionally, we have to signal and mark the "nolwb" - * waiters as "done" here, since without an lwb, we - * can't do this via zil_lwb_flush_vdevs_done() like - * normal. - */ - zil_commit_waiter_t *zcw; - while (zcw = list_head(&nolwb_waiters)) { - zil_commit_waiter_skip(zcw); - list_remove(&nolwb_waiters, zcw); - } - } else { - ASSERT(list_is_empty(&nolwb_waiters)); - ASSERT3P(lwb, !=, NULL); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); - - /* - * At this point, the ZIL block pointed at by the "lwb" - * variable is in one of the following states: "closed" - * or "open". - * - * If its "closed", then no itxs have been committed to - * it, so there's no point in issuing its zio (i.e. - * it's "empty"). - * - * If its "open" state, then it contains one or more - * itxs that eventually need to be committed to stable - * storage. In this case we intentionally do not issue - * the lwb's zio to disk yet, and instead rely on one of - * the following two mechanisms for issuing the zio: - * - * 1. Ideally, there will be more ZIL activity occuring - * on the system, such that this function will be - * immediately called again (not necessarily by the same - * thread) and this lwb's zio will be issued via - * zil_lwb_commit(). This way, the lwb is guaranteed to - * be "full" when it is issued to disk, and we'll make - * use of the lwb's size the best we can. - * - * 2. If there isn't sufficient ZIL activity occuring on - * the system, such that this lwb's zio isn't issued via - * zil_lwb_commit(), zil_commit_waiter() will issue the - * lwb's zio. If this occurs, the lwb is not guaranteed - * to be "full" by the time its zio is issued, and means - * the size of the lwb was "too large" given the amount - * of ZIL activity occuring on the system at that time. - * - * We do this for a couple of reasons: - * - * 1. To try and reduce the number of IOPs needed to - * write the same number of itxs. If an lwb has space - * available in it's buffer for more itxs, and more itxs - * will be committed relatively soon (relative to the - * latency of performing a write), then it's beneficial - * to wait for these "next" itxs. This way, more itxs - * can be committed to stable storage with fewer writes. - * - * 2. To try and use the largest lwb block size that the - * incoming rate of itxs can support. Again, this is to - * try and pack as many itxs into as few lwbs as - * possible, without significantly impacting the latency - * of each individual itx. - */ - } -} - -/* - * This function is responsible for ensuring the passed in commit waiter - * (and associated commit itx) is committed to an lwb. If the waiter is - * not already committed to an lwb, all itxs in the zilog's queue of - * itxs will be processed. The assumption is the passed in waiter's - * commit itx will found in the queue just like the other non-commit - * itxs, such that when the entire queue is processed, the waiter will - * have been commited to an lwb. - * - * The lwb associated with the passed in waiter is not guaranteed to - * have been issued by the time this function completes. If the lwb is - * not issued, we rely on future calls to zil_commit_writer() to issue - * the lwb, or the timeout mechanism found in zil_commit_waiter(). - */ -static void -zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) -{ - ASSERT(!MUTEX_HELD(&zilog->zl_lock)); - ASSERT(spa_writeable(zilog->zl_spa)); - - mutex_enter(&zilog->zl_issuer_lock); - - if (zcw->zcw_lwb != NULL || zcw->zcw_done) { - /* - * It's possible that, while we were waiting to acquire - * the "zl_issuer_lock", another thread committed this - * waiter to an lwb. If that occurs, we bail out early, - * without processing any of the zilog's queue of itxs. - * - * On certain workloads and system configurations, the - * "zl_issuer_lock" can become highly contended. In an - * attempt to reduce this contention, we immediately drop - * the lock if the waiter has already been processed. - * - * We've measured this optimization to reduce CPU spent - * contending on this lock by up to 5%, using a system - * with 32 CPUs, low latency storage (~50 usec writes), - * and 1024 threads performing sync writes. - */ - goto out; - } - - zil_get_commit_list(zilog); - zil_prune_commit_list(zilog); - zil_process_commit_list(zilog); - -out: - mutex_exit(&zilog->zl_issuer_lock); -} - -static void -zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) -{ - ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); - ASSERT(MUTEX_HELD(&zcw->zcw_lock)); - ASSERT3B(zcw->zcw_done, ==, B_FALSE); - - lwb_t *lwb = zcw->zcw_lwb; - ASSERT3P(lwb, !=, NULL); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED); - - /* - * If the lwb has already been issued by another thread, we can - * immediately return since there's no work to be done (the - * point of this function is to issue the lwb). Additionally, we - * do this prior to acquiring the zl_issuer_lock, to avoid - * acquiring it when it's not necessary to do so. - */ - if (lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE) - return; - - /* - * In order to call zil_lwb_write_issue() we must hold the - * zilog's "zl_issuer_lock". We can't simply acquire that lock, - * since we're already holding the commit waiter's "zcw_lock", - * and those two locks are aquired in the opposite order - * elsewhere. - */ - mutex_exit(&zcw->zcw_lock); - mutex_enter(&zilog->zl_issuer_lock); - mutex_enter(&zcw->zcw_lock); - - /* - * Since we just dropped and re-acquired the commit waiter's - * lock, we have to re-check to see if the waiter was marked - * "done" during that process. If the waiter was marked "done", - * the "lwb" pointer is no longer valid (it can be free'd after - * the waiter is marked "done"), so without this check we could - * wind up with a use-after-free error below. - */ - if (zcw->zcw_done) - goto out; - - ASSERT3P(lwb, ==, zcw->zcw_lwb); - - /* - * We've already checked this above, but since we hadn't acquired - * the zilog's zl_issuer_lock, we have to perform this check a - * second time while holding the lock. - * - * We don't need to hold the zl_lock since the lwb cannot transition - * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb - * _can_ transition from ISSUED to DONE, but it's OK to race with - * that transition since we treat the lwb the same, whether it's in - * the ISSUED or DONE states. - * - * The important thing, is we treat the lwb differently depending on - * if it's ISSUED or OPENED, and block any other threads that might - * attempt to issue this lwb. For that reason we hold the - * zl_issuer_lock when checking the lwb_state; we must not call - * zil_lwb_write_issue() if the lwb had already been issued. - * - * See the comment above the lwb_state_t structure definition for - * more details on the lwb states, and locking requirements. - */ - if (lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE) - goto out; - - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); - - /* - * As described in the comments above zil_commit_waiter() and - * zil_process_commit_list(), we need to issue this lwb's zio - * since we've reached the commit waiter's timeout and it still - * hasn't been issued. - */ - lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); - - IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED); - - /* - * Since the lwb's zio hadn't been issued by the time this thread - * reached its timeout, we reset the zilog's "zl_cur_used" field - * to influence the zil block size selection algorithm. - * - * By having to issue the lwb's zio here, it means the size of the - * lwb was too large, given the incoming throughput of itxs. By - * setting "zl_cur_used" to zero, we communicate this fact to the - * block size selection algorithm, so it can take this informaiton - * into account, and potentially select a smaller size for the - * next lwb block that is allocated. - */ - zilog->zl_cur_used = 0; - - if (nlwb == NULL) { - /* - * When zil_lwb_write_issue() returns NULL, this - * indicates zio_alloc_zil() failed to allocate the - * "next" lwb on-disk. When this occurs, the ZIL write - * pipeline must be stalled; see the comment within the - * zil_commit_writer_stall() function for more details. - * - * We must drop the commit waiter's lock prior to - * calling zil_commit_writer_stall() or else we can wind - * up with the following deadlock: - * - * - This thread is waiting for the txg to sync while - * holding the waiter's lock; txg_wait_synced() is - * used within txg_commit_writer_stall(). - * - * - The txg can't sync because it is waiting for this - * lwb's zio callback to call dmu_tx_commit(). - * - * - The lwb's zio callback can't call dmu_tx_commit() - * because it's blocked trying to acquire the waiter's - * lock, which occurs prior to calling dmu_tx_commit() - */ - mutex_exit(&zcw->zcw_lock); - zil_commit_writer_stall(zilog); - mutex_enter(&zcw->zcw_lock); - } - -out: - mutex_exit(&zilog->zl_issuer_lock); - ASSERT(MUTEX_HELD(&zcw->zcw_lock)); -} - -/* - * This function is responsible for performing the following two tasks: - * - * 1. its primary responsibility is to block until the given "commit - * waiter" is considered "done". - * - * 2. its secondary responsibility is to issue the zio for the lwb that - * the given "commit waiter" is waiting on, if this function has - * waited "long enough" and the lwb is still in the "open" state. - * - * Given a sufficient amount of itxs being generated and written using - * the ZIL, the lwb's zio will be issued via the zil_lwb_commit() - * function. If this does not occur, this secondary responsibility will - * ensure the lwb is issued even if there is not other synchronous - * activity on the system. - * - * For more details, see zil_process_commit_list(); more specifically, - * the comment at the bottom of that function. - */ -static void -zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) -{ - ASSERT(!MUTEX_HELD(&zilog->zl_lock)); - ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); - ASSERT(spa_writeable(zilog->zl_spa)); - - mutex_enter(&zcw->zcw_lock); - - /* - * The timeout is scaled based on the lwb latency to avoid - * significantly impacting the latency of each individual itx. - * For more details, see the comment at the bottom of the - * zil_process_commit_list() function. - */ - int pct = MAX(zfs_commit_timeout_pct, 1); -#if defined(illumos) || !defined(_KERNEL) - hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100; - hrtime_t wakeup = gethrtime() + sleep; -#else - sbintime_t sleep = nstosbt((zilog->zl_last_lwb_latency * pct) / 100); - sbintime_t wakeup = getsbinuptime() + sleep; -#endif - boolean_t timedout = B_FALSE; - - while (!zcw->zcw_done) { - ASSERT(MUTEX_HELD(&zcw->zcw_lock)); - - lwb_t *lwb = zcw->zcw_lwb; - - /* - * Usually, the waiter will have a non-NULL lwb field here, - * but it's possible for it to be NULL as a result of - * zil_commit() racing with spa_sync(). - * - * When zil_clean() is called, it's possible for the itxg - * list (which may be cleaned via a taskq) to contain - * commit itxs. When this occurs, the commit waiters linked - * off of these commit itxs will not be committed to an - * lwb. Additionally, these commit waiters will not be - * marked done until zil_commit_waiter_skip() is called via - * zil_itxg_clean(). - * - * Thus, it's possible for this commit waiter (i.e. the - * "zcw" variable) to be found in this "in between" state; - * where it's "zcw_lwb" field is NULL, and it hasn't yet - * been skipped, so it's "zcw_done" field is still B_FALSE. - */ - IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED); - - if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { - ASSERT3B(timedout, ==, B_FALSE); - - /* - * If the lwb hasn't been issued yet, then we - * need to wait with a timeout, in case this - * function needs to issue the lwb after the - * timeout is reached; responsibility (2) from - * the comment above this function. - */ -#if defined(illumos) || !defined(_KERNEL) - clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv, - &zcw->zcw_lock, wakeup, USEC2NSEC(1), - CALLOUT_FLAG_ABSOLUTE); - - if (timeleft >= 0 || zcw->zcw_done) - continue; -#else - int wait_err = cv_timedwait_sbt(&zcw->zcw_cv, - &zcw->zcw_lock, wakeup, SBT_1NS, C_ABSOLUTE); - if (wait_err != EWOULDBLOCK || zcw->zcw_done) - continue; -#endif - - timedout = B_TRUE; - zil_commit_waiter_timeout(zilog, zcw); - - if (!zcw->zcw_done) { - /* - * If the commit waiter has already been - * marked "done", it's possible for the - * waiter's lwb structure to have already - * been freed. Thus, we can only reliably - * make these assertions if the waiter - * isn't done. - */ - ASSERT3P(lwb, ==, zcw->zcw_lwb); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); - } - } else { - /* - * If the lwb isn't open, then it must have already - * been issued. In that case, there's no need to - * use a timeout when waiting for the lwb to - * complete. - * - * Additionally, if the lwb is NULL, the waiter - * will soon be signalled and marked done via - * zil_clean() and zil_itxg_clean(), so no timeout - * is required. - */ - - IMPLY(lwb != NULL, - lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE); - cv_wait(&zcw->zcw_cv, &zcw->zcw_lock); - } - } - - mutex_exit(&zcw->zcw_lock); -} - -static zil_commit_waiter_t * -zil_alloc_commit_waiter() -{ - zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP); - - cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL); - list_link_init(&zcw->zcw_node); - zcw->zcw_lwb = NULL; - zcw->zcw_done = B_FALSE; - zcw->zcw_zio_error = 0; - - return (zcw); -} - -static void -zil_free_commit_waiter(zil_commit_waiter_t *zcw) -{ - ASSERT(!list_link_active(&zcw->zcw_node)); - ASSERT3P(zcw->zcw_lwb, ==, NULL); - ASSERT3B(zcw->zcw_done, ==, B_TRUE); - mutex_destroy(&zcw->zcw_lock); - cv_destroy(&zcw->zcw_cv); - kmem_cache_free(zil_zcw_cache, zcw); -} - -/* - * This function is used to create a TX_COMMIT itx and assign it. This - * way, it will be linked into the ZIL's list of synchronous itxs, and - * then later committed to an lwb (or skipped) when - * zil_process_commit_list() is called. - */ -static void -zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) -{ - dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - - itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); - itx->itx_sync = B_TRUE; - itx->itx_private = zcw; - - zil_itx_assign(zilog, itx, tx); - - dmu_tx_commit(tx); -} - -/* - * Commit ZFS Intent Log transactions (itxs) to stable storage. - * - * When writing ZIL transactions to the on-disk representation of the - * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple - * itxs can be committed to a single lwb. Once a lwb is written and - * committed to stable storage (i.e. the lwb is written, and vdevs have - * been flushed), each itx that was committed to that lwb is also - * considered to be committed to stable storage. - * - * When an itx is committed to an lwb, the log record (lr_t) contained - * by the itx is copied into the lwb's zio buffer, and once this buffer - * is written to disk, it becomes an on-disk ZIL block. - * - * As itxs are generated, they're inserted into the ZIL's queue of - * uncommitted itxs. The semantics of zil_commit() are such that it will - * block until all itxs that were in the queue when it was called, are - * committed to stable storage. - * - * If "foid" is zero, this means all "synchronous" and "asynchronous" - * itxs, for all objects in the dataset, will be committed to stable - * storage prior to zil_commit() returning. If "foid" is non-zero, all - * "synchronous" itxs for all objects, but only "asynchronous" itxs - * that correspond to the foid passed in, will be committed to stable - * storage prior to zil_commit() returning. - * - * Generally speaking, when zil_commit() is called, the consumer doesn't - * actually care about _all_ of the uncommitted itxs. Instead, they're - * simply trying to waiting for a specific itx to be committed to disk, - * but the interface(s) for interacting with the ZIL don't allow such - * fine-grained communication. A better interface would allow a consumer - * to create and assign an itx, and then pass a reference to this itx to - * zil_commit(); such that zil_commit() would return as soon as that - * specific itx was committed to disk (instead of waiting for _all_ - * itxs to be committed). - * - * When a thread calls zil_commit() a special "commit itx" will be - * generated, along with a corresponding "waiter" for this commit itx. - * zil_commit() will wait on this waiter's CV, such that when the waiter - * is marked done, and signalled, zil_commit() will return. - * - * This commit itx is inserted into the queue of uncommitted itxs. This - * provides an easy mechanism for determining which itxs were in the - * queue prior to zil_commit() having been called, and which itxs were - * added after zil_commit() was called. - * - * The commit it is special; it doesn't have any on-disk representation. - * When a commit itx is "committed" to an lwb, the waiter associated - * with it is linked onto the lwb's list of waiters. Then, when that lwb - * completes, each waiter on the lwb's list is marked done and signalled - * -- allowing the thread waiting on the waiter to return from zil_commit(). - * - * It's important to point out a few critical factors that allow us - * to make use of the commit itxs, commit waiters, per-lwb lists of - * commit waiters, and zio completion callbacks like we're doing: - * - * 1. The list of waiters for each lwb is traversed, and each commit - * waiter is marked "done" and signalled, in the zio completion - * callback of the lwb's zio[*]. - * - * * Actually, the waiters are signalled in the zio completion - * callback of the root zio for the DKIOCFLUSHWRITECACHE commands - * that are sent to the vdevs upon completion of the lwb zio. - * - * 2. When the itxs are inserted into the ZIL's queue of uncommitted - * itxs, the order in which they are inserted is preserved[*]; as - * itxs are added to the queue, they are added to the tail of - * in-memory linked lists. - * - * When committing the itxs to lwbs (to be written to disk), they - * are committed in the same order in which the itxs were added to - * the uncommitted queue's linked list(s); i.e. the linked list of - * itxs to commit is traversed from head to tail, and each itx is - * committed to an lwb in that order. - * - * * To clarify: - * - * - the order of "sync" itxs is preserved w.r.t. other - * "sync" itxs, regardless of the corresponding objects. - * - the order of "async" itxs is preserved w.r.t. other - * "async" itxs corresponding to the same object. - * - the order of "async" itxs is *not* preserved w.r.t. other - * "async" itxs corresponding to different objects. - * - the order of "sync" itxs w.r.t. "async" itxs (or vice - * versa) is *not* preserved, even for itxs that correspond - * to the same object. - * - * For more details, see: zil_itx_assign(), zil_async_to_sync(), - * zil_get_commit_list(), and zil_process_commit_list(). - * - * 3. The lwbs represent a linked list of blocks on disk. Thus, any - * lwb cannot be considered committed to stable storage, until its - * "previous" lwb is also committed to stable storage. This fact, - * coupled with the fact described above, means that itxs are - * committed in (roughly) the order in which they were generated. - * This is essential because itxs are dependent on prior itxs. - * Thus, we *must not* deem an itx as being committed to stable - * storage, until *all* prior itxs have also been committed to - * stable storage. - * - * To enforce this ordering of lwb zio's, while still leveraging as - * much of the underlying storage performance as possible, we rely - * on two fundamental concepts: - * - * 1. The creation and issuance of lwb zio's is protected by - * the zilog's "zl_issuer_lock", which ensures only a single - * thread is creating and/or issuing lwb's at a time - * 2. The "previous" lwb is a child of the "current" lwb - * (leveraging the zio parent-child depenency graph) - * - * By relying on this parent-child zio relationship, we can have - * many lwb zio's concurrently issued to the underlying storage, - * but the order in which they complete will be the same order in - * which they were created. - */ -void -zil_commit(zilog_t *zilog, uint64_t foid) -{ - /* - * We should never attempt to call zil_commit on a snapshot for - * a couple of reasons: - * - * 1. A snapshot may never be modified, thus it cannot have any - * in-flight itxs that would have modified the dataset. - * - * 2. By design, when zil_commit() is called, a commit itx will - * be assigned to this zilog; as a result, the zilog will be - * dirtied. We must not dirty the zilog of a snapshot; there's - * checks in the code that enforce this invariant, and will - * cause a panic if it's not upheld. - */ - ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); - - if (zilog->zl_sync == ZFS_SYNC_DISABLED) - return; - - if (!spa_writeable(zilog->zl_spa)) { - /* - * If the SPA is not writable, there should never be any - * pending itxs waiting to be committed to disk. If that - * weren't true, we'd skip writing those itxs out, and - * would break the sematics of zil_commit(); thus, we're - * verifying that truth before we return to the caller. - */ - ASSERT(list_is_empty(&zilog->zl_lwb_list)); - ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); - for (int i = 0; i < TXG_SIZE; i++) - ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); - return; - } - - /* - * If the ZIL is suspended, we don't want to dirty it by calling - * zil_commit_itx_assign() below, nor can we write out - * lwbs like would be done in zil_commit_write(). Thus, we - * simply rely on txg_wait_synced() to maintain the necessary - * semantics, and avoid calling those functions altogether. - */ - if (zilog->zl_suspend > 0) { - txg_wait_synced(zilog->zl_dmu_pool, 0); - return; - } - - zil_commit_impl(zilog, foid); -} - -void -zil_commit_impl(zilog_t *zilog, uint64_t foid) -{ - /* - * Move the "async" itxs for the specified foid to the "sync" - * queues, such that they will be later committed (or skipped) - * to an lwb when zil_process_commit_list() is called. - * - * Since these "async" itxs must be committed prior to this - * call to zil_commit returning, we must perform this operation - * before we call zil_commit_itx_assign(). - */ - zil_async_to_sync(zilog, foid); - - /* - * We allocate a new "waiter" structure which will initially be - * linked to the commit itx using the itx's "itx_private" field. - * Since the commit itx doesn't represent any on-disk state, - * when it's committed to an lwb, rather than copying the its - * lr_t into the lwb's buffer, the commit itx's "waiter" will be - * added to the lwb's list of waiters. Then, when the lwb is - * committed to stable storage, each waiter in the lwb's list of - * waiters will be marked "done", and signalled. - * - * We must create the waiter and assign the commit itx prior to - * calling zil_commit_writer(), or else our specific commit itx - * is not guaranteed to be committed to an lwb prior to calling - * zil_commit_waiter(). - */ - zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); - zil_commit_itx_assign(zilog, zcw); - - zil_commit_writer(zilog, zcw); - zil_commit_waiter(zilog, zcw); - - if (zcw->zcw_zio_error != 0) { - /* - * If there was an error writing out the ZIL blocks that - * this thread is waiting on, then we fallback to - * relying on spa_sync() to write out the data this - * thread is waiting on. Obviously this has performance - * implications, but the expectation is for this to be - * an exceptional case, and shouldn't occur often. - */ - DTRACE_PROBE2(zil__commit__io__error, - zilog_t *, zilog, zil_commit_waiter_t *, zcw); - txg_wait_synced(zilog->zl_dmu_pool, 0); - } - - zil_free_commit_waiter(zcw); -} - -/* - * Called in syncing context to free committed log blocks and update log header. - */ -void -zil_sync(zilog_t *zilog, dmu_tx_t *tx) -{ - zil_header_t *zh = zil_header_in_syncing_context(zilog); - uint64_t txg = dmu_tx_get_txg(tx); - spa_t *spa = zilog->zl_spa; - uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; - lwb_t *lwb; - - /* - * We don't zero out zl_destroy_txg, so make sure we don't try - * to destroy it twice. - */ - if (spa_sync_pass(spa) != 1) - return; - - mutex_enter(&zilog->zl_lock); - - ASSERT(zilog->zl_stop_sync == 0); - - if (*replayed_seq != 0) { - ASSERT(zh->zh_replay_seq < *replayed_seq); - zh->zh_replay_seq = *replayed_seq; - *replayed_seq = 0; - } - - if (zilog->zl_destroy_txg == txg) { - blkptr_t blk = zh->zh_log; - - ASSERT(list_head(&zilog->zl_lwb_list) == NULL); - - bzero(zh, sizeof (zil_header_t)); - bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); - - if (zilog->zl_keep_first) { - /* - * If this block was part of log chain that couldn't - * be claimed because a device was missing during - * zil_claim(), but that device later returns, - * then this block could erroneously appear valid. - * To guard against this, assign a new GUID to the new - * log chain so it doesn't matter what blk points to. - */ - zil_init_log_chain(zilog, &blk); - zh->zh_log = blk; - } - } - - while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { - zh->zh_log = lwb->lwb_blk; - if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) - break; - list_remove(&zilog->zl_lwb_list, lwb); - zio_free(spa, txg, &lwb->lwb_blk); - zil_free_lwb(zilog, lwb); - - /* - * If we don't have anything left in the lwb list then - * we've had an allocation failure and we need to zero - * out the zil_header blkptr so that we don't end - * up freeing the same block twice. - */ - if (list_head(&zilog->zl_lwb_list) == NULL) - BP_ZERO(&zh->zh_log); - } - mutex_exit(&zilog->zl_lock); -} - -/* ARGSUSED */ -static int -zil_lwb_cons(void *vbuf, void *unused, int kmflag) -{ - lwb_t *lwb = vbuf; - list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), - offsetof(zil_commit_waiter_t, zcw_node)); - avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, - sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); - mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); - return (0); -} - -/* ARGSUSED */ -static void -zil_lwb_dest(void *vbuf, void *unused) -{ - lwb_t *lwb = vbuf; - mutex_destroy(&lwb->lwb_vdev_lock); - avl_destroy(&lwb->lwb_vdev_tree); - list_destroy(&lwb->lwb_waiters); -} - -void -zil_init(void) -{ - zil_lwb_cache = kmem_cache_create("zil_lwb_cache", - sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0); - - zil_zcw_cache = kmem_cache_create("zil_zcw_cache", - sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -} - -void -zil_fini(void) -{ - kmem_cache_destroy(zil_zcw_cache); - kmem_cache_destroy(zil_lwb_cache); -} - -void -zil_set_sync(zilog_t *zilog, uint64_t sync) -{ - zilog->zl_sync = sync; -} - -void -zil_set_logbias(zilog_t *zilog, uint64_t logbias) -{ - zilog->zl_logbias = logbias; -} - -zilog_t * -zil_alloc(objset_t *os, zil_header_t *zh_phys) -{ - zilog_t *zilog; - - zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); - - zilog->zl_header = zh_phys; - zilog->zl_os = os; - zilog->zl_spa = dmu_objset_spa(os); - zilog->zl_dmu_pool = dmu_objset_pool(os); - zilog->zl_destroy_txg = TXG_INITIAL - 1; - zilog->zl_logbias = dmu_objset_logbias(os); - zilog->zl_sync = dmu_objset_syncprop(os); - zilog->zl_dirty_max_txg = 0; - zilog->zl_last_lwb_opened = NULL; - zilog->zl_last_lwb_latency = 0; - zilog->zl_max_block_size = zil_maxblocksize; - - mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); - - for (int i = 0; i < TXG_SIZE; i++) { - mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, - MUTEX_DEFAULT, NULL); - } - - list_create(&zilog->zl_lwb_list, sizeof (lwb_t), - offsetof(lwb_t, lwb_node)); - - list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), - offsetof(itx_t, itx_node)); - - cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); - - return (zilog); -} - -void -zil_free(zilog_t *zilog) -{ - zilog->zl_stop_sync = 1; - - ASSERT0(zilog->zl_suspend); - ASSERT0(zilog->zl_suspending); - - ASSERT(list_is_empty(&zilog->zl_lwb_list)); - list_destroy(&zilog->zl_lwb_list); - - ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); - list_destroy(&zilog->zl_itx_commit_list); - - for (int i = 0; i < TXG_SIZE; i++) { - /* - * It's possible for an itx to be generated that doesn't dirty - * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() - * callback to remove the entry. We remove those here. - * - * Also free up the ziltest itxs. - */ - if (zilog->zl_itxg[i].itxg_itxs) - zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); - mutex_destroy(&zilog->zl_itxg[i].itxg_lock); - } - - mutex_destroy(&zilog->zl_issuer_lock); - mutex_destroy(&zilog->zl_lock); - - cv_destroy(&zilog->zl_cv_suspend); - - kmem_free(zilog, sizeof (zilog_t)); -} - -/* - * Open an intent log. - */ -zilog_t * -zil_open(objset_t *os, zil_get_data_t *get_data) -{ - zilog_t *zilog = dmu_objset_zil(os); - - ASSERT3P(zilog->zl_get_data, ==, NULL); - ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); - ASSERT(list_is_empty(&zilog->zl_lwb_list)); - - zilog->zl_get_data = get_data; - - return (zilog); -} - -/* - * Close an intent log. - */ -void -zil_close(zilog_t *zilog) -{ - lwb_t *lwb; - uint64_t txg; - - if (!dmu_objset_is_snapshot(zilog->zl_os)) { - zil_commit(zilog, 0); - } else { - ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); - ASSERT0(zilog->zl_dirty_max_txg); - ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE); - } - - mutex_enter(&zilog->zl_lock); - lwb = list_tail(&zilog->zl_lwb_list); - if (lwb == NULL) - txg = zilog->zl_dirty_max_txg; - else - txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg); - mutex_exit(&zilog->zl_lock); - - /* - * We need to use txg_wait_synced() to wait long enough for the - * ZIL to be clean, and to wait for all pending lwbs to be - * written out. - */ - if (txg) - txg_wait_synced(zilog->zl_dmu_pool, txg); - - if (zilog_is_dirty(zilog)) - zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg); - if (txg < spa_freeze_txg(zilog->zl_spa)) - VERIFY(!zilog_is_dirty(zilog)); - - zilog->zl_get_data = NULL; - - /* - * We should have only one lwb left on the list; remove it now. - */ - mutex_enter(&zilog->zl_lock); - lwb = list_head(&zilog->zl_lwb_list); - if (lwb != NULL) { - ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list)); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - list_remove(&zilog->zl_lwb_list, lwb); - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); - zil_free_lwb(zilog, lwb); - } - mutex_exit(&zilog->zl_lock); -} - -static char *suspend_tag = "zil suspending"; - -/* - * Suspend an intent log. While in suspended mode, we still honor - * synchronous semantics, but we rely on txg_wait_synced() to do it. - * On old version pools, we suspend the log briefly when taking a - * snapshot so that it will have an empty intent log. - * - * Long holds are not really intended to be used the way we do here -- - * held for such a short time. A concurrent caller of dsl_dataset_long_held() - * could fail. Therefore we take pains to only put a long hold if it is - * actually necessary. Fortunately, it will only be necessary if the - * objset is currently mounted (or the ZVOL equivalent). In that case it - * will already have a long hold, so we are not really making things any worse. - * - * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or - * zvol_state_t), and use their mechanism to prevent their hold from being - * dropped (e.g. VFS_HOLD()). However, that would be even more pain for - * very little gain. - * - * if cookiep == NULL, this does both the suspend & resume. - * Otherwise, it returns with the dataset "long held", and the cookie - * should be passed into zil_resume(). - */ -int -zil_suspend(const char *osname, void **cookiep) -{ - objset_t *os; - zilog_t *zilog; - const zil_header_t *zh; - int error; - - error = dmu_objset_hold(osname, suspend_tag, &os); - if (error != 0) - return (error); - zilog = dmu_objset_zil(os); - - mutex_enter(&zilog->zl_lock); - zh = zilog->zl_header; - - if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ - mutex_exit(&zilog->zl_lock); - dmu_objset_rele(os, suspend_tag); - return (SET_ERROR(EBUSY)); - } - - /* - * Don't put a long hold in the cases where we can avoid it. This - * is when there is no cookie so we are doing a suspend & resume - * (i.e. called from zil_vdev_offline()), and there's nothing to do - * for the suspend because it's already suspended, or there's no ZIL. - */ - if (cookiep == NULL && !zilog->zl_suspending && - (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { - mutex_exit(&zilog->zl_lock); - dmu_objset_rele(os, suspend_tag); - return (0); - } - - dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); - dsl_pool_rele(dmu_objset_pool(os), suspend_tag); - - zilog->zl_suspend++; - - if (zilog->zl_suspend > 1) { - /* - * Someone else is already suspending it. - * Just wait for them to finish. - */ - - while (zilog->zl_suspending) - cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); - mutex_exit(&zilog->zl_lock); - - if (cookiep == NULL) - zil_resume(os); - else - *cookiep = os; - return (0); - } - - /* - * If there is no pointer to an on-disk block, this ZIL must not - * be active (e.g. filesystem not mounted), so there's nothing - * to clean up. - */ - if (BP_IS_HOLE(&zh->zh_log)) { - ASSERT(cookiep != NULL); /* fast path already handled */ - - *cookiep = os; - mutex_exit(&zilog->zl_lock); - return (0); - } - - zilog->zl_suspending = B_TRUE; - mutex_exit(&zilog->zl_lock); - - /* - * We need to use zil_commit_impl to ensure we wait for all - * LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed - * to disk before proceeding. If we used zil_commit instead, it - * would just call txg_wait_synced(), because zl_suspend is set. - * txg_wait_synced() doesn't wait for these lwb's to be - * LWB_STATE_FLUSH_DONE before returning. - */ - zil_commit_impl(zilog, 0); - - /* - * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we - * use txg_wait_synced() to ensure the data from the zilog has - * migrated to the main pool before calling zil_destroy(). - */ - txg_wait_synced(zilog->zl_dmu_pool, 0); - - zil_destroy(zilog, B_FALSE); - - mutex_enter(&zilog->zl_lock); - zilog->zl_suspending = B_FALSE; - cv_broadcast(&zilog->zl_cv_suspend); - mutex_exit(&zilog->zl_lock); - - if (cookiep == NULL) - zil_resume(os); - else - *cookiep = os; - return (0); -} - -void -zil_resume(void *cookie) -{ - objset_t *os = cookie; - zilog_t *zilog = dmu_objset_zil(os); - - mutex_enter(&zilog->zl_lock); - ASSERT(zilog->zl_suspend != 0); - zilog->zl_suspend--; - mutex_exit(&zilog->zl_lock); - dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); - dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); -} - -typedef struct zil_replay_arg { - zil_replay_func_t **zr_replay; - void *zr_arg; - boolean_t zr_byteswap; - char *zr_lr; -} zil_replay_arg_t; - -static int -zil_replay_error(zilog_t *zilog, lr_t *lr, int error) -{ - char name[ZFS_MAX_DATASET_NAME_LEN]; - - zilog->zl_replaying_seq--; /* didn't actually replay this one */ - - dmu_objset_name(zilog->zl_os, name); - - cmn_err(CE_WARN, "ZFS replay transaction error %d, " - "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, - (u_longlong_t)lr->lrc_seq, - (u_longlong_t)(lr->lrc_txtype & ~TX_CI), - (lr->lrc_txtype & TX_CI) ? "CI" : ""); - - return (error); -} - -static int -zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) -{ - zil_replay_arg_t *zr = zra; - const zil_header_t *zh = zilog->zl_header; - uint64_t reclen = lr->lrc_reclen; - uint64_t txtype = lr->lrc_txtype; - int error = 0; - - zilog->zl_replaying_seq = lr->lrc_seq; - - if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ - return (0); - - if (lr->lrc_txg < claim_txg) /* already committed */ - return (0); - - /* Strip case-insensitive bit, still present in log record */ - txtype &= ~TX_CI; - - if (txtype == 0 || txtype >= TX_MAX_TYPE) - return (zil_replay_error(zilog, lr, EINVAL)); - - /* - * If this record type can be logged out of order, the object - * (lr_foid) may no longer exist. That's legitimate, not an error. - */ - if (TX_OOO(txtype)) { - error = dmu_object_info(zilog->zl_os, - LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL); - if (error == ENOENT || error == EEXIST) - return (0); - } - - /* - * Make a copy of the data so we can revise and extend it. - */ - bcopy(lr, zr->zr_lr, reclen); - - /* - * If this is a TX_WRITE with a blkptr, suck in the data. - */ - if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { - error = zil_read_log_data(zilog, (lr_write_t *)lr, - zr->zr_lr + reclen); - if (error != 0) - return (zil_replay_error(zilog, lr, error)); - } - - /* - * The log block containing this lr may have been byteswapped - * so that we can easily examine common fields like lrc_txtype. - * However, the log is a mix of different record types, and only the - * replay vectors know how to byteswap their records. Therefore, if - * the lr was byteswapped, undo it before invoking the replay vector. - */ - if (zr->zr_byteswap) - byteswap_uint64_array(zr->zr_lr, reclen); - - /* - * We must now do two things atomically: replay this log record, - * and update the log header sequence number to reflect the fact that - * we did so. At the end of each replay function the sequence number - * is updated if we are in replay mode. - */ - error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); - if (error != 0) { - /* - * The DMU's dnode layer doesn't see removes until the txg - * commits, so a subsequent claim can spuriously fail with - * EEXIST. So if we receive any error we try syncing out - * any removes then retry the transaction. Note that we - * specify B_FALSE for byteswap now, so we don't do it twice. - */ - txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); - error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); - if (error != 0) - return (zil_replay_error(zilog, lr, error)); - } - return (0); -} - -/* ARGSUSED */ -static int -zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) -{ - zilog->zl_replay_blks++; - - return (0); -} - -/* - * If this dataset has a non-empty intent log, replay it and destroy it. - */ -void -zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) -{ - zilog_t *zilog = dmu_objset_zil(os); - const zil_header_t *zh = zilog->zl_header; - zil_replay_arg_t zr; - - if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { - zil_destroy(zilog, B_TRUE); - return; - } - - zr.zr_replay = replay_func; - zr.zr_arg = arg; - zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); - zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); - - /* - * Wait for in-progress removes to sync before starting replay. - */ - txg_wait_synced(zilog->zl_dmu_pool, 0); - - zilog->zl_replay = B_TRUE; - zilog->zl_replay_time = ddi_get_lbolt(); - ASSERT(zilog->zl_replay_blks == 0); - (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, - zh->zh_claim_txg); - kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); - - zil_destroy(zilog, B_FALSE); - txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); - zilog->zl_replay = B_FALSE; -} - -boolean_t -zil_replaying(zilog_t *zilog, dmu_tx_t *tx) -{ - if (zilog->zl_sync == ZFS_SYNC_DISABLED) - return (B_TRUE); - - if (zilog->zl_replay) { - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = - zilog->zl_replaying_seq; - return (B_TRUE); - } - - return (B_FALSE); -} - -/* ARGSUSED */ -int -zil_reset(const char *osname, void *arg) -{ - int error; - - error = zil_suspend(osname, NULL); - if (error != 0) - return (SET_ERROR(EEXIST)); - return (0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c deleted file mode 100644 index a026b3bfe02d..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ /dev/null @@ -1,4386 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright (c) 2017, Intel Corporation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -SYSCTL_DECL(_vfs_zfs); -SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "ZFS ZIO"); -#if defined(__amd64__) -static int zio_use_uma = 1; -#else -static int zio_use_uma = 0; -#endif -SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, - "Use uma(9) for ZIO allocations"); -static int zio_exclude_metadata = 0; -SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, - "Exclude metadata buffers from dumps as well"); - -zio_trim_stats_t zio_trim_stats = { - { "bytes", KSTAT_DATA_UINT64, - "Number of bytes successfully TRIMmed" }, - { "success", KSTAT_DATA_UINT64, - "Number of successful TRIM requests" }, - { "unsupported", KSTAT_DATA_UINT64, - "Number of TRIM requests that failed because TRIM is not supported" }, - { "failed", KSTAT_DATA_UINT64, - "Number of TRIM requests that failed for reasons other than not supported" }, -}; - -static kstat_t *zio_trim_ksp; - -/* - * ========================================================================== - * I/O type descriptions - * ========================================================================== - */ -const char *zio_type_name[ZIO_TYPES] = { - "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", - "zio_ioctl" -}; - -boolean_t zio_dva_throttle_enabled = B_TRUE; -SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RWTUN, - &zio_dva_throttle_enabled, 0, "Enable allocation throttling"); - -/* - * ========================================================================== - * I/O kmem caches - * ========================================================================== - */ -kmem_cache_t *zio_cache; -kmem_cache_t *zio_link_cache; -kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; -kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; - -#ifdef _KERNEL -extern vmem_t *zio_alloc_arena; -#endif - -#define BP_SPANB(indblkshift, level) \ - (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) -#define COMPARE_META_LEVEL 0x80000000ul -/* - * The following actions directly effect the spa's sync-to-convergence logic. - * The values below define the sync pass when we start performing the action. - * Care should be taken when changing these values as they directly impact - * spa_sync() performance. Tuning these values may introduce subtle performance - * pathologies and should only be done in the context of performance analysis. - * These tunables will eventually be removed and replaced with #defines once - * enough analysis has been done to determine optimal values. - * - * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that - * regular blocks are not deferred. - */ -int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ -SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, - &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); -int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ -SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, - &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); -int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ -SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, - &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); - -/* - * An allocating zio is one that either currently has the DVA allocate - * stage set or will have it later in its lifetime. - */ -#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) - -boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; - -#ifdef illumos -#ifdef ZFS_DEBUG -int zio_buf_debug_limit = 16384; -#else -int zio_buf_debug_limit = 0; -#endif -#endif - -static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); - -void -zio_init(void) -{ - size_t c; - zio_cache = kmem_cache_create("zio_cache", - sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - zio_link_cache = kmem_cache_create("zio_link_cache", - sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - if (!zio_use_uma) - goto out; - - /* - * For small buffers, we want a cache for each multiple of - * SPA_MINBLOCKSIZE. For larger buffers, we want a cache - * for each quarter-power of 2. - */ - for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { - size_t size = (c + 1) << SPA_MINBLOCKSHIFT; - size_t p2 = size; - size_t align = 0; - int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0; - - while (!ISP2(p2)) - p2 &= p2 - 1; - -#ifdef illumos -#ifndef _KERNEL - /* - * If we are using watchpoints, put each buffer on its own page, - * to eliminate the performance overhead of trapping to the - * kernel when modifying a non-watched buffer that shares the - * page with a watched buffer. - */ - if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) - continue; -#endif -#endif /* illumos */ - if (size <= 4 * SPA_MINBLOCKSIZE) { - align = SPA_MINBLOCKSIZE; - } else if (IS_P2ALIGNED(size, p2 >> 2)) { - align = MIN(p2 >> 2, PAGESIZE); - } - - if (align != 0) { - char name[36]; - (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); - zio_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, NULL, cflags); - - /* - * Since zio_data bufs do not appear in crash dumps, we - * pass KMC_NOTOUCH so that no allocator metadata is - * stored with the buffers. - */ - (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); - zio_data_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, NULL, - cflags | KMC_NOTOUCH | KMC_NODEBUG); - } - } - - while (--c != 0) { - ASSERT(zio_buf_cache[c] != NULL); - if (zio_buf_cache[c - 1] == NULL) - zio_buf_cache[c - 1] = zio_buf_cache[c]; - - ASSERT(zio_data_buf_cache[c] != NULL); - if (zio_data_buf_cache[c - 1] == NULL) - zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; - } -out: - - zio_inject_init(); - - zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", - KSTAT_TYPE_NAMED, - sizeof(zio_trim_stats) / sizeof(kstat_named_t), - KSTAT_FLAG_VIRTUAL); - - if (zio_trim_ksp != NULL) { - zio_trim_ksp->ks_data = &zio_trim_stats; - kstat_install(zio_trim_ksp); - } -} - -void -zio_fini(void) -{ - size_t c; - kmem_cache_t *last_cache = NULL; - kmem_cache_t *last_data_cache = NULL; - - for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { - if (zio_buf_cache[c] != last_cache) { - last_cache = zio_buf_cache[c]; - kmem_cache_destroy(zio_buf_cache[c]); - } - zio_buf_cache[c] = NULL; - - if (zio_data_buf_cache[c] != last_data_cache) { - last_data_cache = zio_data_buf_cache[c]; - kmem_cache_destroy(zio_data_buf_cache[c]); - } - zio_data_buf_cache[c] = NULL; - } - - kmem_cache_destroy(zio_link_cache); - kmem_cache_destroy(zio_cache); - - zio_inject_fini(); - - if (zio_trim_ksp != NULL) { - kstat_delete(zio_trim_ksp); - zio_trim_ksp = NULL; - } -} - -/* - * ========================================================================== - * Allocate and free I/O buffers - * ========================================================================== - */ - -/* - * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a - * crashdump if the kernel panics, so use it judiciously. Obviously, it's - * useful to inspect ZFS metadata, but if possible, we should avoid keeping - * excess / transient data in-core during a crashdump. - */ -void * -zio_buf_alloc(size_t size) -{ - size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; - int flags = zio_exclude_metadata ? KM_NODEBUG : 0; - - VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - - if (zio_use_uma) - return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); - else - return (kmem_alloc(size, KM_SLEEP|flags)); -} - -/* - * Use zio_data_buf_alloc to allocate data. The data will not appear in a - * crashdump if the kernel panics. This exists so that we will limit the amount - * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount - * of kernel heap dumped to disk when the kernel panics) - */ -void * -zio_data_buf_alloc(size_t size) -{ - size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; - - VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - - if (zio_use_uma) - return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); - else - return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); -} - -void -zio_buf_free(void *buf, size_t size) -{ - size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; - - VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - - if (zio_use_uma) - kmem_cache_free(zio_buf_cache[c], buf); - else - kmem_free(buf, size); -} - -void -zio_data_buf_free(void *buf, size_t size) -{ - size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; - - VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - - if (zio_use_uma) - kmem_cache_free(zio_data_buf_cache[c], buf); - else - kmem_free(buf, size); -} - -/* - * ========================================================================== - * Push and pop I/O transform buffers - * ========================================================================== - */ -void -zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, - zio_transform_func_t *transform) -{ - zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); - - /* - * Ensure that anyone expecting this zio to contain a linear ABD isn't - * going to get a nasty surprise when they try to access the data. - */ -#ifdef illumos - IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data)); -#else - IMPLY(zio->io_abd != NULL && abd_is_linear(zio->io_abd), - abd_is_linear(data)); -#endif - - zt->zt_orig_abd = zio->io_abd; - zt->zt_orig_size = zio->io_size; - zt->zt_bufsize = bufsize; - zt->zt_transform = transform; - - zt->zt_next = zio->io_transform_stack; - zio->io_transform_stack = zt; - - zio->io_abd = data; - zio->io_size = size; -} - -void -zio_pop_transforms(zio_t *zio) -{ - zio_transform_t *zt; - - while ((zt = zio->io_transform_stack) != NULL) { - if (zt->zt_transform != NULL) - zt->zt_transform(zio, - zt->zt_orig_abd, zt->zt_orig_size); - - if (zt->zt_bufsize != 0) - abd_free(zio->io_abd); - - zio->io_abd = zt->zt_orig_abd; - zio->io_size = zt->zt_orig_size; - zio->io_transform_stack = zt->zt_next; - - kmem_free(zt, sizeof (zio_transform_t)); - } -} - -/* - * ========================================================================== - * I/O transform callbacks for subblocks and decompression - * ========================================================================== - */ -static void -zio_subblock(zio_t *zio, abd_t *data, uint64_t size) -{ - ASSERT(zio->io_size > size); - - if (zio->io_type == ZIO_TYPE_READ) - abd_copy(data, zio->io_abd, size); -} - -static void -zio_decompress(zio_t *zio, abd_t *data, uint64_t size) -{ - if (zio->io_error == 0) { - void *tmp = abd_borrow_buf(data, size); - int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), - zio->io_abd, tmp, zio->io_size, size); - abd_return_buf_copy(data, tmp, size); - - if (ret != 0) - zio->io_error = SET_ERROR(EIO); - } -} - -/* - * ========================================================================== - * I/O parent/child relationships and pipeline interlocks - * ========================================================================== - */ -zio_t * -zio_walk_parents(zio_t *cio, zio_link_t **zl) -{ - list_t *pl = &cio->io_parent_list; - - *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); - if (*zl == NULL) - return (NULL); - - ASSERT((*zl)->zl_child == cio); - return ((*zl)->zl_parent); -} - -zio_t * -zio_walk_children(zio_t *pio, zio_link_t **zl) -{ - list_t *cl = &pio->io_child_list; - - ASSERT(MUTEX_HELD(&pio->io_lock)); - - *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); - if (*zl == NULL) - return (NULL); - - ASSERT((*zl)->zl_parent == pio); - return ((*zl)->zl_child); -} - -zio_t * -zio_unique_parent(zio_t *cio) -{ - zio_link_t *zl = NULL; - zio_t *pio = zio_walk_parents(cio, &zl); - - VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); - return (pio); -} - -void -zio_add_child(zio_t *pio, zio_t *cio) -{ - zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); - - /* - * Logical I/Os can have logical, gang, or vdev children. - * Gang I/Os can have gang or vdev children. - * Vdev I/Os can only have vdev children. - * The following ASSERT captures all of these constraints. - */ - ASSERT3S(cio->io_child_type, <=, pio->io_child_type); - - zl->zl_parent = pio; - zl->zl_child = cio; - - mutex_enter(&pio->io_lock); - mutex_enter(&cio->io_lock); - - ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); - - for (int w = 0; w < ZIO_WAIT_TYPES; w++) - pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; - - list_insert_head(&pio->io_child_list, zl); - list_insert_head(&cio->io_parent_list, zl); - - pio->io_child_count++; - cio->io_parent_count++; - - mutex_exit(&cio->io_lock); - mutex_exit(&pio->io_lock); -} - -static void -zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) -{ - ASSERT(zl->zl_parent == pio); - ASSERT(zl->zl_child == cio); - - mutex_enter(&pio->io_lock); - mutex_enter(&cio->io_lock); - - list_remove(&pio->io_child_list, zl); - list_remove(&cio->io_parent_list, zl); - - pio->io_child_count--; - cio->io_parent_count--; - - mutex_exit(&cio->io_lock); - mutex_exit(&pio->io_lock); - kmem_cache_free(zio_link_cache, zl); -} - -static boolean_t -zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) -{ - boolean_t waiting = B_FALSE; - - mutex_enter(&zio->io_lock); - ASSERT(zio->io_stall == NULL); - for (int c = 0; c < ZIO_CHILD_TYPES; c++) { - if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) - continue; - - uint64_t *countp = &zio->io_children[c][wait]; - if (*countp != 0) { - zio->io_stage >>= 1; - ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); - zio->io_stall = countp; - waiting = B_TRUE; - break; - } - } - mutex_exit(&zio->io_lock); - return (waiting); -} - -static void -zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, - zio_t **next_to_executep) -{ - uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; - int *errorp = &pio->io_child_error[zio->io_child_type]; - - mutex_enter(&pio->io_lock); - if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) - *errorp = zio_worst_error(*errorp, zio->io_error); - pio->io_reexecute |= zio->io_reexecute; - ASSERT3U(*countp, >, 0); - - (*countp)--; - - if (*countp == 0 && pio->io_stall == countp) { - zio_taskq_type_t type = - pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : - ZIO_TASKQ_INTERRUPT; - pio->io_stall = NULL; - mutex_exit(&pio->io_lock); - - /* - * If we can tell the caller to execute this parent next, do - * so. Otherwise dispatch the parent zio as its own task. - * - * Having the caller execute the parent when possible reduces - * locking on the zio taskq's, reduces context switch - * overhead, and has no recursion penalty. Note that one - * read from disk typically causes at least 3 zio's: a - * zio_null(), the logical zio_read(), and then a physical - * zio. When the physical ZIO completes, we are able to call - * zio_done() on all 3 of these zio's from one invocation of - * zio_execute() by returning the parent back to - * zio_execute(). Since the parent isn't executed until this - * thread returns back to zio_execute(), the caller should do - * so promptly. - * - * In other cases, dispatching the parent prevents - * overflowing the stack when we have deeply nested - * parent-child relationships, as we do with the "mega zio" - * of writes for spa_sync(), and the chain of ZIL blocks. - */ - if (next_to_executep != NULL && *next_to_executep == NULL) { - *next_to_executep = pio; - } else { - zio_taskq_dispatch(pio, type, B_FALSE); - } - } else { - mutex_exit(&pio->io_lock); - } -} - -static void -zio_inherit_child_errors(zio_t *zio, enum zio_child c) -{ - if (zio->io_child_error[c] != 0 && zio->io_error == 0) - zio->io_error = zio->io_child_error[c]; -} - -int -zio_bookmark_compare(const void *x1, const void *x2) -{ - const zio_t *z1 = x1; - const zio_t *z2 = x2; - - if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) - return (-1); - if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) - return (1); - - if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) - return (-1); - if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) - return (1); - - if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) - return (-1); - if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) - return (1); - - if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) - return (-1); - if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) - return (1); - - if (z1 < z2) - return (-1); - if (z1 > z2) - return (1); - - return (0); -} - -/* - * ========================================================================== - * Create the various types of I/O (read, write, free, etc) - * ========================================================================== - */ -static zio_t * -zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, - void *private, zio_type_t type, zio_priority_t priority, - enum zio_flag flags, vdev_t *vd, uint64_t offset, - const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) -{ - zio_t *zio; - - IMPLY(type != ZIO_TYPE_FREE, psize <= SPA_MAXBLOCKSIZE); - ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); - ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); - - ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); - ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); - ASSERT(vd || stage == ZIO_STAGE_OPEN); - - IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0); - - zio = kmem_cache_alloc(zio_cache, KM_SLEEP); - bzero(zio, sizeof (zio_t)); - - mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); -#if defined(__FreeBSD__) && defined(_KERNEL) - callout_init(&zio->io_timer, 1); -#endif - - list_create(&zio->io_parent_list, sizeof (zio_link_t), - offsetof(zio_link_t, zl_parent_node)); - list_create(&zio->io_child_list, sizeof (zio_link_t), - offsetof(zio_link_t, zl_child_node)); - metaslab_trace_init(&zio->io_alloc_list); - - if (vd != NULL) - zio->io_child_type = ZIO_CHILD_VDEV; - else if (flags & ZIO_FLAG_GANG_CHILD) - zio->io_child_type = ZIO_CHILD_GANG; - else if (flags & ZIO_FLAG_DDT_CHILD) - zio->io_child_type = ZIO_CHILD_DDT; - else - zio->io_child_type = ZIO_CHILD_LOGICAL; - - if (bp != NULL) { - zio->io_bp = (blkptr_t *)bp; - zio->io_bp_copy = *bp; - zio->io_bp_orig = *bp; - if (type != ZIO_TYPE_WRITE || - zio->io_child_type == ZIO_CHILD_DDT) - zio->io_bp = &zio->io_bp_copy; /* so caller can free */ - if (zio->io_child_type == ZIO_CHILD_LOGICAL) - zio->io_logical = zio; - if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) - pipeline |= ZIO_GANG_STAGES; - } - - zio->io_spa = spa; - zio->io_txg = txg; - zio->io_done = done; - zio->io_private = private; - zio->io_type = type; - zio->io_priority = priority; - zio->io_vd = vd; - zio->io_offset = offset; - zio->io_orig_abd = zio->io_abd = data; - zio->io_orig_size = zio->io_size = psize; - zio->io_lsize = lsize; - zio->io_orig_flags = zio->io_flags = flags; - zio->io_orig_stage = zio->io_stage = stage; - zio->io_orig_pipeline = zio->io_pipeline = pipeline; - zio->io_pipeline_trace = ZIO_STAGE_OPEN; - - zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); - zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); - - if (zb != NULL) - zio->io_bookmark = *zb; - - if (pio != NULL) { - if (zio->io_metaslab_class == NULL) - zio->io_metaslab_class = pio->io_metaslab_class; - if (zio->io_logical == NULL) - zio->io_logical = pio->io_logical; - if (zio->io_child_type == ZIO_CHILD_GANG) - zio->io_gang_leader = pio->io_gang_leader; - zio_add_child(pio, zio); - } - - return (zio); -} - -static void -zio_destroy(zio_t *zio) -{ -#ifdef __FreeBSD__ - KASSERT(!(callout_active(&zio->io_timer) || - callout_pending(&zio->io_timer)), ("zio_destroy: timer active")); -#endif - metaslab_trace_fini(&zio->io_alloc_list); - list_destroy(&zio->io_parent_list); - list_destroy(&zio->io_child_list); - mutex_destroy(&zio->io_lock); - cv_destroy(&zio->io_cv); - kmem_cache_free(zio_cache, zio); -} - -zio_t * -zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, - void *private, enum zio_flag flags) -{ - zio_t *zio; - - zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, - ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, - ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); - - return (zio); -} - -zio_t * -zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) -{ - return (zio_null(NULL, spa, NULL, done, private, flags)); -} - -void -zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) -{ - if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { - zfs_panic_recover("blkptr at %p has invalid TYPE %llu", - bp, (longlong_t)BP_GET_TYPE(bp)); - } - if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || - BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { - zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", - bp, (longlong_t)BP_GET_CHECKSUM(bp)); - } - if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || - BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { - zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", - bp, (longlong_t)BP_GET_COMPRESS(bp)); - } - if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { - zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", - bp, (longlong_t)BP_GET_LSIZE(bp)); - } - if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { - zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", - bp, (longlong_t)BP_GET_PSIZE(bp)); - } - - if (BP_IS_EMBEDDED(bp)) { - if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { - zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", - bp, (longlong_t)BPE_GET_ETYPE(bp)); - } - } - - /* - * Do not verify individual DVAs if the config is not trusted. This - * will be done once the zio is executed in vdev_mirror_map_alloc. - */ - if (!spa->spa_trust_config) - return; - - /* - * Pool-specific checks. - * - * Note: it would be nice to verify that the blk_birth and - * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() - * allows the birth time of log blocks (and dmu_sync()-ed blocks - * that are in the log) to be arbitrarily large. - */ - for (int i = 0; i < BP_GET_NDVAS(bp); i++) { - uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); - if (vdevid >= spa->spa_root_vdev->vdev_children) { - zfs_panic_recover("blkptr at %p DVA %u has invalid " - "VDEV %llu", - bp, i, (longlong_t)vdevid); - continue; - } - vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; - if (vd == NULL) { - zfs_panic_recover("blkptr at %p DVA %u has invalid " - "VDEV %llu", - bp, i, (longlong_t)vdevid); - continue; - } - if (vd->vdev_ops == &vdev_hole_ops) { - zfs_panic_recover("blkptr at %p DVA %u has hole " - "VDEV %llu", - bp, i, (longlong_t)vdevid); - continue; - } - if (vd->vdev_ops == &vdev_missing_ops) { - /* - * "missing" vdevs are valid during import, but we - * don't have their detailed info (e.g. asize), so - * we can't perform any more checks on them. - */ - continue; - } - uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); - uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); - if (BP_IS_GANG(bp)) - asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - if (offset + asize > vd->vdev_asize) { - zfs_panic_recover("blkptr at %p DVA %u has invalid " - "OFFSET %llu", - bp, i, (longlong_t)offset); - } - } -} - -boolean_t -zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) -{ - uint64_t vdevid = DVA_GET_VDEV(dva); - - if (vdevid >= spa->spa_root_vdev->vdev_children) - return (B_FALSE); - - vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; - if (vd == NULL) - return (B_FALSE); - - if (vd->vdev_ops == &vdev_hole_ops) - return (B_FALSE); - - if (vd->vdev_ops == &vdev_missing_ops) { - return (B_FALSE); - } - - uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t asize = DVA_GET_ASIZE(dva); - - if (BP_IS_GANG(bp)) - asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - if (offset + asize > vd->vdev_asize) - return (B_FALSE); - - return (B_TRUE); -} - -zio_t * -zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - abd_t *data, uint64_t size, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) -{ - zio_t *zio; - - zfs_blkptr_verify(spa, bp); - - zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, - data, size, size, done, private, - ZIO_TYPE_READ, priority, flags, NULL, 0, zb, - ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? - ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); - - return (zio); -} - -zio_t * -zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, - zio_done_func_t *ready, zio_done_func_t *children_ready, - zio_done_func_t *physdone, zio_done_func_t *done, - void *private, zio_priority_t priority, enum zio_flag flags, - const zbookmark_phys_t *zb) -{ - zio_t *zio; - - ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && - zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && - zp->zp_compress >= ZIO_COMPRESS_OFF && - zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && - DMU_OT_IS_VALID(zp->zp_type) && - zp->zp_level < 32 && - zp->zp_copies > 0 && - zp->zp_copies <= spa_max_replication(spa)); - - zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, - ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, - ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? - ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); - - zio->io_ready = ready; - zio->io_children_ready = children_ready; - zio->io_physdone = physdone; - zio->io_prop = *zp; - - /* - * Data can be NULL if we are going to call zio_write_override() to - * provide the already-allocated BP. But we may need the data to - * verify a dedup hit (if requested). In this case, don't try to - * dedup (just take the already-allocated BP verbatim). - */ - if (data == NULL && zio->io_prop.zp_dedup_verify) { - zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; - } - - return (zio); -} - -zio_t * -zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, - uint64_t size, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) -{ - zio_t *zio; - - zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, - ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, - ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); - - return (zio); -} - -void -zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) -{ - ASSERT(zio->io_type == ZIO_TYPE_WRITE); - ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - ASSERT(zio->io_stage == ZIO_STAGE_OPEN); - ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); - - /* - * We must reset the io_prop to match the values that existed - * when the bp was first written by dmu_sync() keeping in mind - * that nopwrite and dedup are mutually exclusive. - */ - zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; - zio->io_prop.zp_nopwrite = nopwrite; - zio->io_prop.zp_copies = copies; - zio->io_bp_override = bp; -} - -void -zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) -{ - - zfs_blkptr_verify(spa, bp); - - /* - * The check for EMBEDDED is a performance optimization. We - * process the free here (by ignoring it) rather than - * putting it on the list and then processing it in zio_free_sync(). - */ - if (BP_IS_EMBEDDED(bp)) - return; - metaslab_check_free(spa, bp); - - /* - * Frees that are for the currently-syncing txg, are not going to be - * deferred, and which will not need to do a read (i.e. not GANG or - * DEDUP), can be processed immediately. Otherwise, put them on the - * in-memory list for later processing. - */ - if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || - txg != spa->spa_syncing_txg || - spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { - bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); - } else { - VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, - BP_GET_PSIZE(bp), 0))); - } -} - -zio_t * -zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - uint64_t size, enum zio_flag flags) -{ - zio_t *zio; - enum zio_stage stage = ZIO_FREE_PIPELINE; - - ASSERT(!BP_IS_HOLE(bp)); - ASSERT(spa_syncing_txg(spa) == txg); - ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); - - if (BP_IS_EMBEDDED(bp)) - return (zio_null(pio, spa, NULL, NULL, NULL, 0)); - - metaslab_check_free(spa, bp); - arc_freed(spa, bp); - dsl_scan_freed(spa, bp); - - if (zfs_trim_enabled) - stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | - ZIO_STAGE_VDEV_IO_ASSESS; - /* - * GANG and DEDUP blocks can induce a read (for the gang block header, - * or the DDT), so issue them asynchronously so that this thread is - * not tied up. - */ - else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) - stage |= ZIO_STAGE_ISSUE_ASYNC; - - flags |= ZIO_FLAG_DONT_QUEUE; - - zio = zio_create(pio, spa, txg, bp, NULL, size, - size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, - flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage); - - return (zio); -} - -zio_t * -zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - zio_done_func_t *done, void *private, enum zio_flag flags) -{ - zio_t *zio; - - zfs_blkptr_verify(spa, bp); - - if (BP_IS_EMBEDDED(bp)) - return (zio_null(pio, spa, NULL, NULL, NULL, 0)); - - /* - * A claim is an allocation of a specific block. Claims are needed - * to support immediate writes in the intent log. The issue is that - * immediate writes contain committed data, but in a txg that was - * *not* committed. Upon opening the pool after an unclean shutdown, - * the intent log claims all blocks that contain immediate write data - * so that the SPA knows they're in use. - * - * All claims *must* be resolved in the first txg -- before the SPA - * starts allocating blocks -- so that nothing is allocated twice. - * If txg == 0 we just verify that the block is claimable. - */ - ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, - spa_min_claim_txg(spa)); - ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); - ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ - - zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), - BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, - flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); - ASSERT0(zio->io_queued_timestamp); - - return (zio); -} - -zio_t * -zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags) -{ - zio_t *zio; - int c; - - if (vd->vdev_children == 0) { - zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, - ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, - ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); - - zio->io_cmd = cmd; - } else { - zio = zio_null(pio, spa, NULL, NULL, NULL, flags); - - for (c = 0; c < vd->vdev_children; c++) - zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, - offset, size, done, private, priority, flags)); - } - - return (zio); -} - -zio_t * -zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - abd_t *data, int checksum, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, boolean_t labels) -{ - zio_t *zio; - - ASSERT(vd->vdev_children == 0); - ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || - offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); - ASSERT3U(offset + size, <=, vd->vdev_psize); - - zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, - private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, - offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); - - zio->io_prop.zp_checksum = checksum; - - return (zio); -} - -zio_t * -zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - abd_t *data, int checksum, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, boolean_t labels) -{ - zio_t *zio; - - ASSERT(vd->vdev_children == 0); - ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || - offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); - ASSERT3U(offset + size, <=, vd->vdev_psize); - - zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, - private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, - offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); - - zio->io_prop.zp_checksum = checksum; - - if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { - /* - * zec checksums are necessarily destructive -- they modify - * the end of the write buffer to hold the verifier/checksum. - * Therefore, we must make a local copy in case the data is - * being written to multiple places in parallel. - */ - abd_t *wbuf = abd_alloc_sametype(data, size); - abd_copy(wbuf, data, size); - - zio_push_transform(zio, wbuf, size, size, NULL); - } - - return (zio); -} - -/* - * Create a child I/O to do some work for us. - */ -zio_t * -zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - abd_t *data, uint64_t size, int type, zio_priority_t priority, - enum zio_flag flags, zio_done_func_t *done, void *private) -{ - enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; - zio_t *zio; - - /* - * vdev child I/Os do not propagate their error to the parent. - * Therefore, for correct operation the caller *must* check for - * and handle the error in the child i/o's done callback. - * The only exceptions are i/os that we don't care about - * (OPTIONAL or REPAIR). - */ - ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || - done != NULL); - - if (type == ZIO_TYPE_READ && bp != NULL) { - /* - * If we have the bp, then the child should perform the - * checksum and the parent need not. This pushes error - * detection as close to the leaves as possible and - * eliminates redundant checksums in the interior nodes. - */ - pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; - pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; - } - - /* Not all IO types require vdev io done stage e.g. free */ - if (type == ZIO_TYPE_FREE && - !(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) - pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; - - if (vd->vdev_ops->vdev_op_leaf) { - ASSERT0(vd->vdev_children); - offset += VDEV_LABEL_START_SIZE; - } - - flags |= ZIO_VDEV_CHILD_FLAGS(pio); - - /* - * If we've decided to do a repair, the write is not speculative -- - * even if the original read was. - */ - if (flags & ZIO_FLAG_IO_REPAIR) - flags &= ~ZIO_FLAG_SPECULATIVE; - - /* - * If we're creating a child I/O that is not associated with a - * top-level vdev, then the child zio is not an allocating I/O. - * If this is a retried I/O then we ignore it since we will - * have already processed the original allocating I/O. - */ - if (flags & ZIO_FLAG_IO_ALLOCATING && - (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { - ASSERT(pio->io_metaslab_class != NULL); - ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); - ASSERT(type == ZIO_TYPE_WRITE); - ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); - ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); - ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || - pio->io_child_type == ZIO_CHILD_GANG); - - flags &= ~ZIO_FLAG_IO_ALLOCATING; - } - - zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, - done, private, type, priority, flags, vd, offset, &pio->io_bookmark, - ZIO_STAGE_VDEV_IO_START >> 1, pipeline); - ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); - - zio->io_physdone = pio->io_physdone; - if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) - zio->io_logical->io_phys_children++; - - return (zio); -} - -zio_t * -zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, - zio_type_t type, zio_priority_t priority, enum zio_flag flags, - zio_done_func_t *done, void *private) -{ - zio_t *zio; - - ASSERT(vd->vdev_ops->vdev_op_leaf); - - zio = zio_create(NULL, vd->vdev_spa, 0, NULL, - data, size, size, done, private, type, priority, - flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, - vd, offset, NULL, - ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); - - return (zio); -} - -void -zio_flush(zio_t *zio, vdev_t *vd) -{ - zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, - NULL, NULL, ZIO_PRIORITY_NOW, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); -} - -zio_t * -zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) -{ - - ASSERT(vd->vdev_ops->vdev_op_leaf); - - return (zio_create(zio, spa, 0, NULL, NULL, size, size, NULL, NULL, - ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, - vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); -} - -void -zio_shrink(zio_t *zio, uint64_t size) -{ - ASSERT3P(zio->io_executor, ==, NULL); - ASSERT3P(zio->io_orig_size, ==, zio->io_size); - ASSERT3U(size, <=, zio->io_size); - - /* - * We don't shrink for raidz because of problems with the - * reconstruction when reading back less than the block size. - * Note, BP_IS_RAIDZ() assumes no compression. - */ - ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); - if (!BP_IS_RAIDZ(zio->io_bp)) { - /* we are not doing a raw write */ - ASSERT3U(zio->io_size, ==, zio->io_lsize); - zio->io_orig_size = zio->io_size = zio->io_lsize = size; - } -} - -/* - * ========================================================================== - * Prepare to read and write logical blocks - * ========================================================================== - */ - -static zio_t * -zio_read_bp_init(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - - ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); - - if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && - zio->io_child_type == ZIO_CHILD_LOGICAL && - !(zio->io_flags & ZIO_FLAG_RAW)) { - uint64_t psize = - BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); - zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), - psize, psize, zio_decompress); - } - - if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { - zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - - int psize = BPE_GET_PSIZE(bp); - void *data = abd_borrow_buf(zio->io_abd, psize); - decode_embedded_bp_compressed(bp, data); - abd_return_buf_copy(zio->io_abd, data, psize); - } else { - ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); - } - - if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) - zio->io_flags |= ZIO_FLAG_DONT_CACHE; - - if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) - zio->io_flags |= ZIO_FLAG_DONT_CACHE; - - if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) - zio->io_pipeline = ZIO_DDT_READ_PIPELINE; - - return (zio); -} - -static zio_t * -zio_write_bp_init(zio_t *zio) -{ - if (!IO_IS_ALLOCATING(zio)) - return (zio); - - ASSERT(zio->io_child_type != ZIO_CHILD_DDT); - - if (zio->io_bp_override) { - blkptr_t *bp = zio->io_bp; - zio_prop_t *zp = &zio->io_prop; - - ASSERT(bp->blk_birth != zio->io_txg); - ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); - - *bp = *zio->io_bp_override; - zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - - if (BP_IS_EMBEDDED(bp)) - return (zio); - - /* - * If we've been overridden and nopwrite is set then - * set the flag accordingly to indicate that a nopwrite - * has already occurred. - */ - if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { - ASSERT(!zp->zp_dedup); - ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); - zio->io_flags |= ZIO_FLAG_NOPWRITE; - return (zio); - } - - ASSERT(!zp->zp_nopwrite); - - if (BP_IS_HOLE(bp) || !zp->zp_dedup) - return (zio); - - ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & - ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); - - if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { - BP_SET_DEDUP(bp, 1); - zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; - return (zio); - } - - /* - * We were unable to handle this as an override bp, treat - * it as a regular write I/O. - */ - zio->io_bp_override = NULL; - *bp = zio->io_bp_orig; - zio->io_pipeline = zio->io_orig_pipeline; - } - - return (zio); -} - -static zio_t * -zio_write_compress(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - zio_prop_t *zp = &zio->io_prop; - enum zio_compress compress = zp->zp_compress; - blkptr_t *bp = zio->io_bp; - uint64_t lsize = zio->io_lsize; - uint64_t psize = zio->io_size; - int pass = 1; - - EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0); - - /* - * If our children haven't all reached the ready stage, - * wait for them and then repeat this pipeline stage. - */ - if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | - ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { - return (NULL); - } - - if (!IO_IS_ALLOCATING(zio)) - return (zio); - - if (zio->io_children_ready != NULL) { - /* - * Now that all our children are ready, run the callback - * associated with this zio in case it wants to modify the - * data to be written. - */ - ASSERT3U(zp->zp_level, >, 0); - zio->io_children_ready(zio); - } - - ASSERT(zio->io_child_type != ZIO_CHILD_DDT); - ASSERT(zio->io_bp_override == NULL); - - if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { - /* - * We're rewriting an existing block, which means we're - * working on behalf of spa_sync(). For spa_sync() to - * converge, it must eventually be the case that we don't - * have to allocate new blocks. But compression changes - * the blocksize, which forces a reallocate, and makes - * convergence take longer. Therefore, after the first - * few passes, stop compressing to ensure convergence. - */ - pass = spa_sync_pass(spa); - - ASSERT(zio->io_txg == spa_syncing_txg(spa)); - ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - ASSERT(!BP_GET_DEDUP(bp)); - - if (pass >= zfs_sync_pass_dont_compress) - compress = ZIO_COMPRESS_OFF; - - /* Make sure someone doesn't change their mind on overwrites */ - ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), - spa_max_replication(spa)) == BP_GET_NDVAS(bp)); - } - - /* If it's a compressed write that is not raw, compress the buffer. */ - if (compress != ZIO_COMPRESS_OFF && psize == lsize) { - void *cbuf = zio_buf_alloc(lsize); - psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); - if (psize == 0 || psize == lsize) { - compress = ZIO_COMPRESS_OFF; - zio_buf_free(cbuf, lsize); - } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && - zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && - spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { - encode_embedded_bp_compressed(bp, - cbuf, compress, lsize, psize); - BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); - BP_SET_TYPE(bp, zio->io_prop.zp_type); - BP_SET_LEVEL(bp, zio->io_prop.zp_level); - zio_buf_free(cbuf, lsize); - bp->blk_birth = zio->io_txg; - zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - ASSERT(spa_feature_is_active(spa, - SPA_FEATURE_EMBEDDED_DATA)); - return (zio); - } else { - /* - * Round up compressed size up to the ashift - * of the smallest-ashift device, and zero the tail. - * This ensures that the compressed size of the BP - * (and thus compressratio property) are correct, - * in that we charge for the padding used to fill out - * the last sector. - */ - ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); - size_t rounded = (size_t)P2ROUNDUP(psize, - 1ULL << spa->spa_min_ashift); - if (rounded >= lsize) { - compress = ZIO_COMPRESS_OFF; - zio_buf_free(cbuf, lsize); - psize = lsize; - } else { - abd_t *cdata = abd_get_from_buf(cbuf, lsize); - abd_take_ownership_of_buf(cdata, B_TRUE); - abd_zero_off(cdata, psize, rounded - psize); - psize = rounded; - zio_push_transform(zio, cdata, - psize, lsize, NULL); - } - } - - /* - * We were unable to handle this as an override bp, treat - * it as a regular write I/O. - */ - zio->io_bp_override = NULL; - *bp = zio->io_bp_orig; - zio->io_pipeline = zio->io_orig_pipeline; - } else { - ASSERT3U(psize, !=, 0); - } - - /* - * The final pass of spa_sync() must be all rewrites, but the first - * few passes offer a trade-off: allocating blocks defers convergence, - * but newly allocated blocks are sequential, so they can be written - * to disk faster. Therefore, we allow the first few passes of - * spa_sync() to allocate new blocks, but force rewrites after that. - * There should only be a handful of blocks after pass 1 in any case. - */ - if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && - BP_GET_PSIZE(bp) == psize && - pass >= zfs_sync_pass_rewrite) { - VERIFY3U(psize, !=, 0); - enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; - - zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; - zio->io_flags |= ZIO_FLAG_IO_REWRITE; - } else { - BP_ZERO(bp); - zio->io_pipeline = ZIO_WRITE_PIPELINE; - } - - if (psize == 0) { - if (zio->io_bp_orig.blk_birth != 0 && - spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { - BP_SET_LSIZE(bp, lsize); - BP_SET_TYPE(bp, zp->zp_type); - BP_SET_LEVEL(bp, zp->zp_level); - BP_SET_BIRTH(bp, zio->io_txg, 0); - } - zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - } else { - ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); - BP_SET_LSIZE(bp, lsize); - BP_SET_TYPE(bp, zp->zp_type); - BP_SET_LEVEL(bp, zp->zp_level); - BP_SET_PSIZE(bp, psize); - BP_SET_COMPRESS(bp, compress); - BP_SET_CHECKSUM(bp, zp->zp_checksum); - BP_SET_DEDUP(bp, zp->zp_dedup); - BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); - if (zp->zp_dedup) { - ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); - zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; - } - if (zp->zp_nopwrite) { - ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); - zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; - } - } - return (zio); -} - -static zio_t * -zio_free_bp_init(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - - if (zio->io_child_type == ZIO_CHILD_LOGICAL) { - if (BP_GET_DEDUP(bp)) - zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; - } - - ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); - - return (zio); -} - -/* - * ========================================================================== - * Execute the I/O pipeline - * ========================================================================== - */ - -static void -zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) -{ - spa_t *spa = zio->io_spa; - zio_type_t t = zio->io_type; - int flags = (cutinline ? TQ_FRONT : 0); - - ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); - - /* - * If we're a config writer or a probe, the normal issue and - * interrupt threads may all be blocked waiting for the config lock. - * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. - */ - if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) - t = ZIO_TYPE_NULL; - - /* - * A similar issue exists for the L2ARC write thread until L2ARC 2.0. - */ - if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) - t = ZIO_TYPE_NULL; - - /* - * If this is a high priority I/O, then use the high priority taskq if - * available. - */ - if ((zio->io_priority == ZIO_PRIORITY_NOW || - zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) && - spa->spa_zio_taskq[t][q + 1].stqs_count != 0) - q++; - - ASSERT3U(q, <, ZIO_TASKQ_TYPES); - - /* - * NB: We are assuming that the zio can only be dispatched - * to a single taskq at a time. It would be a grievous error - * to dispatch the zio to another taskq at the same time. - */ -#if defined(illumos) || !defined(_KERNEL) - ASSERT(zio->io_tqent.tqent_next == NULL); -#else - ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); -#endif - spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, - flags, &zio->io_tqent); -} - -static boolean_t -zio_taskq_member(zio_t *zio, zio_taskq_type_t q) -{ - kthread_t *executor = zio->io_executor; - spa_t *spa = zio->io_spa; - - for (zio_type_t t = 0; t < ZIO_TYPES; t++) { - spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; - uint_t i; - for (i = 0; i < tqs->stqs_count; i++) { - if (taskq_member(tqs->stqs_taskq[i], executor)) - return (B_TRUE); - } - } - - return (B_FALSE); -} - -static zio_t * -zio_issue_async(zio_t *zio) -{ - zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); - - return (NULL); -} - -void -zio_interrupt(zio_t *zio) -{ - zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); -} - -void -zio_delay_interrupt(zio_t *zio) -{ - /* - * The timeout_generic() function isn't defined in userspace, so - * rather than trying to implement the function, the zio delay - * functionality has been disabled for userspace builds. - */ - -#ifdef _KERNEL - /* - * If io_target_timestamp is zero, then no delay has been registered - * for this IO, thus jump to the end of this function and "skip" the - * delay; issuing it directly to the zio layer. - */ - if (zio->io_target_timestamp != 0) { - hrtime_t now = gethrtime(); - - if (now >= zio->io_target_timestamp) { - /* - * This IO has already taken longer than the target - * delay to complete, so we don't want to delay it - * any longer; we "miss" the delay and issue it - * directly to the zio layer. This is likely due to - * the target latency being set to a value less than - * the underlying hardware can satisfy (e.g. delay - * set to 1ms, but the disks take 10ms to complete an - * IO request). - */ - - DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, - hrtime_t, now); - - zio_interrupt(zio); - } else { - hrtime_t diff = zio->io_target_timestamp - now; - - DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, - hrtime_t, now, hrtime_t, diff); - -#ifdef __FreeBSD__ - callout_reset_sbt(&zio->io_timer, nstosbt(diff), 0, - (void (*)(void *))zio_interrupt, zio, C_HARDCLOCK); -#else - (void) timeout_generic(CALLOUT_NORMAL, - (void (*)(void *))zio_interrupt, zio, diff, 1, 0); -#endif - } - - return; - } -#endif - - DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); - zio_interrupt(zio); -} - -/* - * Execute the I/O pipeline until one of the following occurs: - * - * (1) the I/O completes - * (2) the pipeline stalls waiting for dependent child I/Os - * (3) the I/O issues, so we're waiting for an I/O completion interrupt - * (4) the I/O is delegated by vdev-level caching or aggregation - * (5) the I/O is deferred due to vdev-level queueing - * (6) the I/O is handed off to another thread. - * - * In all cases, the pipeline stops whenever there's no CPU work; it never - * burns a thread in cv_wait(). - * - * There's no locking on io_stage because there's no legitimate way - * for multiple threads to be attempting to process the same I/O. - */ -static zio_pipe_stage_t *zio_pipeline[]; - -void -zio_execute(zio_t *zio) -{ - ASSERT3U(zio->io_queued_timestamp, >, 0); - - while (zio->io_stage < ZIO_STAGE_DONE) { - enum zio_stage pipeline = zio->io_pipeline; - enum zio_stage stage = zio->io_stage; - - zio->io_executor = curthread; - - ASSERT(!MUTEX_HELD(&zio->io_lock)); - ASSERT(ISP2(stage)); - ASSERT(zio->io_stall == NULL); - - do { - stage <<= 1; - } while ((stage & pipeline) == 0); - - ASSERT(stage <= ZIO_STAGE_DONE); - - /* - * If we are in interrupt context and this pipeline stage - * will grab a config lock that is held across I/O, - * or may wait for an I/O that needs an interrupt thread - * to complete, issue async to avoid deadlock. - * - * For VDEV_IO_START, we cut in line so that the io will - * be sent to disk promptly. - */ - if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && - zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { - boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? - zio_requeue_io_start_cut_in_line : B_FALSE; - zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); - return; - } - - zio->io_stage = stage; - zio->io_pipeline_trace |= zio->io_stage; - - /* - * The zio pipeline stage returns the next zio to execute - * (typically the same as this one), or NULL if we should - * stop. - */ - zio = zio_pipeline[highbit64(stage) - 1](zio); - - if (zio == NULL) - return; - } -} - -/* - * ========================================================================== - * Initiate I/O, either sync or async - * ========================================================================== - */ -int -zio_wait(zio_t *zio) -{ - int error; - - ASSERT3P(zio->io_stage, ==, ZIO_STAGE_OPEN); - ASSERT3P(zio->io_executor, ==, NULL); - - zio->io_waiter = curthread; - ASSERT0(zio->io_queued_timestamp); - zio->io_queued_timestamp = gethrtime(); - - zio_execute(zio); - - mutex_enter(&zio->io_lock); - while (zio->io_executor != NULL) - cv_wait(&zio->io_cv, &zio->io_lock); - mutex_exit(&zio->io_lock); - - error = zio->io_error; - zio_destroy(zio); - - return (error); -} - -void -zio_nowait(zio_t *zio) -{ - ASSERT3P(zio->io_executor, ==, NULL); - - if (zio->io_child_type == ZIO_CHILD_LOGICAL && - zio_unique_parent(zio) == NULL) { - /* - * This is a logical async I/O with no parent to wait for it. - * We add it to the spa_async_root_zio "Godfather" I/O which - * will ensure they complete prior to unloading the pool. - */ - spa_t *spa = zio->io_spa; - - zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); - } - - ASSERT0(zio->io_queued_timestamp); - zio->io_queued_timestamp = gethrtime(); - zio_execute(zio); -} - -/* - * ========================================================================== - * Reexecute, cancel, or suspend/resume failed I/O - * ========================================================================== - */ - -static void -zio_reexecute(zio_t *pio) -{ - zio_t *cio, *cio_next; - - ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); - ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); - ASSERT(pio->io_gang_leader == NULL); - ASSERT(pio->io_gang_tree == NULL); - - pio->io_flags = pio->io_orig_flags; - pio->io_stage = pio->io_orig_stage; - pio->io_pipeline = pio->io_orig_pipeline; - pio->io_reexecute = 0; - pio->io_flags |= ZIO_FLAG_REEXECUTED; - pio->io_pipeline_trace = 0; - pio->io_error = 0; - for (int w = 0; w < ZIO_WAIT_TYPES; w++) - pio->io_state[w] = 0; - for (int c = 0; c < ZIO_CHILD_TYPES; c++) - pio->io_child_error[c] = 0; - - if (IO_IS_ALLOCATING(pio)) - BP_ZERO(pio->io_bp); - - /* - * As we reexecute pio's children, new children could be created. - * New children go to the head of pio's io_child_list, however, - * so we will (correctly) not reexecute them. The key is that - * the remainder of pio's io_child_list, from 'cio_next' onward, - * cannot be affected by any side effects of reexecuting 'cio'. - */ - zio_link_t *zl = NULL; - mutex_enter(&pio->io_lock); - for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { - cio_next = zio_walk_children(pio, &zl); - for (int w = 0; w < ZIO_WAIT_TYPES; w++) - pio->io_children[cio->io_child_type][w]++; - mutex_exit(&pio->io_lock); - zio_reexecute(cio); - mutex_enter(&pio->io_lock); - } - mutex_exit(&pio->io_lock); - - /* - * Now that all children have been reexecuted, execute the parent. - * We don't reexecute "The Godfather" I/O here as it's the - * responsibility of the caller to wait on it. - */ - if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { - pio->io_queued_timestamp = gethrtime(); - zio_execute(pio); - } -} - -void -zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) -{ - if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) - fm_panic("Pool '%s' has encountered an uncorrectable I/O " - "failure and the failure mode property for this pool " - "is set to panic.", spa_name(spa)); - - zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); - - mutex_enter(&spa->spa_suspend_lock); - - if (spa->spa_suspend_zio_root == NULL) - spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | - ZIO_FLAG_GODFATHER); - - spa->spa_suspended = reason; - - if (zio != NULL) { - ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); - ASSERT(zio != spa->spa_suspend_zio_root); - ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - ASSERT(zio_unique_parent(zio) == NULL); - ASSERT(zio->io_stage == ZIO_STAGE_DONE); - zio_add_child(spa->spa_suspend_zio_root, zio); - } - - mutex_exit(&spa->spa_suspend_lock); -} - -int -zio_resume(spa_t *spa) -{ - zio_t *pio; - - /* - * Reexecute all previously suspended i/o. - */ - mutex_enter(&spa->spa_suspend_lock); - spa->spa_suspended = ZIO_SUSPEND_NONE; - cv_broadcast(&spa->spa_suspend_cv); - pio = spa->spa_suspend_zio_root; - spa->spa_suspend_zio_root = NULL; - mutex_exit(&spa->spa_suspend_lock); - - if (pio == NULL) - return (0); - - zio_reexecute(pio); - return (zio_wait(pio)); -} - -void -zio_resume_wait(spa_t *spa) -{ - mutex_enter(&spa->spa_suspend_lock); - while (spa_suspended(spa)) - cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); - mutex_exit(&spa->spa_suspend_lock); -} - -/* - * ========================================================================== - * Gang blocks. - * - * A gang block is a collection of small blocks that looks to the DMU - * like one large block. When zio_dva_allocate() cannot find a block - * of the requested size, due to either severe fragmentation or the pool - * being nearly full, it calls zio_write_gang_block() to construct the - * block from smaller fragments. - * - * A gang block consists of a gang header (zio_gbh_phys_t) and up to - * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like - * an indirect block: it's an array of block pointers. It consumes - * only one sector and hence is allocatable regardless of fragmentation. - * The gang header's bps point to its gang members, which hold the data. - * - * Gang blocks are self-checksumming, using the bp's - * as the verifier to ensure uniqueness of the SHA256 checksum. - * Critically, the gang block bp's blk_cksum is the checksum of the data, - * not the gang header. This ensures that data block signatures (needed for - * deduplication) are independent of how the block is physically stored. - * - * Gang blocks can be nested: a gang member may itself be a gang block. - * Thus every gang block is a tree in which root and all interior nodes are - * gang headers, and the leaves are normal blocks that contain user data. - * The root of the gang tree is called the gang leader. - * - * To perform any operation (read, rewrite, free, claim) on a gang block, - * zio_gang_assemble() first assembles the gang tree (minus data leaves) - * in the io_gang_tree field of the original logical i/o by recursively - * reading the gang leader and all gang headers below it. This yields - * an in-core tree containing the contents of every gang header and the - * bps for every constituent of the gang block. - * - * With the gang tree now assembled, zio_gang_issue() just walks the gang tree - * and invokes a callback on each bp. To free a gang block, zio_gang_issue() - * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. - * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). - * zio_read_gang() is a wrapper around zio_read() that omits reading gang - * headers, since we already have those in io_gang_tree. zio_rewrite_gang() - * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() - * of the gang header plus zio_checksum_compute() of the data to update the - * gang header's blk_cksum as described above. - * - * The two-phase assemble/issue model solves the problem of partial failure -- - * what if you'd freed part of a gang block but then couldn't read the - * gang header for another part? Assembling the entire gang tree first - * ensures that all the necessary gang header I/O has succeeded before - * starting the actual work of free, claim, or write. Once the gang tree - * is assembled, free and claim are in-memory operations that cannot fail. - * - * In the event that a gang write fails, zio_dva_unallocate() walks the - * gang tree to immediately free (i.e. insert back into the space map) - * everything we've allocated. This ensures that we don't get ENOSPC - * errors during repeated suspend/resume cycles due to a flaky device. - * - * Gang rewrites only happen during sync-to-convergence. If we can't assemble - * the gang tree, we won't modify the block, so we can safely defer the free - * (knowing that the block is still intact). If we *can* assemble the gang - * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free - * each constituent bp and we can allocate a new block on the next sync pass. - * - * In all cases, the gang tree allows complete recovery from partial failure. - * ========================================================================== - */ - -static void -zio_gang_issue_func_done(zio_t *zio) -{ - abd_put(zio->io_abd); -} - -static zio_t * -zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, - uint64_t offset) -{ - if (gn != NULL) - return (pio); - - return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), - BP_GET_PSIZE(bp), zio_gang_issue_func_done, - NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), - &pio->io_bookmark)); -} - -static zio_t * -zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, - uint64_t offset) -{ - zio_t *zio; - - if (gn != NULL) { - abd_t *gbh_abd = - abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); - zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), - &pio->io_bookmark); - /* - * As we rewrite each gang header, the pipeline will compute - * a new gang block header checksum for it; but no one will - * compute a new data checksum, so we do that here. The one - * exception is the gang leader: the pipeline already computed - * its data checksum because that stage precedes gang assembly. - * (Presently, nothing actually uses interior data checksums; - * this is just good hygiene.) - */ - if (gn != pio->io_gang_leader->io_gang_tree) { - abd_t *buf = abd_get_offset(data, offset); - - zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), - buf, BP_GET_PSIZE(bp)); - - abd_put(buf); - } - /* - * If we are here to damage data for testing purposes, - * leave the GBH alone so that we can detect the damage. - */ - if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) - zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; - } else { - zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - abd_get_offset(data, offset), BP_GET_PSIZE(bp), - zio_gang_issue_func_done, NULL, pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); - } - - return (zio); -} - -/* ARGSUSED */ -static zio_t * -zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, - uint64_t offset) -{ - return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, - BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), - ZIO_GANG_CHILD_FLAGS(pio))); -} - -/* ARGSUSED */ -static zio_t * -zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, - uint64_t offset) -{ - return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, - NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); -} - -static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { - NULL, - zio_read_gang, - zio_rewrite_gang, - zio_free_gang, - zio_claim_gang, - NULL -}; - -static void zio_gang_tree_assemble_done(zio_t *zio); - -static zio_gang_node_t * -zio_gang_node_alloc(zio_gang_node_t **gnpp) -{ - zio_gang_node_t *gn; - - ASSERT(*gnpp == NULL); - - gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); - gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); - *gnpp = gn; - - return (gn); -} - -static void -zio_gang_node_free(zio_gang_node_t **gnpp) -{ - zio_gang_node_t *gn = *gnpp; - - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) - ASSERT(gn->gn_child[g] == NULL); - - zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); - kmem_free(gn, sizeof (*gn)); - *gnpp = NULL; -} - -static void -zio_gang_tree_free(zio_gang_node_t **gnpp) -{ - zio_gang_node_t *gn = *gnpp; - - if (gn == NULL) - return; - - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) - zio_gang_tree_free(&gn->gn_child[g]); - - zio_gang_node_free(gnpp); -} - -static void -zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) -{ - zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); - abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); - - ASSERT(gio->io_gang_leader == gio); - ASSERT(BP_IS_GANG(bp)); - - zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, - zio_gang_tree_assemble_done, gn, gio->io_priority, - ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); -} - -static void -zio_gang_tree_assemble_done(zio_t *zio) -{ - zio_t *gio = zio->io_gang_leader; - zio_gang_node_t *gn = zio->io_private; - blkptr_t *bp = zio->io_bp; - - ASSERT(gio == zio_unique_parent(zio)); - ASSERT(zio->io_child_count == 0); - - if (zio->io_error) - return; - - /* this ABD was created from a linear buf in zio_gang_tree_assemble */ - if (BP_SHOULD_BYTESWAP(bp)) - byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); - - ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); - ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); - ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); - - abd_put(zio->io_abd); - - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { - blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; - if (!BP_IS_GANG(gbp)) - continue; - zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); - } -} - -static void -zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, - uint64_t offset) -{ - zio_t *gio = pio->io_gang_leader; - zio_t *zio; - - ASSERT(BP_IS_GANG(bp) == !!gn); - ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); - ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); - - /* - * If you're a gang header, your data is in gn->gn_gbh. - * If you're a gang member, your data is in 'data' and gn == NULL. - */ - zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); - - if (gn != NULL) { - ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); - - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { - blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; - if (BP_IS_HOLE(gbp)) - continue; - zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, - offset); - offset += BP_GET_PSIZE(gbp); - } - } - - if (gn == gio->io_gang_tree && gio->io_abd != NULL) - ASSERT3U(gio->io_size, ==, offset); - - if (zio != pio) - zio_nowait(zio); -} - -static zio_t * -zio_gang_assemble(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - - ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); - ASSERT(zio->io_child_type > ZIO_CHILD_GANG); - - zio->io_gang_leader = zio; - - zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); - - return (zio); -} - -static zio_t * -zio_gang_issue(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - - if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { - return (NULL); - } - - ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); - ASSERT(zio->io_child_type > ZIO_CHILD_GANG); - - if (zio->io_child_error[ZIO_CHILD_GANG] == 0) - zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, - 0); - else - zio_gang_tree_free(&zio->io_gang_tree); - - zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - - return (zio); -} - -static void -zio_write_gang_member_ready(zio_t *zio) -{ - zio_t *pio = zio_unique_parent(zio); - zio_t *gio = zio->io_gang_leader; - dva_t *cdva = zio->io_bp->blk_dva; - dva_t *pdva = pio->io_bp->blk_dva; - uint64_t asize; - - if (BP_IS_HOLE(zio->io_bp)) - return; - - ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); - - ASSERT(zio->io_child_type == ZIO_CHILD_GANG); - ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); - ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); - ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); - ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); - - mutex_enter(&pio->io_lock); - for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { - ASSERT(DVA_GET_GANG(&pdva[d])); - asize = DVA_GET_ASIZE(&pdva[d]); - asize += DVA_GET_ASIZE(&cdva[d]); - DVA_SET_ASIZE(&pdva[d], asize); - } - mutex_exit(&pio->io_lock); -} - -static void -zio_write_gang_done(zio_t *zio) -{ - /* - * The io_abd field will be NULL for a zio with no data. The io_flags - * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't - * check for it here as it is cleared in zio_ready. - */ - if (zio->io_abd != NULL) - abd_put(zio->io_abd); -} - -static zio_t * -zio_write_gang_block(zio_t *pio) -{ - spa_t *spa = pio->io_spa; - metaslab_class_t *mc = spa_normal_class(spa); - blkptr_t *bp = pio->io_bp; - zio_t *gio = pio->io_gang_leader; - zio_t *zio; - zio_gang_node_t *gn, **gnpp; - zio_gbh_phys_t *gbh; - abd_t *gbh_abd; - uint64_t txg = pio->io_txg; - uint64_t resid = pio->io_size; - uint64_t lsize; - int copies = gio->io_prop.zp_copies; - int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); - zio_prop_t zp; - int error; - boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); - - int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; - if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { - ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); - ASSERT(has_data); - - flags |= METASLAB_ASYNC_ALLOC; - VERIFY(zfs_refcount_held(&mc->mc_alloc_slots[pio->io_allocator], - pio)); - - /* - * The logical zio has already placed a reservation for - * 'copies' allocation slots but gang blocks may require - * additional copies. These additional copies - * (i.e. gbh_copies - copies) are guaranteed to succeed - * since metaslab_class_throttle_reserve() always allows - * additional reservations for gang blocks. - */ - VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, - pio->io_allocator, pio, flags)); - } - - error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, - bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, - &pio->io_alloc_list, pio, pio->io_allocator); - if (error) { - if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { - ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); - ASSERT(has_data); - - /* - * If we failed to allocate the gang block header then - * we remove any additional allocation reservations that - * we placed here. The original reservation will - * be removed when the logical I/O goes to the ready - * stage. - */ - metaslab_class_throttle_unreserve(mc, - gbh_copies - copies, pio->io_allocator, pio); - } - pio->io_error = error; - return (pio); - } - - if (pio == gio) { - gnpp = &gio->io_gang_tree; - } else { - gnpp = pio->io_private; - ASSERT(pio->io_ready == zio_write_gang_member_ready); - } - - gn = zio_gang_node_alloc(gnpp); - gbh = gn->gn_gbh; - bzero(gbh, SPA_GANGBLOCKSIZE); - gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); - - /* - * Create the gang header. - */ - zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, - zio_write_gang_done, NULL, pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); - - /* - * Create and nowait the gang children. - */ - for (int g = 0; resid != 0; resid -= lsize, g++) { - lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), - SPA_MINBLOCKSIZE); - ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); - - zp.zp_checksum = gio->io_prop.zp_checksum; - zp.zp_compress = ZIO_COMPRESS_OFF; - zp.zp_type = DMU_OT_NONE; - zp.zp_level = 0; - zp.zp_copies = gio->io_prop.zp_copies; - zp.zp_dedup = B_FALSE; - zp.zp_dedup_verify = B_FALSE; - zp.zp_nopwrite = B_FALSE; - - zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], - has_data ? abd_get_offset(pio->io_abd, pio->io_size - - resid) : NULL, lsize, lsize, &zp, - zio_write_gang_member_ready, NULL, NULL, - zio_write_gang_done, &gn->gn_child[g], pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); - - if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { - ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); - ASSERT(has_data); - - /* - * Gang children won't throttle but we should - * account for their work, so reserve an allocation - * slot for them here. - */ - VERIFY(metaslab_class_throttle_reserve(mc, - zp.zp_copies, cio->io_allocator, cio, flags)); - } - zio_nowait(cio); - } - - /* - * Set pio's pipeline to just wait for zio to finish. - */ - pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - - zio_nowait(zio); - - return (pio); -} - -/* - * The zio_nop_write stage in the pipeline determines if allocating a - * new bp is necessary. The nopwrite feature can handle writes in - * either syncing or open context (i.e. zil writes) and as a result is - * mutually exclusive with dedup. - * - * By leveraging a cryptographically secure checksum, such as SHA256, we - * can compare the checksums of the new data and the old to determine if - * allocating a new block is required. Note that our requirements for - * cryptographic strength are fairly weak: there can't be any accidental - * hash collisions, but we don't need to be secure against intentional - * (malicious) collisions. To trigger a nopwrite, you have to be able - * to write the file to begin with, and triggering an incorrect (hash - * collision) nopwrite is no worse than simply writing to the file. - * That said, there are no known attacks against the checksum algorithms - * used for nopwrite, assuming that the salt and the checksums - * themselves remain secret. - */ -static zio_t * -zio_nop_write(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - blkptr_t *bp_orig = &zio->io_bp_orig; - zio_prop_t *zp = &zio->io_prop; - - ASSERT(BP_GET_LEVEL(bp) == 0); - ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); - ASSERT(zp->zp_nopwrite); - ASSERT(!zp->zp_dedup); - ASSERT(zio->io_bp_override == NULL); - ASSERT(IO_IS_ALLOCATING(zio)); - - /* - * Check to see if the original bp and the new bp have matching - * characteristics (i.e. same checksum, compression algorithms, etc). - * If they don't then just continue with the pipeline which will - * allocate a new bp. - */ - if (BP_IS_HOLE(bp_orig) || - !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & - ZCHECKSUM_FLAG_NOPWRITE) || - BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || - BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || - BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || - zp->zp_copies != BP_GET_NDVAS(bp_orig)) - return (zio); - - /* - * If the checksums match then reset the pipeline so that we - * avoid allocating a new bp and issuing any I/O. - */ - if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { - ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & - ZCHECKSUM_FLAG_NOPWRITE); - ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); - ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); - ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); - ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, - sizeof (uint64_t)) == 0); - - *bp = *bp_orig; - zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - zio->io_flags |= ZIO_FLAG_NOPWRITE; - } - - return (zio); -} - -/* - * ========================================================================== - * Dedup - * ========================================================================== - */ -static void -zio_ddt_child_read_done(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - ddt_entry_t *dde = zio->io_private; - ddt_phys_t *ddp; - zio_t *pio = zio_unique_parent(zio); - - mutex_enter(&pio->io_lock); - ddp = ddt_phys_select(dde, bp); - if (zio->io_error == 0) - ddt_phys_clear(ddp); /* this ddp doesn't need repair */ - - if (zio->io_error == 0 && dde->dde_repair_abd == NULL) - dde->dde_repair_abd = zio->io_abd; - else - abd_free(zio->io_abd); - mutex_exit(&pio->io_lock); -} - -static zio_t * -zio_ddt_read_start(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - - ASSERT(BP_GET_DEDUP(bp)); - ASSERT(BP_GET_PSIZE(bp) == zio->io_size); - ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - - if (zio->io_child_error[ZIO_CHILD_DDT]) { - ddt_t *ddt = ddt_select(zio->io_spa, bp); - ddt_entry_t *dde = ddt_repair_start(ddt, bp); - ddt_phys_t *ddp = dde->dde_phys; - ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); - blkptr_t blk; - - ASSERT(zio->io_vsd == NULL); - zio->io_vsd = dde; - - if (ddp_self == NULL) - return (zio); - - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) - continue; - ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, - &blk); - zio_nowait(zio_read(zio, zio->io_spa, &blk, - abd_alloc_for_io(zio->io_size, B_TRUE), - zio->io_size, zio_ddt_child_read_done, dde, - zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | - ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); - } - return (zio); - } - - zio_nowait(zio_read(zio, zio->io_spa, bp, - zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, - ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); - - return (zio); -} - -static zio_t * -zio_ddt_read_done(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - - if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { - return (NULL); - } - - ASSERT(BP_GET_DEDUP(bp)); - ASSERT(BP_GET_PSIZE(bp) == zio->io_size); - ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - - if (zio->io_child_error[ZIO_CHILD_DDT]) { - ddt_t *ddt = ddt_select(zio->io_spa, bp); - ddt_entry_t *dde = zio->io_vsd; - if (ddt == NULL) { - ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); - return (zio); - } - if (dde == NULL) { - zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; - zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); - return (NULL); - } - if (dde->dde_repair_abd != NULL) { - abd_copy(zio->io_abd, dde->dde_repair_abd, - zio->io_size); - zio->io_child_error[ZIO_CHILD_DDT] = 0; - } - ddt_repair_done(ddt, dde); - zio->io_vsd = NULL; - } - - ASSERT(zio->io_vsd == NULL); - - return (zio); -} - -static boolean_t -zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) -{ - spa_t *spa = zio->io_spa; - boolean_t do_raw = (zio->io_flags & ZIO_FLAG_RAW); - - /* We should never get a raw, override zio */ - ASSERT(!(zio->io_bp_override && do_raw)); - - /* - * Note: we compare the original data, not the transformed data, - * because when zio->io_bp is an override bp, we will not have - * pushed the I/O transforms. That's an important optimization - * because otherwise we'd compress/encrypt all dmu_sync() data twice. - */ - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { - zio_t *lio = dde->dde_lead_zio[p]; - - if (lio != NULL) { - return (lio->io_orig_size != zio->io_orig_size || - abd_cmp(zio->io_orig_abd, lio->io_orig_abd, - zio->io_orig_size) != 0); - } - } - - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { - ddt_phys_t *ddp = &dde->dde_phys[p]; - - if (ddp->ddp_phys_birth != 0) { - arc_buf_t *abuf = NULL; - arc_flags_t aflags = ARC_FLAG_WAIT; - int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; - blkptr_t blk = *zio->io_bp; - int error; - - ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); - - ddt_exit(ddt); - - /* - * Intuitively, it would make more sense to compare - * io_abd than io_orig_abd in the raw case since you - * don't want to look at any transformations that have - * happened to the data. However, for raw I/Os the - * data will actually be the same in io_abd and - * io_orig_abd, so all we have to do is issue this as - * a raw ARC read. - */ - if (do_raw) { - zio_flags |= ZIO_FLAG_RAW; - ASSERT3U(zio->io_size, ==, zio->io_orig_size); - ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd, - zio->io_size)); - ASSERT3P(zio->io_transform_stack, ==, NULL); - } - - error = arc_read(NULL, spa, &blk, - arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, - zio_flags, &aflags, &zio->io_bookmark); - - if (error == 0) { - if (arc_buf_size(abuf) != zio->io_orig_size || - abd_cmp_buf(zio->io_orig_abd, abuf->b_data, - zio->io_orig_size) != 0) - error = SET_ERROR(EEXIST); - arc_buf_destroy(abuf, &abuf); - } - - ddt_enter(ddt); - return (error != 0); - } - } - - return (B_FALSE); -} - -static void -zio_ddt_child_write_ready(zio_t *zio) -{ - int p = zio->io_prop.zp_copies; - ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); - ddt_entry_t *dde = zio->io_private; - ddt_phys_t *ddp = &dde->dde_phys[p]; - zio_t *pio; - - if (zio->io_error) - return; - - ddt_enter(ddt); - - ASSERT(dde->dde_lead_zio[p] == zio); - - ddt_phys_fill(ddp, zio->io_bp); - - zio_link_t *zl = NULL; - while ((pio = zio_walk_parents(zio, &zl)) != NULL) - ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); - - ddt_exit(ddt); -} - -static void -zio_ddt_child_write_done(zio_t *zio) -{ - int p = zio->io_prop.zp_copies; - ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); - ddt_entry_t *dde = zio->io_private; - ddt_phys_t *ddp = &dde->dde_phys[p]; - - ddt_enter(ddt); - - ASSERT(ddp->ddp_refcnt == 0); - ASSERT(dde->dde_lead_zio[p] == zio); - dde->dde_lead_zio[p] = NULL; - - if (zio->io_error == 0) { - zio_link_t *zl = NULL; - while (zio_walk_parents(zio, &zl) != NULL) - ddt_phys_addref(ddp); - } else { - ddt_phys_clear(ddp); - } - - ddt_exit(ddt); -} - -static void -zio_ddt_ditto_write_done(zio_t *zio) -{ - int p = DDT_PHYS_DITTO; - zio_prop_t *zp = &zio->io_prop; - blkptr_t *bp = zio->io_bp; - ddt_t *ddt = ddt_select(zio->io_spa, bp); - ddt_entry_t *dde = zio->io_private; - ddt_phys_t *ddp = &dde->dde_phys[p]; - ddt_key_t *ddk = &dde->dde_key; - - ddt_enter(ddt); - - ASSERT(ddp->ddp_refcnt == 0); - ASSERT(dde->dde_lead_zio[p] == zio); - dde->dde_lead_zio[p] = NULL; - - if (zio->io_error == 0) { - ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); - ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); - ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); - if (ddp->ddp_phys_birth != 0) - ddt_phys_free(ddt, ddk, ddp, zio->io_txg); - ddt_phys_fill(ddp, bp); - } - - ddt_exit(ddt); -} - -static zio_t * -zio_ddt_write(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - blkptr_t *bp = zio->io_bp; - uint64_t txg = zio->io_txg; - zio_prop_t *zp = &zio->io_prop; - int p = zp->zp_copies; - int ditto_copies; - zio_t *cio = NULL; - zio_t *dio = NULL; - ddt_t *ddt = ddt_select(spa, bp); - ddt_entry_t *dde; - ddt_phys_t *ddp; - - ASSERT(BP_GET_DEDUP(bp)); - ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); - ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); - ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); - - ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_TRUE); - ddp = &dde->dde_phys[p]; - - if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { - /* - * If we're using a weak checksum, upgrade to a strong checksum - * and try again. If we're already using a strong checksum, - * we can't resolve it, so just convert to an ordinary write. - * (And automatically e-mail a paper to Nature?) - */ - if (!(zio_checksum_table[zp->zp_checksum].ci_flags & - ZCHECKSUM_FLAG_DEDUP)) { - zp->zp_checksum = spa_dedup_checksum(spa); - zio_pop_transforms(zio); - zio->io_stage = ZIO_STAGE_OPEN; - BP_ZERO(bp); - } else { - zp->zp_dedup = B_FALSE; - BP_SET_DEDUP(bp, B_FALSE); - } - ASSERT(!BP_GET_DEDUP(bp)); - zio->io_pipeline = ZIO_WRITE_PIPELINE; - ddt_exit(ddt); - return (zio); - } - - ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); - ASSERT(ditto_copies < SPA_DVAS_PER_BP); - - if (ditto_copies > ddt_ditto_copies_present(dde) && - dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { - zio_prop_t czp = *zp; - - czp.zp_copies = ditto_copies; - - /* - * If we arrived here with an override bp, we won't have run - * the transform stack, so we won't have the data we need to - * generate a child i/o. So, toss the override bp and restart. - * This is safe, because using the override bp is just an - * optimization; and it's rare, so the cost doesn't matter. - */ - if (zio->io_bp_override) { - zio_pop_transforms(zio); - zio->io_stage = ZIO_STAGE_OPEN; - zio->io_pipeline = ZIO_WRITE_PIPELINE; - zio->io_bp_override = NULL; - BP_ZERO(bp); - ddt_exit(ddt); - return (zio); - } - - dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, - zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, - NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, - ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); - - zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL); - dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; - } - - if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { - if (ddp->ddp_phys_birth != 0) - ddt_bp_fill(ddp, bp, txg); - if (dde->dde_lead_zio[p] != NULL) - zio_add_child(zio, dde->dde_lead_zio[p]); - else - ddt_phys_addref(ddp); - } else if (zio->io_bp_override) { - ASSERT(bp->blk_birth == txg); - ASSERT(BP_EQUAL(bp, zio->io_bp_override)); - ddt_phys_fill(ddp, bp); - ddt_phys_addref(ddp); - } else { - cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, - zio->io_orig_size, zio->io_orig_size, zp, - zio_ddt_child_write_ready, NULL, NULL, - zio_ddt_child_write_done, dde, zio->io_priority, - ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); - - zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); - dde->dde_lead_zio[p] = cio; - } - - ddt_exit(ddt); - - if (cio) - zio_nowait(cio); - if (dio) - zio_nowait(dio); - - return (zio); -} - -ddt_entry_t *freedde; /* for debugging */ - -static zio_t * -zio_ddt_free(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - blkptr_t *bp = zio->io_bp; - ddt_t *ddt = ddt_select(spa, bp); - ddt_entry_t *dde; - ddt_phys_t *ddp; - - ASSERT(BP_GET_DEDUP(bp)); - ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - - ddt_enter(ddt); - freedde = dde = ddt_lookup(ddt, bp, B_TRUE); - if (dde) { - ddp = ddt_phys_select(dde, bp); - if (ddp) - ddt_phys_decref(ddp); - } - ddt_exit(ddt); - - return (zio); -} - -/* - * ========================================================================== - * Allocate and free blocks - * ========================================================================== - */ - -static zio_t * -zio_io_to_allocate(spa_t *spa, int allocator) -{ - zio_t *zio; - - ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator])); - - zio = avl_first(&spa->spa_alloc_trees[allocator]); - if (zio == NULL) - return (NULL); - - ASSERT(IO_IS_ALLOCATING(zio)); - - /* - * Try to place a reservation for this zio. If we're unable to - * reserve then we throttle. - */ - ASSERT3U(zio->io_allocator, ==, allocator); - if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, - zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { - return (NULL); - } - - avl_remove(&spa->spa_alloc_trees[allocator], zio); - ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); - - return (zio); -} - -static zio_t * -zio_dva_throttle(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - zio_t *nio; - metaslab_class_t *mc; - - /* locate an appropriate allocation class */ - mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, - zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); - - if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || - !mc->mc_alloc_throttle_enabled || - zio->io_child_type == ZIO_CHILD_GANG || - zio->io_flags & ZIO_FLAG_NODATA) { - return (zio); - } - - ASSERT(zio->io_child_type > ZIO_CHILD_GANG); - - ASSERT3U(zio->io_queued_timestamp, >, 0); - ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); - - zbookmark_phys_t *bm = &zio->io_bookmark; - /* - * We want to try to use as many allocators as possible to help improve - * performance, but we also want logically adjacent IOs to be physically - * adjacent to improve sequential read performance. We chunk each object - * into 2^20 block regions, and then hash based on the objset, object, - * level, and region to accomplish both of these goals. - */ - zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, - bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; - mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); - ASSERT(zio->io_type == ZIO_TYPE_WRITE); - zio->io_metaslab_class = mc; - avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); - nio = zio_io_to_allocate(spa, zio->io_allocator); - mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); - - return (nio); -} - -static void -zio_allocate_dispatch(spa_t *spa, int allocator) -{ - zio_t *zio; - - mutex_enter(&spa->spa_alloc_locks[allocator]); - zio = zio_io_to_allocate(spa, allocator); - mutex_exit(&spa->spa_alloc_locks[allocator]); - if (zio == NULL) - return; - - ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); - ASSERT0(zio->io_error); - zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); -} - -static zio_t * -zio_dva_allocate(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - metaslab_class_t *mc; - blkptr_t *bp = zio->io_bp; - int error; - int flags = 0; - - if (zio->io_gang_leader == NULL) { - ASSERT(zio->io_child_type > ZIO_CHILD_GANG); - zio->io_gang_leader = zio; - } - - ASSERT(BP_IS_HOLE(bp)); - ASSERT0(BP_GET_NDVAS(bp)); - ASSERT3U(zio->io_prop.zp_copies, >, 0); - ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); - ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); - - if (zio->io_flags & ZIO_FLAG_NODATA) - flags |= METASLAB_DONT_THROTTLE; - if (zio->io_flags & ZIO_FLAG_GANG_CHILD) - flags |= METASLAB_GANG_CHILD; - if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) - flags |= METASLAB_ASYNC_ALLOC; - - /* - * if not already chosen, locate an appropriate allocation class - */ - mc = zio->io_metaslab_class; - if (mc == NULL) { - mc = spa_preferred_class(spa, zio->io_size, - zio->io_prop.zp_type, zio->io_prop.zp_level, - zio->io_prop.zp_zpl_smallblk); - zio->io_metaslab_class = mc; - } - - error = metaslab_alloc(spa, mc, zio->io_size, bp, - zio->io_prop.zp_copies, zio->io_txg, NULL, flags, - &zio->io_alloc_list, zio, zio->io_allocator); - - /* - * Fallback to normal class when an alloc class is full - */ - if (error == ENOSPC && mc != spa_normal_class(spa)) { - /* - * If throttling, transfer reservation over to normal class. - * The io_allocator slot can remain the same even though we - * are switching classes. - */ - if (mc->mc_alloc_throttle_enabled && - (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) { - metaslab_class_throttle_unreserve(mc, - zio->io_prop.zp_copies, zio->io_allocator, zio); - zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; - - mc = spa_normal_class(spa); - VERIFY(metaslab_class_throttle_reserve(mc, - zio->io_prop.zp_copies, zio->io_allocator, zio, - flags | METASLAB_MUST_RESERVE)); - } else { - mc = spa_normal_class(spa); - } - zio->io_metaslab_class = mc; - - error = metaslab_alloc(spa, mc, zio->io_size, bp, - zio->io_prop.zp_copies, zio->io_txg, NULL, flags, - &zio->io_alloc_list, zio, zio->io_allocator); - } - - if (error != 0) { - zfs_dbgmsg("%s: metaslab allocation failure: zio %p, " - "size %llu, error %d", spa_name(spa), zio, zio->io_size, - error); - if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) - return (zio_write_gang_block(zio)); - zio->io_error = error; - } - - return (zio); -} - -static zio_t * -zio_dva_free(zio_t *zio) -{ - metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); - - return (zio); -} - -static zio_t * -zio_dva_claim(zio_t *zio) -{ - int error; - - error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); - if (error) - zio->io_error = error; - - return (zio); -} - -/* - * Undo an allocation. This is used by zio_done() when an I/O fails - * and we want to give back the block we just allocated. - * This handles both normal blocks and gang blocks. - */ -static void -zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) -{ - ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); - ASSERT(zio->io_bp_override == NULL); - - if (!BP_IS_HOLE(bp)) - metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); - - if (gn != NULL) { - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { - zio_dva_unallocate(zio, gn->gn_child[g], - &gn->gn_gbh->zg_blkptr[g]); - } - } -} - -/* - * Try to allocate an intent log block. Return 0 on success, errno on failure. - */ -int -zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp, - blkptr_t *old_bp, uint64_t size, boolean_t *slog) -{ - int error = 1; - zio_alloc_list_t io_alloc_list; - - ASSERT(txg > spa_syncing_txg(spa)); - - metaslab_trace_init(&io_alloc_list); - - /* - * Block pointer fields are useful to metaslabs for stats and debugging. - * Fill in the obvious ones before calling into metaslab_alloc(). - */ - BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); - BP_SET_PSIZE(new_bp, size); - BP_SET_LEVEL(new_bp, 0); - - /* - * When allocating a zil block, we don't have information about - * the final destination of the block except the objset it's part - * of, so we just hash the objset ID to pick the allocator to get - * some parallelism. - */ - error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, - txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL, - cityhash4(0, 0, 0, objset) % spa->spa_alloc_count); - if (error == 0) { - *slog = TRUE; - } else { - error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, - &io_alloc_list, NULL, cityhash4(0, 0, 0, objset) % - spa->spa_alloc_count); - if (error == 0) - *slog = FALSE; - } - metaslab_trace_fini(&io_alloc_list); - - if (error == 0) { - BP_SET_LSIZE(new_bp, size); - BP_SET_PSIZE(new_bp, size); - BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); - BP_SET_CHECKSUM(new_bp, - spa_version(spa) >= SPA_VERSION_SLIM_ZIL - ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); - BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); - BP_SET_LEVEL(new_bp, 0); - BP_SET_DEDUP(new_bp, 0); - BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); - } else { - zfs_dbgmsg("%s: zil block allocation failure: " - "size %llu, error %d", spa_name(spa), size, error); - } - - return (error); -} - -/* - * ========================================================================== - * Read, write and delete to physical devices - * ========================================================================== - */ - - -/* - * Issue an I/O to the underlying vdev. Typically the issue pipeline - * stops after this stage and will resume upon I/O completion. - * However, there are instances where the vdev layer may need to - * continue the pipeline when an I/O was not issued. Since the I/O - * that was sent to the vdev layer might be different than the one - * currently active in the pipeline (see vdev_queue_io()), we explicitly - * force the underlying vdev layers to call either zio_execute() or - * zio_interrupt() to ensure that the pipeline continues with the correct I/O. - */ -static zio_t * -zio_vdev_io_start(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - uint64_t align; - spa_t *spa = zio->io_spa; - int ret; - - ASSERT(zio->io_error == 0); - ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); - - if (vd == NULL) { - if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) - spa_config_enter(spa, SCL_ZIO, zio, RW_READER); - - /* - * The mirror_ops handle multiple DVAs in a single BP. - */ - vdev_mirror_ops.vdev_op_io_start(zio); - return (NULL); - } - - if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && - zio->io_priority == ZIO_PRIORITY_NOW) { - trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); - return (zio); - } - - ASSERT3P(zio->io_logical, !=, zio); - if (zio->io_type == ZIO_TYPE_WRITE) { - ASSERT(spa->spa_trust_config); - - if (zio->io_vd->vdev_removing) { - /* - * Note: the code can handle other kinds of writes, - * but we don't expect them. - */ - ASSERT(zio->io_flags & - (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | - ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); - } - } - - /* - * We keep track of time-sensitive I/Os so that the scan thread - * can quickly react to certain workloads. In particular, we care - * about non-scrubbing, top-level reads and writes with the following - * characteristics: - * - synchronous writes of user data to non-slog devices - * - any reads of user data - * When these conditions are met, adjust the timestamp of spa_last_io - * which allows the scan thread to adjust its workload accordingly. - */ - if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && - vd == vd->vdev_top && !vd->vdev_islog && - zio->io_bookmark.zb_objset != DMU_META_OBJSET && - zio->io_txg != spa_syncing_txg(spa)) { - uint64_t old = spa->spa_last_io; - uint64_t new = ddi_get_lbolt64(); - if (old != new) - (void) atomic_cas_64(&spa->spa_last_io, old, new); - } - align = 1ULL << vd->vdev_top->vdev_ashift; - - if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && - P2PHASE(zio->io_size, align) != 0) { - /* Transform logical writes to be a full physical block size. */ - uint64_t asize = P2ROUNDUP(zio->io_size, align); - abd_t *abuf = NULL; - if (zio->io_type == ZIO_TYPE_READ || - zio->io_type == ZIO_TYPE_WRITE) - abuf = abd_alloc_sametype(zio->io_abd, asize); - ASSERT(vd == vd->vdev_top); - if (zio->io_type == ZIO_TYPE_WRITE) { - abd_copy(abuf, zio->io_abd, zio->io_size); - abd_zero_off(abuf, zio->io_size, asize - zio->io_size); - } - zio_push_transform(zio, abuf, asize, abuf ? asize : 0, - zio_subblock); - } - - /* - * If this is not a physical io, make sure that it is properly aligned - * before proceeding. - */ - if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { - ASSERT0(P2PHASE(zio->io_offset, align)); - ASSERT0(P2PHASE(zio->io_size, align)); - } else { - /* - * For the physical io we allow alignment - * to a logical block size. - */ - uint64_t log_align = - 1ULL << vd->vdev_top->vdev_logical_ashift; - ASSERT0(P2PHASE(zio->io_offset, log_align)); - ASSERT0(P2PHASE(zio->io_size, log_align)); - } - - VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); - - /* - * If this is a repair I/O, and there's no self-healing involved -- - * that is, we're just resilvering what we expect to resilver -- - * then don't do the I/O unless zio's txg is actually in vd's DTL. - * This prevents spurious resilvering. - * - * There are a few ways that we can end up creating these spurious - * resilver i/os: - * - * 1. A resilver i/o will be issued if any DVA in the BP has a - * dirty DTL. The mirror code will issue resilver writes to - * each DVA, including the one(s) that are not on vdevs with dirty - * DTLs. - * - * 2. With nested replication, which happens when we have a - * "replacing" or "spare" vdev that's a child of a mirror or raidz. - * For example, given mirror(replacing(A+B), C), it's likely that - * only A is out of date (it's the new device). In this case, we'll - * read from C, then use the data to resilver A+B -- but we don't - * actually want to resilver B, just A. The top-level mirror has no - * way to know this, so instead we just discard unnecessary repairs - * as we work our way down the vdev tree. - * - * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc. - * The same logic applies to any form of nested replication: ditto - * + mirror, RAID-Z + replacing, etc. - * - * However, indirect vdevs point off to other vdevs which may have - * DTL's, so we never bypass them. The child i/os on concrete vdevs - * will be properly bypassed instead. - */ - if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && - !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && - zio->io_txg != 0 && /* not a delegated i/o */ - vd->vdev_ops != &vdev_indirect_ops && - !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { - ASSERT(zio->io_type == ZIO_TYPE_WRITE); - zio_vdev_io_bypass(zio); - return (zio); - } - - if (vd->vdev_ops->vdev_op_leaf) { - switch (zio->io_type) { - case ZIO_TYPE_READ: - if (vdev_cache_read(zio)) - return (zio); - /* FALLTHROUGH */ - case ZIO_TYPE_WRITE: - case ZIO_TYPE_FREE: - if ((zio = vdev_queue_io(zio)) == NULL) - return (NULL); - - if (!vdev_accessible(vd, zio)) { - zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); - return (NULL); - } - break; - } - /* - * Note that we ignore repair writes for TRIM because they can - * conflict with normal writes. This isn't an issue because, by - * definition, we only repair blocks that aren't freed. - */ - if (zio->io_type == ZIO_TYPE_WRITE && - !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && - !trim_map_write_start(zio)) - return (NULL); - } - - vd->vdev_ops->vdev_op_io_start(zio); - return (NULL); -} - -static zio_t * -zio_vdev_io_done(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; - boolean_t unexpected_error = B_FALSE; - - if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { - return (NULL); - } - - ASSERT(zio->io_type == ZIO_TYPE_READ || - zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); - - if (vd != NULL && vd->vdev_ops->vdev_op_leaf && - (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || - zio->io_type == ZIO_TYPE_FREE)) { - - if (zio->io_type == ZIO_TYPE_WRITE && - !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) - trim_map_write_done(zio); - - vdev_queue_io_done(zio); - - if (zio->io_type == ZIO_TYPE_WRITE) - vdev_cache_write(zio); - - if (zio_injection_enabled && zio->io_error == 0) - zio->io_error = zio_handle_device_injection(vd, - zio, EIO); - - if (zio_injection_enabled && zio->io_error == 0) - zio->io_error = zio_handle_label_injection(zio, EIO); - - if (zio->io_error) { - if (zio->io_error == ENOTSUP && - zio->io_type == ZIO_TYPE_FREE) { - /* Not all devices support TRIM. */ - } else if (!vdev_accessible(vd, zio)) { - zio->io_error = SET_ERROR(ENXIO); - } else { - unexpected_error = B_TRUE; - } - } - } - - ops->vdev_op_io_done(zio); - - if (unexpected_error) - VERIFY(vdev_probe(vd, zio) == NULL); - - return (zio); -} - -/* - * This function is used to change the priority of an existing zio that is - * currently in-flight. This is used by the arc to upgrade priority in the - * event that a demand read is made for a block that is currently queued - * as a scrub or async read IO. Otherwise, the high priority read request - * would end up having to wait for the lower priority IO. - */ -void -zio_change_priority(zio_t *pio, zio_priority_t priority) -{ - zio_t *cio, *cio_next; - zio_link_t *zl = NULL; - - ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - - if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { - vdev_queue_change_io_priority(pio, priority); - } else { - pio->io_priority = priority; - } - - mutex_enter(&pio->io_lock); - for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { - cio_next = zio_walk_children(pio, &zl); - zio_change_priority(cio, priority); - } - mutex_exit(&pio->io_lock); -} - -/* - * For non-raidz ZIOs, we can just copy aside the bad data read from the - * disk, and use that to finish the checksum ereport later. - */ -static void -zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, - const void *good_buf) -{ - /* no processing needed */ - zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); -} - -/*ARGSUSED*/ -void -zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) -{ - void *buf = zio_buf_alloc(zio->io_size); - - abd_copy_to_buf(buf, zio->io_abd, zio->io_size); - - zcr->zcr_cbinfo = zio->io_size; - zcr->zcr_cbdata = buf; - zcr->zcr_finish = zio_vsd_default_cksum_finish; - zcr->zcr_free = zio_buf_free; -} - -static zio_t * -zio_vdev_io_assess(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - - if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { - return (NULL); - } - - if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) - spa_config_exit(zio->io_spa, SCL_ZIO, zio); - - if (zio->io_vsd != NULL) { - zio->io_vsd_ops->vsd_free(zio); - zio->io_vsd = NULL; - } - - if (zio_injection_enabled && zio->io_error == 0) - zio->io_error = zio_handle_fault_injection(zio, EIO); - - if (zio->io_type == ZIO_TYPE_FREE && - zio->io_priority != ZIO_PRIORITY_NOW) { - switch (zio->io_error) { - case 0: - ZIO_TRIM_STAT_INCR(bytes, zio->io_size); - ZIO_TRIM_STAT_BUMP(success); - break; - case EOPNOTSUPP: - ZIO_TRIM_STAT_BUMP(unsupported); - break; - default: - ZIO_TRIM_STAT_BUMP(failed); - break; - } - } - - /* - * If the I/O failed, determine whether we should attempt to retry it. - * - * On retry, we cut in line in the issue queue, since we don't want - * compression/checksumming/etc. work to prevent our (cheap) IO reissue. - */ - if (zio->io_error && vd == NULL && - !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { - ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ - ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ - zio->io_error = 0; - zio->io_flags |= ZIO_FLAG_IO_RETRY | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; - zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; - zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, - zio_requeue_io_start_cut_in_line); - return (NULL); - } - - /* - * If we got an error on a leaf device, convert it to ENXIO - * if the device is not accessible at all. - */ - if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && - !vdev_accessible(vd, zio)) - zio->io_error = SET_ERROR(ENXIO); - - /* - * If we can't write to an interior vdev (mirror or RAID-Z), - * set vdev_cant_write so that we stop trying to allocate from it. - */ - if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && - vd != NULL && !vd->vdev_ops->vdev_op_leaf) { - vd->vdev_cant_write = B_TRUE; - } - - /* - * If a cache flush returns ENOTSUP or ENOTTY, we know that no future - * attempts will ever succeed. In this case we set a persistent bit so - * that we don't bother with it in the future. - */ - if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && - zio->io_type == ZIO_TYPE_IOCTL && - zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) - vd->vdev_nowritecache = B_TRUE; - - if (zio->io_error) - zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - - if (vd != NULL && vd->vdev_ops->vdev_op_leaf && - zio->io_physdone != NULL) { - ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); - ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); - zio->io_physdone(zio->io_logical); - } - - return (zio); -} - -void -zio_vdev_io_reissue(zio_t *zio) -{ - ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); - ASSERT(zio->io_error == 0); - - zio->io_stage >>= 1; -} - -void -zio_vdev_io_redone(zio_t *zio) -{ - ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); - - zio->io_stage >>= 1; -} - -void -zio_vdev_io_bypass(zio_t *zio) -{ - ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); - ASSERT(zio->io_error == 0); - - zio->io_flags |= ZIO_FLAG_IO_BYPASS; - zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; -} - -/* - * ========================================================================== - * Generate and verify checksums - * ========================================================================== - */ -static zio_t * -zio_checksum_generate(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - enum zio_checksum checksum; - - if (bp == NULL) { - /* - * This is zio_write_phys(). - * We're either generating a label checksum, or none at all. - */ - checksum = zio->io_prop.zp_checksum; - - if (checksum == ZIO_CHECKSUM_OFF) - return (zio); - - ASSERT(checksum == ZIO_CHECKSUM_LABEL); - } else { - if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { - ASSERT(!IO_IS_ALLOCATING(zio)); - checksum = ZIO_CHECKSUM_GANG_HEADER; - } else { - checksum = BP_GET_CHECKSUM(bp); - } - } - - zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); - - return (zio); -} - -static zio_t * -zio_checksum_verify(zio_t *zio) -{ - zio_bad_cksum_t info; - blkptr_t *bp = zio->io_bp; - int error; - - ASSERT(zio->io_vd != NULL); - - if (bp == NULL) { - /* - * This is zio_read_phys(). - * We're either verifying a label checksum, or nothing at all. - */ - if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) - return (zio); - - ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); - } - - if ((error = zio_checksum_error(zio, &info)) != 0) { - zio->io_error = error; - if (error == ECKSUM && - !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - zfs_ereport_start_checksum(zio->io_spa, - zio->io_vd, zio, zio->io_offset, - zio->io_size, NULL, &info); - } - } - - return (zio); -} - -/* - * Called by RAID-Z to ensure we don't compute the checksum twice. - */ -void -zio_checksum_verified(zio_t *zio) -{ - zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; -} - -/* - * ========================================================================== - * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. - * An error of 0 indicates success. ENXIO indicates whole-device failure, - * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO - * indicate errors that are specific to one I/O, and most likely permanent. - * Any other error is presumed to be worse because we weren't expecting it. - * ========================================================================== - */ -int -zio_worst_error(int e1, int e2) -{ - static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; - int r1, r2; - - for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) - if (e1 == zio_error_rank[r1]) - break; - - for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) - if (e2 == zio_error_rank[r2]) - break; - - return (r1 > r2 ? e1 : e2); -} - -/* - * ========================================================================== - * I/O completion - * ========================================================================== - */ -static zio_t * -zio_ready(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - zio_t *pio, *pio_next; - zio_link_t *zl = NULL; - - if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, - ZIO_WAIT_READY)) { - return (NULL); - } - - if (zio->io_ready) { - ASSERT(IO_IS_ALLOCATING(zio)); - ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || - (zio->io_flags & ZIO_FLAG_NOPWRITE)); - ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); - - zio->io_ready(zio); - } - - if (bp != NULL && bp != &zio->io_bp_copy) - zio->io_bp_copy = *bp; - - if (zio->io_error != 0) { - zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - - if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { - ASSERT(IO_IS_ALLOCATING(zio)); - ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); - ASSERT(zio->io_metaslab_class != NULL); - - /* - * We were unable to allocate anything, unreserve and - * issue the next I/O to allocate. - */ - metaslab_class_throttle_unreserve( - zio->io_metaslab_class, zio->io_prop.zp_copies, - zio->io_allocator, zio); - zio_allocate_dispatch(zio->io_spa, zio->io_allocator); - } - } - - mutex_enter(&zio->io_lock); - zio->io_state[ZIO_WAIT_READY] = 1; - pio = zio_walk_parents(zio, &zl); - mutex_exit(&zio->io_lock); - - /* - * As we notify zio's parents, new parents could be added. - * New parents go to the head of zio's io_parent_list, however, - * so we will (correctly) not notify them. The remainder of zio's - * io_parent_list, from 'pio_next' onward, cannot change because - * all parents must wait for us to be done before they can be done. - */ - for (; pio != NULL; pio = pio_next) { - pio_next = zio_walk_parents(zio, &zl); - zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL); - } - - if (zio->io_flags & ZIO_FLAG_NODATA) { - if (BP_IS_GANG(bp)) { - zio->io_flags &= ~ZIO_FLAG_NODATA; - } else { - ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); - zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; - } - } - - if (zio_injection_enabled && - zio->io_spa->spa_syncing_txg == zio->io_txg) - zio_handle_ignored_writes(zio); - - return (zio); -} - -/* - * Update the allocation throttle accounting. - */ -static void -zio_dva_throttle_done(zio_t *zio) -{ - zio_t *lio = zio->io_logical; - zio_t *pio = zio_unique_parent(zio); - vdev_t *vd = zio->io_vd; - int flags = METASLAB_ASYNC_ALLOC; - - ASSERT3P(zio->io_bp, !=, NULL); - ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); - ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); - ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); - ASSERT(vd != NULL); - ASSERT3P(vd, ==, vd->vdev_top); - ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY))); - ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); - ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); - ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); - - /* - * Parents of gang children can have two flavors -- ones that - * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) - * and ones that allocated the constituent blocks. The allocation - * throttle needs to know the allocating parent zio so we must find - * it here. - */ - if (pio->io_child_type == ZIO_CHILD_GANG) { - /* - * If our parent is a rewrite gang child then our grandparent - * would have been the one that performed the allocation. - */ - if (pio->io_flags & ZIO_FLAG_IO_REWRITE) - pio = zio_unique_parent(pio); - flags |= METASLAB_GANG_CHILD; - } - - ASSERT(IO_IS_ALLOCATING(pio)); - ASSERT3P(zio, !=, zio->io_logical); - ASSERT(zio->io_logical != NULL); - ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); - ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); - ASSERT(zio->io_metaslab_class != NULL); - - mutex_enter(&pio->io_lock); - metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, - pio->io_allocator, B_TRUE); - mutex_exit(&pio->io_lock); - - metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, - pio->io_allocator, pio); - - /* - * Call into the pipeline to see if there is more work that - * needs to be done. If there is work to be done it will be - * dispatched to another taskq thread. - */ - zio_allocate_dispatch(zio->io_spa, pio->io_allocator); -} - -static zio_t * -zio_done(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - zio_t *lio = zio->io_logical; - blkptr_t *bp = zio->io_bp; - vdev_t *vd = zio->io_vd; - uint64_t psize = zio->io_size; - zio_t *pio, *pio_next; - zio_link_t *zl = NULL; - - /* - * If our children haven't all completed, - * wait for them and then repeat this pipeline stage. - */ - if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { - return (NULL); - } - - /* - * If the allocation throttle is enabled, then update the accounting. - * We only track child I/Os that are part of an allocating async - * write. We must do this since the allocation is performed - * by the logical I/O but the actual write is done by child I/Os. - */ - if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && - zio->io_child_type == ZIO_CHILD_VDEV) { - ASSERT(zio->io_metaslab_class != NULL); - ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); - zio_dva_throttle_done(zio); - } - - /* - * If the allocation throttle is enabled, verify that - * we have decremented the refcounts for every I/O that was throttled. - */ - if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { - ASSERT(zio->io_type == ZIO_TYPE_WRITE); - ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); - ASSERT(bp != NULL); - - metaslab_group_alloc_verify(spa, zio->io_bp, zio, - zio->io_allocator); - VERIFY(zfs_refcount_not_held( - &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator], - zio)); - } - - for (int c = 0; c < ZIO_CHILD_TYPES; c++) - for (int w = 0; w < ZIO_WAIT_TYPES; w++) - ASSERT(zio->io_children[c][w] == 0); - - if (bp != NULL && !BP_IS_EMBEDDED(bp)) { - ASSERT(bp->blk_pad[0] == 0); - ASSERT(bp->blk_pad[1] == 0); - ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || - (bp == zio_unique_parent(zio)->io_bp)); - if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && - zio->io_bp_override == NULL && - !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { - ASSERT(!BP_SHOULD_BYTESWAP(bp)); - ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); - ASSERT(BP_COUNT_GANG(bp) == 0 || - (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); - } - if (zio->io_flags & ZIO_FLAG_NOPWRITE) - VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); - } - - /* - * If there were child vdev/gang/ddt errors, they apply to us now. - */ - zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); - zio_inherit_child_errors(zio, ZIO_CHILD_GANG); - zio_inherit_child_errors(zio, ZIO_CHILD_DDT); - - /* - * If the I/O on the transformed data was successful, generate any - * checksum reports now while we still have the transformed data. - */ - if (zio->io_error == 0) { - while (zio->io_cksum_report != NULL) { - zio_cksum_report_t *zcr = zio->io_cksum_report; - uint64_t align = zcr->zcr_align; - uint64_t asize = P2ROUNDUP(psize, align); - char *abuf = NULL; - abd_t *adata = zio->io_abd; - - if (asize != psize) { - adata = abd_alloc_linear(asize, B_TRUE); - abd_copy(adata, zio->io_abd, psize); - abd_zero_off(adata, psize, asize - psize); - } - - if (adata != NULL) - abuf = abd_borrow_buf_copy(adata, asize); - - zio->io_cksum_report = zcr->zcr_next; - zcr->zcr_next = NULL; - zcr->zcr_finish(zcr, abuf); - zfs_ereport_free_checksum(zcr); - - if (adata != NULL) - abd_return_buf(adata, abuf, asize); - - if (asize != psize) - abd_free(adata); - } - } - - zio_pop_transforms(zio); /* note: may set zio->io_error */ - - vdev_stat_update(zio, psize); - - if (zio->io_error) { - /* - * If this I/O is attached to a particular vdev, - * generate an error message describing the I/O failure - * at the block level. We ignore these errors if the - * device is currently unavailable. - */ - if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) - zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); - - if ((zio->io_error == EIO || !(zio->io_flags & - (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && - zio == lio) { - /* - * For logical I/O requests, tell the SPA to log the - * error and generate a logical data ereport. - */ - spa_log_error(spa, zio); - zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, - 0, 0); - } - } - - if (zio->io_error && zio == lio) { - /* - * Determine whether zio should be reexecuted. This will - * propagate all the way to the root via zio_notify_parent(). - */ - ASSERT(vd == NULL && bp != NULL); - ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - - if (IO_IS_ALLOCATING(zio) && - !(zio->io_flags & ZIO_FLAG_CANFAIL)) { - if (zio->io_error != ENOSPC) - zio->io_reexecute |= ZIO_REEXECUTE_NOW; - else - zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; - } - - if ((zio->io_type == ZIO_TYPE_READ || - zio->io_type == ZIO_TYPE_FREE) && - !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && - zio->io_error == ENXIO && - spa_load_state(spa) == SPA_LOAD_NONE && - spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) - zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; - - if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) - zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; - - /* - * Here is a possibly good place to attempt to do - * either combinatorial reconstruction or error correction - * based on checksums. It also might be a good place - * to send out preliminary ereports before we suspend - * processing. - */ - } - - /* - * If there were logical child errors, they apply to us now. - * We defer this until now to avoid conflating logical child - * errors with errors that happened to the zio itself when - * updating vdev stats and reporting FMA events above. - */ - zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); - - if ((zio->io_error || zio->io_reexecute) && - IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && - !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) - zio_dva_unallocate(zio, zio->io_gang_tree, bp); - - zio_gang_tree_free(&zio->io_gang_tree); - - /* - * Godfather I/Os should never suspend. - */ - if ((zio->io_flags & ZIO_FLAG_GODFATHER) && - (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) - zio->io_reexecute = 0; - - if (zio->io_reexecute) { - /* - * This is a logical I/O that wants to reexecute. - * - * Reexecute is top-down. When an i/o fails, if it's not - * the root, it simply notifies its parent and sticks around. - * The parent, seeing that it still has children in zio_done(), - * does the same. This percolates all the way up to the root. - * The root i/o will reexecute or suspend the entire tree. - * - * This approach ensures that zio_reexecute() honors - * all the original i/o dependency relationships, e.g. - * parents not executing until children are ready. - */ - ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - - zio->io_gang_leader = NULL; - - mutex_enter(&zio->io_lock); - zio->io_state[ZIO_WAIT_DONE] = 1; - mutex_exit(&zio->io_lock); - - /* - * "The Godfather" I/O monitors its children but is - * not a true parent to them. It will track them through - * the pipeline but severs its ties whenever they get into - * trouble (e.g. suspended). This allows "The Godfather" - * I/O to return status without blocking. - */ - zl = NULL; - for (pio = zio_walk_parents(zio, &zl); pio != NULL; - pio = pio_next) { - zio_link_t *remove_zl = zl; - pio_next = zio_walk_parents(zio, &zl); - - if ((pio->io_flags & ZIO_FLAG_GODFATHER) && - (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { - zio_remove_child(pio, zio, remove_zl); - /* - * This is a rare code path, so we don't - * bother with "next_to_execute". - */ - zio_notify_parent(pio, zio, ZIO_WAIT_DONE, - NULL); - } - } - - if ((pio = zio_unique_parent(zio)) != NULL) { - /* - * We're not a root i/o, so there's nothing to do - * but notify our parent. Don't propagate errors - * upward since we haven't permanently failed yet. - */ - ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); - zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; - /* - * This is a rare code path, so we don't bother with - * "next_to_execute". - */ - zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); - } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { - /* - * We'd fail again if we reexecuted now, so suspend - * until conditions improve (e.g. device comes online). - */ - zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR); - } else { - /* - * Reexecution is potentially a huge amount of work. - * Hand it off to the otherwise-unused claim taskq. - */ -#if defined(illumos) || !defined(_KERNEL) - ASSERT(zio->io_tqent.tqent_next == NULL); -#else - ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); -#endif - spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, - ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, - 0, &zio->io_tqent); - } - return (NULL); - } - - ASSERT(zio->io_child_count == 0); - ASSERT(zio->io_reexecute == 0); - ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); - - /* - * Report any checksum errors, since the I/O is complete. - */ - while (zio->io_cksum_report != NULL) { - zio_cksum_report_t *zcr = zio->io_cksum_report; - zio->io_cksum_report = zcr->zcr_next; - zcr->zcr_next = NULL; - zcr->zcr_finish(zcr, NULL); - zfs_ereport_free_checksum(zcr); - } - - /* - * It is the responsibility of the done callback to ensure that this - * particular zio is no longer discoverable for adoption, and as - * such, cannot acquire any new parents. - */ - if (zio->io_done) - zio->io_done(zio); - - mutex_enter(&zio->io_lock); - zio->io_state[ZIO_WAIT_DONE] = 1; - mutex_exit(&zio->io_lock); - - /* - * We are done executing this zio. We may want to execute a parent - * next. See the comment in zio_notify_parent(). - */ - zio_t *next_to_execute = NULL; - zl = NULL; - for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { - zio_link_t *remove_zl = zl; - pio_next = zio_walk_parents(zio, &zl); - zio_remove_child(pio, zio, remove_zl); - zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute); - } - - if (zio->io_waiter != NULL) { - mutex_enter(&zio->io_lock); - zio->io_executor = NULL; - cv_broadcast(&zio->io_cv); - mutex_exit(&zio->io_lock); - } else { - zio_destroy(zio); - } - - return (next_to_execute); -} - -/* - * ========================================================================== - * I/O pipeline definition - * ========================================================================== - */ -static zio_pipe_stage_t *zio_pipeline[] = { - NULL, - zio_read_bp_init, - zio_write_bp_init, - zio_free_bp_init, - zio_issue_async, - zio_write_compress, - zio_checksum_generate, - zio_nop_write, - zio_ddt_read_start, - zio_ddt_read_done, - zio_ddt_write, - zio_ddt_free, - zio_gang_assemble, - zio_gang_issue, - zio_dva_throttle, - zio_dva_allocate, - zio_dva_free, - zio_dva_claim, - zio_ready, - zio_vdev_io_start, - zio_vdev_io_done, - zio_vdev_io_assess, - zio_checksum_verify, - zio_done -}; - - - - -/* - * Compare two zbookmark_phys_t's to see which we would reach first in a - * pre-order traversal of the object tree. - * - * This is simple in every case aside from the meta-dnode object. For all other - * objects, we traverse them in order (object 1 before object 2, and so on). - * However, all of these objects are traversed while traversing object 0, since - * the data it points to is the list of objects. Thus, we need to convert to a - * canonical representation so we can compare meta-dnode bookmarks to - * non-meta-dnode bookmarks. - * - * We do this by calculating "equivalents" for each field of the zbookmark. - * zbookmarks outside of the meta-dnode use their own object and level, and - * calculate the level 0 equivalent (the first L0 blkid that is contained in the - * blocks this bookmark refers to) by multiplying their blkid by their span - * (the number of L0 blocks contained within one block at their level). - * zbookmarks inside the meta-dnode calculate their object equivalent - * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use - * level + 1<<31 (any value larger than a level could ever be) for their level. - * This causes them to always compare before a bookmark in their object - * equivalent, compare appropriately to bookmarks in other objects, and to - * compare appropriately to other bookmarks in the meta-dnode. - */ -int -zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, - const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) -{ - /* - * These variables represent the "equivalent" values for the zbookmark, - * after converting zbookmarks inside the meta dnode to their - * normal-object equivalents. - */ - uint64_t zb1obj, zb2obj; - uint64_t zb1L0, zb2L0; - uint64_t zb1level, zb2level; - - if (zb1->zb_object == zb2->zb_object && - zb1->zb_level == zb2->zb_level && - zb1->zb_blkid == zb2->zb_blkid) - return (0); - - /* - * BP_SPANB calculates the span in blocks. - */ - zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); - zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); - - if (zb1->zb_object == DMU_META_DNODE_OBJECT) { - zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); - zb1L0 = 0; - zb1level = zb1->zb_level + COMPARE_META_LEVEL; - } else { - zb1obj = zb1->zb_object; - zb1level = zb1->zb_level; - } - - if (zb2->zb_object == DMU_META_DNODE_OBJECT) { - zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); - zb2L0 = 0; - zb2level = zb2->zb_level + COMPARE_META_LEVEL; - } else { - zb2obj = zb2->zb_object; - zb2level = zb2->zb_level; - } - - /* Now that we have a canonical representation, do the comparison. */ - if (zb1obj != zb2obj) - return (zb1obj < zb2obj ? -1 : 1); - else if (zb1L0 != zb2L0) - return (zb1L0 < zb2L0 ? -1 : 1); - else if (zb1level != zb2level) - return (zb1level > zb2level ? -1 : 1); - /* - * This can (theoretically) happen if the bookmarks have the same object - * and level, but different blkids, if the block sizes are not the same. - * There is presently no way to change the indirect block sizes - */ - return (0); -} - -/* - * This function checks the following: given that last_block is the place that - * our traversal stopped last time, does that guarantee that we've visited - * every node under subtree_root? Therefore, we can't just use the raw output - * of zbookmark_compare. We have to pass in a modified version of - * subtree_root; by incrementing the block id, and then checking whether - * last_block is before or equal to that, we can tell whether or not having - * visited last_block implies that all of subtree_root's children have been - * visited. - */ -boolean_t -zbookmark_subtree_completed(const dnode_phys_t *dnp, - const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) -{ - zbookmark_phys_t mod_zb = *subtree_root; - mod_zb.zb_blkid++; - ASSERT(last_block->zb_level == 0); - - /* The objset_phys_t isn't before anything. */ - if (dnp == NULL) - return (B_FALSE); - - /* - * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the - * data block size in sectors, because that variable is only used if - * the bookmark refers to a block in the meta-dnode. Since we don't - * know without examining it what object it refers to, and there's no - * harm in passing in this value in other cases, we always pass it in. - * - * We pass in 0 for the indirect block size shift because zb2 must be - * level 0. The indirect block size is only used to calculate the span - * of the bookmark, but since the bookmark must be level 0, the span is - * always 1, so the math works out. - * - * If you make changes to how the zbookmark_compare code works, be sure - * to make sure that this code still works afterwards. - */ - return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, - 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, - last_block) <= 0); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c deleted file mode 100644 index 8924804a6fcb..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c +++ /dev/null @@ -1,475 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. - * Copyright 2013 Saso Kiselkov. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Checksum vectors. - * - * In the SPA, everything is checksummed. We support checksum vectors - * for three distinct reasons: - * - * 1. Different kinds of data need different levels of protection. - * For SPA metadata, we always want a very strong checksum. - * For user data, we let users make the trade-off between speed - * and checksum strength. - * - * 2. Cryptographic hash and MAC algorithms are an area of active research. - * It is likely that in future hash functions will be at least as strong - * as current best-of-breed, and may be substantially faster as well. - * We want the ability to take advantage of these new hashes as soon as - * they become available. - * - * 3. If someone develops hardware that can compute a strong hash quickly, - * we want the ability to take advantage of that hardware. - * - * Of course, we don't want a checksum upgrade to invalidate existing - * data, so we store the checksum *function* in eight bits of the bp. - * This gives us room for up to 256 different checksum functions. - * - * When writing a block, we always checksum it with the latest-and-greatest - * checksum function of the appropriate strength. When reading a block, - * we compare the expected checksum against the actual checksum, which we - * compute via the checksum function specified by BP_GET_CHECKSUM(bp). - * - * SALTED CHECKSUMS - * - * To enable the use of less secure hash algorithms with dedup, we - * introduce the notion of salted checksums (MACs, really). A salted - * checksum is fed both a random 256-bit value (the salt) and the data - * to be checksummed. This salt is kept secret (stored on the pool, but - * never shown to the user). Thus even if an attacker knew of collision - * weaknesses in the hash algorithm, they won't be able to mount a known - * plaintext attack on the DDT, since the actual hash value cannot be - * known ahead of time. How the salt is used is algorithm-specific - * (some might simply prefix it to the data block, others might need to - * utilize a full-blown HMAC). On disk the salt is stored in a ZAP - * object in the MOS (DMU_POOL_CHECKSUM_SALT). - * - * CONTEXT TEMPLATES - * - * Some hashing algorithms need to perform a substantial amount of - * initialization work (e.g. salted checksums above may need to pre-hash - * the salt) before being able to process data. Performing this - * redundant work for each block would be wasteful, so we instead allow - * a checksum algorithm to do the work once (the first time it's used) - * and then keep this pre-initialized context as a template inside the - * spa_t (spa_cksum_tmpls). If the zio_checksum_info_t contains - * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to - * construct and destruct the pre-initialized checksum context. The - * pre-initialized context is then reused during each checksum - * invocation and passed to the checksum function. - */ - -/*ARGSUSED*/ -static void -abd_checksum_off(abd_t *abd, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); -} - -/*ARGSUSED*/ -void -abd_fletcher_2_native(abd_t *abd, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - fletcher_init(zcp); - (void) abd_iterate_func(abd, 0, size, - fletcher_2_incremental_native, zcp); -} - -/*ARGSUSED*/ -void -abd_fletcher_2_byteswap(abd_t *abd, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - fletcher_init(zcp); - (void) abd_iterate_func(abd, 0, size, - fletcher_2_incremental_byteswap, zcp); -} - -/*ARGSUSED*/ -void -abd_fletcher_4_native(abd_t *abd, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - fletcher_init(zcp); - (void) abd_iterate_func(abd, 0, size, - fletcher_4_incremental_native, zcp); -} - -/*ARGSUSED*/ -void -abd_fletcher_4_byteswap(abd_t *abd, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) -{ - fletcher_init(zcp); - (void) abd_iterate_func(abd, 0, size, - fletcher_4_incremental_byteswap, zcp); -} - -zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { - {{NULL, NULL}, NULL, NULL, 0, "inherit"}, - {{NULL, NULL}, NULL, NULL, 0, "on"}, - {{abd_checksum_off, abd_checksum_off}, - NULL, NULL, 0, "off"}, - {{abd_checksum_SHA256, abd_checksum_SHA256}, - NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, - "label"}, - {{abd_checksum_SHA256, abd_checksum_SHA256}, - NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, - "gang_header"}, - {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, - NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, - {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, - NULL, NULL, 0, "fletcher2"}, - {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, - NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, - {{abd_checksum_SHA256, abd_checksum_SHA256}, - NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | - ZCHECKSUM_FLAG_NOPWRITE, "sha256"}, - {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, - NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"}, - {{abd_checksum_off, abd_checksum_off}, - NULL, NULL, 0, "noparity"}, - {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap}, - NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | - ZCHECKSUM_FLAG_NOPWRITE, "sha512"}, - {{abd_checksum_skein_native, abd_checksum_skein_byteswap}, - abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free, - ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | - ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, -#ifdef illumos - {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap}, - abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free, - ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | - ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, -#endif -}; - -/* - * The flag corresponding to the "verify" in dedup=[checksum,]verify - * must be cleared first, so callers should use ZIO_CHECKSUM_MASK. - */ -spa_feature_t -zio_checksum_to_feature(enum zio_checksum cksum) -{ - VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0); - - switch (cksum) { - case ZIO_CHECKSUM_SHA512: - return (SPA_FEATURE_SHA512); - case ZIO_CHECKSUM_SKEIN: - return (SPA_FEATURE_SKEIN); -#ifdef illumos - case ZIO_CHECKSUM_EDONR: - return (SPA_FEATURE_EDONR); -#endif - } - return (SPA_FEATURE_NONE); -} - -enum zio_checksum -zio_checksum_select(enum zio_checksum child, enum zio_checksum parent) -{ - ASSERT(child < ZIO_CHECKSUM_FUNCTIONS); - ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS); - ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); - - if (child == ZIO_CHECKSUM_INHERIT) - return (parent); - - if (child == ZIO_CHECKSUM_ON) - return (ZIO_CHECKSUM_ON_VALUE); - - return (child); -} - -enum zio_checksum -zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, - enum zio_checksum parent) -{ - ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS); - ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS); - ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); - - if (child == ZIO_CHECKSUM_INHERIT) - return (parent); - - if (child == ZIO_CHECKSUM_ON) - return (spa_dedup_checksum(spa)); - - if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY)) - return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY); - - ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags & - ZCHECKSUM_FLAG_DEDUP) || - (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF); - - return (child); -} - -/* - * Set the external verifier for a gang block based on , - * a tuple which is guaranteed to be unique for the life of the pool. - */ -static void -zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp) -{ - dva_t *dva = BP_IDENTITY(bp); - uint64_t txg = BP_PHYSICAL_BIRTH(bp); - - ASSERT(BP_IS_GANG(bp)); - - ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); -} - -/* - * Set the external verifier for a label block based on its offset. - * The vdev is implicit, and the txg is unknowable at pool open time -- - * hence the logic in vdev_uberblock_load() to find the most recent copy. - */ -static void -zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) -{ - ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); -} - -/* - * Calls the template init function of a checksum which supports context - * templates and installs the template into the spa_t. - */ -static void -zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) -{ - zio_checksum_info_t *ci = &zio_checksum_table[checksum]; - - if (ci->ci_tmpl_init == NULL) - return; - if (spa->spa_cksum_tmpls[checksum] != NULL) - return; - - VERIFY(ci->ci_tmpl_free != NULL); - mutex_enter(&spa->spa_cksum_tmpls_lock); - if (spa->spa_cksum_tmpls[checksum] == NULL) { - spa->spa_cksum_tmpls[checksum] = - ci->ci_tmpl_init(&spa->spa_cksum_salt); - VERIFY(spa->spa_cksum_tmpls[checksum] != NULL); - } - mutex_exit(&spa->spa_cksum_tmpls_lock); -} - -/* - * Generate the checksum. - */ -void -zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - abd_t *abd, uint64_t size) -{ - blkptr_t *bp = zio->io_bp; - uint64_t offset = zio->io_offset; - zio_checksum_info_t *ci = &zio_checksum_table[checksum]; - zio_cksum_t cksum; - spa_t *spa = zio->io_spa; - - ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS); - ASSERT(ci->ci_func[0] != NULL); - - zio_checksum_template_init(checksum, spa); - - if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { - zio_eck_t *eck; - void *data = abd_to_buf(abd); - - if (checksum == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = data; - - size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ, - uint64_t); - eck = &zilc->zc_eck; - } else { - eck = (zio_eck_t *)((char *)data + size) - 1; - } - if (checksum == ZIO_CHECKSUM_GANG_HEADER) - zio_checksum_gang_verifier(&eck->zec_cksum, bp); - else if (checksum == ZIO_CHECKSUM_LABEL) - zio_checksum_label_verifier(&eck->zec_cksum, offset); - else - bp->blk_cksum = eck->zec_cksum; - eck->zec_magic = ZEC_MAGIC; - ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], - &cksum); - eck->zec_cksum = cksum; - } else { - ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], - &bp->blk_cksum); - } -} - -int -zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, - abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) -{ - zio_checksum_info_t *ci = &zio_checksum_table[checksum]; - zio_cksum_t actual_cksum, expected_cksum; - int byteswap; - - if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) - return (SET_ERROR(EINVAL)); - - zio_checksum_template_init(checksum, spa); - - if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { - zio_eck_t *eck; - zio_cksum_t verifier; - uint64_t data_size = size; - void *data = abd_borrow_buf_copy(abd, data_size); - - if (checksum == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = data; - uint64_t nused; - - eck = &zilc->zc_eck; - if (eck->zec_magic == ZEC_MAGIC) { - nused = zilc->zc_nused; - } else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) { - nused = BSWAP_64(zilc->zc_nused); - } else { - abd_return_buf(abd, data, data_size); - return (SET_ERROR(ECKSUM)); - } - - if (nused > data_size) { - abd_return_buf(abd, data, data_size); - return (SET_ERROR(ECKSUM)); - } - - size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); - } else { - eck = (zio_eck_t *)((char *)data + data_size) - 1; - } - - if (checksum == ZIO_CHECKSUM_GANG_HEADER) - zio_checksum_gang_verifier(&verifier, bp); - else if (checksum == ZIO_CHECKSUM_LABEL) - zio_checksum_label_verifier(&verifier, offset); - else - verifier = bp->blk_cksum; - - byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); - - if (byteswap) - byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); - - size_t eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data; - expected_cksum = eck->zec_cksum; - eck->zec_cksum = verifier; - abd_return_buf_copy(abd, data, data_size); - - ci->ci_func[byteswap](abd, size, - spa->spa_cksum_tmpls[checksum], &actual_cksum); - abd_copy_from_buf_off(abd, &expected_cksum, - eck_offset, sizeof (zio_cksum_t)); - - if (byteswap) { - byteswap_uint64_array(&expected_cksum, - sizeof (zio_cksum_t)); - } - } else { - byteswap = BP_SHOULD_BYTESWAP(bp); - expected_cksum = bp->blk_cksum; - ci->ci_func[byteswap](abd, size, - spa->spa_cksum_tmpls[checksum], &actual_cksum); - } - - if (info != NULL) { - info->zbc_expected = expected_cksum; - info->zbc_actual = actual_cksum; - info->zbc_checksum_name = ci->ci_name; - info->zbc_byteswapped = byteswap; - info->zbc_injected = 0; - info->zbc_has_cksum = 1; - } - - if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) - return (SET_ERROR(ECKSUM)); - - return (0); -} - -int -zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) -{ - blkptr_t *bp = zio->io_bp; - uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : - (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); - int error; - uint64_t size = (bp == NULL ? zio->io_size : - (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); - uint64_t offset = zio->io_offset; - abd_t *data = zio->io_abd; - spa_t *spa = zio->io_spa; - - error = zio_checksum_error_impl(spa, bp, checksum, data, size, - offset, info); - - if (zio_injection_enabled && error == 0 && zio->io_error == 0) { - error = zio_handle_fault_injection(zio, ECKSUM); - if (error != 0) - info->zbc_injected = 1; - } - - return (error); -} - -/* - * Called by a spa_t that's about to be deallocated. This steps through - * all of the checksum context templates and deallocates any that were - * initialized using the algorithm-specific template init function. - */ -void -zio_checksum_templates_free(spa_t *spa) -{ - for (enum zio_checksum checksum = 0; - checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { - if (spa->spa_cksum_tmpls[checksum] != NULL) { - zio_checksum_info_t *ci = &zio_checksum_table[checksum]; - - VERIFY(ci->ci_tmpl_free != NULL); - ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); - spa->spa_cksum_tmpls[checksum] = NULL; - } - } -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c deleted file mode 100644 index b87303889ddb..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c +++ /dev/null @@ -1,215 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2013, 2018 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include - -typedef struct zcomp_stats { - kstat_named_t zcompstat_attempts; - kstat_named_t zcompstat_empty; - kstat_named_t zcompstat_skipped_insufficient_gain; -} zcomp_stats_t; - -static zcomp_stats_t zcomp_stats = { - { "attempts", KSTAT_DATA_UINT64 }, - { "empty", KSTAT_DATA_UINT64 }, - { "skipped_insufficient_gain", KSTAT_DATA_UINT64 } -}; - -#define ZCOMPSTAT_INCR(stat, val) \ - atomic_add_64(&zcomp_stats.stat.value.ui64, (val)); - -#define ZCOMPSTAT_BUMP(stat) ZCOMPSTAT_INCR(stat, 1); - -kstat_t *zcomp_ksp; - -/* - * If nonzero, every 1/X decompression attempts will fail, simulating - * an undetected memory error. - */ -uint64_t zio_decompress_fail_fraction = 0; - -/* - * Compression vectors. - */ -zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { - {"inherit", 0, NULL, NULL}, - {"on", 0, NULL, NULL}, - {"uncompressed", 0, NULL, NULL}, - {"lzjb", 0, lzjb_compress, lzjb_decompress}, - {"empty", 0, NULL, NULL}, - {"gzip-1", 1, gzip_compress, gzip_decompress}, - {"gzip-2", 2, gzip_compress, gzip_decompress}, - {"gzip-3", 3, gzip_compress, gzip_decompress}, - {"gzip-4", 4, gzip_compress, gzip_decompress}, - {"gzip-5", 5, gzip_compress, gzip_decompress}, - {"gzip-6", 6, gzip_compress, gzip_decompress}, - {"gzip-7", 7, gzip_compress, gzip_decompress}, - {"gzip-8", 8, gzip_compress, gzip_decompress}, - {"gzip-9", 9, gzip_compress, gzip_decompress}, - {"zle", 64, zle_compress, zle_decompress}, - {"lz4", 0, lz4_compress, lz4_decompress} -}; - -enum zio_compress -zio_compress_select(spa_t *spa, enum zio_compress child, - enum zio_compress parent) -{ - enum zio_compress result; - - ASSERT(child < ZIO_COMPRESS_FUNCTIONS); - ASSERT(parent < ZIO_COMPRESS_FUNCTIONS); - ASSERT(parent != ZIO_COMPRESS_INHERIT); - - result = child; - if (result == ZIO_COMPRESS_INHERIT) - result = parent; - - if (result == ZIO_COMPRESS_ON) { - if (spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS)) - result = ZIO_COMPRESS_LZ4_ON_VALUE; - else - result = ZIO_COMPRESS_LEGACY_ON_VALUE; - } - - return (result); -} - -/*ARGSUSED*/ -static int -zio_compress_zeroed_cb(void *data, size_t len, void *private) -{ - uint64_t *end = (uint64_t *)((char *)data + len); - for (uint64_t *word = (uint64_t *)data; word < end; word++) - if (*word != 0) - return (1); - - return (0); -} - -size_t -zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len) -{ - size_t c_len, d_len; - zio_compress_info_t *ci = &zio_compress_table[c]; - - ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); - ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); - - ZCOMPSTAT_BUMP(zcompstat_attempts); - - /* - * If the data is all zeroes, we don't even need to allocate - * a block for it. We indicate this by returning zero size. - */ - if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0) { - ZCOMPSTAT_BUMP(zcompstat_empty); - return (0); - } - - if (c == ZIO_COMPRESS_EMPTY) - return (s_len); - - /* Compress at least 12.5% */ - d_len = s_len - (s_len >> 3); - - /* No compression algorithms can read from ABDs directly */ - void *tmp = abd_borrow_buf_copy(src, s_len); - c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level); - abd_return_buf(src, tmp, s_len); - - if (c_len > d_len) { - ZCOMPSTAT_BUMP(zcompstat_skipped_insufficient_gain); - return (s_len); - } - - ASSERT3U(c_len, <=, d_len); - return (c_len); -} - -int -zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, - size_t s_len, size_t d_len) -{ - zio_compress_info_t *ci = &zio_compress_table[c]; - if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) - return (SET_ERROR(EINVAL)); - - return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); -} - -int -zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, - size_t s_len, size_t d_len) -{ - void *tmp = abd_borrow_buf_copy(src, s_len); - int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len); - abd_return_buf(src, tmp, s_len); - - /* - * Decompression shouldn't fail, because we've already verifyied - * the checksum. However, for extra protection (e.g. against bitflips - * in non-ECC RAM), we handle this error (and test it). - */ - ASSERT0(ret); - if (zio_decompress_fail_fraction != 0 && - spa_get_random(zio_decompress_fail_fraction) == 0) - ret = SET_ERROR(EINVAL); - - return (ret); -} - -void -zio_compress_init(void) -{ - - zcomp_ksp = kstat_create("zfs", 0, "zcompstats", "misc", - KSTAT_TYPE_NAMED, sizeof (zcomp_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - - if (zcomp_ksp != NULL) { - zcomp_ksp->ks_data = &zcomp_stats; - kstat_install(zcomp_ksp); - } -} - -void -zio_compress_fini(void) -{ - if (zcomp_ksp != NULL) { - kstat_delete(zcomp_ksp); - zcomp_ksp = NULL; - } -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c deleted file mode 100644 index 26f59af9968f..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c +++ /dev/null @@ -1,755 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. - */ - -/* - * ZFS fault injection - * - * To handle fault injection, we keep track of a series of zinject_record_t - * structures which describe which logical block(s) should be injected with a - * fault. These are kept in a global list. Each record corresponds to a given - * spa_t and maintains a special hold on the spa_t so that it cannot be deleted - * or exported while the injection record exists. - * - * Device level injection is done using the 'zi_guid' field. If this is set, it - * means that the error is destined for a particular device, not a piece of - * data. - * - * This is a rather poor data structure and algorithm, but we don't expect more - * than a few faults at any one time, so it should be sufficient for our needs. - */ - -#include -#include -#include -#include -#include -#include - -uint32_t zio_injection_enabled; - -/* - * Data describing each zinject handler registered on the system, and - * contains the list node linking the handler in the global zinject - * handler list. - */ -typedef struct inject_handler { - int zi_id; - spa_t *zi_spa; - zinject_record_t zi_record; - uint64_t *zi_lanes; - int zi_next_lane; - list_node_t zi_link; -} inject_handler_t; - -/* - * List of all zinject handlers registered on the system, protected by - * the inject_lock defined below. - */ -static list_t inject_handlers; - -/* - * This protects insertion into, and traversal of, the inject handler - * list defined above; as well as the inject_delay_count. Any time a - * handler is inserted or removed from the list, this lock should be - * taken as a RW_WRITER; and any time traversal is done over the list - * (without modification to it) this lock should be taken as a RW_READER. - */ -static krwlock_t inject_lock; - -/* - * This holds the number of zinject delay handlers that have been - * registered on the system. It is protected by the inject_lock defined - * above. Thus modifications to this count must be a RW_WRITER of the - * inject_lock, and reads of this count must be (at least) a RW_READER - * of the lock. - */ -static int inject_delay_count = 0; - -/* - * This lock is used only in zio_handle_io_delay(), refer to the comment - * in that function for more details. - */ -static kmutex_t inject_delay_mtx; - -/* - * Used to assign unique identifying numbers to each new zinject handler. - */ -static int inject_next_id = 1; - -/* - * Returns true if the given record matches the I/O in progress. - */ -static boolean_t -zio_match_handler(zbookmark_phys_t *zb, uint64_t type, - zinject_record_t *record, int error) -{ - /* - * Check for a match against the MOS, which is based on type - */ - if (zb->zb_objset == DMU_META_OBJSET && - record->zi_objset == DMU_META_OBJSET && - record->zi_object == DMU_META_DNODE_OBJECT) { - if (record->zi_type == DMU_OT_NONE || - type == record->zi_type) - return (record->zi_freq == 0 || - spa_get_random(100) < record->zi_freq); - else - return (B_FALSE); - } - - /* - * Check for an exact match. - */ - if (zb->zb_objset == record->zi_objset && - zb->zb_object == record->zi_object && - zb->zb_level == record->zi_level && - zb->zb_blkid >= record->zi_start && - zb->zb_blkid <= record->zi_end && - error == record->zi_error) - return (record->zi_freq == 0 || - spa_get_random(100) < record->zi_freq); - - return (B_FALSE); -} - -/* - * Panic the system when a config change happens in the function - * specified by tag. - */ -void -zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) -{ - inject_handler_t *handler; - - rw_enter(&inject_lock, RW_READER); - - for (handler = list_head(&inject_handlers); handler != NULL; - handler = list_next(&inject_handlers, handler)) { - - if (spa != handler->zi_spa) - continue; - - if (handler->zi_record.zi_type == type && - strcmp(tag, handler->zi_record.zi_func) == 0) - panic("Panic requested in function %s\n", tag); - } - - rw_exit(&inject_lock); -} - -/* - * Determine if the I/O in question should return failure. Returns the errno - * to be returned to the caller. - */ -int -zio_handle_fault_injection(zio_t *zio, int error) -{ - int ret = 0; - inject_handler_t *handler; - - /* - * Ignore I/O not associated with any logical data. - */ - if (zio->io_logical == NULL) - return (0); - - /* - * Currently, we only support fault injection on reads. - */ - if (zio->io_type != ZIO_TYPE_READ) - return (0); - - rw_enter(&inject_lock, RW_READER); - - for (handler = list_head(&inject_handlers); handler != NULL; - handler = list_next(&inject_handlers, handler)) { - - if (zio->io_spa != handler->zi_spa || - handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) - continue; - - /* If this handler matches, return EIO */ - if (zio_match_handler(&zio->io_logical->io_bookmark, - zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, - &handler->zi_record, error)) { - ret = error; - break; - } - } - - rw_exit(&inject_lock); - - return (ret); -} - -/* - * Determine if the zio is part of a label update and has an injection - * handler associated with that portion of the label. Currently, we - * allow error injection in either the nvlist or the uberblock region of - * of the vdev label. - */ -int -zio_handle_label_injection(zio_t *zio, int error) -{ - inject_handler_t *handler; - vdev_t *vd = zio->io_vd; - uint64_t offset = zio->io_offset; - int label; - int ret = 0; - - if (offset >= VDEV_LABEL_START_SIZE && - offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) - return (0); - - rw_enter(&inject_lock, RW_READER); - - for (handler = list_head(&inject_handlers); handler != NULL; - handler = list_next(&inject_handlers, handler)) { - uint64_t start = handler->zi_record.zi_start; - uint64_t end = handler->zi_record.zi_end; - - if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) - continue; - - /* - * The injection region is the relative offsets within a - * vdev label. We must determine the label which is being - * updated and adjust our region accordingly. - */ - label = vdev_label_number(vd->vdev_psize, offset); - start = vdev_label_offset(vd->vdev_psize, label, start); - end = vdev_label_offset(vd->vdev_psize, label, end); - - if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && - (offset >= start && offset <= end)) { - ret = error; - break; - } - } - rw_exit(&inject_lock); - return (ret); -} - - -int -zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) -{ - inject_handler_t *handler; - int ret = 0; - - /* - * We skip over faults in the labels unless it's during - * device open (i.e. zio == NULL). - */ - if (zio != NULL) { - uint64_t offset = zio->io_offset; - - if (offset < VDEV_LABEL_START_SIZE || - offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) - return (0); - } - - rw_enter(&inject_lock, RW_READER); - - for (handler = list_head(&inject_handlers); handler != NULL; - handler = list_next(&inject_handlers, handler)) { - - if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) - continue; - - if (vd->vdev_guid == handler->zi_record.zi_guid) { - if (handler->zi_record.zi_failfast && - (zio == NULL || (zio->io_flags & - (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { - continue; - } - - /* Handle type specific I/O failures */ - if (zio != NULL && - handler->zi_record.zi_iotype != ZIO_TYPES && - handler->zi_record.zi_iotype != zio->io_type) - continue; - - if (handler->zi_record.zi_error == error) { - /* - * For a failed open, pretend like the device - * has gone away. - */ - if (error == ENXIO) - vd->vdev_stat.vs_aux = - VDEV_AUX_OPEN_FAILED; - - /* - * Treat these errors as if they had been - * retried so that all the appropriate stats - * and FMA events are generated. - */ - if (!handler->zi_record.zi_failfast && - zio != NULL) - zio->io_flags |= ZIO_FLAG_IO_RETRY; - - ret = error; - break; - } - if (handler->zi_record.zi_error == ENXIO) { - ret = SET_ERROR(EIO); - break; - } - } - } - - rw_exit(&inject_lock); - - return (ret); -} - -/* - * Simulate hardware that ignores cache flushes. For requested number - * of seconds nix the actual writing to disk. - */ -void -zio_handle_ignored_writes(zio_t *zio) -{ - inject_handler_t *handler; - - rw_enter(&inject_lock, RW_READER); - - for (handler = list_head(&inject_handlers); handler != NULL; - handler = list_next(&inject_handlers, handler)) { - - /* Ignore errors not destined for this pool */ - if (zio->io_spa != handler->zi_spa || - handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) - continue; - - /* - * Positive duration implies # of seconds, negative - * a number of txgs - */ - if (handler->zi_record.zi_timer == 0) { - if (handler->zi_record.zi_duration > 0) - handler->zi_record.zi_timer = ddi_get_lbolt64(); - else - handler->zi_record.zi_timer = zio->io_txg; - } - - /* Have a "problem" writing 60% of the time */ - if (spa_get_random(100) < 60) - zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; - break; - } - - rw_exit(&inject_lock); -} - -void -spa_handle_ignored_writes(spa_t *spa) -{ - inject_handler_t *handler; - - if (zio_injection_enabled == 0) - return; - - rw_enter(&inject_lock, RW_READER); - - for (handler = list_head(&inject_handlers); handler != NULL; - handler = list_next(&inject_handlers, handler)) { - - if (spa != handler->zi_spa || - handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) - continue; - - if (handler->zi_record.zi_duration > 0) { - VERIFY(handler->zi_record.zi_timer == 0 || - handler->zi_record.zi_timer + - handler->zi_record.zi_duration * hz > - ddi_get_lbolt64()); - } else { - /* duration is negative so the subtraction here adds */ - VERIFY(handler->zi_record.zi_timer == 0 || - handler->zi_record.zi_timer - - handler->zi_record.zi_duration >= - spa_syncing_txg(spa)); - } - } - - rw_exit(&inject_lock); -} - -hrtime_t -zio_handle_io_delay(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - inject_handler_t *min_handler = NULL; - hrtime_t min_target = 0; - - rw_enter(&inject_lock, RW_READER); - - /* - * inject_delay_count is a subset of zio_injection_enabled that - * is only incremented for delay handlers. These checks are - * mainly added to remind the reader why we're not explicitly - * checking zio_injection_enabled like the other functions. - */ - IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); - IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); - - /* - * If there aren't any inject delay handlers registered, then we - * can short circuit and simply return 0 here. A value of zero - * informs zio_delay_interrupt() that this request should not be - * delayed. This short circuit keeps us from acquiring the - * inject_delay_mutex unnecessarily. - */ - if (inject_delay_count == 0) { - rw_exit(&inject_lock); - return (0); - } - - /* - * Each inject handler has a number of "lanes" associated with - * it. Each lane is able to handle requests independently of one - * another, and at a latency defined by the inject handler - * record's zi_timer field. Thus if a handler in configured with - * a single lane with a 10ms latency, it will delay requests - * such that only a single request is completed every 10ms. So, - * if more than one request is attempted per each 10ms interval, - * the average latency of the requests will be greater than - * 10ms; but if only a single request is submitted each 10ms - * interval the average latency will be 10ms. - * - * We need to acquire this mutex to prevent multiple concurrent - * threads being assigned to the same lane of a given inject - * handler. The mutex allows us to perform the following two - * operations atomically: - * - * 1. determine the minimum handler and minimum target - * value of all the possible handlers - * 2. update that minimum handler's lane array - * - * Without atomicity, two (or more) threads could pick the same - * lane in step (1), and then conflict with each other in step - * (2). This could allow a single lane handler to process - * multiple requests simultaneously, which shouldn't be possible. - */ - mutex_enter(&inject_delay_mtx); - - for (inject_handler_t *handler = list_head(&inject_handlers); - handler != NULL; handler = list_next(&inject_handlers, handler)) { - if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) - continue; - - if (vd->vdev_guid != handler->zi_record.zi_guid) - continue; - - /* - * Defensive; should never happen as the array allocation - * occurs prior to inserting this handler on the list. - */ - ASSERT3P(handler->zi_lanes, !=, NULL); - - /* - * This should never happen, the zinject command should - * prevent a user from setting an IO delay with zero lanes. - */ - ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); - - ASSERT3U(handler->zi_record.zi_nlanes, >, - handler->zi_next_lane); - - /* - * We want to issue this IO to the lane that will become - * idle the soonest, so we compare the soonest this - * specific handler can complete the IO with all other - * handlers, to find the lowest value of all possible - * lanes. We then use this lane to submit the request. - * - * Since each handler has a constant value for its - * delay, we can just use the "next" lane for that - * handler; as it will always be the lane with the - * lowest value for that particular handler (i.e. the - * lane that will become idle the soonest). This saves a - * scan of each handler's lanes array. - * - * There's two cases to consider when determining when - * this specific IO request should complete. If this - * lane is idle, we want to "submit" the request now so - * it will complete after zi_timer milliseconds. Thus, - * we set the target to now + zi_timer. - * - * If the lane is busy, we want this request to complete - * zi_timer milliseconds after the lane becomes idle. - * Since the 'zi_lanes' array holds the time at which - * each lane will become idle, we use that value to - * determine when this request should complete. - */ - hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); - hrtime_t busy = handler->zi_record.zi_timer + - handler->zi_lanes[handler->zi_next_lane]; - hrtime_t target = MAX(idle, busy); - - if (min_handler == NULL) { - min_handler = handler; - min_target = target; - continue; - } - - ASSERT3P(min_handler, !=, NULL); - ASSERT3U(min_target, !=, 0); - - /* - * We don't yet increment the "next lane" variable since - * we still might find a lower value lane in another - * handler during any remaining iterations. Once we're - * sure we've selected the absolute minimum, we'll claim - * the lane and increment the handler's "next lane" - * field below. - */ - - if (target < min_target) { - min_handler = handler; - min_target = target; - } - } - - /* - * 'min_handler' will be NULL if no IO delays are registered for - * this vdev, otherwise it will point to the handler containing - * the lane that will become idle the soonest. - */ - if (min_handler != NULL) { - ASSERT3U(min_target, !=, 0); - min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; - - /* - * If we've used all possible lanes for this handler, - * loop back and start using the first lane again; - * otherwise, just increment the lane index. - */ - min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % - min_handler->zi_record.zi_nlanes; - } - - mutex_exit(&inject_delay_mtx); - rw_exit(&inject_lock); - - return (min_target); -} - -/* - * Create a new handler for the given record. We add it to the list, adding - * a reference to the spa_t in the process. We increment zio_injection_enabled, - * which is the switch to trigger all fault injection. - */ -int -zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) -{ - inject_handler_t *handler; - int error; - spa_t *spa; - - /* - * If this is pool-wide metadata, make sure we unload the corresponding - * spa_t, so that the next attempt to load it will trigger the fault. - * We call spa_reset() to unload the pool appropriately. - */ - if (flags & ZINJECT_UNLOAD_SPA) - if ((error = spa_reset(name)) != 0) - return (error); - - if (record->zi_cmd == ZINJECT_DELAY_IO) { - /* - * A value of zero for the number of lanes or for the - * delay time doesn't make sense. - */ - if (record->zi_timer == 0 || record->zi_nlanes == 0) - return (SET_ERROR(EINVAL)); - - /* - * The number of lanes is directly mapped to the size of - * an array used by the handler. Thus, to ensure the - * user doesn't trigger an allocation that's "too large" - * we cap the number of lanes here. - */ - if (record->zi_nlanes >= UINT16_MAX) - return (SET_ERROR(EINVAL)); - } - - if (!(flags & ZINJECT_NULL)) { - /* - * spa_inject_ref() will add an injection reference, which will - * prevent the pool from being removed from the namespace while - * still allowing it to be unloaded. - */ - if ((spa = spa_inject_addref(name)) == NULL) - return (SET_ERROR(ENOENT)); - - handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); - - handler->zi_spa = spa; - handler->zi_record = *record; - - if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { - handler->zi_lanes = kmem_zalloc( - sizeof (*handler->zi_lanes) * - handler->zi_record.zi_nlanes, KM_SLEEP); - handler->zi_next_lane = 0; - } else { - handler->zi_lanes = NULL; - handler->zi_next_lane = 0; - } - - rw_enter(&inject_lock, RW_WRITER); - - /* - * We can't move this increment into the conditional - * above because we need to hold the RW_WRITER lock of - * inject_lock, and we don't want to hold that while - * allocating the handler's zi_lanes array. - */ - if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { - ASSERT3S(inject_delay_count, >=, 0); - inject_delay_count++; - ASSERT3S(inject_delay_count, >, 0); - } - - *id = handler->zi_id = inject_next_id++; - list_insert_tail(&inject_handlers, handler); - atomic_inc_32(&zio_injection_enabled); - - rw_exit(&inject_lock); - } - - /* - * Flush the ARC, so that any attempts to read this data will end up - * going to the ZIO layer. Note that this is a little overkill, but - * we don't have the necessary ARC interfaces to do anything else, and - * fault injection isn't a performance critical path. - */ - if (flags & ZINJECT_FLUSH_ARC) - /* - * We must use FALSE to ensure arc_flush returns, since - * we're not preventing concurrent ARC insertions. - */ - arc_flush(NULL, FALSE); - - return (0); -} - -/* - * Returns the next record with an ID greater than that supplied to the - * function. Used to iterate over all handlers in the system. - */ -int -zio_inject_list_next(int *id, char *name, size_t buflen, - zinject_record_t *record) -{ - inject_handler_t *handler; - int ret; - - mutex_enter(&spa_namespace_lock); - rw_enter(&inject_lock, RW_READER); - - for (handler = list_head(&inject_handlers); handler != NULL; - handler = list_next(&inject_handlers, handler)) - if (handler->zi_id > *id) - break; - - if (handler) { - *record = handler->zi_record; - *id = handler->zi_id; - (void) strncpy(name, spa_name(handler->zi_spa), buflen); - ret = 0; - } else { - ret = SET_ERROR(ENOENT); - } - - rw_exit(&inject_lock); - mutex_exit(&spa_namespace_lock); - - return (ret); -} - -/* - * Clear the fault handler with the given identifier, or return ENOENT if none - * exists. - */ -int -zio_clear_fault(int id) -{ - inject_handler_t *handler; - - rw_enter(&inject_lock, RW_WRITER); - - for (handler = list_head(&inject_handlers); handler != NULL; - handler = list_next(&inject_handlers, handler)) - if (handler->zi_id == id) - break; - - if (handler == NULL) { - rw_exit(&inject_lock); - return (SET_ERROR(ENOENT)); - } - - if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { - ASSERT3S(inject_delay_count, >, 0); - inject_delay_count--; - ASSERT3S(inject_delay_count, >=, 0); - } - - list_remove(&inject_handlers, handler); - rw_exit(&inject_lock); - - if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { - ASSERT3P(handler->zi_lanes, !=, NULL); - kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * - handler->zi_record.zi_nlanes); - } else { - ASSERT3P(handler->zi_lanes, ==, NULL); - } - - spa_inject_delref(handler->zi_spa); - kmem_free(handler, sizeof (inject_handler_t)); - atomic_dec_32(&zio_injection_enabled); - - return (0); -} - -void -zio_inject_init(void) -{ - rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); - mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); - list_create(&inject_handlers, sizeof (inject_handler_t), - offsetof(inject_handler_t, zi_link)); -} - -void -zio_inject_fini(void) -{ - list_destroy(&inject_handlers); - mutex_destroy(&inject_delay_mtx); - rw_destroy(&inject_lock); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c deleted file mode 100644 index 13c5673fbe26..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Zero-length encoding. This is a fast and simple algorithm to eliminate - * runs of zeroes. Each chunk of compressed data begins with a length byte, b. - * If b < n (where n is the compression parameter) then the next b + 1 bytes - * are literal values. If b >= n then the next (256 - b + 1) bytes are zero. - */ -#include -#include - -size_t -zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) -{ - uchar_t *src = s_start; - uchar_t *dst = d_start; - uchar_t *s_end = src + s_len; - uchar_t *d_end = dst + d_len; - - while (src < s_end && dst < d_end - 1) { - uchar_t *first = src; - uchar_t *len = dst++; - if (src[0] == 0) { - uchar_t *last = src + (256 - n); - while (src < MIN(last, s_end) && src[0] == 0) - src++; - *len = src - first - 1 + n; - } else { - uchar_t *last = src + n; - if (d_end - dst < n) - break; - while (src < MIN(last, s_end) - 1 && (src[0] | src[1])) - *dst++ = *src++; - if (src[0]) - *dst++ = *src++; - *len = src - first - 1; - } - } - return (src == s_end ? dst - (uchar_t *)d_start : s_len); -} - -int -zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) -{ - uchar_t *src = s_start; - uchar_t *dst = d_start; - uchar_t *s_end = src + s_len; - uchar_t *d_end = dst + d_len; - - while (src < s_end && dst < d_end) { - int len = 1 + *src++; - if (len <= n) { - while (len-- != 0) - *dst++ = *src++; - } else { - len -= n; - while (len-- != 0) - *dst++ = 0; - } - } - return (dst == d_end ? 0 : -1); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c deleted file mode 100644 index ec333e54d2a8..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c +++ /dev/null @@ -1,187 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, 2015 by Delphix. All rights reserved. - * Copyright 2016 The MathWorks, Inc. All rights reserved. - */ - -/* - * A Zero Reference Lock (ZRL) is a reference count that can lock out new - * references only when the count is zero and only without waiting if the count - * is not already zero. It is similar to a read-write lock in that it allows - * multiple readers and only a single writer, but it does not allow a writer to - * block while waiting for readers to exit, and therefore the question of - * reader/writer priority is moot (no WRWANT bit). Since the equivalent of - * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it - * is perfectly safe for the same reader to acquire the same lock multiple - * times. The fact that a ZRL is reentrant for readers (through multiple calls - * to zrl_add()) makes it convenient for determining whether something is - * actively referenced without the fuss of flagging lock ownership across - * function calls. - */ -#include - -/* - * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is - * treated as zero references. - */ -#define ZRL_LOCKED -1 -#define ZRL_DESTROYED -2 - -void -zrl_init(zrlock_t *zrl) -{ - mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL); - zrl->zr_refcount = 0; - cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL); -#ifdef ZFS_DEBUG - zrl->zr_owner = NULL; - zrl->zr_caller = NULL; -#endif -} - -void -zrl_destroy(zrlock_t *zrl) -{ - ASSERT0(zrl->zr_refcount); - - mutex_destroy(&zrl->zr_mtx); - zrl->zr_refcount = ZRL_DESTROYED; - cv_destroy(&zrl->zr_cv); -} - -void -zrl_add_impl(zrlock_t *zrl, const char *zc) -{ - for (;;) { - uint32_t n = (uint32_t)zrl->zr_refcount; - while (n != ZRL_LOCKED) { - uint32_t cas = atomic_cas_32( - (uint32_t *)&zrl->zr_refcount, n, n + 1); - if (cas == n) { - ASSERT3S((int32_t)n, >=, 0); -#ifdef ZFS_DEBUG - if (zrl->zr_owner == curthread) { - DTRACE_PROBE2(zrlock__reentry, - zrlock_t *, zrl, uint32_t, n); - } - zrl->zr_owner = curthread; - zrl->zr_caller = zc; -#endif - return; - } - n = cas; - } - - mutex_enter(&zrl->zr_mtx); - while (zrl->zr_refcount == ZRL_LOCKED) { - cv_wait(&zrl->zr_cv, &zrl->zr_mtx); - } - mutex_exit(&zrl->zr_mtx); - } -} - -void -zrl_remove(zrlock_t *zrl) -{ - uint32_t n; - -#ifdef ZFS_DEBUG - if (zrl->zr_owner == curthread) { - zrl->zr_owner = NULL; - zrl->zr_caller = NULL; - } -#endif - n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount); - ASSERT3S((int32_t)n, >=, 0); -} - -int -zrl_tryenter(zrlock_t *zrl) -{ - uint32_t n = (uint32_t)zrl->zr_refcount; - - if (n == 0) { - uint32_t cas = atomic_cas_32( - (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED); - if (cas == 0) { -#ifdef ZFS_DEBUG - ASSERT3P(zrl->zr_owner, ==, NULL); - zrl->zr_owner = curthread; -#endif - return (1); - } - } - - ASSERT3S((int32_t)n, >, ZRL_DESTROYED); - - return (0); -} - -void -zrl_exit(zrlock_t *zrl) -{ - ASSERT3S(zrl->zr_refcount, ==, ZRL_LOCKED); - - mutex_enter(&zrl->zr_mtx); -#ifdef ZFS_DEBUG - ASSERT3P(zrl->zr_owner, ==, curthread); - zrl->zr_owner = NULL; - membar_producer(); /* make sure the owner store happens first */ -#endif - zrl->zr_refcount = 0; - cv_broadcast(&zrl->zr_cv); - mutex_exit(&zrl->zr_mtx); -} - -int -zrl_refcount(zrlock_t *zrl) -{ - ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED); - - int n = (int)zrl->zr_refcount; - return (n <= 0 ? 0 : n); -} - -int -zrl_is_zero(zrlock_t *zrl) -{ - ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED); - - return (zrl->zr_refcount <= 0); -} - -int -zrl_is_locked(zrlock_t *zrl) -{ - ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED); - - return (zrl->zr_refcount == ZRL_LOCKED); -} - -#ifdef ZFS_DEBUG -kthread_t * -zrl_owner(zrlock_t *zrl) -{ - return (zrl->zr_owner); -} -#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c deleted file mode 100644 index 76a9fa122b26..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c +++ /dev/null @@ -1,431 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2017, 2019 by Delphix. All rights reserved. - */ - -/* - * ZTHR Infrastructure - * =================== - * - * ZTHR threads are used for isolated operations that span multiple txgs - * within a SPA. They generally exist from SPA creation/loading and until - * the SPA is exported/destroyed. The ideal requirements for an operation - * to be modeled with a zthr are the following: - * - * 1] The operation needs to run over multiple txgs. - * 2] There is be a single point of reference in memory or on disk that - * indicates whether the operation should run/is running or has - * stopped. - * - * If the operation satisfies the above then the following rules guarantee - * a certain level of correctness: - * - * 1] Any thread EXCEPT the zthr changes the work indicator from stopped - * to running but not the opposite. - * 2] Only the zthr can change the work indicator from running to stopped - * (e.g. when it is done) but not the opposite. - * - * This way a normal zthr cycle should go like this: - * - * 1] An external thread changes the work indicator from stopped to - * running and wakes up the zthr. - * 2] The zthr wakes up, checks the indicator and starts working. - * 3] When the zthr is done, it changes the indicator to stopped, allowing - * a new cycle to start. - * - * Besides being awakened by other threads, a zthr can be configured - * during creation to wakeup on it's own after a specified interval - * [see zthr_create_timer()]. - * - * Note: ZTHR threads are NOT a replacement for generic threads! Please - * ensure that they fit your use-case well before using them. - * - * == ZTHR creation - * - * Every zthr needs three inputs to start running: - * - * 1] A user-defined checker function (checkfunc) that decides whether - * the zthr should start working or go to sleep. The function should - * return TRUE when the zthr needs to work or FALSE to let it sleep, - * and should adhere to the following signature: - * boolean_t checkfunc_name(void *args, zthr_t *t); - * - * 2] A user-defined ZTHR function (func) which the zthr executes when - * it is not sleeping. The function should adhere to the following - * signature type: - * void func_name(void *args, zthr_t *t); - * - * 3] A void args pointer that will be passed to checkfunc and func - * implicitly by the infrastructure. - * - * The reason why the above API needs two different functions, - * instead of one that both checks and does the work, has to do with - * the zthr's internal state lock (zthr_state_lock) and the allowed - * cancellation windows. We want to hold the zthr_state_lock while - * running checkfunc but not while running func. This way the zthr - * can be cancelled while doing work and not while checking for work. - * - * To start a zthr: - * zthr_t *zthr_pointer = zthr_create(checkfunc, func, args); - * or - * zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func, - * args, max_sleep); - * - * After that you should be able to wakeup, cancel, and resume the - * zthr from another thread using the zthr_pointer. - * - * NOTE: ZTHR threads could potentially wake up spuriously and the - * user should take this into account when writing a checkfunc. - * [see ZTHR state transitions] - * - * == ZTHR cancellation - * - * ZTHR threads must be cancelled when their SPA is being exported - * or when they need to be paused so they don't interfere with other - * operations. - * - * To cancel a zthr: - * zthr_cancel(zthr_pointer); - * - * To resume it: - * zthr_resume(zthr_pointer); - * - * A zthr will implicitly check if it has received a cancellation - * signal every time func returns and every time it wakes up [see - * ZTHR state transitions below]. - * - * At times, waiting for the zthr's func to finish its job may take - * time. This may be very time-consuming for some operations that - * need to cancel the SPA's zthrs (e.g spa_export). For this scenario - * the user can explicitly make their ZTHR function aware of incoming - * cancellation signals using zthr_iscancelled(). A common pattern for - * that looks like this: - * - * int - * func_name(void *args, zthr_t *t) - * { - * ... ... - * while (!work_done && !zthr_iscancelled(t)) { - * ... ... - * } - * } - * - * == ZTHR cleanup - * - * Cancelling a zthr doesn't clean up its metadata (internal locks, - * function pointers to func and checkfunc, etc..). This is because - * we want to keep them around in case we want to resume the execution - * of the zthr later. Similarly for zthrs that exit themselves. - * - * To completely cleanup a zthr, cancel it first to ensure that it - * is not running and then use zthr_destroy(). - * - * == ZTHR state transitions - * - * zthr creation - * + - * | - * | woke up - * | +--------------+ sleep - * | | ^ - * | | | - * | | | FALSE - * | | | - * v v FALSE + - * cancelled? +---------> checkfunc? - * + ^ + - * | | | - * | | | TRUE - * | | | - * | | func returned v - * | +---------------+ func - * | - * | TRUE - * | - * v - * zthr stopped running - * - * == Implementation of ZTHR requests - * - * ZTHR wakeup, cancel, and resume are requests on a zthr to - * change its internal state. Requests on a zthr are serialized - * using the zthr_request_lock, while changes in its internal - * state are protected by the zthr_state_lock. A request will - * first acquire the zthr_request_lock and then immediately - * acquire the zthr_state_lock. We do this so that incoming - * requests are serialized using the request lock, while still - * allowing us to use the state lock for thread communication - * via zthr_cv. - */ - -#include -#include - -struct zthr { - /* running thread doing the work */ - kthread_t *zthr_thread; - - /* lock protecting internal data & invariants */ - kmutex_t zthr_state_lock; - - /* mutex that serializes external requests */ - kmutex_t zthr_request_lock; - - /* notification mechanism for requests */ - kcondvar_t zthr_cv; - - /* flag set to true if we are canceling the zthr */ - boolean_t zthr_cancel; - - /* - * maximum amount of time that the zthr is spent sleeping; - * if this is 0, the thread doesn't wake up until it gets - * signaled. - */ - hrtime_t zthr_wait_time; - - /* consumer-provided callbacks & data */ - zthr_checkfunc_t *zthr_checkfunc; - zthr_func_t *zthr_func; - void *zthr_arg; -}; - -static void -zthr_procedure(void *arg) -{ - zthr_t *t = arg; - - mutex_enter(&t->zthr_state_lock); - ASSERT3P(t->zthr_thread, ==, curthread); - - while (!t->zthr_cancel) { - if (t->zthr_checkfunc(t->zthr_arg, t)) { - mutex_exit(&t->zthr_state_lock); - t->zthr_func(t->zthr_arg, t); - mutex_enter(&t->zthr_state_lock); - } else { - /* go to sleep */ - if (t->zthr_wait_time == 0) { - cv_wait(&t->zthr_cv, &t->zthr_state_lock); - } else { - (void) cv_timedwait_hires(&t->zthr_cv, - &t->zthr_state_lock, t->zthr_wait_time, - MSEC2NSEC(1), 0); - } - } - } - - /* - * Clear out the kernel thread metadata and notify the - * zthr_cancel() thread that we've stopped running. - */ - t->zthr_thread = NULL; - t->zthr_cancel = B_FALSE; - cv_broadcast(&t->zthr_cv); - - mutex_exit(&t->zthr_state_lock); - thread_exit(); -} - -zthr_t * -zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg) -{ - return (zthr_create_timer(checkfunc, func, arg, (hrtime_t)0)); -} - -/* - * Create a zthr with specified maximum sleep time. If the time - * in sleeping state exceeds max_sleep, a wakeup(do the check and - * start working if required) will be triggered. - */ -zthr_t * -zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func, - void *arg, hrtime_t max_sleep) -{ - zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP); - mutex_init(&t->zthr_state_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&t->zthr_request_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL); - - mutex_enter(&t->zthr_state_lock); - t->zthr_checkfunc = checkfunc; - t->zthr_func = func; - t->zthr_arg = arg; - t->zthr_wait_time = max_sleep; - - t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, - 0, &p0, TS_RUN, minclsyspri); - mutex_exit(&t->zthr_state_lock); - - return (t); -} - -void -zthr_destroy(zthr_t *t) -{ - ASSERT(!MUTEX_HELD(&t->zthr_state_lock)); - ASSERT(!MUTEX_HELD(&t->zthr_request_lock)); - VERIFY3P(t->zthr_thread, ==, NULL); - mutex_destroy(&t->zthr_request_lock); - mutex_destroy(&t->zthr_state_lock); - cv_destroy(&t->zthr_cv); - kmem_free(t, sizeof (*t)); -} - -/* - * Wake up the zthr if it is sleeping. If the thread has been - * cancelled that does nothing. - */ -void -zthr_wakeup(zthr_t *t) -{ - mutex_enter(&t->zthr_request_lock); - mutex_enter(&t->zthr_state_lock); - - /* - * There are 4 states that we can find the zthr when issuing - * this broadcast: - * - * [1] The common case of the thread being asleep, at which - * point the broadcast will wake it up. - * [2] The thread has been cancelled. Waking up a cancelled - * thread is a no-op. Any work that is still left to be - * done should be handled the next time the thread is - * resumed. - * [3] The thread is doing work and is already up, so this - * is basically a no-op. - * [4] The thread was just created/resumed, in which case the - * behavior is similar to [3]. - */ - cv_broadcast(&t->zthr_cv); - - mutex_exit(&t->zthr_state_lock); - mutex_exit(&t->zthr_request_lock); -} - -/* - * Sends a cancel request to the zthr and blocks until the zthr is - * cancelled. If the zthr is not running (e.g. has been cancelled - * already), this is a no-op. - */ -void -zthr_cancel(zthr_t *t) -{ - mutex_enter(&t->zthr_request_lock); - mutex_enter(&t->zthr_state_lock); - - /* - * Since we are holding the zthr_state_lock at this point - * we can find the state in one of the following 4 states: - * - * [1] The thread has already been cancelled, therefore - * there is nothing for us to do. - * [2] The thread is sleeping, so we broadcast the CV first - * to wake it up and then we set the flag and we are - * waiting for it to exit. - * [3] The thread is doing work, in which case we just set - * the flag and wait for it to finish. - * [4] The thread was just created/resumed, in which case - * the behavior is similar to [3]. - * - * Since requests are serialized, by the time that we get - * control back we expect that the zthr is cancelled and - * not running anymore. - */ - if (t->zthr_thread != NULL) { - t->zthr_cancel = B_TRUE; - - /* broadcast in case the zthr is sleeping */ - cv_broadcast(&t->zthr_cv); - - while (t->zthr_thread != NULL) - cv_wait(&t->zthr_cv, &t->zthr_state_lock); - - ASSERT(!t->zthr_cancel); - } - - mutex_exit(&t->zthr_state_lock); - mutex_exit(&t->zthr_request_lock); -} - -/* - * Sends a resume request to the supplied zthr. If the zthr is - * already running this is a no-op. - */ -void -zthr_resume(zthr_t *t) -{ - mutex_enter(&t->zthr_request_lock); - mutex_enter(&t->zthr_state_lock); - - ASSERT3P(&t->zthr_checkfunc, !=, NULL); - ASSERT3P(&t->zthr_func, !=, NULL); - ASSERT(!t->zthr_cancel); - - /* - * There are 4 states that we find the zthr in at this point - * given the locks that we hold: - * - * [1] The zthr was cancelled, so we spawn a new thread for - * the zthr (common case). - * [2] The zthr is running at which point this is a no-op. - * [3] The zthr is sleeping at which point this is a no-op. - * [4] The zthr was just spawned at which point this is a - * no-op. - */ - if (t->zthr_thread == NULL) { - t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, - 0, &p0, TS_RUN, minclsyspri); - } - - mutex_exit(&t->zthr_state_lock); - mutex_exit(&t->zthr_request_lock); -} - -/* - * This function is intended to be used by the zthr itself - * (specifically the zthr_func callback provided) to check - * if another thread has signaled it to stop running before - * doing some expensive operation. - * - * returns TRUE if we are in the middle of trying to cancel - * this thread. - * - * returns FALSE otherwise. - */ -boolean_t -zthr_iscancelled(zthr_t *t) -{ - ASSERT3P(t->zthr_thread, ==, curthread); - - /* - * The majority of the functions here grab zthr_request_lock - * first and then zthr_state_lock. This function only grabs - * the zthr_state_lock. That is because this function should - * only be called from the zthr_func to check if someone has - * issued a zthr_cancel() on the thread. If there is a zthr_cancel() - * happening concurrently, attempting to grab the request lock - * here would result in a deadlock. - * - * By grabbing only the zthr_state_lock this function is allowed - * to run concurrently with a zthr_cancel() request. - */ - mutex_enter(&t->zthr_state_lock); - boolean_t cancelled = t->zthr_cancel; - mutex_exit(&t->zthr_state_lock); - return (cancelled); -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c deleted file mode 100644 index f68670c956a1..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c +++ /dev/null @@ -1,3347 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * - * Copyright (c) 2006-2010 Pawel Jakub Dawidek - * All rights reserved. - * - * Portions Copyright 2010 Robert Milkowski - * - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright (c) 2016 Actifio, Inc. All rights reserved. - */ - -/* Portions Copyright 2011 Martin Matuska */ - -/* - * ZFS volume emulation driver. - * - * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. - * Volumes are accessed through the symbolic links named: - * - * /dev/zvol/dsk// - * /dev/zvol/rdsk// - * - * These links are created by the /dev filesystem (sdev_zvolops.c). - * Volumes are persistent through reboot. No user command needs to be - * run before opening and using a device. - * - * FreeBSD notes. - * On FreeBSD ZVOLs are simply GEOM providers like any other storage device - * in the system. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "zfs_namecheck.h" - -#ifndef illumos -struct g_class zfs_zvol_class = { - .name = "ZFS::ZVOL", - .version = G_VERSION, -}; - -DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); - -#endif -void *zfsdev_state; -static char *zvol_tag = "zvol_tag"; - -#define ZVOL_DUMPSIZE "dumpsize" - -/* - * This lock protects the zfsdev_state structure from being modified - * while it's being used, e.g. an open that comes in before a create - * finishes. It also protects temporary opens of the dataset so that, - * e.g., an open doesn't get a spurious EBUSY. - */ -#ifdef illumos -kmutex_t zfsdev_state_lock; -#else -/* - * In FreeBSD we've replaced the upstream zfsdev_state_lock with the - * spa_namespace_lock in the ZVOL code. - */ -#define zfsdev_state_lock spa_namespace_lock -#endif -static uint32_t zvol_minors; - -#ifndef illumos -SYSCTL_DECL(_vfs_zfs); -SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "ZFS VOLUME"); -static int volmode = ZFS_VOLMODE_GEOM; -SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &volmode, 0, - "Expose as GEOM providers (1), device files (2) or neither"); -static boolean_t zpool_on_zvol = B_FALSE; -SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, - "Allow zpools to use zvols as vdevs (DANGEROUS)"); - -#endif -typedef struct zvol_extent { - list_node_t ze_node; - dva_t ze_dva; /* dva associated with this extent */ - uint64_t ze_nblks; /* number of blocks in extent */ -} zvol_extent_t; - -/* - * The in-core state of each volume. - */ -typedef struct zvol_state { -#ifndef illumos - LIST_ENTRY(zvol_state) zv_links; -#endif - char zv_name[MAXPATHLEN]; /* pool/dd name */ - uint64_t zv_volsize; /* amount of space we advertise */ - uint64_t zv_volblocksize; /* volume block size */ -#ifdef illumos - minor_t zv_minor; /* minor number */ -#else - struct cdev *zv_dev; /* non-GEOM device */ - struct g_provider *zv_provider; /* GEOM provider */ -#endif - uint8_t zv_min_bs; /* minimum addressable block shift */ - uint8_t zv_flags; /* readonly, dumpified, etc. */ - objset_t *zv_objset; /* objset handle */ -#ifdef illumos - uint32_t zv_open_count[OTYPCNT]; /* open counts */ -#endif - uint32_t zv_total_opens; /* total open count */ - uint32_t zv_sync_cnt; /* synchronous open count */ - zilog_t *zv_zilog; /* ZIL handle */ - list_t zv_extents; /* List of extents for dump */ - rangelock_t zv_rangelock; - dnode_t *zv_dn; /* dnode hold */ -#ifndef illumos - int zv_state; - int zv_volmode; /* Provide GEOM or cdev */ - struct bio_queue_head zv_queue; - struct mtx zv_queue_mtx; /* zv_queue mutex */ -#endif -} zvol_state_t; - -typedef enum { - ZVOL_ASYNC_CREATE_MINORS, - ZVOL_ASYNC_REMOVE_MINORS, - ZVOL_ASYNC_RENAME_MINORS, - ZVOL_ASYNC_MAX -} zvol_async_op_t; - -typedef struct { - zvol_async_op_t op; - char pool[ZFS_MAX_DATASET_NAME_LEN]; - char name1[ZFS_MAX_DATASET_NAME_LEN]; - char name2[ZFS_MAX_DATASET_NAME_LEN]; -} zvol_task_t; - -#ifndef illumos -static LIST_HEAD(, zvol_state) all_zvols; -#endif -/* - * zvol specific flags - */ -#define ZVOL_RDONLY 0x1 -#define ZVOL_DUMPIFIED 0x2 -#define ZVOL_EXCL 0x4 -#define ZVOL_WCE 0x8 - -/* - * zvol maximum transfer in one DMU tx. - */ -int zvol_maxphys = DMU_MAX_ACCESS/2; - -/* - * Toggle unmap functionality. - */ -boolean_t zvol_unmap_enabled = B_TRUE; - -/* - * If true, unmaps requested as synchronous are executed synchronously, - * otherwise all unmaps are asynchronous. - */ -boolean_t zvol_unmap_sync_enabled = B_FALSE; - -#ifndef illumos -SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, - &zvol_unmap_enabled, 0, - "Enable UNMAP functionality"); - -SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_sync_enabled, CTLFLAG_RWTUN, - &zvol_unmap_sync_enabled, 0, - "UNMAPs requested as sync are executed synchronously"); - -static d_open_t zvol_d_open; -static d_close_t zvol_d_close; -static d_read_t zvol_read; -static d_write_t zvol_write; -static d_ioctl_t zvol_d_ioctl; -static d_strategy_t zvol_strategy; - -static struct cdevsw zvol_cdevsw = { - .d_version = D_VERSION, - .d_open = zvol_d_open, - .d_close = zvol_d_close, - .d_read = zvol_read, - .d_write = zvol_write, - .d_ioctl = zvol_d_ioctl, - .d_strategy = zvol_strategy, - .d_name = "zvol", - .d_flags = D_DISK | D_TRACKCLOSE, -}; - -static void zvol_geom_run(zvol_state_t *zv); -static void zvol_geom_destroy(zvol_state_t *zv); -static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); -static void zvol_geom_start(struct bio *bp); -static void zvol_geom_worker(void *arg); -static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, - uint64_t len, boolean_t sync); -#endif /* !illumos */ - -extern int zfs_set_prop_nvlist(const char *, zprop_source_t, - nvlist_t *, nvlist_t *); -static int zvol_remove_zv(zvol_state_t *); -static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, - struct lwb *lwb, zio_t *zio); -static int zvol_dumpify(zvol_state_t *zv); -static int zvol_dump_fini(zvol_state_t *zv); -static int zvol_dump_init(zvol_state_t *zv, boolean_t resize); - -static void -zvol_size_changed(zvol_state_t *zv, uint64_t volsize) -{ -#ifdef illumos - dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor); - - zv->zv_volsize = volsize; - VERIFY(ddi_prop_update_int64(dev, zfs_dip, - "Size", volsize) == DDI_SUCCESS); - VERIFY(ddi_prop_update_int64(dev, zfs_dip, - "Nblocks", lbtodb(volsize)) == DDI_SUCCESS); - - /* Notify specfs to invalidate the cached size */ - spec_size_invalidate(dev, VBLK); - spec_size_invalidate(dev, VCHR); -#else /* !illumos */ - zv->zv_volsize = volsize; - if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { - struct g_provider *pp; - - pp = zv->zv_provider; - if (pp == NULL) - return; - g_topology_lock(); - - /* - * Do not invoke resize event when initial size was zero. - * ZVOL initializes the size on first open, this is not - * real resizing. - */ - if (pp->mediasize == 0) - pp->mediasize = zv->zv_volsize; - else - g_resize_provider(pp, zv->zv_volsize); - g_topology_unlock(); - } -#endif /* illumos */ -} - -int -zvol_check_volsize(uint64_t volsize, uint64_t blocksize) -{ - if (volsize == 0) - return (SET_ERROR(EINVAL)); - - if (volsize % blocksize != 0) - return (SET_ERROR(EINVAL)); - -#ifdef _ILP32 - if (volsize - 1 > SPEC_MAXOFFSET_T) - return (SET_ERROR(EOVERFLOW)); -#endif - return (0); -} - -int -zvol_check_volblocksize(uint64_t volblocksize) -{ - if (volblocksize < SPA_MINBLOCKSIZE || - volblocksize > SPA_OLD_MAXBLOCKSIZE || - !ISP2(volblocksize)) - return (SET_ERROR(EDOM)); - - return (0); -} - -int -zvol_get_stats(objset_t *os, nvlist_t *nv) -{ - int error; - dmu_object_info_t doi; - uint64_t val; - - error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); - if (error) - return (error); - - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); - - error = dmu_object_info(os, ZVOL_OBJ, &doi); - - if (error == 0) { - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, - doi.doi_data_block_size); - } - - return (error); -} - -static zvol_state_t * -zvol_minor_lookup(const char *name) -{ -#ifdef illumos - minor_t minor; -#endif - zvol_state_t *zv; - - ASSERT(MUTEX_HELD(&zfsdev_state_lock)); - -#ifdef illumos - for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) { - zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); - if (zv == NULL) - continue; -#else - LIST_FOREACH(zv, &all_zvols, zv_links) { -#endif - if (strcmp(zv->zv_name, name) == 0) - return (zv); - } - - return (NULL); -} - -/* extent mapping arg */ -struct maparg { - zvol_state_t *ma_zv; - uint64_t ma_blks; -}; - -/*ARGSUSED*/ -static int -zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) -{ - struct maparg *ma = arg; - zvol_extent_t *ze; - int bs = ma->ma_zv->zv_volblocksize; - - if (bp == NULL || BP_IS_HOLE(bp) || - zb->zb_object != ZVOL_OBJ || zb->zb_level != 0) - return (0); - - VERIFY(!BP_IS_EMBEDDED(bp)); - - VERIFY3U(ma->ma_blks, ==, zb->zb_blkid); - ma->ma_blks++; - - /* Abort immediately if we have encountered gang blocks */ - if (BP_IS_GANG(bp)) - return (SET_ERROR(EFRAGS)); - - /* - * See if the block is at the end of the previous extent. - */ - ze = list_tail(&ma->ma_zv->zv_extents); - if (ze && - DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) && - DVA_GET_OFFSET(BP_IDENTITY(bp)) == - DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) { - ze->ze_nblks++; - return (0); - } - - dprintf_bp(bp, "%s", "next blkptr:"); - - /* start a new extent */ - ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP); - ze->ze_dva = bp->blk_dva[0]; /* structure assignment */ - ze->ze_nblks = 1; - list_insert_tail(&ma->ma_zv->zv_extents, ze); - return (0); -} - -static void -zvol_free_extents(zvol_state_t *zv) -{ - zvol_extent_t *ze; - - while (ze = list_head(&zv->zv_extents)) { - list_remove(&zv->zv_extents, ze); - kmem_free(ze, sizeof (zvol_extent_t)); - } -} - -static int -zvol_get_lbas(zvol_state_t *zv) -{ - objset_t *os = zv->zv_objset; - struct maparg ma; - int err; - - ma.ma_zv = zv; - ma.ma_blks = 0; - zvol_free_extents(zv); - - /* commit any in-flight changes before traversing the dataset */ - txg_wait_synced(dmu_objset_pool(os), 0); - err = traverse_dataset(dmu_objset_ds(os), 0, - TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma); - if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) { - zvol_free_extents(zv); - return (err ? err : EIO); - } - - return (0); -} - -/* ARGSUSED */ -void -zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) -{ - zfs_creat_t *zct = arg; - nvlist_t *nvprops = zct->zct_props; - int error; - uint64_t volblocksize, volsize; - - VERIFY(nvlist_lookup_uint64(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); - if (nvlist_lookup_uint64(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) - volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); - - /* - * These properties must be removed from the list so the generic - * property setting step won't apply to them. - */ - VERIFY(nvlist_remove_all(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); - (void) nvlist_remove_all(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); - - error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, - DMU_OT_NONE, 0, tx); - ASSERT(error == 0); - - error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, - DMU_OT_NONE, 0, tx); - ASSERT(error == 0); - - error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); - ASSERT(error == 0); -} - -/* - * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we - * implement DKIOCFREE/free-long-range. - */ -static int -zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) -{ - zvol_state_t *zv = arg1; - lr_truncate_t *lr = arg2; - uint64_t offset, length; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - offset = lr->lr_offset; - length = lr->lr_length; - - return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length)); -} - -/* - * Replay a TX_WRITE ZIL transaction that didn't get committed - * after a system failure - */ -static int -zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) -{ - zvol_state_t *zv = arg1; - lr_write_t *lr = arg2; - objset_t *os = zv->zv_objset; - char *data = (char *)(lr + 1); /* data follows lr_write_t */ - uint64_t offset, length; - dmu_tx_t *tx; - int error; - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - offset = lr->lr_offset; - length = lr->lr_length; - - /* If it's a dmu_sync() block, write the whole block */ - if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { - uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); - if (length < blocksize) { - offset -= offset % blocksize; - length = blocksize; - } - } - - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - dmu_write(os, ZVOL_OBJ, offset, length, data, tx); - dmu_tx_commit(tx); - } - - return (error); -} - -/* ARGSUSED */ -static int -zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) -{ - return (SET_ERROR(ENOTSUP)); -} - -/* - * Callback vectors for replaying records. - * Only TX_WRITE and TX_TRUNCATE are needed for zvol. - */ -zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { - zvol_replay_err, /* 0 no such transaction type */ - zvol_replay_err, /* TX_CREATE */ - zvol_replay_err, /* TX_MKDIR */ - zvol_replay_err, /* TX_MKXATTR */ - zvol_replay_err, /* TX_SYMLINK */ - zvol_replay_err, /* TX_REMOVE */ - zvol_replay_err, /* TX_RMDIR */ - zvol_replay_err, /* TX_LINK */ - zvol_replay_err, /* TX_RENAME */ - zvol_replay_write, /* TX_WRITE */ - zvol_replay_truncate, /* TX_TRUNCATE */ - zvol_replay_err, /* TX_SETATTR */ - zvol_replay_err, /* TX_ACL */ - zvol_replay_err, /* TX_CREATE_ACL */ - zvol_replay_err, /* TX_CREATE_ATTR */ - zvol_replay_err, /* TX_CREATE_ACL_ATTR */ - zvol_replay_err, /* TX_MKDIR_ACL */ - zvol_replay_err, /* TX_MKDIR_ATTR */ - zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ - zvol_replay_err, /* TX_WRITE2 */ -}; - -#ifdef illumos -int -zvol_name2minor(const char *name, minor_t *minor) -{ - zvol_state_t *zv; - - mutex_enter(&zfsdev_state_lock); - zv = zvol_minor_lookup(name); - if (minor && zv) - *minor = zv->zv_minor; - mutex_exit(&zfsdev_state_lock); - return (zv ? 0 : -1); -} -#endif /* illumos */ - -/* - * Create a minor node (plus a whole lot more) for the specified volume. - */ -static int -zvol_create_minor(const char *name) -{ - zfs_soft_state_t *zs; - zvol_state_t *zv; - objset_t *os; -#ifdef illumos - dmu_object_info_t doi; - minor_t minor = 0; - char chrbuf[30], blkbuf[30]; -#else - struct g_provider *pp; - struct g_geom *gp; - uint64_t mode; -#endif - int error; - -#ifndef illumos - ZFS_LOG(1, "Creating ZVOL %s...", name); -#endif - - mutex_enter(&zfsdev_state_lock); - - if (zvol_minor_lookup(name) != NULL) { - mutex_exit(&zfsdev_state_lock); - return (SET_ERROR(EEXIST)); - } - - /* lie and say we're read-only */ - error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os); - - if (error) { - mutex_exit(&zfsdev_state_lock); - return (error); - } - -#ifdef illumos - if ((minor = zfsdev_minor_alloc()) == 0) { - dmu_objset_disown(os, FTAG); - mutex_exit(&zfsdev_state_lock); - return (SET_ERROR(ENXIO)); - } - - if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) { - dmu_objset_disown(os, FTAG); - mutex_exit(&zfsdev_state_lock); - return (SET_ERROR(EAGAIN)); - } - (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, - (char *)name); - - (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor); - - if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, - minor, DDI_PSEUDO, 0) == DDI_FAILURE) { - ddi_soft_state_free(zfsdev_state, minor); - dmu_objset_disown(os, FTAG); - mutex_exit(&zfsdev_state_lock); - return (SET_ERROR(EAGAIN)); - } - - (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor); - - if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK, - minor, DDI_PSEUDO, 0) == DDI_FAILURE) { - ddi_remove_minor_node(zfs_dip, chrbuf); - ddi_soft_state_free(zfsdev_state, minor); - dmu_objset_disown(os, FTAG); - mutex_exit(&zfsdev_state_lock); - return (SET_ERROR(EAGAIN)); - } - - zs = ddi_get_soft_state(zfsdev_state, minor); - zs->zss_type = ZSST_ZVOL; - zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); -#else /* !illumos */ - - zv = kmem_zalloc(sizeof(*zv), KM_SLEEP); - zv->zv_state = 0; - error = dsl_prop_get_integer(name, - zfs_prop_to_name(ZFS_PROP_VOLMODE), &mode, NULL); - if (error != 0 || mode == ZFS_VOLMODE_DEFAULT) - mode = volmode; - - zv->zv_volmode = mode; - if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { - g_topology_lock(); - gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); - gp->start = zvol_geom_start; - gp->access = zvol_geom_access; - pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); - pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; - pp->sectorsize = DEV_BSIZE; - pp->mediasize = 0; - pp->private = zv; - - zv->zv_provider = pp; - bioq_init(&zv->zv_queue); - mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF); - } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { - struct make_dev_args args; - - make_dev_args_init(&args); - args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; - args.mda_devsw = &zvol_cdevsw; - args.mda_cr = NULL; - args.mda_uid = UID_ROOT; - args.mda_gid = GID_OPERATOR; - args.mda_mode = 0640; - args.mda_si_drv2 = zv; - error = make_dev_s(&args, &zv->zv_dev, - "%s/%s", ZVOL_DRIVER, name); - if (error != 0) { - kmem_free(zv, sizeof(*zv)); - dmu_objset_disown(os, FTAG); - mutex_exit(&zfsdev_state_lock); - return (error); - } - zv->zv_dev->si_iosize_max = MAXPHYS; - } - LIST_INSERT_HEAD(&all_zvols, zv, zv_links); -#endif /* illumos */ - - (void) strlcpy(zv->zv_name, name, MAXPATHLEN); - zv->zv_min_bs = DEV_BSHIFT; -#ifdef illumos - zv->zv_minor = minor; -#endif - zv->zv_objset = os; - if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) - zv->zv_flags |= ZVOL_RDONLY; - rangelock_init(&zv->zv_rangelock, NULL, NULL); - list_create(&zv->zv_extents, sizeof (zvol_extent_t), - offsetof(zvol_extent_t, ze_node)); -#ifdef illumos - /* get and cache the blocksize */ - error = dmu_object_info(os, ZVOL_OBJ, &doi); - ASSERT(error == 0); - zv->zv_volblocksize = doi.doi_data_block_size; -#endif - - if (spa_writeable(dmu_objset_spa(os))) { - if (zil_replay_disable) - zil_destroy(dmu_objset_zil(os), B_FALSE); - else - zil_replay(os, zv, zvol_replay_vector); - } - dmu_objset_disown(os, FTAG); - zv->zv_objset = NULL; - - zvol_minors++; - - mutex_exit(&zfsdev_state_lock); -#ifndef illumos - if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { - zvol_geom_run(zv); - g_topology_unlock(); - } - - ZFS_LOG(1, "ZVOL %s created.", name); -#endif - - return (0); -} - -/* - * Remove minor node for the specified volume. - */ -static int -zvol_remove_zv(zvol_state_t *zv) -{ -#ifdef illumos - char nmbuf[20]; - minor_t minor = zv->zv_minor; -#endif - - ASSERT(MUTEX_HELD(&zfsdev_state_lock)); - if (zv->zv_total_opens != 0) - return (SET_ERROR(EBUSY)); - -#ifdef illumos - (void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor); - ddi_remove_minor_node(zfs_dip, nmbuf); - - (void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor); - ddi_remove_minor_node(zfs_dip, nmbuf); -#else - ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); - - LIST_REMOVE(zv, zv_links); - if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { - g_topology_lock(); - zvol_geom_destroy(zv); - g_topology_unlock(); - } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { - if (zv->zv_dev != NULL) - destroy_dev(zv->zv_dev); - } -#endif - - rangelock_fini(&zv->zv_rangelock); - - kmem_free(zv, sizeof (zvol_state_t)); -#ifdef illumos - ddi_soft_state_free(zfsdev_state, minor); -#endif - zvol_minors--; - return (0); -} - -int -zvol_first_open(zvol_state_t *zv) -{ - dmu_object_info_t doi; - objset_t *os; - uint64_t volsize; - int error; - uint64_t readonly; - - /* lie and say we're read-only */ - error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE, - zvol_tag, &os); - if (error) - return (error); - - zv->zv_objset = os; - error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); - if (error) { - ASSERT(error == 0); - dmu_objset_disown(os, zvol_tag); - return (error); - } - - /* get and cache the blocksize */ - error = dmu_object_info(os, ZVOL_OBJ, &doi); - if (error) { - ASSERT(error == 0); - dmu_objset_disown(os, zvol_tag); - return (error); - } - zv->zv_volblocksize = doi.doi_data_block_size; - - error = dnode_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dn); - if (error) { - dmu_objset_disown(os, zvol_tag); - return (error); - } - - zvol_size_changed(zv, volsize); - zv->zv_zilog = zil_open(os, zvol_get_data); - - VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly, - NULL) == 0); - if (readonly || dmu_objset_is_snapshot(os) || - !spa_writeable(dmu_objset_spa(os))) - zv->zv_flags |= ZVOL_RDONLY; - else - zv->zv_flags &= ~ZVOL_RDONLY; - return (error); -} - -void -zvol_last_close(zvol_state_t *zv) -{ - zil_close(zv->zv_zilog); - zv->zv_zilog = NULL; - - dnode_rele(zv->zv_dn, zvol_tag); - zv->zv_dn = NULL; - - /* - * Evict cached data - */ - if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) && - !(zv->zv_flags & ZVOL_RDONLY)) - txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); - dmu_objset_evict_dbufs(zv->zv_objset); - - dmu_objset_disown(zv->zv_objset, zvol_tag); - zv->zv_objset = NULL; -} - -#ifdef illumos -int -zvol_prealloc(zvol_state_t *zv) -{ - objset_t *os = zv->zv_objset; - dmu_tx_t *tx; - uint64_t refd, avail, usedobjs, availobjs; - uint64_t resid = zv->zv_volsize; - uint64_t off = 0; - - /* Check the space usage before attempting to allocate the space */ - dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs); - if (avail < zv->zv_volsize) - return (SET_ERROR(ENOSPC)); - - /* Free old extents if they exist */ - zvol_free_extents(zv); - - while (resid != 0) { - int error; - uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE); - - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off); - return (error); - } - dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx); - dmu_tx_commit(tx); - off += bytes; - resid -= bytes; - } - txg_wait_synced(dmu_objset_pool(os), 0); - - return (0); -} -#endif /* illumos */ - -static int -zvol_update_volsize(objset_t *os, uint64_t volsize) -{ - dmu_tx_t *tx; - int error; - - ASSERT(MUTEX_HELD(&zfsdev_state_lock)); - - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); - dmu_tx_mark_netfree(tx); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } - - error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, - &volsize, tx); - dmu_tx_commit(tx); - - if (error == 0) - error = dmu_free_long_range(os, - ZVOL_OBJ, volsize, DMU_OBJECT_END); - return (error); -} - -void -zvol_remove_minors_impl(const char *name) -{ -#ifdef illumos - zvol_state_t *zv; - char *namebuf; - minor_t minor; - - namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP); - (void) strncpy(namebuf, name, strlen(name)); - (void) strcat(namebuf, "/"); - mutex_enter(&zfsdev_state_lock); - for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) { - - zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); - if (zv == NULL) - continue; - if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0) - (void) zvol_remove_zv(zv); - } - kmem_free(namebuf, strlen(name) + 2); - - mutex_exit(&zfsdev_state_lock); -#else /* !illumos */ - zvol_state_t *zv, *tzv; - size_t namelen; - - namelen = strlen(name); - - mutex_enter(&zfsdev_state_lock); - - LIST_FOREACH_SAFE(zv, &all_zvols, zv_links, tzv) { - if (strcmp(zv->zv_name, name) == 0 || - (strncmp(zv->zv_name, name, namelen) == 0 && - strlen(zv->zv_name) > namelen && (zv->zv_name[namelen] == '/' || - zv->zv_name[namelen] == '@'))) { - (void) zvol_remove_zv(zv); - } - } - - mutex_exit(&zfsdev_state_lock); -#endif /* illumos */ -} - -static int -zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize) -{ - uint64_t old_volsize = 0ULL; - int error = 0; - - ASSERT(MUTEX_HELD(&zfsdev_state_lock)); - - /* - * Reinitialize the dump area to the new size. If we - * failed to resize the dump area then restore it back to - * its original size. We must set the new volsize prior - * to calling dumpvp_resize() to ensure that the devices' - * size(9P) is not visible by the dump subsystem. - */ - old_volsize = zv->zv_volsize; - zvol_size_changed(zv, volsize); - -#ifdef ZVOL_DUMP - if (zv->zv_flags & ZVOL_DUMPIFIED) { - if ((error = zvol_dumpify(zv)) != 0 || - (error = dumpvp_resize()) != 0) { - int dumpify_error; - - (void) zvol_update_volsize(zv->zv_objset, old_volsize); - zvol_size_changed(zv, old_volsize); - dumpify_error = zvol_dumpify(zv); - error = dumpify_error ? dumpify_error : error; - } - } -#endif /* ZVOL_DUMP */ - -#ifdef illumos - /* - * Generate a LUN expansion event. - */ - if (error == 0) { - sysevent_id_t eid; - nvlist_t *attr; - char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); - - (void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV, - zv->zv_minor); - - VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); - - (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, - ESC_DEV_DLE, attr, &eid, DDI_SLEEP); - - nvlist_free(attr); - kmem_free(physpath, MAXPATHLEN); - } -#endif /* illumos */ - return (error); -} - -int -zvol_set_volsize(const char *name, uint64_t volsize) -{ - zvol_state_t *zv = NULL; - objset_t *os; - int error; - dmu_object_info_t doi; - uint64_t readonly; - boolean_t owned = B_FALSE; - - error = dsl_prop_get_integer(name, - zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); - if (error != 0) - return (error); - if (readonly) - return (SET_ERROR(EROFS)); - - mutex_enter(&zfsdev_state_lock); - zv = zvol_minor_lookup(name); - - if (zv == NULL || zv->zv_objset == NULL) { - if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, - FTAG, &os)) != 0) { - mutex_exit(&zfsdev_state_lock); - return (error); - } - owned = B_TRUE; - if (zv != NULL) - zv->zv_objset = os; - } else { - os = zv->zv_objset; - } - - if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 || - (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0) - goto out; - - error = zvol_update_volsize(os, volsize); - - if (error == 0 && zv != NULL) - error = zvol_update_live_volsize(zv, volsize); -out: - if (owned) { - dmu_objset_disown(os, FTAG); - if (zv != NULL) - zv->zv_objset = NULL; - } - mutex_exit(&zfsdev_state_lock); - return (error); -} - -/*ARGSUSED*/ -#ifdef illumos -int -zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr) -#else -static int -zvol_open(struct g_provider *pp, int flag, int count) -#endif -{ - zvol_state_t *zv; - int err = 0; -#ifdef illumos - - mutex_enter(&zfsdev_state_lock); - - zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL); - if (zv == NULL) { - mutex_exit(&zfsdev_state_lock); - return (SET_ERROR(ENXIO)); - } - - if (zv->zv_total_opens == 0) - err = zvol_first_open(zv); - if (err) { - mutex_exit(&zfsdev_state_lock); - return (err); - } -#else /* !illumos */ - boolean_t locked = B_FALSE; - - if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { - /* - * if zfs_geom_probe_vdev_key is set, that means that zfs is - * attempting to probe geom providers while looking for a - * replacement for a missing VDEV. In this case, the - * spa_namespace_lock will not be held, but it is still illegal - * to use a zvol as a vdev. Deadlocks can result if another - * thread has spa_namespace_lock - */ - return (EOPNOTSUPP); - } - /* - * Protect against recursively entering spa_namespace_lock - * when spa_open() is used for a pool on a (local) ZVOL(s). - * This is needed since we replaced upstream zfsdev_state_lock - * with spa_namespace_lock in the ZVOL code. - * We are using the same trick as spa_open(). - * Note that calls in zvol_first_open which need to resolve - * pool name to a spa object will enter spa_open() - * recursively, but that function already has all the - * necessary protection. - */ - if (!MUTEX_HELD(&zfsdev_state_lock)) { - mutex_enter(&zfsdev_state_lock); - locked = B_TRUE; - } - - zv = pp->private; - if (zv == NULL) { - if (locked) - mutex_exit(&zfsdev_state_lock); - return (SET_ERROR(ENXIO)); - } - - if (zv->zv_total_opens == 0) { - err = zvol_first_open(zv); - if (err) { - if (locked) - mutex_exit(&zfsdev_state_lock); - return (err); - } - pp->mediasize = zv->zv_volsize; - pp->stripeoffset = 0; - pp->stripesize = zv->zv_volblocksize; - } -#endif /* illumos */ - if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { - err = SET_ERROR(EROFS); - goto out; - } - if (zv->zv_flags & ZVOL_EXCL) { - err = SET_ERROR(EBUSY); - goto out; - } -#ifdef FEXCL - if (flag & FEXCL) { - if (zv->zv_total_opens != 0) { - err = SET_ERROR(EBUSY); - goto out; - } - zv->zv_flags |= ZVOL_EXCL; - } -#endif - -#ifdef illumos - if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) { - zv->zv_open_count[otyp]++; - zv->zv_total_opens++; - } - mutex_exit(&zfsdev_state_lock); -#else - zv->zv_total_opens += count; - if (locked) - mutex_exit(&zfsdev_state_lock); -#endif - - return (err); -out: - if (zv->zv_total_opens == 0) - zvol_last_close(zv); -#ifdef illumos - mutex_exit(&zfsdev_state_lock); -#else - if (locked) - mutex_exit(&zfsdev_state_lock); -#endif - return (err); -} - -/*ARGSUSED*/ -#ifdef illumos -int -zvol_close(dev_t dev, int flag, int otyp, cred_t *cr) -{ - minor_t minor = getminor(dev); - zvol_state_t *zv; - int error = 0; - - mutex_enter(&zfsdev_state_lock); - - zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); - if (zv == NULL) { - mutex_exit(&zfsdev_state_lock); -#else /* !illumos */ -static int -zvol_close(struct g_provider *pp, int flag, int count) -{ - zvol_state_t *zv; - int error = 0; - boolean_t locked = B_FALSE; - - /* See comment in zvol_open(). */ - if (!MUTEX_HELD(&zfsdev_state_lock)) { - mutex_enter(&zfsdev_state_lock); - locked = B_TRUE; - } - - zv = pp->private; - if (zv == NULL) { - if (locked) - mutex_exit(&zfsdev_state_lock); -#endif /* illumos */ - return (SET_ERROR(ENXIO)); - } - - if (zv->zv_flags & ZVOL_EXCL) { - ASSERT(zv->zv_total_opens == 1); - zv->zv_flags &= ~ZVOL_EXCL; - } - - /* - * If the open count is zero, this is a spurious close. - * That indicates a bug in the kernel / DDI framework. - */ -#ifdef illumos - ASSERT(zv->zv_open_count[otyp] != 0); -#endif - ASSERT(zv->zv_total_opens != 0); - - /* - * You may get multiple opens, but only one close. - */ -#ifdef illumos - zv->zv_open_count[otyp]--; - zv->zv_total_opens--; -#else - zv->zv_total_opens -= count; -#endif - - if (zv->zv_total_opens == 0) - zvol_last_close(zv); - -#ifdef illumos - mutex_exit(&zfsdev_state_lock); -#else - if (locked) - mutex_exit(&zfsdev_state_lock); -#endif - return (error); -} - -/* ARGSUSED */ -static void -zvol_get_done(zgd_t *zgd, int error) -{ - if (zgd->zgd_db) - dmu_buf_rele(zgd->zgd_db, zgd); - - rangelock_exit(zgd->zgd_lr); - - kmem_free(zgd, sizeof (zgd_t)); -} - -/* - * Get data to generate a TX_WRITE intent log record. - */ -static int -zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) -{ - zvol_state_t *zv = arg; - uint64_t offset = lr->lr_offset; - uint64_t size = lr->lr_length; /* length of user data */ - dmu_buf_t *db; - zgd_t *zgd; - int error; - - ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); - ASSERT3U(size, !=, 0); - - zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_lwb = lwb; - - /* - * Write records come in two flavors: immediate and indirect. - * For small writes it's cheaper to store the data with the - * log record (immediate); for large writes it's cheaper to - * sync the data and get a pointer to it (indirect) so that - * we don't have to write the data twice. - */ - if (buf != NULL) { /* immediate write */ - zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, - RL_READER); - error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, - DMU_READ_NO_PREFETCH); - } else { /* indirect write */ - /* - * Have to lock the whole block to ensure when it's written out - * and its checksum is being calculated that no one can change - * the data. Contrarily to zfs_get_data we need not re-check - * blocksize after we get the lock because it cannot be changed. - */ - size = zv->zv_volblocksize; - offset = P2ALIGN(offset, size); - zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, - RL_READER); - error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, - DMU_READ_NO_PREFETCH); - if (error == 0) { - blkptr_t *bp = &lr->lr_blkptr; - - zgd->zgd_db = db; - zgd->zgd_bp = bp; - - ASSERT(db->db_offset == offset); - ASSERT(db->db_size == size); - - error = dmu_sync(zio, lr->lr_common.lrc_txg, - zvol_get_done, zgd); - - if (error == 0) - return (0); - } - } - - zvol_get_done(zgd, error); - - return (error); -} - -/* - * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. - * - * We store data in the log buffers if it's small enough. - * Otherwise we will later flush the data out via dmu_sync(). - */ -ssize_t zvol_immediate_write_sz = 32768; -#ifdef _KERNEL -SYSCTL_LONG(_vfs_zfs_vol, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN, - &zvol_immediate_write_sz, 0, "Minimal size for indirect log write"); -#endif - -static void -zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid, - boolean_t sync) -{ - uint32_t blocksize = zv->zv_volblocksize; - zilog_t *zilog = zv->zv_zilog; - itx_wr_state_t write_state; - - if (zil_replaying(zilog, tx)) - return; - - if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) - write_state = WR_INDIRECT; - else if (!spa_has_slogs(zilog->zl_spa) && - resid >= blocksize && blocksize > zvol_immediate_write_sz) - write_state = WR_INDIRECT; - else if (sync) - write_state = WR_COPIED; - else - write_state = WR_NEED_COPY; - - while (resid) { - itx_t *itx; - lr_write_t *lr; - itx_wr_state_t wr_state = write_state; - ssize_t len = resid; - - if (wr_state == WR_COPIED && resid > zil_max_copied_data(zilog)) - wr_state = WR_NEED_COPY; - else if (wr_state == WR_INDIRECT) - len = MIN(blocksize - P2PHASE(off, blocksize), resid); - - itx = zil_itx_create(TX_WRITE, sizeof (*lr) + - (wr_state == WR_COPIED ? len : 0)); - lr = (lr_write_t *)&itx->itx_lr; - if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, - off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { - zil_itx_destroy(itx); - itx = zil_itx_create(TX_WRITE, sizeof (*lr)); - lr = (lr_write_t *)&itx->itx_lr; - wr_state = WR_NEED_COPY; - } - - itx->itx_wr_state = wr_state; - lr->lr_foid = ZVOL_OBJ; - lr->lr_offset = off; - lr->lr_length = len; - lr->lr_blkoff = 0; - BP_ZERO(&lr->lr_blkptr); - - itx->itx_private = zv; - - if (!sync && (zv->zv_sync_cnt == 0)) - itx->itx_sync = B_FALSE; - - zil_itx_assign(zilog, itx, tx); - - off += len; - resid -= len; - } -} - -#ifdef illumos -static int -zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset, - uint64_t size, boolean_t doread, boolean_t isdump) -{ - vdev_disk_t *dvd; - int c; - int numerrors = 0; - - if (vd->vdev_ops == &vdev_mirror_ops || - vd->vdev_ops == &vdev_replacing_ops || - vd->vdev_ops == &vdev_spare_ops) { - for (c = 0; c < vd->vdev_children; c++) { - int err = zvol_dumpio_vdev(vd->vdev_child[c], - addr, offset, origoffset, size, doread, isdump); - if (err != 0) { - numerrors++; - } else if (doread) { - break; - } - } - } - - if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops) - return (numerrors < vd->vdev_children ? 0 : EIO); - - if (doread && !vdev_readable(vd)) - return (SET_ERROR(EIO)); - else if (!doread && !vdev_writeable(vd)) - return (SET_ERROR(EIO)); - - if (vd->vdev_ops == &vdev_raidz_ops) { - return (vdev_raidz_physio(vd, - addr, size, offset, origoffset, doread, isdump)); - } - - offset += VDEV_LABEL_START_SIZE; - - if (ddi_in_panic() || isdump) { - ASSERT(!doread); - if (doread) - return (SET_ERROR(EIO)); - dvd = vd->vdev_tsd; - ASSERT3P(dvd, !=, NULL); - return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset), - lbtodb(size))); - } else { - dvd = vd->vdev_tsd; - ASSERT3P(dvd, !=, NULL); - return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size, - offset, doread ? B_READ : B_WRITE)); - } -} - -static int -zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size, - boolean_t doread, boolean_t isdump) -{ - vdev_t *vd; - int error; - zvol_extent_t *ze; - spa_t *spa = dmu_objset_spa(zv->zv_objset); - - /* Must be sector aligned, and not stradle a block boundary. */ - if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) || - P2BOUNDARY(offset, size, zv->zv_volblocksize)) { - return (SET_ERROR(EINVAL)); - } - ASSERT(size <= zv->zv_volblocksize); - - /* Locate the extent this belongs to */ - ze = list_head(&zv->zv_extents); - while (offset >= ze->ze_nblks * zv->zv_volblocksize) { - offset -= ze->ze_nblks * zv->zv_volblocksize; - ze = list_next(&zv->zv_extents, ze); - } - - if (ze == NULL) - return (SET_ERROR(EINVAL)); - - if (!ddi_in_panic()) - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - - vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva)); - offset += DVA_GET_OFFSET(&ze->ze_dva); - error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva), - size, doread, isdump); - - if (!ddi_in_panic()) - spa_config_exit(spa, SCL_STATE, FTAG); - - return (error); -} - -int -zvol_strategy(buf_t *bp) -{ - zfs_soft_state_t *zs = NULL; -#else /* !illumos */ -void -zvol_strategy(struct bio *bp) -{ -#endif /* illumos */ - zvol_state_t *zv; - uint64_t off, volsize; - size_t resid; - char *addr; - objset_t *os; - int error = 0; -#ifdef illumos - boolean_t doread = bp->b_flags & B_READ; -#else - boolean_t doread = 0; -#endif - boolean_t is_dumpified; - boolean_t sync; - -#ifdef illumos - if (getminor(bp->b_edev) == 0) { - error = SET_ERROR(EINVAL); - } else { - zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev)); - if (zs == NULL) - error = SET_ERROR(ENXIO); - else if (zs->zss_type != ZSST_ZVOL) - error = SET_ERROR(EINVAL); - } - - if (error) { - bioerror(bp, error); - biodone(bp); - return (0); - } - - zv = zs->zss_data; - - if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) { - bioerror(bp, EROFS); - biodone(bp); - return (0); - } - - off = ldbtob(bp->b_blkno); -#else /* !illumos */ - if (bp->bio_to) - zv = bp->bio_to->private; - else - zv = bp->bio_dev->si_drv2; - - if (zv == NULL) { - error = SET_ERROR(ENXIO); - goto out; - } - - if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) { - error = SET_ERROR(EROFS); - goto out; - } - - switch (bp->bio_cmd) { - case BIO_FLUSH: - goto sync; - case BIO_READ: - doread = 1; - case BIO_WRITE: - case BIO_DELETE: - break; - default: - error = EOPNOTSUPP; - goto out; - } - - off = bp->bio_offset; -#endif /* illumos */ - volsize = zv->zv_volsize; - - os = zv->zv_objset; - ASSERT(os != NULL); - -#ifdef illumos - bp_mapin(bp); - addr = bp->b_un.b_addr; - resid = bp->b_bcount; - - if (resid > 0 && (off < 0 || off >= volsize)) { - bioerror(bp, EIO); - biodone(bp); - return (0); - } - - is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED; - sync = ((!(bp->b_flags & B_ASYNC) && - !(zv->zv_flags & ZVOL_WCE)) || - (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) && - !doread && !is_dumpified; -#else /* !illumos */ - addr = bp->bio_data; - resid = bp->bio_length; - - if (resid > 0 && (off < 0 || off >= volsize)) { - error = SET_ERROR(EIO); - goto out; - } - - is_dumpified = B_FALSE; - sync = !doread && !is_dumpified && - zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; -#endif /* illumos */ - - /* - * There must be no buffer changes when doing a dmu_sync() because - * we can't change the data whilst calculating the checksum. - */ - locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, off, resid, - doread ? RL_READER : RL_WRITER); - -#ifndef illumos - if (bp->bio_cmd == BIO_DELETE) { - dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error != 0) { - dmu_tx_abort(tx); - } else { - zvol_log_truncate(zv, tx, off, resid, sync); - dmu_tx_commit(tx); - error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, - off, resid); - resid = 0; - } - goto unlock; - } -#endif - while (resid != 0 && off < volsize) { - size_t size = MIN(resid, zvol_maxphys); -#ifdef illumos - if (is_dumpified) { - size = MIN(size, P2END(off, zv->zv_volblocksize) - off); - error = zvol_dumpio(zv, addr, off, size, - doread, B_FALSE); - } else if (doread) { -#else - if (doread) { -#endif - error = dmu_read(os, ZVOL_OBJ, off, size, addr, - DMU_READ_PREFETCH); - } else { - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - dmu_write(os, ZVOL_OBJ, off, size, addr, tx); - zvol_log_write(zv, tx, off, size, sync); - dmu_tx_commit(tx); - } - } - if (error) { - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = SET_ERROR(EIO); - break; - } - off += size; - addr += size; - resid -= size; - } -#ifndef illumos -unlock: -#endif - rangelock_exit(lr); - -#ifdef illumos - if ((bp->b_resid = resid) == bp->b_bcount) - bioerror(bp, off > volsize ? EINVAL : error); - - if (sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); - biodone(bp); - - return (0); -#else /* !illumos */ - bp->bio_completed = bp->bio_length - resid; - if (bp->bio_completed < bp->bio_length && off > volsize) - error = EINVAL; - - if (sync) { -sync: - zil_commit(zv->zv_zilog, ZVOL_OBJ); - } -out: - if (bp->bio_to) - g_io_deliver(bp, error); - else - biofinish(bp, NULL, error); -#endif /* illumos */ -} - -#ifdef illumos -/* - * Set the buffer count to the zvol maximum transfer. - * Using our own routine instead of the default minphys() - * means that for larger writes we write bigger buffers on X86 - * (128K instead of 56K) and flush the disk write cache less often - * (every zvol_maxphys - currently 1MB) instead of minphys (currently - * 56K on X86 and 128K on sparc). - */ -void -zvol_minphys(struct buf *bp) -{ - if (bp->b_bcount > zvol_maxphys) - bp->b_bcount = zvol_maxphys; -} - -int -zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks) -{ - minor_t minor = getminor(dev); - zvol_state_t *zv; - int error = 0; - uint64_t size; - uint64_t boff; - uint64_t resid; - - zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); - if (zv == NULL) - return (SET_ERROR(ENXIO)); - - if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0) - return (SET_ERROR(EINVAL)); - - boff = ldbtob(blkno); - resid = ldbtob(nblocks); - - VERIFY3U(boff + resid, <=, zv->zv_volsize); - - while (resid) { - size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff); - error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE); - if (error) - break; - boff += size; - addr += size; - resid -= size; - } - - return (error); -} - -/*ARGSUSED*/ -int -zvol_read(dev_t dev, uio_t *uio, cred_t *cr) -{ - minor_t minor = getminor(dev); -#else /* !illumos */ -int -zvol_read(struct cdev *dev, struct uio *uio, int ioflag) -{ -#endif /* illumos */ - zvol_state_t *zv; - uint64_t volsize; - int error = 0; - -#ifdef illumos - zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); - if (zv == NULL) - return (SET_ERROR(ENXIO)); -#else - zv = dev->si_drv2; -#endif - - volsize = zv->zv_volsize; - /* uio_loffset == volsize isn't an error as its required for EOF processing. */ - if (uio->uio_resid > 0 && - (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) - return (SET_ERROR(EIO)); - -#ifdef illumos - if (zv->zv_flags & ZVOL_DUMPIFIED) { - error = physio(zvol_strategy, NULL, dev, B_READ, - zvol_minphys, uio); - return (error); - } -#endif - - locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, - uio->uio_loffset, uio->uio_resid, RL_READER); - while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { - uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); - - /* don't read past the end */ - if (bytes > volsize - uio->uio_loffset) - bytes = volsize - uio->uio_loffset; - - error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes); - if (error) { - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = SET_ERROR(EIO); - break; - } - } - rangelock_exit(lr); - - return (error); -} - -#ifdef illumos -/*ARGSUSED*/ -int -zvol_write(dev_t dev, uio_t *uio, cred_t *cr) -{ - minor_t minor = getminor(dev); -#else /* !illumos */ -int -zvol_write(struct cdev *dev, struct uio *uio, int ioflag) -{ -#endif /* illumos */ - zvol_state_t *zv; - uint64_t volsize; - int error = 0; - boolean_t sync; - -#ifdef illumos - zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); - if (zv == NULL) - return (SET_ERROR(ENXIO)); -#else - zv = dev->si_drv2; -#endif - - volsize = zv->zv_volsize; - /* uio_loffset == volsize isn't an error as its required for EOF processing. */ - if (uio->uio_resid > 0 && - (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) - return (SET_ERROR(EIO)); - -#ifdef illumos - if (zv->zv_flags & ZVOL_DUMPIFIED) { - error = physio(zvol_strategy, NULL, dev, B_WRITE, - zvol_minphys, uio); - return (error); - } - - sync = !(zv->zv_flags & ZVOL_WCE) || -#else - sync = (ioflag & IO_SYNC) || -#endif - (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); - - locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, - uio->uio_loffset, uio->uio_resid, RL_WRITER); - while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { - uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); - uint64_t off = uio->uio_loffset; - dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); - - if (bytes > volsize - off) /* don't write past the end */ - bytes = volsize - off; - - dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - break; - } - error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx); - if (error == 0) - zvol_log_write(zv, tx, off, bytes, sync); - dmu_tx_commit(tx); - - if (error) - break; - } - rangelock_exit(lr); - - if (sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); - return (error); -} - -#ifdef illumos -int -zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs) -{ - struct uuid uuid = EFI_RESERVED; - efi_gpe_t gpe = { 0 }; - uint32_t crc; - dk_efi_t efi; - int length; - char *ptr; - - if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag)) - return (SET_ERROR(EFAULT)); - ptr = (char *)(uintptr_t)efi.dki_data_64; - length = efi.dki_length; - /* - * Some clients may attempt to request a PMBR for the - * zvol. Currently this interface will return EINVAL to - * such requests. These requests could be supported by - * adding a check for lba == 0 and consing up an appropriate - * PMBR. - */ - if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0) - return (SET_ERROR(EINVAL)); - - gpe.efi_gpe_StartingLBA = LE_64(34ULL); - gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1); - UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid); - - if (efi.dki_lba == 1) { - efi_gpt_t gpt = { 0 }; - - gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE); - gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); - gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt)); - gpt.efi_gpt_MyLBA = LE_64(1ULL); - gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL); - gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1); - gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL); - gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1); - gpt.efi_gpt_SizeOfPartitionEntry = - LE_32(sizeof (efi_gpe_t)); - CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table); - gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); - CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table); - gpt.efi_gpt_HeaderCRC32 = LE_32(~crc); - if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length), - flag)) - return (SET_ERROR(EFAULT)); - ptr += sizeof (gpt); - length -= sizeof (gpt); - } - if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe), - length), flag)) - return (SET_ERROR(EFAULT)); - return (0); -} - -/* - * BEGIN entry points to allow external callers access to the volume. - */ -/* - * Return the volume parameters needed for access from an external caller. - * These values are invariant as long as the volume is held open. - */ -int -zvol_get_volume_params(minor_t minor, uint64_t *blksize, - uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl, - void **rl_hdl, void **dnode_hdl) -{ - zvol_state_t *zv; - - zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); - if (zv == NULL) - return (SET_ERROR(ENXIO)); - if (zv->zv_flags & ZVOL_DUMPIFIED) - return (SET_ERROR(ENXIO)); - - ASSERT(blksize && max_xfer_len && minor_hdl && - objset_hdl && zil_hdl && rl_hdl && dnode_hdl); - - *blksize = zv->zv_volblocksize; - *max_xfer_len = (uint64_t)zvol_maxphys; - *minor_hdl = zv; - *objset_hdl = zv->zv_objset; - *zil_hdl = zv->zv_zilog; - *rl_hdl = &zv->zv_rangelock; - *dnode_hdl = zv->zv_dn; - return (0); -} - -/* - * Return the current volume size to an external caller. - * The size can change while the volume is open. - */ -uint64_t -zvol_get_volume_size(void *minor_hdl) -{ - zvol_state_t *zv = minor_hdl; - - return (zv->zv_volsize); -} - -/* - * Return the current WCE setting to an external caller. - * The WCE setting can change while the volume is open. - */ -int -zvol_get_volume_wce(void *minor_hdl) -{ - zvol_state_t *zv = minor_hdl; - - return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0); -} - -/* - * Entry point for external callers to zvol_log_write - */ -void -zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid, - boolean_t sync) -{ - zvol_state_t *zv = minor_hdl; - - zvol_log_write(zv, tx, off, resid, sync); -} -/* - * END entry points to allow external callers access to the volume. - */ -#endif /* illumos */ - -/* - * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. - */ -static void -zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, - boolean_t sync) -{ - itx_t *itx; - lr_truncate_t *lr; - zilog_t *zilog = zv->zv_zilog; - - if (zil_replaying(zilog, tx)) - return; - - itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); - lr = (lr_truncate_t *)&itx->itx_lr; - lr->lr_foid = ZVOL_OBJ; - lr->lr_offset = off; - lr->lr_length = len; - - itx->itx_sync = (sync || zv->zv_sync_cnt != 0); - zil_itx_assign(zilog, itx, tx); -} - -#ifdef illumos -/* - * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I). - * Also a dirtbag dkio ioctl for unmap/free-block functionality. - */ -/*ARGSUSED*/ -int -zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) -{ - zvol_state_t *zv; - struct dk_callback *dkc; - int error = 0; - locked_range_t *lr; - - mutex_enter(&zfsdev_state_lock); - - zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL); - - if (zv == NULL) { - mutex_exit(&zfsdev_state_lock); - return (SET_ERROR(ENXIO)); - } - ASSERT(zv->zv_total_opens > 0); - - switch (cmd) { - - case DKIOCINFO: - { - struct dk_cinfo dki; - - bzero(&dki, sizeof (dki)); - (void) strcpy(dki.dki_cname, "zvol"); - (void) strcpy(dki.dki_dname, "zvol"); - dki.dki_ctype = DKC_UNKNOWN; - dki.dki_unit = getminor(dev); - dki.dki_maxtransfer = - 1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs); - mutex_exit(&zfsdev_state_lock); - if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag)) - error = SET_ERROR(EFAULT); - return (error); - } - - case DKIOCGMEDIAINFO: - { - struct dk_minfo dkm; - - bzero(&dkm, sizeof (dkm)); - dkm.dki_lbsize = 1U << zv->zv_min_bs; - dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs; - dkm.dki_media_type = DK_UNKNOWN; - mutex_exit(&zfsdev_state_lock); - if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag)) - error = SET_ERROR(EFAULT); - return (error); - } - - case DKIOCGMEDIAINFOEXT: - { - struct dk_minfo_ext dkmext; - - bzero(&dkmext, sizeof (dkmext)); - dkmext.dki_lbsize = 1U << zv->zv_min_bs; - dkmext.dki_pbsize = zv->zv_volblocksize; - dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs; - dkmext.dki_media_type = DK_UNKNOWN; - mutex_exit(&zfsdev_state_lock); - if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag)) - error = SET_ERROR(EFAULT); - return (error); - } - - case DKIOCGETEFI: - { - uint64_t vs = zv->zv_volsize; - uint8_t bs = zv->zv_min_bs; - - mutex_exit(&zfsdev_state_lock); - error = zvol_getefi((void *)arg, flag, vs, bs); - return (error); - } - - case DKIOCFLUSHWRITECACHE: - dkc = (struct dk_callback *)arg; - mutex_exit(&zfsdev_state_lock); - zil_commit(zv->zv_zilog, ZVOL_OBJ); - if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) { - (*dkc->dkc_callback)(dkc->dkc_cookie, error); - error = 0; - } - return (error); - - case DKIOCGETWCE: - { - int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0; - if (ddi_copyout(&wce, (void *)arg, sizeof (int), - flag)) - error = SET_ERROR(EFAULT); - break; - } - case DKIOCSETWCE: - { - int wce; - if (ddi_copyin((void *)arg, &wce, sizeof (int), - flag)) { - error = SET_ERROR(EFAULT); - break; - } - if (wce) { - zv->zv_flags |= ZVOL_WCE; - mutex_exit(&zfsdev_state_lock); - } else { - zv->zv_flags &= ~ZVOL_WCE; - mutex_exit(&zfsdev_state_lock); - zil_commit(zv->zv_zilog, ZVOL_OBJ); - } - return (0); - } - - case DKIOCGGEOM: - case DKIOCGVTOC: - /* - * commands using these (like prtvtoc) expect ENOTSUP - * since we're emulating an EFI label - */ - error = SET_ERROR(ENOTSUP); - break; - - case DKIOCDUMPINIT: - lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize, - RL_WRITER); - error = zvol_dumpify(zv); - rangelock_exit(lr); - break; - - case DKIOCDUMPFINI: - if (!(zv->zv_flags & ZVOL_DUMPIFIED)) - break; - lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize, - RL_WRITER); - error = zvol_dump_fini(zv); - rangelock_exit(lr); - break; - - case DKIOCFREE: - { - dkioc_free_list_t *dfl; - dmu_tx_t *tx; - - if (!zvol_unmap_enabled) - break; - - if (!(flag & FKIOCTL)) { - error = dfl_copyin((void *)arg, &dfl, flag, KM_SLEEP); - if (error != 0) - break; - } else { - dfl = (dkioc_free_list_t *)arg; - ASSERT3U(dfl->dfl_num_exts, <=, DFL_COPYIN_MAX_EXTS); - if (dfl->dfl_num_exts > DFL_COPYIN_MAX_EXTS) { - error = SET_ERROR(EINVAL); - break; - } - } - - mutex_exit(&zfsdev_state_lock); - - for (int i = 0; i < dfl->dfl_num_exts; i++) { - uint64_t start = dfl->dfl_exts[i].dfle_start, - length = dfl->dfl_exts[i].dfle_length, - end = start + length; - - /* - * Apply Postel's Law to length-checking. If they - * overshoot, just blank out until the end, if there's - * a need to blank out anything. - */ - if (start >= zv->zv_volsize) - continue; /* No need to do anything... */ - if (end > zv->zv_volsize) { - end = DMU_OBJECT_END; - length = end - start; - } - - lr = rangelock_enter(&zv->zv_rangelock, start, length, - RL_WRITER); - tx = dmu_tx_create(zv->zv_objset); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error != 0) { - dmu_tx_abort(tx); - } else { - zvol_log_truncate(zv, tx, start, length, - B_TRUE); - dmu_tx_commit(tx); - error = dmu_free_long_range(zv->zv_objset, - ZVOL_OBJ, start, length); - } - - rangelock_exit(lr); - - if (error != 0) - break; - } - - /* - * If the write-cache is disabled, 'sync' property - * is set to 'always', or if the caller is asking for - * a synchronous free, commit this operation to the zil. - * This will sync any previous uncommitted writes to the - * zvol object. - * Can be overridden by the zvol_unmap_sync_enabled tunable. - */ - if ((error == 0) && zvol_unmap_sync_enabled && - (!(zv->zv_flags & ZVOL_WCE) || - (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) || - (dfl->dfl_flags & DF_WAIT_SYNC))) { - zil_commit(zv->zv_zilog, ZVOL_OBJ); - } - - if (!(flag & FKIOCTL)) - dfl_free(dfl); - - return (error); - } - - default: - error = SET_ERROR(ENOTTY); - break; - - } - mutex_exit(&zfsdev_state_lock); - return (error); -} -#endif /* illumos */ - -int -zvol_busy(void) -{ - return (zvol_minors != 0); -} - -void -zvol_init(void) -{ - VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t), - 1) == 0); -#ifdef illumos - mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); -#else - ZFS_LOG(1, "ZVOL Initialized."); -#endif -} - -void -zvol_fini(void) -{ -#ifdef illumos - mutex_destroy(&zfsdev_state_lock); -#endif - ddi_soft_state_fini(&zfsdev_state); - ZFS_LOG(1, "ZVOL Deinitialized."); -} - -#ifdef illumos -/*ARGSUSED*/ -static int -zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - - if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP)) - return (1); - return (0); -} - -/*ARGSUSED*/ -static void -zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - - spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx); -} - -static int -zvol_dump_init(zvol_state_t *zv, boolean_t resize) -{ - dmu_tx_t *tx; - int error; - objset_t *os = zv->zv_objset; - spa_t *spa = dmu_objset_spa(os); - vdev_t *vd = spa->spa_root_vdev; - nvlist_t *nv = NULL; - uint64_t version = spa_version(spa); - uint64_t checksum, compress, refresrv, vbs, dedup; - - ASSERT(MUTEX_HELD(&zfsdev_state_lock)); - ASSERT(vd->vdev_ops == &vdev_root_ops); - - error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0, - DMU_OBJECT_END); - if (error != 0) - return (error); - /* wait for dmu_free_long_range to actually free the blocks */ - txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); - - /* - * If the pool on which the dump device is being initialized has more - * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is - * enabled. If so, bump that feature's counter to indicate that the - * feature is active. We also check the vdev type to handle the - * following case: - * # zpool create test raidz disk1 disk2 disk3 - * Now have spa_root_vdev->vdev_children == 1 (the raidz vdev), - * the raidz vdev itself has 3 children. - */ - if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) { - if (!spa_feature_is_enabled(spa, - SPA_FEATURE_MULTI_VDEV_CRASH_DUMP)) - return (SET_ERROR(ENOTSUP)); - (void) dsl_sync_task(spa_name(spa), - zfs_mvdev_dump_feature_check, - zfs_mvdev_dump_activate_feature_sync, NULL, - 2, ZFS_SPACE_CHECK_RESERVED); - } - - if (!resize) { - error = dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL); - if (error == 0) { - error = dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, - NULL); - } - if (error == 0) { - error = dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), - &refresrv, NULL); - } - if (error == 0) { - error = dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, - NULL); - } - if (version >= SPA_VERSION_DEDUP && error == 0) { - error = dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL); - } - } - if (error != 0) - return (error); - - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); - dmu_tx_hold_bonus(tx, ZVOL_OBJ); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error != 0) { - dmu_tx_abort(tx); - return (error); - } - - /* - * If we are resizing the dump device then we only need to - * update the refreservation to match the newly updated - * zvolsize. Otherwise, we save off the original state of the - * zvol so that we can restore them if the zvol is ever undumpified. - */ - if (resize) { - error = zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, - &zv->zv_volsize, tx); - } else { - error = zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, - &compress, tx); - if (error == 0) { - error = zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, - &checksum, tx); - } - if (error == 0) { - error = zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, - &refresrv, tx); - } - if (error == 0) { - error = zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, - &vbs, tx); - } - if (error == 0) { - error = dmu_object_set_blocksize( - os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx); - } - if (version >= SPA_VERSION_DEDUP && error == 0) { - error = zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, - &dedup, tx); - } - if (error == 0) - zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE; - } - dmu_tx_commit(tx); - - /* - * We only need update the zvol's property if we are initializing - * the dump area for the first time. - */ - if (error == 0 && !resize) { - /* - * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum - * function. Otherwise, use the old default -- OFF. - */ - checksum = spa_feature_is_active(spa, - SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY : - ZIO_CHECKSUM_OFF; - - VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0); - VERIFY(nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), - ZIO_COMPRESS_OFF) == 0); - VERIFY(nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), - checksum) == 0); - if (version >= SPA_VERSION_DEDUP) { - VERIFY(nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_DEDUP), - ZIO_CHECKSUM_OFF) == 0); - } - - error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL, - nv, NULL); - nvlist_free(nv); - } - - /* Allocate the space for the dump */ - if (error == 0) - error = zvol_prealloc(zv); - return (error); -} - -static int -zvol_dumpify(zvol_state_t *zv) -{ - int error = 0; - uint64_t dumpsize = 0; - dmu_tx_t *tx; - objset_t *os = zv->zv_objset; - - if (zv->zv_flags & ZVOL_RDONLY) - return (SET_ERROR(EROFS)); - - if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, - 8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) { - boolean_t resize = (dumpsize > 0); - - if ((error = zvol_dump_init(zv, resize)) != 0) { - (void) zvol_dump_fini(zv); - return (error); - } - } - - /* - * Build up our lba mapping. - */ - error = zvol_get_lbas(zv); - if (error) { - (void) zvol_dump_fini(zv); - return (error); - } - - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - (void) zvol_dump_fini(zv); - return (error); - } - - zv->zv_flags |= ZVOL_DUMPIFIED; - error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1, - &zv->zv_volsize, tx); - dmu_tx_commit(tx); - - if (error) { - (void) zvol_dump_fini(zv); - return (error); - } - - txg_wait_synced(dmu_objset_pool(os), 0); - return (0); -} - -static int -zvol_dump_fini(zvol_state_t *zv) -{ - dmu_tx_t *tx; - objset_t *os = zv->zv_objset; - nvlist_t *nv; - int error = 0; - uint64_t checksum, compress, refresrv, vbs, dedup; - uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset)); - - /* - * Attempt to restore the zvol back to its pre-dumpified state. - * This is a best-effort attempt as it's possible that not all - * of these properties were initialized during the dumpify process - * (i.e. error during zvol_dump_init). - */ - - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } - (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx); - dmu_tx_commit(tx); - - (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum); - (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress); - (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv); - (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs); - - VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); - (void) nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum); - (void) nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress); - (void) nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv); - if (version >= SPA_VERSION_DEDUP && - zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) { - (void) nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_DEDUP), dedup); - } - (void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL, - nv, NULL); - nvlist_free(nv); - - zvol_free_extents(zv); - zv->zv_flags &= ~ZVOL_DUMPIFIED; - (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END); - /* wait for dmu_free_long_range to actually free the blocks */ - txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, ZVOL_OBJ); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } - if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0) - zv->zv_volblocksize = vbs; - dmu_tx_commit(tx); - - return (0); -} -#else /* !illumos */ - -static void -zvol_geom_run(zvol_state_t *zv) -{ - struct g_provider *pp; - - pp = zv->zv_provider; - g_error_provider(pp, 0); - - kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, - "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER)); -} - -static void -zvol_geom_destroy(zvol_state_t *zv) -{ - struct g_provider *pp; - - g_topology_assert(); - - mtx_lock(&zv->zv_queue_mtx); - zv->zv_state = 1; - wakeup_one(&zv->zv_queue); - while (zv->zv_state != 2) - msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0); - mtx_destroy(&zv->zv_queue_mtx); - - pp = zv->zv_provider; - zv->zv_provider = NULL; - pp->private = NULL; - g_wither_geom(pp->geom, ENXIO); -} - -static int -zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) -{ - int count, error, flags; - - g_topology_assert(); - - /* - * To make it easier we expect either open or close, but not both - * at the same time. - */ - KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || - (acr <= 0 && acw <= 0 && ace <= 0), - ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", - pp->name, acr, acw, ace)); - - if (pp->private == NULL) { - if (acr <= 0 && acw <= 0 && ace <= 0) - return (0); - return (pp->error); - } - - /* - * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0, - * because GEOM already handles that and handles it a bit differently. - * GEOM allows for multiple read/exclusive consumers and ZFS allows - * only one exclusive consumer, no matter if it is reader or writer. - * I like better the way GEOM works so I'll leave it for GEOM to - * decide what to do. - */ - - count = acr + acw + ace; - if (count == 0) - return (0); - - flags = 0; - if (acr != 0 || ace != 0) - flags |= FREAD; - if (acw != 0) - flags |= FWRITE; - - g_topology_unlock(); - if (count > 0) - error = zvol_open(pp, flags, count); - else - error = zvol_close(pp, flags, -count); - g_topology_lock(); - return (error); -} - -static void -zvol_geom_start(struct bio *bp) -{ - zvol_state_t *zv; - boolean_t first; - - zv = bp->bio_to->private; - ASSERT(zv != NULL); - switch (bp->bio_cmd) { - case BIO_FLUSH: - if (!THREAD_CAN_SLEEP()) - goto enqueue; - zil_commit(zv->zv_zilog, ZVOL_OBJ); - g_io_deliver(bp, 0); - break; - case BIO_READ: - case BIO_WRITE: - case BIO_DELETE: - if (!THREAD_CAN_SLEEP()) - goto enqueue; - zvol_strategy(bp); - break; - case BIO_GETATTR: { - spa_t *spa = dmu_objset_spa(zv->zv_objset); - uint64_t refd, avail, usedobjs, availobjs, val; - - if (g_handleattr_int(bp, "GEOM::candelete", 1)) - return; - if (strcmp(bp->bio_attribute, "blocksavail") == 0) { - dmu_objset_space(zv->zv_objset, &refd, &avail, - &usedobjs, &availobjs); - if (g_handleattr_off_t(bp, "blocksavail", - avail / DEV_BSIZE)) - return; - } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { - dmu_objset_space(zv->zv_objset, &refd, &avail, - &usedobjs, &availobjs); - if (g_handleattr_off_t(bp, "blocksused", - refd / DEV_BSIZE)) - return; - } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { - avail = metaslab_class_get_space(spa_normal_class(spa)); - avail -= metaslab_class_get_alloc(spa_normal_class(spa)); - if (g_handleattr_off_t(bp, "poolblocksavail", - avail / DEV_BSIZE)) - return; - } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { - refd = metaslab_class_get_alloc(spa_normal_class(spa)); - if (g_handleattr_off_t(bp, "poolblocksused", - refd / DEV_BSIZE)) - return; - } - /* FALLTHROUGH */ - } - default: - g_io_deliver(bp, EOPNOTSUPP); - break; - } - return; - -enqueue: - mtx_lock(&zv->zv_queue_mtx); - first = (bioq_first(&zv->zv_queue) == NULL); - bioq_insert_tail(&zv->zv_queue, bp); - mtx_unlock(&zv->zv_queue_mtx); - if (first) - wakeup_one(&zv->zv_queue); -} - -static void -zvol_geom_worker(void *arg) -{ - zvol_state_t *zv; - struct bio *bp; - - thread_lock(curthread); - sched_prio(curthread, PRIBIO); - thread_unlock(curthread); - - zv = arg; - for (;;) { - mtx_lock(&zv->zv_queue_mtx); - bp = bioq_takefirst(&zv->zv_queue); - if (bp == NULL) { - if (zv->zv_state == 1) { - zv->zv_state = 2; - wakeup(&zv->zv_state); - mtx_unlock(&zv->zv_queue_mtx); - kthread_exit(); - } - msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP, - "zvol:io", 0); - continue; - } - mtx_unlock(&zv->zv_queue_mtx); - switch (bp->bio_cmd) { - case BIO_FLUSH: - zil_commit(zv->zv_zilog, ZVOL_OBJ); - g_io_deliver(bp, 0); - break; - case BIO_READ: - case BIO_WRITE: - case BIO_DELETE: - zvol_strategy(bp); - break; - default: - g_io_deliver(bp, EOPNOTSUPP); - break; - } - } -} - -extern boolean_t dataset_name_hidden(const char *name); - -static int -zvol_create_snapshots(objset_t *os, const char *name) -{ - uint64_t cookie, obj; - char *sname; - int error, len; - - cookie = obj = 0; - sname = kmem_alloc(MAXPATHLEN, KM_SLEEP); - -#if 0 - (void) dmu_objset_find(name, dmu_objset_prefetch, NULL, - DS_FIND_SNAPSHOTS); -#endif - - for (;;) { - len = snprintf(sname, MAXPATHLEN, "%s@", name); - if (len >= MAXPATHLEN) { - dmu_objset_rele(os, FTAG); - error = ENAMETOOLONG; - break; - } - - dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - error = dmu_snapshot_list_next(os, MAXPATHLEN - len, - sname + len, &obj, &cookie, NULL); - dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - if (error != 0) { - if (error == ENOENT) - error = 0; - break; - } - - error = zvol_create_minor(sname); - if (error != 0 && error != EEXIST) { - printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n", - sname, error); - break; - } - } - - kmem_free(sname, MAXPATHLEN); - return (error); -} - -int -zvol_create_minors_impl(const char *name) -{ - uint64_t cookie; - objset_t *os; - char *osname, *p; - int error, len; - - if (dataset_name_hidden(name)) - return (0); - - if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) { - printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n", - name, error); - return (error); - } - if (dmu_objset_type(os) == DMU_OST_ZVOL) { - dsl_dataset_long_hold(os->os_dsl_dataset, FTAG); - dsl_pool_rele(dmu_objset_pool(os), FTAG); - error = zvol_create_minor(name); - if (error == 0 || error == EEXIST) { - error = zvol_create_snapshots(os, name); - } else { - printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n", - name, error); - } - dsl_dataset_long_rele(os->os_dsl_dataset, FTAG); - dsl_dataset_rele(os->os_dsl_dataset, FTAG); - return (error); - } - if (dmu_objset_type(os) != DMU_OST_ZFS) { - dmu_objset_rele(os, FTAG); - return (0); - } - - osname = kmem_alloc(MAXPATHLEN, KM_SLEEP); - if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) { - dmu_objset_rele(os, FTAG); - kmem_free(osname, MAXPATHLEN); - return (ENOENT); - } - p = osname + strlen(osname); - len = MAXPATHLEN - (p - osname); - -#if 0 - /* Prefetch the datasets. */ - cookie = 0; - while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) { - if (!dataset_name_hidden(osname)) - (void) dmu_objset_prefetch(osname, NULL); - } -#endif - - cookie = 0; - while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL, - &cookie) == 0) { - dmu_objset_rele(os, FTAG); - (void)zvol_create_minors_impl(osname); - if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) { - printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n", - name, error); - return (error); - } - } - - dmu_objset_rele(os, FTAG); - kmem_free(osname, MAXPATHLEN); - return (0); -} - -static void -zvol_rename_minor(zvol_state_t *zv, const char *newname) -{ - struct g_geom *gp; - struct g_provider *pp; - struct cdev *dev; - - ASSERT(MUTEX_HELD(&zfsdev_state_lock)); - - if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { - g_topology_lock(); - pp = zv->zv_provider; - ASSERT(pp != NULL); - gp = pp->geom; - ASSERT(gp != NULL); - - zv->zv_provider = NULL; - g_wither_provider(pp, ENXIO); - - pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); - pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; - pp->sectorsize = DEV_BSIZE; - pp->mediasize = zv->zv_volsize; - pp->private = zv; - zv->zv_provider = pp; - g_error_provider(pp, 0); - g_topology_unlock(); - } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { - struct make_dev_args args; - - if ((dev = zv->zv_dev) != NULL) { - zv->zv_dev = NULL; - destroy_dev(dev); - if (zv->zv_total_opens > 0) { - zv->zv_flags &= ~ZVOL_EXCL; - zv->zv_total_opens = 0; - zvol_last_close(zv); - } - } - - make_dev_args_init(&args); - args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; - args.mda_devsw = &zvol_cdevsw; - args.mda_cr = NULL; - args.mda_uid = UID_ROOT; - args.mda_gid = GID_OPERATOR; - args.mda_mode = 0640; - args.mda_si_drv2 = zv; - if (make_dev_s(&args, &zv->zv_dev, - "%s/%s", ZVOL_DRIVER, newname) == 0) - zv->zv_dev->si_iosize_max = MAXPHYS; - } - strlcpy(zv->zv_name, newname, sizeof(zv->zv_name)); -} - -void -zvol_rename_minors_impl(const char *oldname, const char *newname) -{ - char name[MAXPATHLEN]; - struct g_provider *pp; - struct g_geom *gp; - size_t oldnamelen, newnamelen; - zvol_state_t *zv; - char *namebuf; - boolean_t locked = B_FALSE; - - oldnamelen = strlen(oldname); - newnamelen = strlen(newname); - - /* See comment in zvol_open(). */ - if (!MUTEX_HELD(&zfsdev_state_lock)) { - mutex_enter(&zfsdev_state_lock); - locked = B_TRUE; - } - - LIST_FOREACH(zv, &all_zvols, zv_links) { - if (strcmp(zv->zv_name, oldname) == 0) { - zvol_rename_minor(zv, newname); - } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && - (zv->zv_name[oldnamelen] == '/' || - zv->zv_name[oldnamelen] == '@')) { - snprintf(name, sizeof(name), "%s%c%s", newname, - zv->zv_name[oldnamelen], - zv->zv_name + oldnamelen + 1); - zvol_rename_minor(zv, name); - } - } - - if (locked) - mutex_exit(&zfsdev_state_lock); -} - -static zvol_task_t * -zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2) -{ - zvol_task_t *task; - char *delim; - - task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); - task->op = op; - delim = strchr(name1, '/'); - strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN); - - strlcpy(task->name1, name1, MAXNAMELEN); - if (name2 != NULL) - strlcpy(task->name2, name2, MAXNAMELEN); - - return (task); -} - -static void -zvol_task_free(zvol_task_t *task) -{ - kmem_free(task, sizeof (zvol_task_t)); -} - -/* - * The worker thread function performed asynchronously. - */ -static void -zvol_task_cb(void *param) -{ - zvol_task_t *task = (zvol_task_t *)param; - - switch (task->op) { - case ZVOL_ASYNC_CREATE_MINORS: - (void) zvol_create_minors_impl(task->name1); - break; - case ZVOL_ASYNC_REMOVE_MINORS: - zvol_remove_minors_impl(task->name1); - break; - case ZVOL_ASYNC_RENAME_MINORS: - zvol_rename_minors_impl(task->name1, task->name2); - break; - default: - VERIFY(0); - break; - } - - zvol_task_free(task); -} - -static void -zvol_minors_helper(spa_t *spa, zvol_async_op_t op, const char *name1, - const char *name2) -{ - zvol_task_t *task; - - if (dataset_name_hidden(name1)) - return; - if (name2 != NULL && dataset_name_hidden(name2)) - return; - task = zvol_task_alloc(op, name1, name2); - (void)taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); -} - -void -zvol_create_minors(spa_t *spa, const char *name) -{ - zvol_minors_helper(spa, ZVOL_ASYNC_CREATE_MINORS, name, NULL); -} - -void -zvol_remove_minors(spa_t *spa, const char *name) -{ - zvol_minors_helper(spa, ZVOL_ASYNC_REMOVE_MINORS, name, NULL); -} - -void -zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname) -{ - zvol_minors_helper(spa, ZVOL_ASYNC_RENAME_MINORS, oldname, newname); -} - -static int -zvol_d_open(struct cdev *dev, int flags, int fmt, struct thread *td) -{ - zvol_state_t *zv = dev->si_drv2; - int err = 0; - - mutex_enter(&zfsdev_state_lock); - if (zv->zv_total_opens == 0) - err = zvol_first_open(zv); - if (err) { - mutex_exit(&zfsdev_state_lock); - return (err); - } - if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { - err = SET_ERROR(EROFS); - goto out; - } - if (zv->zv_flags & ZVOL_EXCL) { - err = SET_ERROR(EBUSY); - goto out; - } -#ifdef FEXCL - if (flags & FEXCL) { - if (zv->zv_total_opens != 0) { - err = SET_ERROR(EBUSY); - goto out; - } - zv->zv_flags |= ZVOL_EXCL; - } -#endif - - zv->zv_total_opens++; - if (flags & (FSYNC | FDSYNC)) { - zv->zv_sync_cnt++; - if (zv->zv_sync_cnt == 1) - zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); - } - mutex_exit(&zfsdev_state_lock); - return (err); -out: - if (zv->zv_total_opens == 0) - zvol_last_close(zv); - mutex_exit(&zfsdev_state_lock); - return (err); -} - -static int -zvol_d_close(struct cdev *dev, int flags, int fmt, struct thread *td) -{ - zvol_state_t *zv = dev->si_drv2; - - mutex_enter(&zfsdev_state_lock); - if (zv->zv_flags & ZVOL_EXCL) { - ASSERT(zv->zv_total_opens == 1); - zv->zv_flags &= ~ZVOL_EXCL; - } - - /* - * If the open count is zero, this is a spurious close. - * That indicates a bug in the kernel / DDI framework. - */ - ASSERT(zv->zv_total_opens != 0); - - /* - * You may get multiple opens, but only one close. - */ - zv->zv_total_opens--; - if (flags & (FSYNC | FDSYNC)) - zv->zv_sync_cnt--; - - if (zv->zv_total_opens == 0) - zvol_last_close(zv); - - mutex_exit(&zfsdev_state_lock); - return (0); -} - -static int -zvol_d_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) -{ - zvol_state_t *zv; - locked_range_t *lr; - off_t offset, length; - int i, error; - boolean_t sync; - - zv = dev->si_drv2; - - error = 0; - KASSERT(zv->zv_total_opens > 0, - ("Device with zero access count in zvol_d_ioctl")); - - i = IOCPARM_LEN(cmd); - switch (cmd) { - case DIOCGSECTORSIZE: - *(u_int *)data = DEV_BSIZE; - break; - case DIOCGMEDIASIZE: - *(off_t *)data = zv->zv_volsize; - break; - case DIOCGFLUSH: - zil_commit(zv->zv_zilog, ZVOL_OBJ); - break; - case DIOCGDELETE: - if (!zvol_unmap_enabled) - break; - - offset = ((off_t *)data)[0]; - length = ((off_t *)data)[1]; - if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || - offset < 0 || offset >= zv->zv_volsize || - length <= 0) { - printf("%s: offset=%jd length=%jd\n", __func__, offset, - length); - error = EINVAL; - break; - } - - lr = rangelock_enter(&zv->zv_rangelock, offset, length, - RL_WRITER); - dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error != 0) { - sync = FALSE; - dmu_tx_abort(tx); - } else { - sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); - zvol_log_truncate(zv, tx, offset, length, sync); - dmu_tx_commit(tx); - error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, - offset, length); - } - rangelock_exit(lr); - if (sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); - break; - case DIOCGSTRIPESIZE: - *(off_t *)data = zv->zv_volblocksize; - break; - case DIOCGSTRIPEOFFSET: - *(off_t *)data = 0; - break; - case DIOCGATTR: { - spa_t *spa = dmu_objset_spa(zv->zv_objset); - struct diocgattr_arg *arg = (struct diocgattr_arg *)data; - uint64_t refd, avail, usedobjs, availobjs; - - if (strcmp(arg->name, "GEOM::candelete") == 0) - arg->value.i = 1; - else if (strcmp(arg->name, "blocksavail") == 0) { - dmu_objset_space(zv->zv_objset, &refd, &avail, - &usedobjs, &availobjs); - arg->value.off = avail / DEV_BSIZE; - } else if (strcmp(arg->name, "blocksused") == 0) { - dmu_objset_space(zv->zv_objset, &refd, &avail, - &usedobjs, &availobjs); - arg->value.off = refd / DEV_BSIZE; - } else if (strcmp(arg->name, "poolblocksavail") == 0) { - avail = metaslab_class_get_space(spa_normal_class(spa)); - avail -= metaslab_class_get_alloc(spa_normal_class(spa)); - arg->value.off = avail / DEV_BSIZE; - } else if (strcmp(arg->name, "poolblocksused") == 0) { - refd = metaslab_class_get_alloc(spa_normal_class(spa)); - arg->value.off = refd / DEV_BSIZE; - } else - error = ENOIOCTL; - break; - } - case FIOSEEKHOLE: - case FIOSEEKDATA: { - off_t *off = (off_t *)data; - uint64_t noff; - boolean_t hole; - - hole = (cmd == FIOSEEKHOLE); - noff = *off; - error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); - *off = noff; - break; - } - default: - error = ENOIOCTL; - } - - return (error); -} -#endif /* illumos */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/os/callb.c b/sys/cddl/contrib/opensolaris/uts/common/os/callb.c deleted file mode 100644 index da479087f869..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/os/callb.c +++ /dev/null @@ -1,438 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* for delay() */ -#include /* For TASKQ_NAMELEN */ -#include - -#define CB_MAXNAME TASKQ_NAMELEN - -/* - * The callb mechanism provides generic event scheduling/echoing. - * A callb function is registered and called on behalf of the event. - */ -typedef struct callb { - struct callb *c_next; /* next in class or on freelist */ - kthread_id_t c_thread; /* ptr to caller's thread struct */ - char c_flag; /* info about the callb state */ - uchar_t c_class; /* this callb's class */ - kcondvar_t c_done_cv; /* signal callb completion */ - boolean_t (*c_func)(); /* cb function: returns true if ok */ - void *c_arg; /* arg to c_func */ - char c_name[CB_MAXNAME+1]; /* debug:max func name length */ -} callb_t; - -/* - * callb c_flag bitmap definitions - */ -#define CALLB_FREE 0x0 -#define CALLB_TAKEN 0x1 -#define CALLB_EXECUTING 0x2 - -/* - * Basic structure for a callb table. - * All callbs are organized into different class groups described - * by ct_class array. - * The callbs within a class are single-linked and normally run by a - * serial execution. - */ -typedef struct callb_table { - kmutex_t ct_lock; /* protect all callb states */ - callb_t *ct_freelist; /* free callb structures */ - int ct_busy; /* != 0 prevents additions */ - kcondvar_t ct_busy_cv; /* to wait for not busy */ - int ct_ncallb; /* num of callbs allocated */ - callb_t *ct_first_cb[NCBCLASS]; /* ptr to 1st callb in a class */ -} callb_table_t; - -int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC; - -static callb_id_t callb_add_common(boolean_t (*)(void *, int), - void *, int, char *, kthread_id_t); - -static callb_table_t callb_table; /* system level callback table */ -static callb_table_t *ct = &callb_table; -static kmutex_t callb_safe_mutex; -callb_cpr_t callb_cprinfo_safe = { - &callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, 0, 0 }; - -/* - * Init all callb tables in the system. - */ -void -callb_init(void *dummy __unused) -{ - callb_table.ct_busy = 0; /* mark table open for additions */ - mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL); -} - -void -callb_fini(void *dummy __unused) -{ - callb_t *cp; - int i; - - mutex_enter(&ct->ct_lock); - for (i = 0; i < 16; i++) { - while ((cp = ct->ct_freelist) != NULL) { - ct->ct_freelist = cp->c_next; - ct->ct_ncallb--; - kmem_free(cp, sizeof (callb_t)); - } - if (ct->ct_ncallb == 0) - break; - /* Not all callbacks finished, waiting for the rest. */ - mutex_exit(&ct->ct_lock); - tsleep(ct, 0, "callb", hz / 4); - mutex_enter(&ct->ct_lock); - } - if (ct->ct_ncallb > 0) - printf("%s: Leaked %d callbacks!\n", __func__, ct->ct_ncallb); - mutex_exit(&ct->ct_lock); - mutex_destroy(&callb_safe_mutex); - mutex_destroy(&callb_table.ct_lock); -} - -/* - * callout_add() is called to register func() be called later. - */ -static callb_id_t -callb_add_common(boolean_t (*func)(void *arg, int code), - void *arg, int class, char *name, kthread_id_t t) -{ - callb_t *cp; - - ASSERT(class < NCBCLASS); - - mutex_enter(&ct->ct_lock); - while (ct->ct_busy) - cv_wait(&ct->ct_busy_cv, &ct->ct_lock); - if ((cp = ct->ct_freelist) == NULL) { - ct->ct_ncallb++; - cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP); - } - ct->ct_freelist = cp->c_next; - cp->c_thread = t; - cp->c_func = func; - cp->c_arg = arg; - cp->c_class = (uchar_t)class; - cp->c_flag |= CALLB_TAKEN; -#ifdef DEBUG - if (strlen(name) > CB_MAXNAME) - cmn_err(CE_WARN, "callb_add: name of callback function '%s' " - "too long -- truncated to %d chars", - name, CB_MAXNAME); -#endif - (void) strncpy(cp->c_name, name, CB_MAXNAME); - cp->c_name[CB_MAXNAME] = '\0'; - - /* - * Insert the new callb at the head of its class list. - */ - cp->c_next = ct->ct_first_cb[class]; - ct->ct_first_cb[class] = cp; - - mutex_exit(&ct->ct_lock); - return ((callb_id_t)cp); -} - -/* - * The default function to add an entry to the callback table. Since - * it uses curthread as the thread identifier to store in the table, - * it should be used for the normal case of a thread which is calling - * to add ITSELF to the table. - */ -callb_id_t -callb_add(boolean_t (*func)(void *arg, int code), - void *arg, int class, char *name) -{ - return (callb_add_common(func, arg, class, name, curthread)); -} - -/* - * A special version of callb_add() above for use by threads which - * might be adding an entry to the table on behalf of some other - * thread (for example, one which is constructed but not yet running). - * In this version the thread id is an argument. - */ -callb_id_t -callb_add_thread(boolean_t (*func)(void *arg, int code), - void *arg, int class, char *name, kthread_id_t t) -{ - return (callb_add_common(func, arg, class, name, t)); -} - -/* - * callout_delete() is called to remove an entry identified by id - * that was originally placed there by a call to callout_add(). - * return -1 if fail to delete a callb entry otherwise return 0. - */ -int -callb_delete(callb_id_t id) -{ - callb_t **pp; - callb_t *me = (callb_t *)id; - - mutex_enter(&ct->ct_lock); - - for (;;) { - pp = &ct->ct_first_cb[me->c_class]; - while (*pp != NULL && *pp != me) - pp = &(*pp)->c_next; - -#ifdef DEBUG - if (*pp != me) { - cmn_err(CE_WARN, "callb delete bogus entry 0x%p", - (void *)me); - mutex_exit(&ct->ct_lock); - return (-1); - } -#endif /* DEBUG */ - - /* - * It is not allowed to delete a callb in the middle of - * executing otherwise, the callb_execute() will be confused. - */ - if (!(me->c_flag & CALLB_EXECUTING)) - break; - - cv_wait(&me->c_done_cv, &ct->ct_lock); - } - /* relink the class list */ - *pp = me->c_next; - - /* clean up myself and return the free callb to the head of freelist */ - me->c_flag = CALLB_FREE; - me->c_next = ct->ct_freelist; - ct->ct_freelist = me; - - mutex_exit(&ct->ct_lock); - return (0); -} - -/* - * class: indicates to execute all callbs in the same class; - * code: optional argument for the callb functions. - * return: = 0: success - * != 0: ptr to string supplied when callback was registered - */ -void * -callb_execute_class(int class, int code) -{ - callb_t *cp; - void *ret = NULL; - - ASSERT(class < NCBCLASS); - - mutex_enter(&ct->ct_lock); - - for (cp = ct->ct_first_cb[class]; - cp != NULL && ret == 0; cp = cp->c_next) { - while (cp->c_flag & CALLB_EXECUTING) - cv_wait(&cp->c_done_cv, &ct->ct_lock); - /* - * cont if the callb is deleted while we're sleeping - */ - if (cp->c_flag == CALLB_FREE) - continue; - cp->c_flag |= CALLB_EXECUTING; - -#ifdef CALLB_DEBUG - printf("callb_execute: name=%s func=%p arg=%p\n", - cp->c_name, (void *)cp->c_func, (void *)cp->c_arg); -#endif /* CALLB_DEBUG */ - - mutex_exit(&ct->ct_lock); - /* If callback function fails, pass back client's name */ - if (!(*cp->c_func)(cp->c_arg, code)) - ret = cp->c_name; - mutex_enter(&ct->ct_lock); - - cp->c_flag &= ~CALLB_EXECUTING; - cv_broadcast(&cp->c_done_cv); - } - mutex_exit(&ct->ct_lock); - return (ret); -} - -/* - * callers make sure no recursive entries to this func. - * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure. - * - * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we - * use a cv_timedwait() in case the kernel thread is blocked. - * - * Note that this is a generic callback handler for daemon CPR and - * should NOT be changed to accommodate any specific requirement in a daemon. - * Individual daemons that require changes to the handler shall write - * callback routines in their own daemon modules. - */ -boolean_t -callb_generic_cpr(void *arg, int code) -{ - callb_cpr_t *cp = (callb_cpr_t *)arg; - clock_t ret = 0; /* assume success */ - - mutex_enter(cp->cc_lockp); - - switch (code) { - case CB_CODE_CPR_CHKPT: - cp->cc_events |= CALLB_CPR_START; -#ifdef CPR_NOT_THREAD_SAFE - while (!(cp->cc_events & CALLB_CPR_SAFE)) - /* cv_timedwait() returns -1 if it times out. */ - if ((ret = cv_reltimedwait(&cp->cc_callb_cv, - cp->cc_lockp, (callb_timeout_sec * hz), - TR_CLOCK_TICK)) == -1) - break; -#endif - break; - - case CB_CODE_CPR_RESUME: - cp->cc_events &= ~CALLB_CPR_START; - cv_signal(&cp->cc_stop_cv); - break; - } - mutex_exit(cp->cc_lockp); - return (ret != -1); -} - -/* - * The generic callback function associated with kernel threads which - * are always considered safe. - */ -/* ARGSUSED */ -boolean_t -callb_generic_cpr_safe(void *arg, int code) -{ - return (B_TRUE); -} -/* - * Prevent additions to callback table. - */ -void -callb_lock_table(void) -{ - mutex_enter(&ct->ct_lock); - ASSERT(ct->ct_busy == 0); - ct->ct_busy = 1; - mutex_exit(&ct->ct_lock); -} - -/* - * Allow additions to callback table. - */ -void -callb_unlock_table(void) -{ - mutex_enter(&ct->ct_lock); - ASSERT(ct->ct_busy != 0); - ct->ct_busy = 0; - cv_broadcast(&ct->ct_busy_cv); - mutex_exit(&ct->ct_lock); -} - -#ifdef illumos -/* - * Return a boolean value indicating whether a particular kernel thread is - * stopped in accordance with the cpr callback protocol. If returning - * false, also return a pointer to the thread name via the 2nd argument. - */ -boolean_t -callb_is_stopped(kthread_id_t tp, caddr_t *thread_name) -{ - callb_t *cp; - boolean_t ret_val; - - mutex_enter(&ct->ct_lock); - - for (cp = ct->ct_first_cb[CB_CL_CPR_DAEMON]; - cp != NULL && tp != cp->c_thread; cp = cp->c_next) - ; - - ret_val = (cp != NULL); - if (ret_val) { - /* - * We found the thread in the callback table and have - * provisionally set the return value to true. Now - * see if it is marked "safe" and is sleeping or stopped. - */ - callb_cpr_t *ccp = (callb_cpr_t *)cp->c_arg; - - *thread_name = cp->c_name; /* in case not stopped */ - mutex_enter(ccp->cc_lockp); - - if (ccp->cc_events & CALLB_CPR_SAFE) { - int retry; - - mutex_exit(ccp->cc_lockp); - for (retry = 0; retry < CALLB_MAX_RETRY; retry++) { - thread_lock(tp); - if (tp->t_state & (TS_SLEEP | TS_STOPPED)) { - thread_unlock(tp); - break; - } - thread_unlock(tp); - delay(CALLB_THREAD_DELAY); - } - ret_val = retry < CALLB_MAX_RETRY; - } else { - ret_val = - (ccp->cc_events & CALLB_CPR_ALWAYS_SAFE) != 0; - mutex_exit(ccp->cc_lockp); - } - } else { - /* - * Thread not found in callback table. Make the best - * attempt to identify the thread in the error message. - */ - ulong_t offset; - char *sym = kobj_getsymname((uintptr_t)tp->t_startpc, - &offset); - - *thread_name = sym ? sym : "*unknown*"; - } - - mutex_exit(&ct->ct_lock); - return (ret_val); -} -#endif /* illumos */ - -SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL); -SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL); diff --git a/sys/cddl/contrib/opensolaris/uts/common/os/fm.c b/sys/cddl/contrib/opensolaris/uts/common/os/fm.c deleted file mode 100644 index 21aac7a1b49d..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/os/fm.c +++ /dev/null @@ -1,1399 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -/* - * Fault Management Architecture (FMA) Resource and Protocol Support - * - * The routines contained herein provide services to support kernel subsystems - * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089). - * - * Name-Value Pair Lists - * - * The embodiment of an FMA protocol element (event, fmri or authority) is a - * name-value pair list (nvlist_t). FMA-specific nvlist construtor and - * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used - * to create an nvpair list using custom allocators. Callers may choose to - * allocate either from the kernel memory allocator, or from a preallocated - * buffer, useful in constrained contexts like high-level interrupt routines. - * - * Protocol Event and FMRI Construction - * - * Convenience routines are provided to construct nvlist events according to - * the FMA Event Protocol and Naming Schema specification for ereports and - * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes. - * - * ENA Manipulation - * - * Routines to generate ENA formats 0, 1 and 2 are available as well as - * routines to increment formats 1 and 2. Individual fields within the - * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(), - * fm_ena_format_get() and fm_ena_gen_get(). - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * URL and SUNW-MSG-ID value to display for fm_panic(), defined below. These - * values must be kept in sync with the FMA source code in usr/src/cmd/fm. - */ -static const char *fm_url = "http://www.sun.com/msg"; -static const char *fm_msgid = "SUNOS-8000-0G"; -static char *volatile fm_panicstr = NULL; - -#ifdef illumos -errorq_t *ereport_errorq; -#endif -void *ereport_dumpbuf; -size_t ereport_dumplen; - -static uint_t ereport_chanlen = ERPT_EVCH_MAX; -static evchan_t *ereport_chan = NULL; -static ulong_t ereport_qlen = 0; -static size_t ereport_size = 0; -static int ereport_cols = 80; - -extern void fastreboot_disable_highpil(void); - -/* - * Common fault management kstats to record ereport generation - * failures - */ - -struct erpt_kstat { - kstat_named_t erpt_dropped; /* num erpts dropped on post */ - kstat_named_t erpt_set_failed; /* num erpt set failures */ - kstat_named_t fmri_set_failed; /* num fmri set failures */ - kstat_named_t payload_set_failed; /* num payload set failures */ -}; - -static struct erpt_kstat erpt_kstat_data = { - { "erpt-dropped", KSTAT_DATA_UINT64 }, - { "erpt-set-failed", KSTAT_DATA_UINT64 }, - { "fmri-set-failed", KSTAT_DATA_UINT64 }, - { "payload-set-failed", KSTAT_DATA_UINT64 } -}; - -#ifdef illumos -/*ARGSUSED*/ -static void -fm_drain(void *private, void *data, errorq_elem_t *eep) -{ - nvlist_t *nvl = errorq_elem_nvl(ereport_errorq, eep); - - if (!KERNEL_PANICKED()) - (void) fm_ereport_post(nvl, EVCH_TRYHARD); - else - fm_nvprint(nvl); -} -#endif - -void -fm_init(void) -{ - kstat_t *ksp; - -#ifdef illumos - (void) sysevent_evc_bind(FM_ERROR_CHAN, - &ereport_chan, EVCH_CREAT | EVCH_HOLD_PEND); - - (void) sysevent_evc_control(ereport_chan, - EVCH_SET_CHAN_LEN, &ereport_chanlen); -#endif - - if (ereport_qlen == 0) - ereport_qlen = ERPT_MAX_ERRS * MAX(max_ncpus, 4); - - if (ereport_size == 0) - ereport_size = ERPT_DATA_SZ; - -#ifdef illumos - ereport_errorq = errorq_nvcreate("fm_ereport_queue", - (errorq_func_t)fm_drain, NULL, ereport_qlen, ereport_size, - FM_ERR_PIL, ERRORQ_VITAL); - if (ereport_errorq == NULL) - panic("failed to create required ereport error queue"); -#endif - - ereport_dumpbuf = kmem_alloc(ereport_size, KM_SLEEP); - ereport_dumplen = ereport_size; - - /* Initialize ereport allocation and generation kstats */ - ksp = kstat_create("unix", 0, "fm", "misc", KSTAT_TYPE_NAMED, - sizeof (struct erpt_kstat) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - - if (ksp != NULL) { - ksp->ks_data = &erpt_kstat_data; - kstat_install(ksp); - } else { - cmn_err(CE_NOTE, "failed to create fm/misc kstat\n"); - - } -} - -#ifdef illumos -/* - * Formatting utility function for fm_nvprintr. We attempt to wrap chunks of - * output so they aren't split across console lines, and return the end column. - */ -/*PRINTFLIKE4*/ -static int -fm_printf(int depth, int c, int cols, const char *format, ...) -{ - va_list ap; - int width; - char c1; - - va_start(ap, format); - width = vsnprintf(&c1, sizeof (c1), format, ap); - va_end(ap); - - if (c + width >= cols) { - console_printf("\n\r"); - c = 0; - if (format[0] != ' ' && depth > 0) { - console_printf(" "); - c++; - } - } - - va_start(ap, format); - console_vprintf(format, ap); - va_end(ap); - - return ((c + width) % cols); -} - -/* - * Recursively print a nvlist in the specified column width and return the - * column we end up in. This function is called recursively by fm_nvprint(), - * below. We generically format the entire nvpair using hexadecimal - * integers and strings, and elide any integer arrays. Arrays are basically - * used for cache dumps right now, so we suppress them so as not to overwhelm - * the amount of console output we produce at panic time. This can be further - * enhanced as FMA technology grows based upon the needs of consumers. All - * FMA telemetry is logged using the dump device transport, so the console - * output serves only as a fallback in case this procedure is unsuccessful. - */ -static int -fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) -{ - nvpair_t *nvp; - - for (nvp = nvlist_next_nvpair(nvl, NULL); - nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { - - data_type_t type = nvpair_type(nvp); - const char *name = nvpair_name(nvp); - - boolean_t b; - uint8_t i8; - uint16_t i16; - uint32_t i32; - uint64_t i64; - char *str; - nvlist_t *cnv; - - if (strcmp(name, FM_CLASS) == 0) - continue; /* already printed by caller */ - - c = fm_printf(d, c, cols, " %s=", name); - - switch (type) { - case DATA_TYPE_BOOLEAN: - c = fm_printf(d + 1, c, cols, " 1"); - break; - - case DATA_TYPE_BOOLEAN_VALUE: - (void) nvpair_value_boolean_value(nvp, &b); - c = fm_printf(d + 1, c, cols, b ? "1" : "0"); - break; - - case DATA_TYPE_BYTE: - (void) nvpair_value_byte(nvp, &i8); - c = fm_printf(d + 1, c, cols, "%x", i8); - break; - - case DATA_TYPE_INT8: - (void) nvpair_value_int8(nvp, (void *)&i8); - c = fm_printf(d + 1, c, cols, "%x", i8); - break; - - case DATA_TYPE_UINT8: - (void) nvpair_value_uint8(nvp, &i8); - c = fm_printf(d + 1, c, cols, "%x", i8); - break; - - case DATA_TYPE_INT16: - (void) nvpair_value_int16(nvp, (void *)&i16); - c = fm_printf(d + 1, c, cols, "%x", i16); - break; - - case DATA_TYPE_UINT16: - (void) nvpair_value_uint16(nvp, &i16); - c = fm_printf(d + 1, c, cols, "%x", i16); - break; - - case DATA_TYPE_INT32: - (void) nvpair_value_int32(nvp, (void *)&i32); - c = fm_printf(d + 1, c, cols, "%x", i32); - break; - - case DATA_TYPE_UINT32: - (void) nvpair_value_uint32(nvp, &i32); - c = fm_printf(d + 1, c, cols, "%x", i32); - break; - - case DATA_TYPE_INT64: - (void) nvpair_value_int64(nvp, (void *)&i64); - c = fm_printf(d + 1, c, cols, "%llx", - (u_longlong_t)i64); - break; - - case DATA_TYPE_UINT64: - (void) nvpair_value_uint64(nvp, &i64); - c = fm_printf(d + 1, c, cols, "%llx", - (u_longlong_t)i64); - break; - - case DATA_TYPE_HRTIME: - (void) nvpair_value_hrtime(nvp, (void *)&i64); - c = fm_printf(d + 1, c, cols, "%llx", - (u_longlong_t)i64); - break; - - case DATA_TYPE_STRING: - (void) nvpair_value_string(nvp, &str); - c = fm_printf(d + 1, c, cols, "\"%s\"", - str ? str : ""); - break; - - case DATA_TYPE_NVLIST: - c = fm_printf(d + 1, c, cols, "["); - (void) nvpair_value_nvlist(nvp, &cnv); - c = fm_nvprintr(cnv, d + 1, c, cols); - c = fm_printf(d + 1, c, cols, " ]"); - break; - - case DATA_TYPE_NVLIST_ARRAY: { - nvlist_t **val; - uint_t i, nelem; - - c = fm_printf(d + 1, c, cols, "["); - (void) nvpair_value_nvlist_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) { - c = fm_nvprintr(val[i], d + 1, c, cols); - } - c = fm_printf(d + 1, c, cols, " ]"); - } - break; - - case DATA_TYPE_BOOLEAN_ARRAY: - case DATA_TYPE_BYTE_ARRAY: - case DATA_TYPE_INT8_ARRAY: - case DATA_TYPE_UINT8_ARRAY: - case DATA_TYPE_INT16_ARRAY: - case DATA_TYPE_UINT16_ARRAY: - case DATA_TYPE_INT32_ARRAY: - case DATA_TYPE_UINT32_ARRAY: - case DATA_TYPE_INT64_ARRAY: - case DATA_TYPE_UINT64_ARRAY: - case DATA_TYPE_STRING_ARRAY: - c = fm_printf(d + 1, c, cols, "[...]"); - break; - case DATA_TYPE_UNKNOWN: - c = fm_printf(d + 1, c, cols, ""); - break; - } - } - - return (c); -} - -void -fm_nvprint(nvlist_t *nvl) -{ - char *class; - int c = 0; - - console_printf("\r"); - - if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0) - c = fm_printf(0, c, ereport_cols, "%s", class); - - if (fm_nvprintr(nvl, 0, c, ereport_cols) != 0) - console_printf("\n"); - - console_printf("\n"); -} - -/* - * Wrapper for panic() that first produces an FMA-style message for admins. - * Normally such messages are generated by fmd(1M)'s syslog-msgs agent: this - * is the one exception to that rule and the only error that gets messaged. - * This function is intended for use by subsystems that have detected a fatal - * error and enqueued appropriate ereports and wish to then force a panic. - */ -/*PRINTFLIKE1*/ -void -fm_panic(const char *format, ...) -{ - va_list ap; - - (void) atomic_cas_ptr((void *)&fm_panicstr, NULL, (void *)format); -#if defined(__i386) || defined(__amd64) - fastreboot_disable_highpil(); -#endif /* __i386 || __amd64 */ - va_start(ap, format); - vpanic(format, ap); - va_end(ap); -} - -/* - * Simply tell the caller if fm_panicstr is set, ie. an fma event has - * caused the panic. If so, something other than the default panic - * diagnosis method will diagnose the cause of the panic. - */ -int -is_fm_panic() -{ - if (fm_panicstr) - return (1); - else - return (0); -} - -/* - * Print any appropriate FMA banner message before the panic message. This - * function is called by panicsys() and prints the message for fm_panic(). - * We print the message here so that it comes after the system is quiesced. - * A one-line summary is recorded in the log only (cmn_err(9F) with "!" prefix). - * The rest of the message is for the console only and not needed in the log, - * so it is printed using console_printf(). We break it up into multiple - * chunks so as to avoid overflowing any small legacy prom_printf() buffers. - */ -void -fm_banner(void) -{ - timespec_t tod; - hrtime_t now; - - if (!fm_panicstr) - return; /* panic was not initiated by fm_panic(); do nothing */ - - if (KERNEL_PANICKED()) { - tod = panic_hrestime; - now = panic_hrtime; - } else { - gethrestime(&tod); - now = gethrtime_waitfree(); - } - - cmn_err(CE_NOTE, "!SUNW-MSG-ID: %s, " - "TYPE: Error, VER: 1, SEVERITY: Major\n", fm_msgid); - - console_printf( -"\n\rSUNW-MSG-ID: %s, TYPE: Error, VER: 1, SEVERITY: Major\n" -"EVENT-TIME: 0x%lx.0x%lx (0x%llx)\n", - fm_msgid, tod.tv_sec, tod.tv_nsec, (u_longlong_t)now); - - console_printf( -"PLATFORM: %s, CSN: -, HOSTNAME: %s\n" -"SOURCE: %s, REV: %s %s\n", - platform, utsname.nodename, utsname.sysname, - utsname.release, utsname.version); - - console_printf( -"DESC: Errors have been detected that require a reboot to ensure system\n" -"integrity. See %s/%s for more information.\n", - fm_url, fm_msgid); - - console_printf( -"AUTO-RESPONSE: Solaris will attempt to save and diagnose the error telemetry\n" -"IMPACT: The system will sync files, save a crash dump if needed, and reboot\n" -"REC-ACTION: Save the error summary below in case telemetry cannot be saved\n"); - - console_printf("\n"); -} - -/* - * Utility function to write all of the pending ereports to the dump device. - * This function is called at either normal reboot or panic time, and simply - * iterates over the in-transit messages in the ereport sysevent channel. - */ -void -fm_ereport_dump(void) -{ - evchanq_t *chq; - sysevent_t *sep; - erpt_dump_t ed; - - timespec_t tod; - hrtime_t now; - char *buf; - size_t len; - - if (KERNEL_PANICKED()) { - tod = panic_hrestime; - now = panic_hrtime; - } else { - if (ereport_errorq != NULL) - errorq_drain(ereport_errorq); - gethrestime(&tod); - now = gethrtime_waitfree(); - } - - /* - * In the panic case, sysevent_evc_walk_init() will return NULL. - */ - if ((chq = sysevent_evc_walk_init(ereport_chan, NULL)) == NULL && - !KERNEL_PANICKED()) - return; /* event channel isn't initialized yet */ - - while ((sep = sysevent_evc_walk_step(chq)) != NULL) { - if ((buf = sysevent_evc_event_attr(sep, &len)) == NULL) - break; - - ed.ed_magic = ERPT_MAGIC; - ed.ed_chksum = checksum32(buf, len); - ed.ed_size = (uint32_t)len; - ed.ed_pad = 0; - ed.ed_hrt_nsec = SE_TIME(sep); - ed.ed_hrt_base = now; - ed.ed_tod_base.sec = tod.tv_sec; - ed.ed_tod_base.nsec = tod.tv_nsec; - - dumpvp_write(&ed, sizeof (ed)); - dumpvp_write(buf, len); - } - - sysevent_evc_walk_fini(chq); -} -#endif - -/* - * Post an error report (ereport) to the sysevent error channel. The error - * channel must be established with a prior call to sysevent_evc_create() - * before publication may occur. - */ -void -fm_ereport_post(nvlist_t *ereport, int evc_flag) -{ - size_t nvl_size = 0; - evchan_t *error_chan; - sysevent_id_t eid; - - (void) nvlist_size(ereport, &nvl_size, NV_ENCODE_NATIVE); - if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) { - atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); - return; - } - -#ifdef illumos - if (sysevent_evc_bind(FM_ERROR_CHAN, &error_chan, - EVCH_CREAT|EVCH_HOLD_PEND) != 0) { - atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); - return; - } - - if (sysevent_evc_publish(error_chan, EC_FM, ESC_FM_ERROR, - SUNW_VENDOR, FM_PUB, ereport, evc_flag) != 0) { - atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); - (void) sysevent_evc_unbind(error_chan); - return; - } - (void) sysevent_evc_unbind(error_chan); -#else - (void) ddi_log_sysevent(NULL, SUNW_VENDOR, EC_DEV_STATUS, - ESC_DEV_DLE, ereport, &eid, DDI_SLEEP); -#endif -} - -/* - * Wrapppers for FM nvlist allocators - */ -/* ARGSUSED */ -static void * -i_fm_alloc(nv_alloc_t *nva, size_t size) -{ - return (kmem_zalloc(size, KM_SLEEP)); -} - -/* ARGSUSED */ -static void -i_fm_free(nv_alloc_t *nva, void *buf, size_t size) -{ - kmem_free(buf, size); -} - -const nv_alloc_ops_t fm_mem_alloc_ops = { - NULL, - NULL, - i_fm_alloc, - i_fm_free, - NULL -}; - -/* - * Create and initialize a new nv_alloc_t for a fixed buffer, buf. A pointer - * to the newly allocated nv_alloc_t structure is returned upon success or NULL - * is returned to indicate that the nv_alloc structure could not be created. - */ -nv_alloc_t * -fm_nva_xcreate(char *buf, size_t bufsz) -{ - nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP); - - if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) { - kmem_free(nvhdl, sizeof (nv_alloc_t)); - return (NULL); - } - - return (nvhdl); -} - -/* - * Destroy a previously allocated nv_alloc structure. The fixed buffer - * associated with nva must be freed by the caller. - */ -void -fm_nva_xdestroy(nv_alloc_t *nva) -{ - nv_alloc_fini(nva); - kmem_free(nva, sizeof (nv_alloc_t)); -} - -/* - * Create a new nv list. A pointer to a new nv list structure is returned - * upon success or NULL is returned to indicate that the structure could - * not be created. The newly created nv list is created and managed by the - * operations installed in nva. If nva is NULL, the default FMA nva - * operations are installed and used. - * - * When called from the kernel and nva == NULL, this function must be called - * from passive kernel context with no locks held that can prevent a - * sleeping memory allocation from occurring. Otherwise, this function may - * be called from other kernel contexts as long a valid nva created via - * fm_nva_create() is supplied. - */ -nvlist_t * -fm_nvlist_create(nv_alloc_t *nva) -{ - int hdl_alloced = 0; - nvlist_t *nvl; - nv_alloc_t *nvhdl; - - if (nva == NULL) { - nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP); - - if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) { - kmem_free(nvhdl, sizeof (nv_alloc_t)); - return (NULL); - } - hdl_alloced = 1; - } else { - nvhdl = nva; - } - - if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) { - if (hdl_alloced) { - nv_alloc_fini(nvhdl); - kmem_free(nvhdl, sizeof (nv_alloc_t)); - } - return (NULL); - } - - return (nvl); -} - -/* - * Destroy a previously allocated nvlist structure. flag indicates whether - * or not the associated nva structure should be freed (FM_NVA_FREE) or - * retained (FM_NVA_RETAIN). Retaining the nv alloc structure allows - * it to be re-used for future nvlist creation operations. - */ -void -fm_nvlist_destroy(nvlist_t *nvl, int flag) -{ - nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl); - - nvlist_free(nvl); - - if (nva != NULL) { - if (flag == FM_NVA_FREE) - fm_nva_xdestroy(nva); - } -} - -int -i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap) -{ - int nelem, ret = 0; - data_type_t type; - - while (ret == 0 && name != NULL) { - type = va_arg(ap, data_type_t); - switch (type) { - case DATA_TYPE_BYTE: - ret = nvlist_add_byte(payload, name, - va_arg(ap, uint_t)); - break; - case DATA_TYPE_BYTE_ARRAY: - nelem = va_arg(ap, int); - ret = nvlist_add_byte_array(payload, name, - va_arg(ap, uchar_t *), nelem); - break; - case DATA_TYPE_BOOLEAN_VALUE: - ret = nvlist_add_boolean_value(payload, name, - va_arg(ap, boolean_t)); - break; - case DATA_TYPE_BOOLEAN_ARRAY: - nelem = va_arg(ap, int); - ret = nvlist_add_boolean_array(payload, name, - va_arg(ap, boolean_t *), nelem); - break; - case DATA_TYPE_INT8: - ret = nvlist_add_int8(payload, name, - va_arg(ap, int)); - break; - case DATA_TYPE_INT8_ARRAY: - nelem = va_arg(ap, int); - ret = nvlist_add_int8_array(payload, name, - va_arg(ap, int8_t *), nelem); - break; - case DATA_TYPE_UINT8: - ret = nvlist_add_uint8(payload, name, - va_arg(ap, uint_t)); - break; - case DATA_TYPE_UINT8_ARRAY: - nelem = va_arg(ap, int); - ret = nvlist_add_uint8_array(payload, name, - va_arg(ap, uint8_t *), nelem); - break; - case DATA_TYPE_INT16: - ret = nvlist_add_int16(payload, name, - va_arg(ap, int)); - break; - case DATA_TYPE_INT16_ARRAY: - nelem = va_arg(ap, int); - ret = nvlist_add_int16_array(payload, name, - va_arg(ap, int16_t *), nelem); - break; - case DATA_TYPE_UINT16: - ret = nvlist_add_uint16(payload, name, - va_arg(ap, uint_t)); - break; - case DATA_TYPE_UINT16_ARRAY: - nelem = va_arg(ap, int); - ret = nvlist_add_uint16_array(payload, name, - va_arg(ap, uint16_t *), nelem); - break; - case DATA_TYPE_INT32: - ret = nvlist_add_int32(payload, name, - va_arg(ap, int32_t)); - break; - case DATA_TYPE_INT32_ARRAY: - nelem = va_arg(ap, int); - ret = nvlist_add_int32_array(payload, name, - va_arg(ap, int32_t *), nelem); - break; - case DATA_TYPE_UINT32: - ret = nvlist_add_uint32(payload, name, - va_arg(ap, uint32_t)); - break; - case DATA_TYPE_UINT32_ARRAY: - nelem = va_arg(ap, int); - ret = nvlist_add_uint32_array(payload, name, - va_arg(ap, uint32_t *), nelem); - break; - case DATA_TYPE_INT64: - ret = nvlist_add_int64(payload, name, - va_arg(ap, int64_t)); - break; - case DATA_TYPE_INT64_ARRAY: - nelem = va_arg(ap, int); - ret = nvlist_add_int64_array(payload, name, - va_arg(ap, int64_t *), nelem); - break; - case DATA_TYPE_UINT64: - ret = nvlist_add_uint64(payload, name, - va_arg(ap, uint64_t)); - break; - case DATA_TYPE_UINT64_ARRAY: - nelem = va_arg(ap, int); - ret = nvlist_add_uint64_array(payload, name, - va_arg(ap, uint64_t *), nelem); - break; - case DATA_TYPE_STRING: - ret = nvlist_add_string(payload, name, - va_arg(ap, char *)); - break; - case DATA_TYPE_STRING_ARRAY: - nelem = va_arg(ap, int); - ret = nvlist_add_string_array(payload, name, - va_arg(ap, char **), nelem); - break; - case DATA_TYPE_NVLIST: - ret = nvlist_add_nvlist(payload, name, - va_arg(ap, nvlist_t *)); - break; - case DATA_TYPE_NVLIST_ARRAY: - nelem = va_arg(ap, int); - ret = nvlist_add_nvlist_array(payload, name, - va_arg(ap, nvlist_t **), nelem); - break; - default: - ret = EINVAL; - } - - name = va_arg(ap, char *); - } - return (ret); -} - -void -fm_payload_set(nvlist_t *payload, ...) -{ - int ret; - const char *name; - va_list ap; - - va_start(ap, payload); - name = va_arg(ap, char *); - ret = i_fm_payload_set(payload, name, ap); - va_end(ap); - - if (ret) - atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64); -} - -/* - * Set-up and validate the members of an ereport event according to: - * - * Member name Type Value - * ==================================================== - * class string ereport - * version uint8_t 0 - * ena uint64_t - * detector nvlist_t - * ereport-payload nvlist_t - * - * We don't actually add a 'version' member to the payload. Really, - * the version quoted to us by our caller is that of the category 1 - * "ereport" event class (and we require FM_EREPORT_VERS0) but - * the payload version of the actual leaf class event under construction - * may be something else. Callers should supply a version in the varargs, - * or (better) we could take two version arguments - one for the - * ereport category 1 classification (expect FM_EREPORT_VERS0) and one - * for the leaf class. - */ -void -fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class, - uint64_t ena, const nvlist_t *detector, ...) -{ - char ereport_class[FM_MAX_CLASS]; - const char *name; - va_list ap; - int ret; - - if (version != FM_EREPORT_VERS0) { - atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); - return; - } - - (void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s", - FM_EREPORT_CLASS, erpt_class); - if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) { - atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); - return; - } - - if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) { - atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); - } - - if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR, - (nvlist_t *)detector) != 0) { - atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); - } - - va_start(ap, detector); - name = va_arg(ap, const char *); - ret = i_fm_payload_set(ereport, name, ap); - va_end(ap); - - if (ret) - atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); -} - -/* - * Set-up and validate the members of an hc fmri according to; - * - * Member name Type Value - * =================================================== - * version uint8_t 0 - * auth nvlist_t - * hc-name string - * hc-id string - * - * Note that auth and hc-id are optional members. - */ - -#define HC_MAXPAIRS 20 -#define HC_MAXNAMELEN 50 - -static int -fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth) -{ - if (version != FM_HC_SCHEME_VERSION) { - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - return (0); - } - - if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 || - nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) { - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - return (0); - } - - if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY, - (nvlist_t *)auth) != 0) { - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - return (0); - } - - return (1); -} - -void -fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth, - nvlist_t *snvl, int npairs, ...) -{ - nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri); - nvlist_t *pairs[HC_MAXPAIRS]; - va_list ap; - int i; - - if (!fm_fmri_hc_set_common(fmri, version, auth)) - return; - - npairs = MIN(npairs, HC_MAXPAIRS); - - va_start(ap, npairs); - for (i = 0; i < npairs; i++) { - const char *name = va_arg(ap, const char *); - uint32_t id = va_arg(ap, uint32_t); - char idstr[11]; - - (void) snprintf(idstr, sizeof (idstr), "%u", id); - - pairs[i] = fm_nvlist_create(nva); - if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 || - nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) { - atomic_inc_64( - &erpt_kstat_data.fmri_set_failed.value.ui64); - } - } - va_end(ap); - - if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0) - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - - for (i = 0; i < npairs; i++) - fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN); - - if (snvl != NULL) { - if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) { - atomic_inc_64( - &erpt_kstat_data.fmri_set_failed.value.ui64); - } - } -} - -/* - * Set-up and validate the members of an dev fmri according to: - * - * Member name Type Value - * ==================================================== - * version uint8_t 0 - * auth nvlist_t - * devpath string - * [devid] string - * [target-port-l0id] string - * - * Note that auth and devid are optional members. - */ -void -fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth, - const char *devpath, const char *devid, const char *tpl0) -{ - int err = 0; - - if (version != DEV_SCHEME_VERSION0) { - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - return; - } - - err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version); - err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV); - - if (auth != NULL) { - err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY, - (nvlist_t *)auth); - } - - err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath); - - if (devid != NULL) - err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid); - - if (tpl0 != NULL) - err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0); - - if (err) - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - -} - -/* - * Set-up and validate the members of an cpu fmri according to: - * - * Member name Type Value - * ==================================================== - * version uint8_t 0 - * auth nvlist_t - * cpuid uint32_t - * cpumask uint8_t - * serial uint64_t - * - * Note that auth, cpumask, serial are optional members. - * - */ -void -fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth, - uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp) -{ - uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64; - - if (version < CPU_SCHEME_VERSION1) { - atomic_inc_64(failedp); - return; - } - - if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) { - atomic_inc_64(failedp); - return; - } - - if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME, - FM_FMRI_SCHEME_CPU) != 0) { - atomic_inc_64(failedp); - return; - } - - if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY, - (nvlist_t *)auth) != 0) - atomic_inc_64(failedp); - - if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0) - atomic_inc_64(failedp); - - if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK, - *cpu_maskp) != 0) - atomic_inc_64(failedp); - - if (serial_idp == NULL || nvlist_add_string(fmri_cpu, - FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0) - atomic_inc_64(failedp); -} - -/* - * Set-up and validate the members of a mem according to: - * - * Member name Type Value - * ==================================================== - * version uint8_t 0 - * auth nvlist_t [optional] - * unum string - * serial string [optional*] - * offset uint64_t [optional] - * - * * serial is required if offset is present - */ -void -fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth, - const char *unum, const char *serial, uint64_t offset) -{ - if (version != MEM_SCHEME_VERSION0) { - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - return; - } - - if (!serial && (offset != (uint64_t)-1)) { - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - return; - } - - if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) { - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - return; - } - - if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) { - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - return; - } - - if (auth != NULL) { - if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY, - (nvlist_t *)auth) != 0) { - atomic_inc_64( - &erpt_kstat_data.fmri_set_failed.value.ui64); - } - } - - if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) { - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - } - - if (serial != NULL) { - if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID, - (char **)&serial, 1) != 0) { - atomic_inc_64( - &erpt_kstat_data.fmri_set_failed.value.ui64); - } - if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri, - FM_FMRI_MEM_OFFSET, offset) != 0) { - atomic_inc_64( - &erpt_kstat_data.fmri_set_failed.value.ui64); - } - } -} - -void -fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid, - uint64_t vdev_guid) -{ - if (version != ZFS_SCHEME_VERSION0) { - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - return; - } - - if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) { - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - return; - } - - if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) { - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - return; - } - - if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) { - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - } - - if (vdev_guid != 0) { - if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) { - atomic_inc_64( - &erpt_kstat_data.fmri_set_failed.value.ui64); - } - } -} - -uint64_t -fm_ena_increment(uint64_t ena) -{ - uint64_t new_ena; - - switch (ENA_FORMAT(ena)) { - case FM_ENA_FMT1: - new_ena = ena + (1 << ENA_FMT1_GEN_SHFT); - break; - case FM_ENA_FMT2: - new_ena = ena + (1 << ENA_FMT2_GEN_SHFT); - break; - default: - new_ena = 0; - } - - return (new_ena); -} - -uint64_t -fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format) -{ - uint64_t ena = 0; - - switch (format) { - case FM_ENA_FMT1: - if (timestamp) { - ena = (uint64_t)((format & ENA_FORMAT_MASK) | - ((cpuid << ENA_FMT1_CPUID_SHFT) & - ENA_FMT1_CPUID_MASK) | - ((timestamp << ENA_FMT1_TIME_SHFT) & - ENA_FMT1_TIME_MASK)); - } else { - ena = (uint64_t)((format & ENA_FORMAT_MASK) | - ((cpuid << ENA_FMT1_CPUID_SHFT) & - ENA_FMT1_CPUID_MASK) | - ((gethrtime_waitfree() << ENA_FMT1_TIME_SHFT) & - ENA_FMT1_TIME_MASK)); - } - break; - case FM_ENA_FMT2: - ena = (uint64_t)((format & ENA_FORMAT_MASK) | - ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK)); - break; - default: - break; - } - - return (ena); -} - -uint64_t -fm_ena_generate(uint64_t timestamp, uchar_t format) -{ - return (fm_ena_generate_cpu(timestamp, PCPU_GET(cpuid), format)); -} - -uint64_t -fm_ena_generation_get(uint64_t ena) -{ - uint64_t gen; - - switch (ENA_FORMAT(ena)) { - case FM_ENA_FMT1: - gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT; - break; - case FM_ENA_FMT2: - gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT; - break; - default: - gen = 0; - break; - } - - return (gen); -} - -uchar_t -fm_ena_format_get(uint64_t ena) -{ - - return (ENA_FORMAT(ena)); -} - -uint64_t -fm_ena_id_get(uint64_t ena) -{ - uint64_t id; - - switch (ENA_FORMAT(ena)) { - case FM_ENA_FMT1: - id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT; - break; - case FM_ENA_FMT2: - id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT; - break; - default: - id = 0; - } - - return (id); -} - -uint64_t -fm_ena_time_get(uint64_t ena) -{ - uint64_t time; - - switch (ENA_FORMAT(ena)) { - case FM_ENA_FMT1: - time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT; - break; - case FM_ENA_FMT2: - time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT; - break; - default: - time = 0; - } - - return (time); -} - -#ifdef illumos -/* - * Convert a getpcstack() trace to symbolic name+offset, and add the resulting - * string array to a Fault Management ereport as FM_EREPORT_PAYLOAD_NAME_STACK. - */ -void -fm_payload_stack_add(nvlist_t *payload, const pc_t *stack, int depth) -{ - int i; - char *sym; - ulong_t off; - char *stkpp[FM_STK_DEPTH]; - char buf[FM_STK_DEPTH * FM_SYM_SZ]; - char *stkp = buf; - - for (i = 0; i < depth && i != FM_STK_DEPTH; i++, stkp += FM_SYM_SZ) { - if ((sym = kobj_getsymname(stack[i], &off)) != NULL) - (void) snprintf(stkp, FM_SYM_SZ, "%s+%lx", sym, off); - else - (void) snprintf(stkp, FM_SYM_SZ, "%lx", (long)stack[i]); - stkpp[i] = stkp; - } - - fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_STACK, - DATA_TYPE_STRING_ARRAY, depth, stkpp, NULL); -} -#endif - -#ifdef illumos -void -print_msg_hwerr(ctid_t ct_id, proc_t *p) -{ - uprintf("Killed process %d (%s) in contract id %d " - "due to hardware error\n", p->p_pid, p->p_user.u_comm, ct_id); -} -#endif - -void -fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth, - nvlist_t *snvl, nvlist_t *bboard, int npairs, ...) -{ - nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri); - nvlist_t *pairs[HC_MAXPAIRS]; - nvlist_t **hcl; - uint_t n; - int i, j; - va_list ap; - char *hcname, *hcid; - - if (!fm_fmri_hc_set_common(fmri, version, auth)) - return; - - /* - * copy the bboard nvpairs to the pairs array - */ - if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n) - != 0) { - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - return; - } - - for (i = 0; i < n; i++) { - if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME, - &hcname) != 0) { - atomic_inc_64( - &erpt_kstat_data.fmri_set_failed.value.ui64); - return; - } - if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) { - atomic_inc_64( - &erpt_kstat_data.fmri_set_failed.value.ui64); - return; - } - - pairs[i] = fm_nvlist_create(nva); - if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 || - nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) { - for (j = 0; j <= i; j++) { - if (pairs[j] != NULL) - fm_nvlist_destroy(pairs[j], - FM_NVA_RETAIN); - } - atomic_inc_64( - &erpt_kstat_data.fmri_set_failed.value.ui64); - return; - } - } - - /* - * create the pairs from passed in pairs - */ - npairs = MIN(npairs, HC_MAXPAIRS); - - va_start(ap, npairs); - for (i = n; i < npairs + n; i++) { - const char *name = va_arg(ap, const char *); - uint32_t id = va_arg(ap, uint32_t); - char idstr[11]; - (void) snprintf(idstr, sizeof (idstr), "%u", id); - pairs[i] = fm_nvlist_create(nva); - if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 || - nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) { - for (j = 0; j <= i; j++) { - if (pairs[j] != NULL) - fm_nvlist_destroy(pairs[j], - FM_NVA_RETAIN); - } - atomic_inc_64( - &erpt_kstat_data.fmri_set_failed.value.ui64); - return; - } - } - va_end(ap); - - /* - * Create the fmri hc list - */ - if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, - npairs + n) != 0) { - atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); - return; - } - - for (i = 0; i < npairs + n; i++) { - fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN); - } - - if (snvl != NULL) { - if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) { - atomic_inc_64( - &erpt_kstat_data.fmri_set_failed.value.ui64); - return; - } - } -} diff --git a/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c b/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c deleted file mode 100644 index 3682853de902..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c +++ /dev/null @@ -1,63 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include - -static void * -nv_alloc_sys(nv_alloc_t *nva, size_t size) -{ - return (kmem_alloc(size, (int)(uintptr_t)nva->nva_arg)); -} - -/*ARGSUSED*/ -static void -nv_free_sys(nv_alloc_t *nva, void *buf, size_t size) -{ - kmem_free(buf, size); -} - -static const nv_alloc_ops_t system_ops = { - NULL, /* nv_ao_init() */ - NULL, /* nv_ao_fini() */ - nv_alloc_sys, /* nv_ao_alloc() */ - nv_free_sys, /* nv_ao_free() */ - NULL /* nv_ao_reset() */ -}; - -nv_alloc_t nv_alloc_sleep_def = { - &system_ops, - (void *)KM_SLEEP -}; - -nv_alloc_t nv_alloc_nosleep_def = { - &system_ops, - (void *)KM_NOSLEEP -}; - -nv_alloc_t *nv_alloc_sleep = &nv_alloc_sleep_def; -nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def; diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h deleted file mode 100644 index b81678ca07d2..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h +++ /dev/null @@ -1,313 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2014 Garrett D'Amore - * - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - * Copyright 2017 RackTop Systems. - */ - -#ifndef _SYS_ACL_H -#define _SYS_ACL_H - -#include -#include - -#if defined(_KERNEL) -/* - * When compiling OpenSolaris kernel code, this file is included instead of the - * FreeBSD one. Include the original sys/acl.h as well. - */ -#undef _SYS_ACL_H -#include_next -#define _SYS_ACL_H -#endif /* _KERNEL */ - -#ifdef __cplusplus -extern "C" { -#endif - -#define MAX_ACL_ENTRIES (1024) /* max entries of each type */ -typedef struct { - int a_type; /* the type of ACL entry */ - uid_t a_id; /* the entry in -uid or gid */ - o_mode_t a_perm; /* the permission field */ -} aclent_t; - -typedef struct ace { - uid_t a_who; /* uid or gid */ - uint32_t a_access_mask; /* read,write,... */ - uint16_t a_flags; /* see below */ - uint16_t a_type; /* allow or deny */ -} ace_t; - -#ifndef _KERNEL -typedef struct acl_info acl_t; -#endif - -/* - * The following are Defined types for an aclent_t. - */ -#define USER_OBJ (0x01) /* object owner */ -#define USER (0x02) /* additional users */ -#define GROUP_OBJ (0x04) /* owning group of the object */ -#define GROUP (0x08) /* additional groups */ -#define CLASS_OBJ (0x10) /* file group class and mask entry */ -#define OTHER_OBJ (0x20) /* other entry for the object */ -#define ACL_DEFAULT (0x1000) /* default flag */ -/* default object owner */ -#define DEF_USER_OBJ (ACL_DEFAULT | USER_OBJ) -/* default additional users */ -#define DEF_USER (ACL_DEFAULT | USER) -/* default owning group */ -#define DEF_GROUP_OBJ (ACL_DEFAULT | GROUP_OBJ) -/* default additional groups */ -#define DEF_GROUP (ACL_DEFAULT | GROUP) -/* default mask entry */ -#define DEF_CLASS_OBJ (ACL_DEFAULT | CLASS_OBJ) -/* default other entry */ -#define DEF_OTHER_OBJ (ACL_DEFAULT | OTHER_OBJ) - -/* - * The following are defined for ace_t. - */ -#define ACE_READ_DATA 0x00000001 -#define ACE_LIST_DIRECTORY 0x00000001 -#define ACE_WRITE_DATA 0x00000002 -#define ACE_ADD_FILE 0x00000002 -#define ACE_APPEND_DATA 0x00000004 -#define ACE_ADD_SUBDIRECTORY 0x00000004 -#define ACE_READ_NAMED_ATTRS 0x00000008 -#define ACE_WRITE_NAMED_ATTRS 0x00000010 -#define ACE_EXECUTE 0x00000020 -#define ACE_DELETE_CHILD 0x00000040 -#define ACE_READ_ATTRIBUTES 0x00000080 -#define ACE_WRITE_ATTRIBUTES 0x00000100 -#define ACE_DELETE 0x00010000 -#define ACE_READ_ACL 0x00020000 -#define ACE_WRITE_ACL 0x00040000 -#define ACE_WRITE_OWNER 0x00080000 -#define ACE_SYNCHRONIZE 0x00100000 - -#define ACE_FILE_INHERIT_ACE 0x0001 -#define ACE_DIRECTORY_INHERIT_ACE 0x0002 -#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004 -#define ACE_INHERIT_ONLY_ACE 0x0008 -#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010 -#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020 -#define ACE_IDENTIFIER_GROUP 0x0040 -#define ACE_INHERITED_ACE 0x0080 -#define ACE_OWNER 0x1000 -#define ACE_GROUP 0x2000 -#define ACE_EVERYONE 0x4000 - -#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000 -#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001 -#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002 -#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003 - -#define ACL_AUTO_INHERIT 0x0001 -#define ACL_PROTECTED 0x0002 -#define ACL_DEFAULTED 0x0004 -#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED| \ - ACL_DEFAULTED) - -#if defined(_KERNEL) || defined(_FAKE_KERNEL) - -/* - * These are only applicable in a CIFS context. - */ -#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 -#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 -#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 -#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 -#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 -#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 -#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A -#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B -#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C -#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D -#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E -#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F -#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 - -#define ACE_ALL_TYPES 0x001F - -typedef struct ace_object { - uid_t a_who; /* uid or gid */ - uint32_t a_access_mask; /* read,write,... */ - uint16_t a_flags; /* see below */ - uint16_t a_type; /* allow or deny */ - uint8_t a_obj_type[16]; /* obj type */ - uint8_t a_inherit_obj_type[16]; /* inherit obj */ -} ace_object_t; - -#endif - -#define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ - ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ - ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ - ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ - ACE_WRITE_OWNER|ACE_SYNCHRONIZE) - -#define ACE_ALL_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA| \ - ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS|ACE_WRITE_ACL| \ - ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) - -#define ACE_READ_PERMS (ACE_READ_DATA|ACE_READ_ACL|ACE_READ_ATTRIBUTES| \ - ACE_READ_NAMED_ATTRS) - -#define ACE_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES| \ - ACE_WRITE_NAMED_ATTRS) - -#define ACE_MODIFY_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ - ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ - ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ - ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_SYNCHRONIZE) -/* - * The following flags are supported by both NFSv4 ACLs and ace_t. - */ -#define ACE_NFSV4_SUP_FLAGS (ACE_FILE_INHERIT_ACE | \ - ACE_DIRECTORY_INHERIT_ACE | \ - ACE_NO_PROPAGATE_INHERIT_ACE | \ - ACE_INHERIT_ONLY_ACE | \ - ACE_INHERITED_ACE | \ - ACE_IDENTIFIER_GROUP) - -#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE| \ - ACE_IDENTIFIER_GROUP) -#define ACE_INHERIT_FLAGS (ACE_FILE_INHERIT_ACE| ACL_INHERITED_ACE| \ - ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE) - -/* cmd args to acl(2) for aclent_t */ -#define GETACL 1 -#define SETACL 2 -#define GETACLCNT 3 - -/* cmd's to manipulate ace acls. */ -#define ACE_GETACL 4 -#define ACE_SETACL 5 -#define ACE_GETACLCNT 6 - -/* minimal acl entries from GETACLCNT */ -#define MIN_ACL_ENTRIES 4 - -#if !defined(_KERNEL) - -/* acl check errors */ -#define GRP_ERROR 1 -#define USER_ERROR 2 -#define OTHER_ERROR 3 -#define CLASS_ERROR 4 -#define DUPLICATE_ERROR 5 -#define MISS_ERROR 6 -#define MEM_ERROR 7 -#define ENTRY_ERROR 8 - - -/* - * similar to ufs_acl.h: changed to char type for user commands (tar, cpio) - * Attribute types - */ -#define UFSD_FREE ('0') /* Free entry */ -#define UFSD_ACL ('1') /* Access Control Lists */ -#define UFSD_DFACL ('2') /* reserved for future use */ -#define ACE_ACL ('3') /* ace_t style acls */ - -/* - * flag to [f]acl_get() - * controls whether a trivial acl should be returned. - */ -#define ACL_NO_TRIVIAL 0x2 - - -/* - * Flags to control acl_totext() - */ - -#define ACL_APPEND_ID 0x1 /* append uid/gid to user/group entries */ -#define ACL_COMPACT_FMT 0x2 /* build ACL in ls -V format */ -#define ACL_NORESOLVE 0x4 /* don't do name service lookups */ -#define ACL_SID_FMT 0x8 /* use usersid/groupsid when appropriate */ - -/* - * Legacy aclcheck errors for aclent_t ACLs - */ -#define EACL_GRP_ERROR GRP_ERROR -#define EACL_USER_ERROR USER_ERROR -#define EACL_OTHER_ERROR OTHER_ERROR -#define EACL_CLASS_ERROR CLASS_ERROR -#define EACL_DUPLICATE_ERROR DUPLICATE_ERROR -#define EACL_MISS_ERROR MISS_ERROR -#define EACL_MEM_ERROR MEM_ERROR -#define EACL_ENTRY_ERROR ENTRY_ERROR - -#define EACL_INHERIT_ERROR 9 /* invalid inherit flags */ -#define EACL_FLAGS_ERROR 10 /* unknown flag value */ -#define EACL_PERM_MASK_ERROR 11 /* unknown permission */ -#define EACL_COUNT_ERROR 12 /* invalid acl count */ - -#define EACL_INVALID_SLOT 13 /* invalid acl slot */ -#define EACL_NO_ACL_ENTRY 14 /* Entry doesn't exist */ -#define EACL_DIFF_TYPE 15 /* acls aren't same type */ - -#define EACL_INVALID_USER_GROUP 16 /* need user/group name */ -#define EACL_INVALID_STR 17 /* invalid acl string */ -#define EACL_FIELD_NOT_BLANK 18 /* can't have blank field */ -#define EACL_INVALID_ACCESS_TYPE 19 /* invalid access type */ -#define EACL_UNKNOWN_DATA 20 /* Unrecognized data in ACL */ -#define EACL_MISSING_FIELDS 21 /* missing fields in acl */ - -#define EACL_INHERIT_NOTDIR 22 /* Need dir for inheritance */ - -extern int aclcheck(aclent_t *, int, int *); -extern int acltomode(aclent_t *, int, mode_t *); -extern int aclfrommode(aclent_t *, int, mode_t *); -extern int aclsort(int, int, aclent_t *); -extern char *acltotext(aclent_t *, int); -extern aclent_t *aclfromtext(char *, int *); -extern void acl_free(acl_t *); -extern int acl_get(const char *, int, acl_t **); -extern int facl_get(int, int, acl_t **); -extern int acl_set(const char *, acl_t *acl); -extern int facl_set(int, acl_t *acl); -extern int acl_strip(const char *, uid_t, gid_t, mode_t); -extern int acl_trivial(const char *); -extern char *acl_totext(acl_t *, int); -extern int acl_fromtext(const char *, acl_t **); -extern int acl_check(acl_t *, int); - -#else /* !defined(_KERNEL) */ - -extern void ksort(caddr_t, int, int, int (*)(void *, void *)); -extern int cmp2acls(void *, void *); - -#endif /* !defined(_KERNEL) */ - -extern int acl(const char *path, int cmd, int cnt, void *buf); -extern int facl(int fd, int cmd, int cnt, void *buf); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ACL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h b/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h index c9857b086575..4597cfa0f10f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h @@ -115,7 +115,6 @@ extern "C" { #define __VPRINTFLIKE(__n) __sun_attr__((__VPRINTFLIKE__(__n))) #define __KPRINTFLIKE(__n) __sun_attr__((__KPRINTFLIKE__(__n))) #define __KVPRINTFLIKE(__n) __sun_attr__((__KVPRINTFLIKE__(__n))) -#define __NORETURN __sun_attr__((__noreturn__)) #define __CONST __sun_attr__((__const__)) #define __PURE __sun_attr__((__pure__)) diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h b/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h deleted file mode 100644 index f526c85872e7..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h +++ /dev/null @@ -1,830 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2014 Igor Kozhukhov . - * Copyright 2017 RackTop Systems. - */ - -#ifndef _SYS_CPUVAR_H -#define _SYS_CPUVAR_H - -#include -#include /* has cpu_stat_t definition */ -#include -#include -#include /* has kcpc_ctx_t definition */ - -#include -#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP) -#include -#endif - -#include -#include -#include -#include -#include -#if defined(__GNUC__) && defined(_ASM_INLINES) && defined(_KERNEL) && \ - (defined(__i386) || defined(__amd64)) -#include -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -struct squeue_set_s; - -#define CPU_CACHE_COHERENCE_SIZE 64 - -/* - * For fast event tracing. - */ -struct ftrace_record; -typedef struct ftrace_data { - int ftd_state; /* ftrace flags */ - kmutex_t ftd_unused; /* ftrace buffer lock, unused */ - struct ftrace_record *ftd_cur; /* current record */ - struct ftrace_record *ftd_first; /* first record */ - struct ftrace_record *ftd_last; /* last record */ -} ftrace_data_t; - -struct cyc_cpu; -struct nvlist; - -/* - * Per-CPU data. - * - * Be careful adding new members: if they are not the same in all modules (e.g. - * change size depending on a #define), CTF uniquification can fail to work - * properly. Furthermore, this is transitive in that it applies recursively to - * all types pointed to by cpu_t. - */ -typedef struct cpu { - processorid_t cpu_id; /* CPU number */ - processorid_t cpu_seqid; /* sequential CPU id (0..ncpus-1) */ - volatile cpu_flag_t cpu_flags; /* flags indicating CPU state */ - struct cpu *cpu_self; /* pointer to itself */ - kthread_t *cpu_thread; /* current thread */ - kthread_t *cpu_idle_thread; /* idle thread for this CPU */ - kthread_t *cpu_pause_thread; /* pause thread for this CPU */ - klwp_id_t cpu_lwp; /* current lwp (if any) */ - klwp_id_t cpu_fpowner; /* currently loaded fpu owner */ - struct cpupart *cpu_part; /* partition with this CPU */ - struct lgrp_ld *cpu_lpl; /* pointer to this cpu's load */ - int cpu_cache_offset; /* see kmem.c for details */ - - /* - * Links to other CPUs. It is safe to walk these lists if - * one of the following is true: - * - cpu_lock held - * - preemption disabled via kpreempt_disable - * - PIL >= DISP_LEVEL - * - acting thread is an interrupt thread - * - all other CPUs are paused - */ - struct cpu *cpu_next; /* next existing CPU */ - struct cpu *cpu_prev; /* prev existing CPU */ - struct cpu *cpu_next_onln; /* next online (enabled) CPU */ - struct cpu *cpu_prev_onln; /* prev online (enabled) CPU */ - struct cpu *cpu_next_part; /* next CPU in partition */ - struct cpu *cpu_prev_part; /* prev CPU in partition */ - struct cpu *cpu_next_lgrp; /* next CPU in latency group */ - struct cpu *cpu_prev_lgrp; /* prev CPU in latency group */ - struct cpu *cpu_next_lpl; /* next CPU in lgrp partition */ - struct cpu *cpu_prev_lpl; - - struct cpu_pg *cpu_pg; /* cpu's processor groups */ - - void *cpu_reserved[4]; /* reserved for future use */ - - /* - * Scheduling variables. - */ - disp_t *cpu_disp; /* dispatch queue data */ - /* - * Note that cpu_disp is set before the CPU is added to the system - * and is never modified. Hence, no additional locking is needed - * beyond what's necessary to access the cpu_t structure. - */ - char cpu_runrun; /* scheduling flag - set to preempt */ - char cpu_kprunrun; /* force kernel preemption */ - pri_t cpu_chosen_level; /* priority at which cpu */ - /* was chosen for scheduling */ - kthread_t *cpu_dispthread; /* thread selected for dispatch */ - disp_lock_t cpu_thread_lock; /* dispatcher lock on current thread */ - uint8_t cpu_disp_flags; /* flags used by dispatcher */ - /* - * The following field is updated when ever the cpu_dispthread - * changes. Also in places, where the current thread(cpu_dispthread) - * priority changes. This is used in disp_lowpri_cpu() - */ - pri_t cpu_dispatch_pri; /* priority of cpu_dispthread */ - clock_t cpu_last_swtch; /* last time switched to new thread */ - - /* - * Interrupt data. - */ - caddr_t cpu_intr_stack; /* interrupt stack */ - kthread_t *cpu_intr_thread; /* interrupt thread list */ - uint_t cpu_intr_actv; /* interrupt levels active (bitmask) */ - int cpu_base_spl; /* priority for highest rupt active */ - - /* - * Statistics. - */ - cpu_stats_t cpu_stats; /* per-CPU statistics */ - struct kstat *cpu_info_kstat; /* kstat for cpu info */ - - uintptr_t cpu_profile_pc; /* kernel PC in profile interrupt */ - uintptr_t cpu_profile_upc; /* user PC in profile interrupt */ - uintptr_t cpu_profile_pil; /* PIL when profile interrupted */ - - ftrace_data_t cpu_ftrace; /* per cpu ftrace data */ - - clock_t cpu_deadman_counter; /* used by deadman() */ - uint_t cpu_deadman_countdown; /* used by deadman() */ - - kmutex_t cpu_cpc_ctxlock; /* protects context for idle thread */ - kcpc_ctx_t *cpu_cpc_ctx; /* performance counter context */ - - /* - * Configuration information for the processor_info system call. - */ - processor_info_t cpu_type_info; /* config info */ - time_t cpu_state_begin; /* when CPU entered current state */ - char cpu_cpr_flags; /* CPR related info */ - struct cyc_cpu *cpu_cyclic; /* per cpu cyclic subsystem data */ - struct squeue_set_s *cpu_squeue_set; /* per cpu squeue set */ - struct nvlist *cpu_props; /* pool-related properties */ - - krwlock_t cpu_ft_lock; /* DTrace: fasttrap lock */ - uintptr_t cpu_dtrace_caller; /* DTrace: caller, if any */ - hrtime_t cpu_dtrace_chillmark; /* DTrace: chill mark time */ - hrtime_t cpu_dtrace_chilled; /* DTrace: total chill time */ - volatile uint16_t cpu_mstate; /* cpu microstate */ - volatile uint16_t cpu_mstate_gen; /* generation counter */ - volatile hrtime_t cpu_mstate_start; /* cpu microstate start time */ - volatile hrtime_t cpu_acct[NCMSTATES]; /* cpu microstate data */ - hrtime_t cpu_intracct[NCMSTATES]; /* interrupt mstate data */ - hrtime_t cpu_waitrq; /* cpu run-queue wait time */ - struct loadavg_s cpu_loadavg; /* loadavg info for this cpu */ - - char *cpu_idstr; /* for printing and debugging */ - char *cpu_brandstr; /* for printing */ - - /* - * Sum of all device interrupt weights that are currently directed at - * this cpu. Cleared at start of interrupt redistribution. - */ - int32_t cpu_intr_weight; - void *cpu_vm_data; - - struct cpu_physid *cpu_physid; /* physical associations */ - - uint64_t cpu_curr_clock; /* current clock freq in Hz */ - char *cpu_supp_freqs; /* supported freqs in Hz */ - - uintptr_t cpu_cpcprofile_pc; /* kernel PC in cpc interrupt */ - uintptr_t cpu_cpcprofile_upc; /* user PC in cpc interrupt */ - - /* - * Interrupt load factor used by dispatcher & softcall - */ - hrtime_t cpu_intrlast; /* total interrupt time (nsec) */ - int cpu_intrload; /* interrupt load factor (0-99%) */ - - uint_t cpu_rotor; /* for cheap pseudo-random numbers */ - - struct cu_cpu_info *cpu_cu_info; /* capacity & util. info */ - - /* - * cpu_generation is updated whenever CPU goes on-line or off-line. - * Updates to cpu_generation are protected by cpu_lock. - * - * See CPU_NEW_GENERATION() macro below. - */ - volatile uint_t cpu_generation; /* tracking on/off-line */ - - /* - * New members must be added /before/ this member, as the CTF tools - * rely on this being the last field before cpu_m, so they can - * correctly calculate the offset when synthetically adding the cpu_m - * member in objects that do not have it. This fixup is required for - * uniquification to work correctly. - */ - uintptr_t cpu_m_pad; - -#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP) - struct machcpu cpu_m; /* per architecture info */ -#endif -} cpu_t; - -/* - * The cpu_core structure consists of per-CPU state available in any context. - * On some architectures, this may mean that the page(s) containing the - * NCPU-sized array of cpu_core structures must be locked in the TLB -- it - * is up to the platform to assure that this is performed properly. Note that - * the structure is sized to avoid false sharing. - */ -#define CPUC_SIZE (sizeof (uint16_t) + sizeof (uint8_t) + \ - sizeof (uintptr_t) + sizeof (kmutex_t)) -#define CPUC_PADSIZE CPU_CACHE_COHERENCE_SIZE - CPUC_SIZE - -typedef struct cpu_core { - uint16_t cpuc_dtrace_flags; /* DTrace flags */ - uint8_t cpuc_dcpc_intr_state; /* DCPC provider intr state */ - uint8_t cpuc_pad[CPUC_PADSIZE]; /* padding */ - uintptr_t cpuc_dtrace_illval; /* DTrace illegal value */ - kmutex_t cpuc_pid_lock; /* DTrace pid provider lock */ -} cpu_core_t; - -#ifdef _KERNEL -extern cpu_core_t cpu_core[]; -#endif /* _KERNEL */ - -/* - * CPU_ON_INTR() macro. Returns non-zero if currently on interrupt stack. - * Note that this isn't a test for a high PIL. For example, cpu_intr_actv - * does not get updated when we go through sys_trap from TL>0 at high PIL. - * getpil() should be used instead to check for PIL levels. - */ -#define CPU_ON_INTR(cpup) ((cpup)->cpu_intr_actv >> (LOCK_LEVEL + 1)) - -/* - * Check to see if an interrupt thread might be active at a given ipl. - * If so return true. - * We must be conservative--it is ok to give a false yes, but a false no - * will cause disaster. (But if the situation changes after we check it is - * ok--the caller is trying to ensure that an interrupt routine has been - * exited). - * This is used when trying to remove an interrupt handler from an autovector - * list in avintr.c. - */ -#define INTR_ACTIVE(cpup, level) \ - ((level) <= LOCK_LEVEL ? \ - ((cpup)->cpu_intr_actv & (1 << (level))) : (CPU_ON_INTR(cpup))) - -/* - * CPU_PSEUDO_RANDOM() returns a per CPU value that changes each time one - * looks at it. It's meant as a cheap mechanism to be incorporated in routines - * wanting to avoid biasing, but where true randomness isn't needed (just - * something that changes). - */ -#define CPU_PSEUDO_RANDOM() (CPU->cpu_rotor++) - -#if defined(_KERNEL) || defined(_KMEMUSER) - -#define INTR_STACK_SIZE MAX(DEFAULTSTKSZ, PAGESIZE) - -/* MEMBERS PROTECTED BY "atomicity": cpu_flags */ - -/* - * Flags in the CPU structure. - * - * These are protected by cpu_lock (except during creation). - * - * Offlined-CPUs have three stages of being offline: - * - * CPU_ENABLE indicates that the CPU is participating in I/O interrupts - * that can be directed at a number of different CPUs. If CPU_ENABLE - * is off, the CPU will not be given interrupts that can be sent elsewhere, - * but will still get interrupts from devices associated with that CPU only, - * and from other CPUs. - * - * CPU_OFFLINE indicates that the dispatcher should not allow any threads - * other than interrupt threads to run on that CPU. A CPU will not have - * CPU_OFFLINE set if there are any bound threads (besides interrupts). - * - * CPU_QUIESCED is set if p_offline was able to completely turn idle the - * CPU and it will not have to run interrupt threads. In this case it'll - * stay in the idle loop until CPU_QUIESCED is turned off. - * - * CPU_FROZEN is used only by CPR to mark CPUs that have been successfully - * suspended (in the suspend path), or have yet to be resumed (in the resume - * case). - * - * On some platforms CPUs can be individually powered off. - * The following flags are set for powered off CPUs: CPU_QUIESCED, - * CPU_OFFLINE, and CPU_POWEROFF. The following flags are cleared: - * CPU_RUNNING, CPU_READY, CPU_EXISTS, and CPU_ENABLE. - */ -#define CPU_RUNNING 0x001 /* CPU running */ -#define CPU_READY 0x002 /* CPU ready for cross-calls */ -#define CPU_QUIESCED 0x004 /* CPU will stay in idle */ -#define CPU_EXISTS 0x008 /* CPU is configured */ -#define CPU_ENABLE 0x010 /* CPU enabled for interrupts */ -#define CPU_OFFLINE 0x020 /* CPU offline via p_online */ -#define CPU_POWEROFF 0x040 /* CPU is powered off */ -#define CPU_FROZEN 0x080 /* CPU is frozen via CPR suspend */ -#define CPU_SPARE 0x100 /* CPU offline available for use */ -#define CPU_FAULTED 0x200 /* CPU offline diagnosed faulty */ - -#define FMT_CPU_FLAGS \ - "\20\12fault\11spare\10frozen" \ - "\7poweroff\6offline\5enable\4exist\3quiesced\2ready\1run" - -#define CPU_ACTIVE(cpu) (((cpu)->cpu_flags & CPU_OFFLINE) == 0) - -/* - * Flags for cpu_offline(), cpu_faulted(), and cpu_spare(). - */ -#define CPU_FORCED 0x0001 /* Force CPU offline */ - -/* - * DTrace flags. - */ -#define CPU_DTRACE_NOFAULT 0x0001 /* Don't fault */ -#define CPU_DTRACE_DROP 0x0002 /* Drop this ECB */ -#define CPU_DTRACE_BADADDR 0x0004 /* DTrace fault: bad address */ -#define CPU_DTRACE_BADALIGN 0x0008 /* DTrace fault: bad alignment */ -#define CPU_DTRACE_DIVZERO 0x0010 /* DTrace fault: divide by zero */ -#define CPU_DTRACE_ILLOP 0x0020 /* DTrace fault: illegal operation */ -#define CPU_DTRACE_NOSCRATCH 0x0040 /* DTrace fault: out of scratch */ -#define CPU_DTRACE_KPRIV 0x0080 /* DTrace fault: bad kernel access */ -#define CPU_DTRACE_UPRIV 0x0100 /* DTrace fault: bad user access */ -#define CPU_DTRACE_TUPOFLOW 0x0200 /* DTrace fault: tuple stack overflow */ -#if defined(__sparc) -#define CPU_DTRACE_FAKERESTORE 0x0400 /* pid provider hint to getreg */ -#endif -#define CPU_DTRACE_ENTRY 0x0800 /* pid provider hint to ustack() */ -#define CPU_DTRACE_BADSTACK 0x1000 /* DTrace fault: bad stack */ - -#define CPU_DTRACE_FAULT (CPU_DTRACE_BADADDR | CPU_DTRACE_BADALIGN | \ - CPU_DTRACE_DIVZERO | CPU_DTRACE_ILLOP | \ - CPU_DTRACE_NOSCRATCH | CPU_DTRACE_KPRIV | \ - CPU_DTRACE_UPRIV | CPU_DTRACE_TUPOFLOW | \ - CPU_DTRACE_BADSTACK) -#define CPU_DTRACE_ERROR (CPU_DTRACE_FAULT | CPU_DTRACE_DROP) - -/* - * Dispatcher flags - * These flags must be changed only by the current CPU. - */ -#define CPU_DISP_DONTSTEAL 0x01 /* CPU undergoing context swtch */ -#define CPU_DISP_HALTED 0x02 /* CPU halted waiting for interrupt */ - -#endif /* _KERNEL || _KMEMUSER */ - -#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP) - -/* - * Macros for manipulating sets of CPUs as a bitmap. Note that this - * bitmap may vary in size depending on the maximum CPU id a specific - * platform supports. This may be different than the number of CPUs - * the platform supports, since CPU ids can be sparse. We define two - * sets of macros; one for platforms where the maximum CPU id is less - * than the number of bits in a single word (32 in a 32-bit kernel, - * 64 in a 64-bit kernel), and one for platforms that require bitmaps - * of more than one word. - */ - -#define CPUSET_WORDS BT_BITOUL(NCPU) -#define CPUSET_NOTINSET ((uint_t)-1) - -#if CPUSET_WORDS > 1 - -typedef struct cpuset { - ulong_t cpub[CPUSET_WORDS]; -} cpuset_t; - -/* - * Private functions for manipulating cpusets that do not fit in a - * single word. These should not be used directly; instead the - * CPUSET_* macros should be used so the code will be portable - * across different definitions of NCPU. - */ -extern void cpuset_all(cpuset_t *); -extern void cpuset_all_but(cpuset_t *, uint_t); -extern int cpuset_isnull(cpuset_t *); -extern int cpuset_cmp(cpuset_t *, cpuset_t *); -extern void cpuset_only(cpuset_t *, uint_t); -extern uint_t cpuset_find(cpuset_t *); -extern void cpuset_bounds(cpuset_t *, uint_t *, uint_t *); - -#define CPUSET_ALL(set) cpuset_all(&(set)) -#define CPUSET_ALL_BUT(set, cpu) cpuset_all_but(&(set), cpu) -#define CPUSET_ONLY(set, cpu) cpuset_only(&(set), cpu) -#define CPU_IN_SET(set, cpu) BT_TEST((set).cpub, cpu) -#define CPUSET_ADD(set, cpu) BT_SET((set).cpub, cpu) -#define CPUSET_DEL(set, cpu) BT_CLEAR((set).cpub, cpu) -#define CPUSET_ISNULL(set) cpuset_isnull(&(set)) -#define CPUSET_ISEQUAL(set1, set2) cpuset_cmp(&(set1), &(set2)) - -/* - * Find one CPU in the cpuset. - * Sets "cpu" to the id of the found CPU, or CPUSET_NOTINSET if no cpu - * could be found. (i.e. empty set) - */ -#define CPUSET_FIND(set, cpu) { \ - cpu = cpuset_find(&(set)); \ -} - -/* - * Determine the smallest and largest CPU id in the set. Returns - * CPUSET_NOTINSET in smallest and largest when set is empty. - */ -#define CPUSET_BOUNDS(set, smallest, largest) { \ - cpuset_bounds(&(set), &(smallest), &(largest)); \ -} - -/* - * Atomic cpuset operations - * These are safe to use for concurrent cpuset manipulations. - * "xdel" and "xadd" are exclusive operations, that set "result" to "0" - * if the add or del was successful, or "-1" if not successful. - * (e.g. attempting to add a cpu to a cpuset that's already there, or - * deleting a cpu that's not in the cpuset) - */ - -#define CPUSET_ATOMIC_DEL(set, cpu) BT_ATOMIC_CLEAR((set).cpub, (cpu)) -#define CPUSET_ATOMIC_ADD(set, cpu) BT_ATOMIC_SET((set).cpub, (cpu)) - -#define CPUSET_ATOMIC_XADD(set, cpu, result) \ - BT_ATOMIC_SET_EXCL((set).cpub, cpu, result) - -#define CPUSET_ATOMIC_XDEL(set, cpu, result) \ - BT_ATOMIC_CLEAR_EXCL((set).cpub, cpu, result) - - -#define CPUSET_OR(set1, set2) { \ - int _i; \ - for (_i = 0; _i < CPUSET_WORDS; _i++) \ - (set1).cpub[_i] |= (set2).cpub[_i]; \ -} - -#define CPUSET_XOR(set1, set2) { \ - int _i; \ - for (_i = 0; _i < CPUSET_WORDS; _i++) \ - (set1).cpub[_i] ^= (set2).cpub[_i]; \ -} - -#define CPUSET_AND(set1, set2) { \ - int _i; \ - for (_i = 0; _i < CPUSET_WORDS; _i++) \ - (set1).cpub[_i] &= (set2).cpub[_i]; \ -} - -#define CPUSET_ZERO(set) { \ - int _i; \ - for (_i = 0; _i < CPUSET_WORDS; _i++) \ - (set).cpub[_i] = 0; \ -} - -#elif CPUSET_WORDS == 1 - -typedef ulong_t cpuset_t; /* a set of CPUs */ - -#define CPUSET(cpu) (1UL << (cpu)) - -#define CPUSET_ALL(set) ((void)((set) = ~0UL)) -#define CPUSET_ALL_BUT(set, cpu) ((void)((set) = ~CPUSET(cpu))) -#define CPUSET_ONLY(set, cpu) ((void)((set) = CPUSET(cpu))) -#define CPU_IN_SET(set, cpu) ((set) & CPUSET(cpu)) -#define CPUSET_ADD(set, cpu) ((void)((set) |= CPUSET(cpu))) -#define CPUSET_DEL(set, cpu) ((void)((set) &= ~CPUSET(cpu))) -#define CPUSET_ISNULL(set) ((set) == 0) -#define CPUSET_ISEQUAL(set1, set2) ((set1) == (set2)) -#define CPUSET_OR(set1, set2) ((void)((set1) |= (set2))) -#define CPUSET_XOR(set1, set2) ((void)((set1) ^= (set2))) -#define CPUSET_AND(set1, set2) ((void)((set1) &= (set2))) -#define CPUSET_ZERO(set) ((void)((set) = 0)) - -#define CPUSET_FIND(set, cpu) { \ - cpu = (uint_t)(lowbit(set) - 1); \ -} - -#define CPUSET_BOUNDS(set, smallest, largest) { \ - smallest = (uint_t)(lowbit(set) - 1); \ - largest = (uint_t)(highbit(set) - 1); \ -} - -#define CPUSET_ATOMIC_DEL(set, cpu) atomic_and_ulong(&(set), ~CPUSET(cpu)) -#define CPUSET_ATOMIC_ADD(set, cpu) atomic_or_ulong(&(set), CPUSET(cpu)) - -#define CPUSET_ATOMIC_XADD(set, cpu, result) \ - { result = atomic_set_long_excl(&(set), (cpu)); } - -#define CPUSET_ATOMIC_XDEL(set, cpu, result) \ - { result = atomic_clear_long_excl(&(set), (cpu)); } - -#else /* CPUSET_WORDS <= 0 */ - -#error NCPU is undefined or invalid - -#endif /* CPUSET_WORDS */ - -extern cpuset_t cpu_seqid_inuse; - -#endif /* (_KERNEL || _KMEMUSER) && _MACHDEP */ - -#define CPU_CPR_OFFLINE 0x0 -#define CPU_CPR_ONLINE 0x1 -#define CPU_CPR_IS_OFFLINE(cpu) (((cpu)->cpu_cpr_flags & CPU_CPR_ONLINE) == 0) -#define CPU_CPR_IS_ONLINE(cpu) ((cpu)->cpu_cpr_flags & CPU_CPR_ONLINE) -#define CPU_SET_CPR_FLAGS(cpu, flag) ((cpu)->cpu_cpr_flags |= flag) - -#if defined(_KERNEL) || defined(_KMEMUSER) - -extern struct cpu *cpu[]; /* indexed by CPU number */ -extern struct cpu **cpu_seq; /* indexed by sequential CPU id */ -extern cpu_t *cpu_list; /* list of CPUs */ -extern cpu_t *cpu_active; /* list of active CPUs */ -extern int ncpus; /* number of CPUs present */ -extern int ncpus_online; /* number of CPUs not quiesced */ -extern int max_ncpus; /* max present before ncpus is known */ -extern int boot_max_ncpus; /* like max_ncpus but for real */ -extern int boot_ncpus; /* # cpus present @ boot */ -extern processorid_t max_cpuid; /* maximum CPU number */ -extern struct cpu *cpu_inmotion; /* offline or partition move target */ -extern cpu_t *clock_cpu_list; -extern processorid_t max_cpu_seqid_ever; /* maximum seqid ever given */ - -#if defined(__i386) || defined(__amd64) -extern struct cpu *curcpup(void); -#define CPU (curcpup()) /* Pointer to current CPU */ -#else -#define CPU (curthread->t_cpu) /* Pointer to current CPU */ -#endif - -/* - * CPU_CURRENT indicates to thread_affinity_set to use CPU->cpu_id - * as the target and to grab cpu_lock instead of requiring the caller - * to grab it. - */ -#define CPU_CURRENT -3 - -/* - * Per-CPU statistics - * - * cpu_stats_t contains numerous system and VM-related statistics, in the form - * of gauges or monotonically-increasing event occurrence counts. - */ - -#define CPU_STATS_ENTER_K() kpreempt_disable() -#define CPU_STATS_EXIT_K() kpreempt_enable() - -#define CPU_STATS_ADD_K(class, stat, amount) \ - { kpreempt_disable(); /* keep from switching CPUs */\ - CPU_STATS_ADDQ(CPU, class, stat, amount); \ - kpreempt_enable(); \ - } - -#define CPU_STATS_ADDQ(cp, class, stat, amount) { \ - extern void __dtrace_probe___cpu_##class##info_##stat(uint_t, \ - uint64_t *, cpu_t *); \ - uint64_t *stataddr = &((cp)->cpu_stats.class.stat); \ - __dtrace_probe___cpu_##class##info_##stat((amount), \ - stataddr, cp); \ - *(stataddr) += (amount); \ -} - -#define CPU_STATS(cp, stat) \ - ((cp)->cpu_stats.stat) - -/* - * Increment CPU generation value. - * This macro should be called whenever CPU goes on-line or off-line. - * Updates to cpu_generation should be protected by cpu_lock. - */ -#define CPU_NEW_GENERATION(cp) ((cp)->cpu_generation++) - -#endif /* _KERNEL || _KMEMUSER */ - -/* - * CPU support routines (not for genassym.c) - */ -#if (defined(_KERNEL) || defined(_FAKE_KERNEL)) && defined(__STDC__) - -struct zone; - -void cpu_list_init(cpu_t *); -void cpu_add_unit(cpu_t *); -void cpu_del_unit(int cpuid); -void cpu_add_active(cpu_t *); -void cpu_kstat_init(cpu_t *); -void cpu_visibility_add(cpu_t *, struct zone *); -void cpu_visibility_remove(cpu_t *, struct zone *); -void cpu_visibility_configure(cpu_t *, struct zone *); -void cpu_visibility_unconfigure(cpu_t *, struct zone *); -void cpu_visibility_online(cpu_t *, struct zone *); -void cpu_visibility_offline(cpu_t *, struct zone *); -void cpu_create_intrstat(cpu_t *); -void cpu_delete_intrstat(cpu_t *); -int cpu_kstat_intrstat_update(kstat_t *, int); -void cpu_intr_swtch_enter(kthread_t *); -void cpu_intr_swtch_exit(kthread_t *); - -void mbox_lock_init(void); /* initialize cross-call locks */ -void mbox_init(int cpun); /* initialize cross-calls */ -void poke_cpu(int cpun); /* interrupt another CPU (to preempt) */ - -/* - * values for safe_list. Pause state that CPUs are in. - */ -#define PAUSE_IDLE 0 /* normal state */ -#define PAUSE_READY 1 /* paused thread ready to spl */ -#define PAUSE_WAIT 2 /* paused thread is spl-ed high */ -#define PAUSE_DIE 3 /* tell pause thread to leave */ -#define PAUSE_DEAD 4 /* pause thread has left */ - -void mach_cpu_pause(volatile char *); - -void pause_cpus(cpu_t *off_cp, void *(*func)(void *)); -void start_cpus(void); -int cpus_paused(void); - -void cpu_pause_init(void); -cpu_t *cpu_get(processorid_t cpun); /* get the CPU struct associated */ - -int cpu_online(cpu_t *cp); /* take cpu online */ -int cpu_offline(cpu_t *cp, int flags); /* take cpu offline */ -int cpu_spare(cpu_t *cp, int flags); /* take cpu to spare */ -int cpu_faulted(cpu_t *cp, int flags); /* take cpu to faulted */ -int cpu_poweron(cpu_t *cp); /* take powered-off cpu to offline */ -int cpu_poweroff(cpu_t *cp); /* take offline cpu to powered-off */ - -cpu_t *cpu_intr_next(cpu_t *cp); /* get next online CPU taking intrs */ -int cpu_intr_count(cpu_t *cp); /* count # of CPUs handling intrs */ -int cpu_intr_on(cpu_t *cp); /* CPU taking I/O interrupts? */ -void cpu_intr_enable(cpu_t *cp); /* enable I/O interrupts */ -int cpu_intr_disable(cpu_t *cp); /* disable I/O interrupts */ -void cpu_intr_alloc(cpu_t *cp, int n); /* allocate interrupt threads */ - -/* - * Routines for checking CPU states. - */ -int cpu_is_online(cpu_t *); /* check if CPU is online */ -int cpu_is_nointr(cpu_t *); /* check if CPU can service intrs */ -int cpu_is_active(cpu_t *); /* check if CPU can run threads */ -int cpu_is_offline(cpu_t *); /* check if CPU is offline */ -int cpu_is_poweredoff(cpu_t *); /* check if CPU is powered off */ - -int cpu_flagged_online(cpu_flag_t); /* flags show CPU is online */ -int cpu_flagged_nointr(cpu_flag_t); /* flags show CPU not handling intrs */ -int cpu_flagged_active(cpu_flag_t); /* flags show CPU scheduling threads */ -int cpu_flagged_offline(cpu_flag_t); /* flags show CPU is offline */ -int cpu_flagged_poweredoff(cpu_flag_t); /* flags show CPU is powered off */ - -/* - * The processor_info(2) state of a CPU is a simplified representation suitable - * for use by an application program. Kernel subsystems should utilize the - * internal per-CPU state as given by the cpu_flags member of the cpu structure, - * as this information may include platform- or architecture-specific state - * critical to a subsystem's disposition of a particular CPU. - */ -void cpu_set_state(cpu_t *); /* record/timestamp current state */ -int cpu_get_state(cpu_t *); /* get current cpu state */ -const char *cpu_get_state_str(cpu_t *); /* get current cpu state as string */ - - -void cpu_set_curr_clock(uint64_t); /* indicate the current CPU's freq */ -void cpu_set_supp_freqs(cpu_t *, const char *); /* set the CPU supported */ - /* frequencies */ - -int cpu_configure(int); -int cpu_unconfigure(int); -void cpu_destroy_bound_threads(cpu_t *cp); - -extern int cpu_bind_thread(kthread_t *tp, processorid_t bind, - processorid_t *obind, int *error); -extern int cpu_unbind(processorid_t cpu_id, boolean_t force); -extern void thread_affinity_set(kthread_t *t, int cpu_id); -extern void thread_affinity_clear(kthread_t *t); -extern void affinity_set(int cpu_id); -extern void affinity_clear(void); -extern void init_cpu_mstate(struct cpu *, int); -extern void term_cpu_mstate(struct cpu *); -extern void new_cpu_mstate(int, hrtime_t); -extern void get_cpu_mstate(struct cpu *, hrtime_t *); -extern void thread_nomigrate(void); -extern void thread_allowmigrate(void); -extern void weakbinding_stop(void); -extern void weakbinding_start(void); - -/* - * The following routines affect the CPUs participation in interrupt processing, - * if that is applicable on the architecture. This only affects interrupts - * which aren't directed at the processor (not cross calls). - * - * cpu_disable_intr returns non-zero if interrupts were previously enabled. - */ -int cpu_disable_intr(struct cpu *cp); /* stop issuing interrupts to cpu */ -void cpu_enable_intr(struct cpu *cp); /* start issuing interrupts to cpu */ - -/* - * The mutex cpu_lock protects cpu_flags for all CPUs, as well as the ncpus - * and ncpus_online counts. - */ -extern kmutex_t cpu_lock; /* lock protecting CPU data */ - -/* - * CPU state change events - * - * Various subsystems need to know when CPUs change their state. They get this - * information by registering CPU state change callbacks using - * register_cpu_setup_func(). Whenever any CPU changes its state, the callback - * function is called. The callback function is passed three arguments: - * - * Event, described by cpu_setup_t - * CPU ID - * Transparent pointer passed when registering the callback - * - * The callback function is called with cpu_lock held. The return value from the - * callback function is usually ignored, except for CPU_CONFIG and CPU_UNCONFIG - * events. For these two events, non-zero return value indicates a failure and - * prevents successful completion of the operation. - * - * New events may be added in the future. Callback functions should ignore any - * events that they do not understand. - * - * The following events provide notification callbacks: - * - * CPU_INIT A new CPU is started and added to the list of active CPUs - * This event is only used during boot - * - * CPU_CONFIG A newly inserted CPU is prepared for starting running code - * This event is called by DR code - * - * CPU_UNCONFIG CPU has been powered off and needs cleanup - * This event is called by DR code - * - * CPU_ON CPU is enabled but does not run anything yet - * - * CPU_INTR_ON CPU is enabled and has interrupts enabled - * - * CPU_OFF CPU is going offline but can still run threads - * - * CPU_CPUPART_OUT CPU is going to move out of its partition - * - * CPU_CPUPART_IN CPU is going to move to a new partition - * - * CPU_SETUP CPU is set up during boot and can run threads - */ -typedef enum { - CPU_INIT, - CPU_CONFIG, - CPU_UNCONFIG, - CPU_ON, - CPU_OFF, - CPU_CPUPART_IN, - CPU_CPUPART_OUT, - CPU_SETUP, - CPU_INTR_ON -} cpu_setup_t; - -typedef int cpu_setup_func_t(cpu_setup_t, int, void *); - -/* - * Routines used to register interest in cpu's being added to or removed - * from the system. - */ -extern void register_cpu_setup_func(cpu_setup_func_t *, void *); -extern void unregister_cpu_setup_func(cpu_setup_func_t *, void *); -extern void cpu_state_change_notify(int, cpu_setup_t); - -/* - * Call specified function on the given CPU - */ -typedef void (*cpu_call_func_t)(uintptr_t, uintptr_t); -extern void cpu_call(cpu_t *, cpu_call_func_t, uintptr_t, uintptr_t); - - -/* - * Create various strings that describe the given CPU for the - * processor_info system call and configuration-related kstats. - */ -#define CPU_IDSTRLEN 100 - -extern void init_cpu_info(struct cpu *); -extern void populate_idstr(struct cpu *); -extern void cpu_vm_data_init(struct cpu *); -extern void cpu_vm_data_destroy(struct cpu *); - -#endif /* _KERNEL || _FAKE_KERNEL */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_CPUVAR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h index b474f91ce01d..b9ee9095d17d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h @@ -50,25 +50,22 @@ extern "C" { #ifndef _ASM #include +#ifdef _KERNEL +#include +#endif #include #include -#ifdef illumos -#include -#else #include #include #include #include +#include +#include +#include #include typedef int model_t; -#endif #include -#ifdef illumos -#include -#include -#else #include -#endif /* * DTrace Universal Constants and Typedefs diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h index 0b8df9834fa6..d26a09be4caa 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h @@ -50,6 +50,7 @@ extern "C" { */ #include +#include #ifndef illumos #ifdef __sparcv9 diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h deleted file mode 100644 index 029af540b3c7..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_FM_FS_ZFS_H -#define _SYS_FM_FS_ZFS_H - -#ifdef __cplusplus -extern "C" { -#endif - -#define ZFS_ERROR_CLASS "fs.zfs" - -#define FM_EREPORT_ZFS_CHECKSUM "checksum" -#define FM_EREPORT_ZFS_IO "io" -#define FM_EREPORT_ZFS_DATA "data" -#define FM_EREPORT_ZFS_POOL "zpool" -#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" -#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" -#define FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA "vdev.corrupt_data" -#define FM_EREPORT_ZFS_DEVICE_NO_REPLICAS "vdev.no_replicas" -#define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum" -#define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small" -#define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label" -#define FM_EREPORT_ZFS_IO_FAILURE "io_failure" -#define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure" -#define FM_EREPORT_ZFS_LOG_REPLAY "log_replay" -#define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write" - -#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool" -#define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode" -#define FM_EREPORT_PAYLOAD_ZFS_POOL_GUID "pool_guid" -#define FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT "pool_context" -#define FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID "vdev_guid" -#define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type" -#define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path" -#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" -#define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" -#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" -#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" -#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" -#define FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID "parent_devid" -#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET "zio_objset" -#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT "zio_object" -#define FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL "zio_level" -#define FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID "zio_blkid" -#define FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR "zio_err" -#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset" -#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size" -#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" -#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected" -#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual" -#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm" -#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap" -#define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges" -#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap" -#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS "bad_range_sets" -#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears" -#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits" -#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits" -#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram" -#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram" - -#define FM_EREPORT_FAILMODE_WAIT "wait" -#define FM_EREPORT_FAILMODE_CONTINUE "continue" -#define FM_EREPORT_FAILMODE_PANIC "panic" - -#define FM_RESOURCE_REMOVED "removed" -#define FM_RESOURCE_AUTOREPLACE "autoreplace" -#define FM_RESOURCE_STATECHANGE "statechange" - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FM_FS_ZFS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h deleted file mode 100644 index f5f93421bd74..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h +++ /dev/null @@ -1,369 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#ifndef _SYS_FM_PROTOCOL_H -#define _SYS_FM_PROTOCOL_H - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef _KERNEL -#include -#include -#else -#include -#include -#endif - -/* FM common member names */ -#define FM_CLASS "class" -#define FM_VERSION "version" - -/* FM protocol category 1 class names */ -#define FM_EREPORT_CLASS "ereport" -#define FM_FAULT_CLASS "fault" -#define FM_DEFECT_CLASS "defect" -#define FM_RSRC_CLASS "resource" -#define FM_LIST_EVENT "list" -#define FM_IREPORT_CLASS "ireport" - -/* FM list.* event class values */ -#define FM_LIST_SUSPECT_CLASS FM_LIST_EVENT ".suspect" -#define FM_LIST_ISOLATED_CLASS FM_LIST_EVENT ".isolated" -#define FM_LIST_REPAIRED_CLASS FM_LIST_EVENT ".repaired" -#define FM_LIST_UPDATED_CLASS FM_LIST_EVENT ".updated" -#define FM_LIST_RESOLVED_CLASS FM_LIST_EVENT ".resolved" - -/* ereport class subcategory values */ -#define FM_ERROR_CPU "cpu" -#define FM_ERROR_IO "io" - -/* ereport version and payload member names */ -#define FM_EREPORT_VERS0 0 -#define FM_EREPORT_VERSION FM_EREPORT_VERS0 - -/* ereport payload member names */ -#define FM_EREPORT_DETECTOR "detector" -#define FM_EREPORT_ENA "ena" - -/* list.* event payload member names */ -#define FM_LIST_EVENT_SIZE "list-sz" - -/* ireport.* event payload member names */ -#define FM_IREPORT_DETECTOR "detector" -#define FM_IREPORT_UUID "uuid" -#define FM_IREPORT_PRIORITY "pri" -#define FM_IREPORT_ATTRIBUTES "attr" - -/* - * list.suspect, isolated, updated, repaired and resolved - * versions/payload member names. - */ -#define FM_SUSPECT_UUID "uuid" -#define FM_SUSPECT_DIAG_CODE "code" -#define FM_SUSPECT_DIAG_TIME "diag-time" -#define FM_SUSPECT_DE "de" -#define FM_SUSPECT_FAULT_LIST "fault-list" -#define FM_SUSPECT_FAULT_SZ "fault-list-sz" -#define FM_SUSPECT_FAULT_STATUS "fault-status" -#define FM_SUSPECT_INJECTED "__injected" -#define FM_SUSPECT_MESSAGE "message" -#define FM_SUSPECT_RETIRE "retire" -#define FM_SUSPECT_RESPONSE "response" -#define FM_SUSPECT_SEVERITY "severity" - -#define FM_SUSPECT_VERS0 0 -#define FM_SUSPECT_VERSION FM_SUSPECT_VERS0 - -#define FM_SUSPECT_FAULTY 0x1 -#define FM_SUSPECT_UNUSABLE 0x2 -#define FM_SUSPECT_NOT_PRESENT 0x4 -#define FM_SUSPECT_DEGRADED 0x8 -#define FM_SUSPECT_REPAIRED 0x10 -#define FM_SUSPECT_REPLACED 0x20 -#define FM_SUSPECT_ACQUITTED 0x40 - -/* fault event versions and payload member names */ -#define FM_FAULT_VERS0 0 -#define FM_FAULT_VERSION FM_FAULT_VERS0 - -#define FM_FAULT_ASRU "asru" -#define FM_FAULT_FRU "fru" -#define FM_FAULT_FRU_LABEL "fru-label" -#define FM_FAULT_CERTAINTY "certainty" -#define FM_FAULT_RESOURCE "resource" -#define FM_FAULT_LOCATION "location" - -/* resource event versions and payload member names */ -#define FM_RSRC_VERS0 0 -#define FM_RSRC_VERSION FM_RSRC_VERS0 -#define FM_RSRC_RESOURCE "resource" - -/* resource.fm.asru.* payload member names */ -#define FM_RSRC_ASRU_UUID "uuid" -#define FM_RSRC_ASRU_CODE "code" -#define FM_RSRC_ASRU_FAULTY "faulty" -#define FM_RSRC_ASRU_REPAIRED "repaired" -#define FM_RSRC_ASRU_REPLACED "replaced" -#define FM_RSRC_ASRU_ACQUITTED "acquitted" -#define FM_RSRC_ASRU_RESOLVED "resolved" -#define FM_RSRC_ASRU_UNUSABLE "unusable" -#define FM_RSRC_ASRU_EVENT "event" - -/* resource.fm.xprt.* versions and payload member names */ -#define FM_RSRC_XPRT_VERS0 0 -#define FM_RSRC_XPRT_VERSION FM_RSRC_XPRT_VERS0 -#define FM_RSRC_XPRT_UUID "uuid" -#define FM_RSRC_XPRT_SUBCLASS "subclass" -#define FM_RSRC_XPRT_FAULT_STATUS "fault-status" -#define FM_RSRC_XPRT_FAULT_HAS_ASRU "fault-has-asru" - -/* - * FM ENA Format Macros - */ -#define ENA_FORMAT_MASK 0x3 -#define ENA_FORMAT(ena) ((ena) & ENA_FORMAT_MASK) - -/* ENA format types */ -#define FM_ENA_FMT0 0 -#define FM_ENA_FMT1 1 -#define FM_ENA_FMT2 2 - -/* Format 1 */ -#define ENA_FMT1_GEN_MASK 0x00000000000003FCull -#define ENA_FMT1_ID_MASK 0xFFFFFFFFFFFFFC00ull -#define ENA_FMT1_CPUID_MASK 0x00000000000FFC00ull -#define ENA_FMT1_TIME_MASK 0xFFFFFFFFFFF00000ull -#define ENA_FMT1_GEN_SHFT 2 -#define ENA_FMT1_ID_SHFT 10 -#define ENA_FMT1_CPUID_SHFT ENA_FMT1_ID_SHFT -#define ENA_FMT1_TIME_SHFT 20 - -/* Format 2 */ -#define ENA_FMT2_GEN_MASK 0x00000000000003FCull -#define ENA_FMT2_ID_MASK 0xFFFFFFFFFFFFFC00ull -#define ENA_FMT2_TIME_MASK ENA_FMT2_ID_MASK -#define ENA_FMT2_GEN_SHFT 2 -#define ENA_FMT2_ID_SHFT 10 -#define ENA_FMT2_TIME_SHFT ENA_FMT2_ID_SHFT - -/* Common FMRI type names */ -#define FM_FMRI_AUTHORITY "authority" -#define FM_FMRI_SCHEME "scheme" -#define FM_FMRI_SVC_AUTHORITY "svc-authority" -#define FM_FMRI_FACILITY "facility" - -/* FMRI authority-type member names */ -#define FM_FMRI_AUTH_CHASSIS "chassis-id" -#define FM_FMRI_AUTH_PRODUCT_SN "product-sn" -#define FM_FMRI_AUTH_PRODUCT "product-id" -#define FM_FMRI_AUTH_DOMAIN "domain-id" -#define FM_FMRI_AUTH_SERVER "server-id" -#define FM_FMRI_AUTH_HOST "host-id" - -#define FM_AUTH_VERS0 0 -#define FM_FMRI_AUTH_VERSION FM_AUTH_VERS0 - -/* scheme name values */ -#define FM_FMRI_SCHEME_FMD "fmd" -#define FM_FMRI_SCHEME_DEV "dev" -#define FM_FMRI_SCHEME_HC "hc" -#define FM_FMRI_SCHEME_SVC "svc" -#define FM_FMRI_SCHEME_CPU "cpu" -#define FM_FMRI_SCHEME_MEM "mem" -#define FM_FMRI_SCHEME_MOD "mod" -#define FM_FMRI_SCHEME_PKG "pkg" -#define FM_FMRI_SCHEME_LEGACY "legacy-hc" -#define FM_FMRI_SCHEME_ZFS "zfs" -#define FM_FMRI_SCHEME_SW "sw" - -/* Scheme versions */ -#define FMD_SCHEME_VERSION0 0 -#define FM_FMD_SCHEME_VERSION FMD_SCHEME_VERSION0 -#define DEV_SCHEME_VERSION0 0 -#define FM_DEV_SCHEME_VERSION DEV_SCHEME_VERSION0 -#define FM_HC_VERS0 0 -#define FM_HC_SCHEME_VERSION FM_HC_VERS0 -#define CPU_SCHEME_VERSION0 0 -#define CPU_SCHEME_VERSION1 1 -#define FM_CPU_SCHEME_VERSION CPU_SCHEME_VERSION1 -#define MEM_SCHEME_VERSION0 0 -#define FM_MEM_SCHEME_VERSION MEM_SCHEME_VERSION0 -#define MOD_SCHEME_VERSION0 0 -#define FM_MOD_SCHEME_VERSION MOD_SCHEME_VERSION0 -#define PKG_SCHEME_VERSION0 0 -#define FM_PKG_SCHEME_VERSION PKG_SCHEME_VERSION0 -#define LEGACY_SCHEME_VERSION0 0 -#define FM_LEGACY_SCHEME_VERSION LEGACY_SCHEME_VERSION0 -#define SVC_SCHEME_VERSION0 0 -#define FM_SVC_SCHEME_VERSION SVC_SCHEME_VERSION0 -#define ZFS_SCHEME_VERSION0 0 -#define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0 -#define SW_SCHEME_VERSION0 0 -#define FM_SW_SCHEME_VERSION SW_SCHEME_VERSION0 - -/* hc scheme member names */ -#define FM_FMRI_HC_SERIAL_ID "serial" -#define FM_FMRI_HC_PART "part" -#define FM_FMRI_HC_REVISION "revision" -#define FM_FMRI_HC_ROOT "hc-root" -#define FM_FMRI_HC_LIST_SZ "hc-list-sz" -#define FM_FMRI_HC_LIST "hc-list" -#define FM_FMRI_HC_SPECIFIC "hc-specific" - -/* facility member names */ -#define FM_FMRI_FACILITY_NAME "facility-name" -#define FM_FMRI_FACILITY_TYPE "facility-type" - -/* hc-list version and member names */ -#define FM_FMRI_HC_NAME "hc-name" -#define FM_FMRI_HC_ID "hc-id" - -#define HC_LIST_VERSION0 0 -#define FM_HC_LIST_VERSION HC_LIST_VERSION0 - -/* hc-specific member names */ -#define FM_FMRI_HC_SPECIFIC_OFFSET "offset" -#define FM_FMRI_HC_SPECIFIC_PHYSADDR "physaddr" - -/* fmd module scheme member names */ -#define FM_FMRI_FMD_NAME "mod-name" -#define FM_FMRI_FMD_VERSION "mod-version" - -/* dev scheme member names */ -#define FM_FMRI_DEV_ID "devid" -#define FM_FMRI_DEV_TGTPTLUN0 "target-port-l0id" -#define FM_FMRI_DEV_PATH "device-path" - -/* pkg scheme member names */ -#define FM_FMRI_PKG_BASEDIR "pkg-basedir" -#define FM_FMRI_PKG_INST "pkg-inst" -#define FM_FMRI_PKG_VERSION "pkg-version" - -/* svc scheme member names */ -#define FM_FMRI_SVC_NAME "svc-name" -#define FM_FMRI_SVC_INSTANCE "svc-instance" -#define FM_FMRI_SVC_CONTRACT_ID "svc-contract-id" - -/* svc-authority member names */ -#define FM_FMRI_SVC_AUTH_SCOPE "scope" -#define FM_FMRI_SVC_AUTH_SYSTEM_FQN "system-fqn" - -/* cpu scheme member names */ -#define FM_FMRI_CPU_ID "cpuid" -#define FM_FMRI_CPU_SERIAL_ID "serial" -#define FM_FMRI_CPU_MASK "cpumask" -#define FM_FMRI_CPU_VID "cpuvid" -#define FM_FMRI_CPU_CPUFRU "cpufru" -#define FM_FMRI_CPU_CACHE_INDEX "cacheindex" -#define FM_FMRI_CPU_CACHE_WAY "cacheway" -#define FM_FMRI_CPU_CACHE_BIT "cachebit" -#define FM_FMRI_CPU_CACHE_TYPE "cachetype" - -#define FM_FMRI_CPU_CACHE_TYPE_L2 0 -#define FM_FMRI_CPU_CACHE_TYPE_L3 1 - -/* legacy-hc scheme member names */ -#define FM_FMRI_LEGACY_HC "component" -#define FM_FMRI_LEGACY_HC_PREFIX FM_FMRI_SCHEME_HC":///" \ - FM_FMRI_LEGACY_HC"=" - -/* mem scheme member names */ -#define FM_FMRI_MEM_UNUM "unum" -#define FM_FMRI_MEM_SERIAL_ID "serial" -#define FM_FMRI_MEM_PHYSADDR "physaddr" -#define FM_FMRI_MEM_MEMCONFIG "memconfig" -#define FM_FMRI_MEM_OFFSET "offset" - -/* mod scheme member names */ -#define FM_FMRI_MOD_PKG "mod-pkg" -#define FM_FMRI_MOD_NAME "mod-name" -#define FM_FMRI_MOD_ID "mod-id" -#define FM_FMRI_MOD_DESC "mod-desc" - -/* zfs scheme member names */ -#define FM_FMRI_ZFS_POOL "pool" -#define FM_FMRI_ZFS_VDEV "vdev" - -/* sw scheme member names - extra indentation for members of an nvlist */ -#define FM_FMRI_SW_OBJ "object" -#define FM_FMRI_SW_OBJ_PATH "path" -#define FM_FMRI_SW_OBJ_ROOT "root" -#define FM_FMRI_SW_OBJ_PKG "pkg" -#define FM_FMRI_SW_SITE "site" -#define FM_FMRI_SW_SITE_TOKEN "token" -#define FM_FMRI_SW_SITE_MODULE "module" -#define FM_FMRI_SW_SITE_FILE "file" -#define FM_FMRI_SW_SITE_LINE "line" -#define FM_FMRI_SW_SITE_FUNC "func" -#define FM_FMRI_SW_CTXT "context" -#define FM_FMRI_SW_CTXT_ORIGIN "origin" -#define FM_FMRI_SW_CTXT_EXECNAME "execname" -#define FM_FMRI_SW_CTXT_PID "pid" -#define FM_FMRI_SW_CTXT_ZONE "zone" -#define FM_FMRI_SW_CTXT_CTID "ctid" -#define FM_FMRI_SW_CTXT_STACK "stack" - -extern nv_alloc_t *fm_nva_xcreate(char *, size_t); -extern void fm_nva_xdestroy(nv_alloc_t *); - -extern nvlist_t *fm_nvlist_create(nv_alloc_t *); -extern void fm_nvlist_destroy(nvlist_t *, int); - -#define FM_NVA_FREE 0 /* free allocator on nvlist_destroy */ -#define FM_NVA_RETAIN 1 /* keep allocator on nvlist_destroy */ - -extern void fm_ereport_set(nvlist_t *, int, const char *, uint64_t, - const nvlist_t *, ...); -extern void fm_payload_set(nvlist_t *, ...); -extern int i_fm_payload_set(nvlist_t *, const char *, va_list); -extern void fm_fmri_hc_set(nvlist_t *, int, const nvlist_t *, nvlist_t *, - int, ...); -extern void fm_fmri_dev_set(nvlist_t *, int, const nvlist_t *, const char *, - const char *, const char *); -extern void fm_fmri_de_set(nvlist_t *, int, const nvlist_t *, const char *); -extern void fm_fmri_cpu_set(nvlist_t *, int, const nvlist_t *, uint32_t, - uint8_t *, const char *); -extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *, - const char *, uint64_t); -extern void fm_authority_set(nvlist_t *, int, const char *, const char *, - const char *, const char *); -extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t); -extern void fm_fmri_hc_create(nvlist_t *, int, const nvlist_t *, nvlist_t *, - nvlist_t *, int, ...); - -extern uint64_t fm_ena_increment(uint64_t); -extern uint64_t fm_ena_generate(uint64_t, uchar_t); -extern uint64_t fm_ena_generation_get(uint64_t); -extern uchar_t fm_ena_format_get(uint64_t); -extern uint64_t fm_ena_id_get(uint64_t); -extern uint64_t fm_ena_time_get(uint64_t); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FM_PROTOCOL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h deleted file mode 100644 index e99a370af7ae..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2017 RackTop Systems. - */ - -#ifndef _SYS_FM_UTIL_H -#define _SYS_FM_UTIL_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include - -/* - * Shared user/kernel definitions for class length, error channel name, - * and kernel event publisher string. - */ -#define FM_MAX_CLASS 100 -#define FM_ERROR_CHAN "com.sun:fm:error" -#define FM_PUB "fm" - -/* - * ereport dump device transport support - * - * Ereports are written out to the dump device at a proscribed offset from the - * end, similar to in-transit log messages. The ereports are represented as a - * erpt_dump_t header followed by ed_size bytes of packed native nvlist data. - * - * NOTE: All of these constants and the header must be defined so they have the - * same representation for *both* 32-bit and 64-bit producers and consumers. - */ -#define ERPT_MAGIC 0xf00d4eddU -#define ERPT_MAX_ERRS 16 -#define ERPT_DATA_SZ (6 * 1024) -#define ERPT_EVCH_MAX 256 -#define ERPT_HIWAT 64 - -typedef struct erpt_dump { - uint32_t ed_magic; /* ERPT_MAGIC or zero to indicate end */ - uint32_t ed_chksum; /* checksum32() of packed nvlist data */ - uint32_t ed_size; /* ereport (nvl) fixed buf size */ - uint32_t ed_pad; /* reserved for future use */ - hrtime_t ed_hrt_nsec; /* hrtime of this ereport */ - hrtime_t ed_hrt_base; /* hrtime sample corresponding to ed_tod_base */ - struct { - uint64_t sec; /* seconds since gettimeofday() Epoch */ - uint64_t nsec; /* nanoseconds past ed_tod_base.sec */ - } ed_tod_base; -} erpt_dump_t; - -#if defined(_KERNEL) || defined(_FAKE_KERNEL) -#include - -#define FM_STK_DEPTH 20 /* maximum stack depth */ -#define FM_SYM_SZ 64 /* maximum symbol size */ -#define FM_ERR_PIL 2 /* PIL for ereport_errorq drain processing */ - -#define FM_EREPORT_PAYLOAD_NAME_STACK "stack" - -extern errorq_t *ereport_errorq; -extern void *ereport_dumpbuf; -extern size_t ereport_dumplen; - -extern void fm_init(void); -extern void fm_nvprint(nvlist_t *); -#define fm_panic panic -extern void fm_banner(void); - -extern void fm_ereport_dump(void); -extern void fm_ereport_post(nvlist_t *, int); - -extern int is_fm_panic(); -#endif /* _KERNEL || _FAKE_KERNEL */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FM_UTIL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h deleted file mode 100644 index db23bbe01b9f..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h +++ /dev/null @@ -1,1248 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, Martin Matuska . All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2017 Joyent, Inc. - * Copyright (c) 2019 Datto Inc. - * Copyright (c) 2017, Intel Corporation. - */ - -/* Portions Copyright 2010 Robert Milkowski */ - -#ifndef _SYS_FS_ZFS_H -#define _SYS_FS_ZFS_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Types and constants shared between userland and the kernel. - */ - -/* - * Each dataset can be one of the following types. These constants can be - * combined into masks that can be passed to various functions. - */ -typedef enum { - ZFS_TYPE_FILESYSTEM = (1 << 0), - ZFS_TYPE_SNAPSHOT = (1 << 1), - ZFS_TYPE_VOLUME = (1 << 2), - ZFS_TYPE_POOL = (1 << 3), - ZFS_TYPE_BOOKMARK = (1 << 4) -} zfs_type_t; - -/* - * NB: lzc_dataset_type should be updated whenever a new objset type is added, - * if it represents a real type of a dataset that can be created from userland. - */ -typedef enum dmu_objset_type { - DMU_OST_NONE, - DMU_OST_META, - DMU_OST_ZFS, - DMU_OST_ZVOL, - DMU_OST_OTHER, /* For testing only! */ - DMU_OST_ANY, /* Be careful! */ - DMU_OST_NUMTYPES -} dmu_objset_type_t; - -#define ZFS_TYPE_DATASET \ - (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) - -/* - * All of these include the terminating NUL byte. - */ -#define ZAP_MAXNAMELEN 256 -#define ZAP_MAXVALUELEN (1024 * 8) -#define ZAP_OLDMAXVALUELEN 1024 -#define ZFS_MAX_DATASET_NAME_LEN 256 - -/* - * Dataset properties are identified by these constants and must be added to - * the end of this list to ensure that external consumers are not affected - * by the change. If you make any changes to this list, be sure to update - * the property table in usr/src/common/zfs/zfs_prop.c. - */ -typedef enum { - ZPROP_CONT = -2, - ZPROP_INVAL = -1, - ZFS_PROP_TYPE = 0, - ZFS_PROP_CREATION, - ZFS_PROP_USED, - ZFS_PROP_AVAILABLE, - ZFS_PROP_REFERENCED, - ZFS_PROP_COMPRESSRATIO, - ZFS_PROP_MOUNTED, - ZFS_PROP_ORIGIN, - ZFS_PROP_QUOTA, - ZFS_PROP_RESERVATION, - ZFS_PROP_VOLSIZE, - ZFS_PROP_VOLBLOCKSIZE, - ZFS_PROP_RECORDSIZE, - ZFS_PROP_MOUNTPOINT, - ZFS_PROP_SHARENFS, - ZFS_PROP_CHECKSUM, - ZFS_PROP_COMPRESSION, - ZFS_PROP_ATIME, - ZFS_PROP_DEVICES, - ZFS_PROP_EXEC, - ZFS_PROP_SETUID, - ZFS_PROP_READONLY, - ZFS_PROP_ZONED, - ZFS_PROP_SNAPDIR, - ZFS_PROP_ACLMODE, - ZFS_PROP_ACLINHERIT, - ZFS_PROP_CREATETXG, - ZFS_PROP_NAME, /* not exposed to the user */ - ZFS_PROP_CANMOUNT, - ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ - ZFS_PROP_XATTR, - ZFS_PROP_NUMCLONES, /* not exposed to the user */ - ZFS_PROP_COPIES, - ZFS_PROP_VERSION, - ZFS_PROP_UTF8ONLY, - ZFS_PROP_NORMALIZE, - ZFS_PROP_CASE, - ZFS_PROP_VSCAN, - ZFS_PROP_NBMAND, - ZFS_PROP_SHARESMB, - ZFS_PROP_REFQUOTA, - ZFS_PROP_REFRESERVATION, - ZFS_PROP_GUID, - ZFS_PROP_PRIMARYCACHE, - ZFS_PROP_SECONDARYCACHE, - ZFS_PROP_USEDSNAP, - ZFS_PROP_USEDDS, - ZFS_PROP_USEDCHILD, - ZFS_PROP_USEDREFRESERV, - ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ - ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ - ZFS_PROP_DEFER_DESTROY, - ZFS_PROP_USERREFS, - ZFS_PROP_LOGBIAS, - ZFS_PROP_UNIQUE, /* not exposed to the user */ - ZFS_PROP_OBJSETID, /* not exposed to the user */ - ZFS_PROP_DEDUP, - ZFS_PROP_MLSLABEL, - ZFS_PROP_SYNC, - ZFS_PROP_DNODESIZE, - ZFS_PROP_REFRATIO, - ZFS_PROP_WRITTEN, - ZFS_PROP_CLONES, - ZFS_PROP_LOGICALUSED, - ZFS_PROP_LOGICALREFERENCED, - ZFS_PROP_INCONSISTENT, /* not exposed to the user */ - ZFS_PROP_VOLMODE, - ZFS_PROP_FILESYSTEM_LIMIT, - ZFS_PROP_SNAPSHOT_LIMIT, - ZFS_PROP_FILESYSTEM_COUNT, - ZFS_PROP_SNAPSHOT_COUNT, - ZFS_PROP_REDUNDANT_METADATA, - ZFS_PROP_PREV_SNAP, - ZFS_PROP_RECEIVE_RESUME_TOKEN, - ZFS_PROP_REMAPTXG, /* not exposed to the user */ - ZFS_PROP_SPECIAL_SMALL_BLOCKS, - ZFS_NUM_PROPS -} zfs_prop_t; - -typedef enum { - ZFS_PROP_USERUSED, - ZFS_PROP_USERQUOTA, - ZFS_PROP_GROUPUSED, - ZFS_PROP_GROUPQUOTA, - ZFS_NUM_USERQUOTA_PROPS -} zfs_userquota_prop_t; - -extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; - -/* - * Pool properties are identified by these constants and must be added to the - * end of this list to ensure that external consumers are not affected - * by the change. If you make any changes to this list, be sure to update - * the property table in usr/src/common/zfs/zpool_prop.c. - */ -typedef enum { - ZPOOL_PROP_INVAL = -1, - ZPOOL_PROP_NAME, - ZPOOL_PROP_SIZE, - ZPOOL_PROP_CAPACITY, - ZPOOL_PROP_ALTROOT, - ZPOOL_PROP_HEALTH, - ZPOOL_PROP_GUID, - ZPOOL_PROP_VERSION, - ZPOOL_PROP_BOOTFS, - ZPOOL_PROP_DELEGATION, - ZPOOL_PROP_AUTOREPLACE, - ZPOOL_PROP_CACHEFILE, - ZPOOL_PROP_FAILUREMODE, - ZPOOL_PROP_LISTSNAPS, - ZPOOL_PROP_AUTOEXPAND, - ZPOOL_PROP_DEDUPDITTO, - ZPOOL_PROP_DEDUPRATIO, - ZPOOL_PROP_FREE, - ZPOOL_PROP_ALLOCATED, - ZPOOL_PROP_READONLY, - ZPOOL_PROP_COMMENT, - ZPOOL_PROP_EXPANDSZ, - ZPOOL_PROP_FREEING, - ZPOOL_PROP_FRAGMENTATION, - ZPOOL_PROP_LEAKED, - ZPOOL_PROP_MAXBLOCKSIZE, - ZPOOL_PROP_BOOTSIZE, - ZPOOL_PROP_CHECKPOINT, - ZPOOL_PROP_TNAME, - ZPOOL_PROP_MAXDNODESIZE, - ZPOOL_PROP_MULTIHOST, - ZPOOL_NUM_PROPS -} zpool_prop_t; - -/* Small enough to not hog a whole line of printout in zpool(1M). */ -#define ZPROP_MAX_COMMENT 32 - -#define ZPROP_VALUE "value" -#define ZPROP_SOURCE "source" - -typedef enum { - ZPROP_SRC_NONE = 0x1, - ZPROP_SRC_DEFAULT = 0x2, - ZPROP_SRC_TEMPORARY = 0x4, - ZPROP_SRC_LOCAL = 0x8, - ZPROP_SRC_INHERITED = 0x10, - ZPROP_SRC_RECEIVED = 0x20 -} zprop_source_t; - -#define ZPROP_SRC_ALL 0x3f - -#define ZPROP_SOURCE_VAL_RECVD "$recvd" -#define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS" -/* - * Dataset flag implemented as a special entry in the props zap object - * indicating that the dataset has received properties on or after - * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties - * just as it did in earlier versions, and thereafter, local properties are - * preserved. - */ -#define ZPROP_HAS_RECVD "$hasrecvd" - -typedef enum { - ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */ - ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */ -} zprop_errflags_t; - -typedef int (*zprop_func)(int, void *); - -/* - * Properties to be set on the root file system of a new pool - * are stuffed into their own nvlist, which is then included in - * the properties nvlist with the pool properties. - */ -#define ZPOOL_ROOTFS_PROPS "root-props-nvl" - -/* - * Length of 'written@' and 'written#' - */ -#define ZFS_WRITTEN_PROP_PREFIX_LEN 8 - -/* - * Dataset property functions shared between libzfs and kernel. - */ -const char *zfs_prop_default_string(zfs_prop_t); -uint64_t zfs_prop_default_numeric(zfs_prop_t); -boolean_t zfs_prop_readonly(zfs_prop_t); -boolean_t zfs_prop_visible(zfs_prop_t prop); -boolean_t zfs_prop_inheritable(zfs_prop_t); -boolean_t zfs_prop_setonce(zfs_prop_t); -const char *zfs_prop_to_name(zfs_prop_t); -zfs_prop_t zfs_name_to_prop(const char *); -boolean_t zfs_prop_user(const char *); -boolean_t zfs_prop_userquota(const char *); -int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); -int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); -uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); -boolean_t zfs_prop_valid_for_type(int, zfs_type_t); - -/* - * Pool property functions shared between libzfs and kernel. - */ -zpool_prop_t zpool_name_to_prop(const char *); -const char *zpool_prop_to_name(zpool_prop_t); -const char *zpool_prop_default_string(zpool_prop_t); -uint64_t zpool_prop_default_numeric(zpool_prop_t); -boolean_t zpool_prop_readonly(zpool_prop_t); -boolean_t zpool_prop_feature(const char *); -boolean_t zpool_prop_unsupported(const char *name); -int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); -int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); -uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); - -/* - * Definitions for the Delegation. - */ -typedef enum { - ZFS_DELEG_WHO_UNKNOWN = 0, - ZFS_DELEG_USER = 'u', - ZFS_DELEG_USER_SETS = 'U', - ZFS_DELEG_GROUP = 'g', - ZFS_DELEG_GROUP_SETS = 'G', - ZFS_DELEG_EVERYONE = 'e', - ZFS_DELEG_EVERYONE_SETS = 'E', - ZFS_DELEG_CREATE = 'c', - ZFS_DELEG_CREATE_SETS = 'C', - ZFS_DELEG_NAMED_SET = 's', - ZFS_DELEG_NAMED_SET_SETS = 'S' -} zfs_deleg_who_type_t; - -typedef enum { - ZFS_DELEG_NONE = 0, - ZFS_DELEG_PERM_LOCAL = 1, - ZFS_DELEG_PERM_DESCENDENT = 2, - ZFS_DELEG_PERM_LOCALDESCENDENT = 3, - ZFS_DELEG_PERM_CREATE = 4 -} zfs_deleg_inherit_t; - -#define ZFS_DELEG_PERM_UID "uid" -#define ZFS_DELEG_PERM_GID "gid" -#define ZFS_DELEG_PERM_GROUPS "groups" - -#define ZFS_MLSLABEL_DEFAULT "none" - -#define ZFS_SMB_ACL_SRC "src" -#define ZFS_SMB_ACL_TARGET "target" - -typedef enum { - ZFS_CANMOUNT_OFF = 0, - ZFS_CANMOUNT_ON = 1, - ZFS_CANMOUNT_NOAUTO = 2 -} zfs_canmount_type_t; - -typedef enum { - ZFS_LOGBIAS_LATENCY = 0, - ZFS_LOGBIAS_THROUGHPUT = 1 -} zfs_logbias_op_t; - -typedef enum zfs_share_op { - ZFS_SHARE_NFS = 0, - ZFS_UNSHARE_NFS = 1, - ZFS_SHARE_SMB = 2, - ZFS_UNSHARE_SMB = 3 -} zfs_share_op_t; - -typedef enum zfs_smb_acl_op { - ZFS_SMB_ACL_ADD, - ZFS_SMB_ACL_REMOVE, - ZFS_SMB_ACL_RENAME, - ZFS_SMB_ACL_PURGE -} zfs_smb_acl_op_t; - -typedef enum zfs_cache_type { - ZFS_CACHE_NONE = 0, - ZFS_CACHE_METADATA = 1, - ZFS_CACHE_ALL = 2 -} zfs_cache_type_t; - -typedef enum { - ZFS_SYNC_STANDARD = 0, - ZFS_SYNC_ALWAYS = 1, - ZFS_SYNC_DISABLED = 2 -} zfs_sync_type_t; - -typedef enum { - ZFS_VOLMODE_DEFAULT = 0, - ZFS_VOLMODE_GEOM = 1, - ZFS_VOLMODE_DEV = 2, - ZFS_VOLMODE_NONE = 3 -} zfs_volmode_t; - -typedef enum { - ZFS_DNSIZE_LEGACY = 0, - ZFS_DNSIZE_AUTO = 1, - ZFS_DNSIZE_1K = 1024, - ZFS_DNSIZE_2K = 2048, - ZFS_DNSIZE_4K = 4096, - ZFS_DNSIZE_8K = 8192, - ZFS_DNSIZE_16K = 16384 -} zfs_dnsize_type_t; - -typedef enum { - ZFS_REDUNDANT_METADATA_ALL, - ZFS_REDUNDANT_METADATA_MOST -} zfs_redundant_metadata_type_t; - -/* - * On-disk version number. - */ -#define SPA_VERSION_1 1ULL -#define SPA_VERSION_2 2ULL -#define SPA_VERSION_3 3ULL -#define SPA_VERSION_4 4ULL -#define SPA_VERSION_5 5ULL -#define SPA_VERSION_6 6ULL -#define SPA_VERSION_7 7ULL -#define SPA_VERSION_8 8ULL -#define SPA_VERSION_9 9ULL -#define SPA_VERSION_10 10ULL -#define SPA_VERSION_11 11ULL -#define SPA_VERSION_12 12ULL -#define SPA_VERSION_13 13ULL -#define SPA_VERSION_14 14ULL -#define SPA_VERSION_15 15ULL -#define SPA_VERSION_16 16ULL -#define SPA_VERSION_17 17ULL -#define SPA_VERSION_18 18ULL -#define SPA_VERSION_19 19ULL -#define SPA_VERSION_20 20ULL -#define SPA_VERSION_21 21ULL -#define SPA_VERSION_22 22ULL -#define SPA_VERSION_23 23ULL -#define SPA_VERSION_24 24ULL -#define SPA_VERSION_25 25ULL -#define SPA_VERSION_26 26ULL -#define SPA_VERSION_27 27ULL -#define SPA_VERSION_28 28ULL -#define SPA_VERSION_5000 5000ULL - -/* - * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk - * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, - * and do the appropriate changes. Also bump the version number in - * usr/src/grub/capability. - */ -#define SPA_VERSION SPA_VERSION_5000 -#define SPA_VERSION_STRING "5000" - -/* - * Symbolic names for the changes that caused a SPA_VERSION switch. - * Used in the code when checking for presence or absence of a feature. - * Feel free to define multiple symbolic names for each version if there - * were multiple changes to on-disk structures during that version. - * - * NOTE: When checking the current SPA_VERSION in your code, be sure - * to use spa_version() since it reports the version of the - * last synced uberblock. Checking the in-flight version can - * be dangerous in some cases. - */ -#define SPA_VERSION_INITIAL SPA_VERSION_1 -#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 -#define SPA_VERSION_SPARES SPA_VERSION_3 -#define SPA_VERSION_RAIDZ2 SPA_VERSION_3 -#define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3 -#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 -#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 -#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 -#define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5 -#define SPA_VERSION_BOOTFS SPA_VERSION_6 -#define SPA_VERSION_SLOGS SPA_VERSION_7 -#define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 -#define SPA_VERSION_FUID SPA_VERSION_9 -#define SPA_VERSION_REFRESERVATION SPA_VERSION_9 -#define SPA_VERSION_REFQUOTA SPA_VERSION_9 -#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 -#define SPA_VERSION_L2CACHE SPA_VERSION_10 -#define SPA_VERSION_NEXT_CLONES SPA_VERSION_11 -#define SPA_VERSION_ORIGIN SPA_VERSION_11 -#define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 -#define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 -#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 -#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 -#define SPA_VERSION_USERSPACE SPA_VERSION_15 -#define SPA_VERSION_STMF_PROP SPA_VERSION_16 -#define SPA_VERSION_RAIDZ3 SPA_VERSION_17 -#define SPA_VERSION_USERREFS SPA_VERSION_18 -#define SPA_VERSION_HOLES SPA_VERSION_19 -#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 -#define SPA_VERSION_DEDUP SPA_VERSION_21 -#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 -#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 -#define SPA_VERSION_SA SPA_VERSION_24 -#define SPA_VERSION_SCAN SPA_VERSION_25 -#define SPA_VERSION_DIR_CLONES SPA_VERSION_26 -#define SPA_VERSION_DEADLISTS SPA_VERSION_26 -#define SPA_VERSION_FAST_SNAP SPA_VERSION_27 -#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 -#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 -#define SPA_VERSION_FEATURES SPA_VERSION_5000 - -#define SPA_VERSION_IS_SUPPORTED(v) \ - (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ - ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) - -/* - * ZPL version - rev'd whenever an incompatible on-disk format change - * occurs. This is independent of SPA/DMU/ZAP versioning. You must - * also update the version_table[] and help message in zfs_prop.c. - * - * When changing, be sure to teach GRUB how to read the new format! - * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*} - */ -#define ZPL_VERSION_1 1ULL -#define ZPL_VERSION_2 2ULL -#define ZPL_VERSION_3 3ULL -#define ZPL_VERSION_4 4ULL -#define ZPL_VERSION_5 5ULL -#define ZPL_VERSION ZPL_VERSION_5 -#define ZPL_VERSION_STRING "5" - -#define ZPL_VERSION_INITIAL ZPL_VERSION_1 -#define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 -#define ZPL_VERSION_FUID ZPL_VERSION_3 -#define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 -#define ZPL_VERSION_SYSATTR ZPL_VERSION_3 -#define ZPL_VERSION_USERSPACE ZPL_VERSION_4 -#define ZPL_VERSION_SA ZPL_VERSION_5 - -/* Rewind policy information */ -#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ -#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ -#define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */ -#define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */ -#define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */ -#define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */ -#define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */ - -typedef struct zpool_load_policy { - uint32_t zlp_rewind; /* rewind policy requested */ - uint64_t zlp_maxmeta; /* max acceptable meta-data errors */ - uint64_t zlp_maxdata; /* max acceptable data errors */ - uint64_t zlp_txg; /* specific txg to load */ -} zpool_load_policy_t; - -/* - * The following are configuration names used in the nvlist describing a pool's - * configuration. New on-disk names should be prefixed with ":" - * (e.g. "org.open-zfs:") to avoid conflicting names being developed - * independently. - */ -#define ZPOOL_CONFIG_VERSION "version" -#define ZPOOL_CONFIG_POOL_NAME "name" -#define ZPOOL_CONFIG_POOL_STATE "state" -#define ZPOOL_CONFIG_POOL_TXG "txg" -#define ZPOOL_CONFIG_POOL_GUID "pool_guid" -#define ZPOOL_CONFIG_CREATE_TXG "create_txg" -#define ZPOOL_CONFIG_TOP_GUID "top_guid" -#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" -#define ZPOOL_CONFIG_TYPE "type" -#define ZPOOL_CONFIG_CHILDREN "children" -#define ZPOOL_CONFIG_ID "id" -#define ZPOOL_CONFIG_GUID "guid" -#define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object" -#define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births" -#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" -#define ZPOOL_CONFIG_PATH "path" -#define ZPOOL_CONFIG_DEVID "devid" -#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" -#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" -#define ZPOOL_CONFIG_ASHIFT "ashift" -#define ZPOOL_CONFIG_ASIZE "asize" -#define ZPOOL_CONFIG_DTL "DTL" -#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ -#define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ -#define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ -#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ -#define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ -#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" -#define ZPOOL_CONFIG_ERRCOUNT "error_count" -#define ZPOOL_CONFIG_NOT_PRESENT "not_present" -#define ZPOOL_CONFIG_SPARES "spares" -#define ZPOOL_CONFIG_IS_SPARE "is_spare" -#define ZPOOL_CONFIG_NPARITY "nparity" -#define ZPOOL_CONFIG_HOSTID "hostid" -#define ZPOOL_CONFIG_HOSTNAME "hostname" -#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" -#define ZPOOL_CONFIG_UNSPARE "unspare" -#define ZPOOL_CONFIG_PHYS_PATH "phys_path" -#define ZPOOL_CONFIG_IS_LOG "is_log" -#define ZPOOL_CONFIG_L2CACHE "l2cache" -#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array" -#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" -#define ZPOOL_CONFIG_IS_HOLE "is_hole" -#define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" -#define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" -#define ZPOOL_CONFIG_DDT_STATS "ddt_stats" -#define ZPOOL_CONFIG_SPLIT "splitcfg" -#define ZPOOL_CONFIG_ORIG_GUID "orig_guid" -#define ZPOOL_CONFIG_SPLIT_GUID "split_guid" -#define ZPOOL_CONFIG_SPLIT_LIST "guid_list" -#define ZPOOL_CONFIG_REMOVING "removing" -#define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" -#define ZPOOL_CONFIG_COMMENT "comment" -#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ -#define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */ -#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ -#define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ -#define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ -#define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ -#define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ -#define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ -#define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */ -#define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ -#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" -#define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ -#define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" -#define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" -#define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" -#define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */ -#define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */ -#define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */ -#define ZPOOL_CONFIG_MMP_SEQ "mmp_seq" /* not stored on disk */ -#define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */ -#define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */ -#define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */ - -/* - * The persistent vdev state is stored as separate values rather than a single - * 'vdev_state' entry. This is because a device can be in multiple states, such - * as offline and degraded. - */ -#define ZPOOL_CONFIG_OFFLINE "offline" -#define ZPOOL_CONFIG_FAULTED "faulted" -#define ZPOOL_CONFIG_DEGRADED "degraded" -#define ZPOOL_CONFIG_REMOVED "removed" -#define ZPOOL_CONFIG_FRU "fru" -#define ZPOOL_CONFIG_AUX_STATE "aux_state" - -/* Pool load policy parameters */ -#define ZPOOL_LOAD_POLICY "load-policy" -#define ZPOOL_LOAD_REWIND_POLICY "load-rewind-policy" -#define ZPOOL_LOAD_REQUEST_TXG "load-request-txg" -#define ZPOOL_LOAD_META_THRESH "load-meta-thresh" -#define ZPOOL_LOAD_DATA_THRESH "load-data-thresh" - -/* Rewind data discovered */ -#define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts" -#define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" -#define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" - -#define VDEV_TYPE_ROOT "root" -#define VDEV_TYPE_MIRROR "mirror" -#define VDEV_TYPE_REPLACING "replacing" -#define VDEV_TYPE_RAIDZ "raidz" -#define VDEV_TYPE_DISK "disk" -#define VDEV_TYPE_FILE "file" -#define VDEV_TYPE_MISSING "missing" -#define VDEV_TYPE_HOLE "hole" -#define VDEV_TYPE_SPARE "spare" -#define VDEV_TYPE_LOG "log" -#define VDEV_TYPE_L2CACHE "l2cache" -#define VDEV_TYPE_INDIRECT "indirect" - -/* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ -#define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ - "com.delphix:indirect_obsolete_sm" -#define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \ - "com.delphix:obsolete_counts_are_precise" -#define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ - "com.delphix:pool_checkpoint_sm" - -#define VDEV_TOP_ZAP_ALLOCATION_BIAS \ - "org.zfsonlinux:allocation_bias" - -/* vdev metaslab allocation bias */ -#define VDEV_ALLOC_BIAS_LOG "log" -#define VDEV_ALLOC_BIAS_SPECIAL "special" -#define VDEV_ALLOC_BIAS_DEDUP "dedup" - -#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \ - "com.delphix:next_offset_to_initialize" -#define VDEV_LEAF_ZAP_INITIALIZE_STATE \ - "com.delphix:vdev_initialize_state" -#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \ - "com.delphix:vdev_initialize_action_time" - -/* - * This is needed in userland to report the minimum necessary device size. - * - * Note that the zfs test suite uses 64MB vdevs. - */ -#define SPA_MINDEVSIZE (64ULL << 20) - -/* - * Set if the fragmentation has not yet been calculated. This can happen - * because the space maps have not been upgraded or the histogram feature - * is not enabled. - */ -#define ZFS_FRAG_INVALID UINT64_MAX - -/* - * The location of the pool configuration repository, shared between kernel and - * userland. - */ -#define ZPOOL_CACHE "/boot/zfs/zpool.cache" - -/* - * vdev states are ordered from least to most healthy. - * A vdev that's CANT_OPEN or below is considered unusable. - */ -typedef enum vdev_state { - VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ - VDEV_STATE_CLOSED, /* Not currently open */ - VDEV_STATE_OFFLINE, /* Not allowed to open */ - VDEV_STATE_REMOVED, /* Explicitly removed from system */ - VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ - VDEV_STATE_FAULTED, /* External request to fault device */ - VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ - VDEV_STATE_HEALTHY /* Presumed good */ -} vdev_state_t; - -#define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY - -/* - * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field - * of the vdev stats structure uses these constants to distinguish why. - */ -typedef enum vdev_aux { - VDEV_AUX_NONE, /* no error */ - VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ - VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ - VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ - VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ - VDEV_AUX_TOO_SMALL, /* vdev size is too small */ - VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ - VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ - VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ - VDEV_AUX_UNSUP_FEAT, /* unsupported features */ - VDEV_AUX_SPARED, /* hot spare used in another pool */ - VDEV_AUX_ERR_EXCEEDED, /* too many errors */ - VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ - VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ - VDEV_AUX_EXTERNAL, /* external diagnosis */ - VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ - VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */ - VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */ - VDEV_AUX_ACTIVE /* vdev active on a different host */ -} vdev_aux_t; - -/* - * pool state. The following states are written to disk as part of the normal - * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining - * states are software abstractions used at various levels to communicate - * pool state. - */ -typedef enum pool_state { - POOL_STATE_ACTIVE = 0, /* In active use */ - POOL_STATE_EXPORTED, /* Explicitly exported */ - POOL_STATE_DESTROYED, /* Explicitly destroyed */ - POOL_STATE_SPARE, /* Reserved for hot spare use */ - POOL_STATE_L2CACHE, /* Level 2 ARC device */ - POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ - POOL_STATE_UNAVAIL, /* Internal libzfs state */ - POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ -} pool_state_t; - -/* - * mmp state. The following states provide additional detail describing - * why a pool couldn't be safely imported. - */ -typedef enum mmp_state { - MMP_STATE_ACTIVE = 0, /* In active use */ - MMP_STATE_INACTIVE, /* Inactive and safe to import */ - MMP_STATE_NO_HOSTID /* System hostid is not set */ -} mmp_state_t; - -/* - * Scan Functions. - */ -typedef enum pool_scan_func { - POOL_SCAN_NONE, - POOL_SCAN_SCRUB, - POOL_SCAN_RESILVER, - POOL_SCAN_FUNCS -} pool_scan_func_t; - -/* - * Used to control scrub pause and resume. - */ -typedef enum pool_scrub_cmd { - POOL_SCRUB_NORMAL = 0, - POOL_SCRUB_PAUSE, - POOL_SCRUB_FLAGS_END -} pool_scrub_cmd_t; - -/* - * Initialize functions. - */ -typedef enum pool_initialize_func { - POOL_INITIALIZE_DO, - POOL_INITIALIZE_CANCEL, - POOL_INITIALIZE_SUSPEND, - POOL_INITIALIZE_FUNCS -} pool_initialize_func_t; - -/* - * ZIO types. Needed to interpret vdev statistics below. - */ -typedef enum zio_type { - ZIO_TYPE_NULL = 0, - ZIO_TYPE_READ, - ZIO_TYPE_WRITE, - ZIO_TYPE_FREE, - ZIO_TYPE_CLAIM, - ZIO_TYPE_IOCTL, - ZIO_TYPES -} zio_type_t; - -/* - * Pool statistics. Note: all fields should be 64-bit because this - * is passed between kernel and userland as an nvlist uint64 array. - */ -typedef struct pool_scan_stat { - /* values stored on disk */ - uint64_t pss_func; /* pool_scan_func_t */ - uint64_t pss_state; /* dsl_scan_state_t */ - uint64_t pss_start_time; /* scan start time */ - uint64_t pss_end_time; /* scan end time */ - uint64_t pss_to_examine; /* total bytes to scan */ - uint64_t pss_examined; /* total bytes located by scanner */ - uint64_t pss_to_process; /* total bytes to process */ - uint64_t pss_processed; /* total processed bytes */ - uint64_t pss_errors; /* scan errors */ - - /* values not stored on disk */ - uint64_t pss_pass_exam; /* examined bytes per scan pass */ - uint64_t pss_pass_start; /* start time of a scan pass */ - uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */ - /* cumulative time scrub spent paused, needed for rate calculation */ - uint64_t pss_pass_scrub_spent_paused; - - /* Sorted scrubbing new fields */ - /* Stored on disk */ - uint64_t pss_issued; /* total bytes checked by scanner */ - /* Not stored on disk */ - uint64_t pss_pass_issued; /* issued bytes per scan pass */ -} pool_scan_stat_t; - -typedef struct pool_removal_stat { - uint64_t prs_state; /* dsl_scan_state_t */ - uint64_t prs_removing_vdev; - uint64_t prs_start_time; - uint64_t prs_end_time; - uint64_t prs_to_copy; /* bytes that need to be copied */ - uint64_t prs_copied; /* bytes copied so far */ - /* - * bytes of memory used for indirect mappings. - * This includes all removed vdevs. - */ - uint64_t prs_mapping_memory; -} pool_removal_stat_t; - -typedef enum dsl_scan_state { - DSS_NONE, - DSS_SCANNING, - DSS_FINISHED, - DSS_CANCELED, - DSS_NUM_STATES -} dsl_scan_state_t; - -typedef enum { - CS_NONE, - CS_CHECKPOINT_EXISTS, - CS_CHECKPOINT_DISCARDING, - CS_NUM_STATES -} checkpoint_state_t; - -typedef struct pool_checkpoint_stat { - uint64_t pcs_state; /* checkpoint_state_t */ - uint64_t pcs_start_time; /* time checkpoint/discard started */ - uint64_t pcs_space; /* checkpointed space */ -} pool_checkpoint_stat_t; - -typedef enum { - VDEV_INITIALIZE_NONE, - VDEV_INITIALIZE_ACTIVE, - VDEV_INITIALIZE_CANCELED, - VDEV_INITIALIZE_SUSPENDED, - VDEV_INITIALIZE_COMPLETE -} vdev_initializing_state_t; - -/* - * Vdev statistics. Note: all fields should be 64-bit because this - * is passed between kernel and userland as an nvlist uint64 array. - */ -typedef struct vdev_stat { - hrtime_t vs_timestamp; /* time since vdev load */ - uint64_t vs_state; /* vdev state */ - uint64_t vs_aux; /* see vdev_aux_t */ - uint64_t vs_alloc; /* space allocated */ - uint64_t vs_space; /* total capacity */ - uint64_t vs_dspace; /* deflated capacity */ - uint64_t vs_rsize; /* replaceable dev size */ - uint64_t vs_esize; /* expandable dev size */ - uint64_t vs_ops[ZIO_TYPES]; /* operation count */ - uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */ - uint64_t vs_read_errors; /* read errors */ - uint64_t vs_write_errors; /* write errors */ - uint64_t vs_checksum_errors; /* checksum errors */ - uint64_t vs_self_healed; /* self-healed bytes */ - uint64_t vs_scan_removing; /* removing? */ - uint64_t vs_scan_processed; /* scan processed bytes */ - uint64_t vs_configured_ashift; /* TLV vdev_ashift */ - uint64_t vs_logical_ashift; /* vdev_logical_ashift */ - uint64_t vs_physical_ashift; /* vdev_physical_ashift */ - uint64_t vs_fragmentation; /* device fragmentation */ - uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ - uint64_t vs_initialize_errors; /* initializing errors */ - uint64_t vs_initialize_bytes_done; /* bytes initialized */ - uint64_t vs_initialize_bytes_est; /* total bytes to initialize */ - uint64_t vs_initialize_state; /* vdev_initialzing_state_t */ - uint64_t vs_initialize_action_time; /* time_t */ -} vdev_stat_t; -#define VDEV_STAT_VALID(field, uint64_t_field_count) \ - ((uint64_t_field_count * sizeof(uint64_t)) >= \ - (offsetof(vdev_stat_t, field) + sizeof(((vdev_stat_t *)NULL)->field))) - -/* - * DDT statistics. Note: all fields should be 64-bit because this - * is passed between kernel and userland as an nvlist uint64 array. - */ -typedef struct ddt_object { - uint64_t ddo_count; /* number of elments in ddt */ - uint64_t ddo_dspace; /* size of ddt on disk */ - uint64_t ddo_mspace; /* size of ddt in-core */ -} ddt_object_t; - -typedef struct ddt_stat { - uint64_t dds_blocks; /* blocks */ - uint64_t dds_lsize; /* logical size */ - uint64_t dds_psize; /* physical size */ - uint64_t dds_dsize; /* deflated allocated size */ - uint64_t dds_ref_blocks; /* referenced blocks */ - uint64_t dds_ref_lsize; /* referenced lsize * refcnt */ - uint64_t dds_ref_psize; /* referenced psize * refcnt */ - uint64_t dds_ref_dsize; /* referenced dsize * refcnt */ -} ddt_stat_t; - -typedef struct ddt_histogram { - ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ -} ddt_histogram_t; - -#define ZVOL_DRIVER "zvol" -#define ZFS_DRIVER "zfs" -#define ZFS_DEV_NAME "zfs" -#define ZFS_DEV "/dev/" ZFS_DEV_NAME -#define ZFS_DISK_ROOT "/dev/dsk" -#define ZFS_DISK_ROOTD ZFS_DISK_ROOT "/" -#define ZFS_RDISK_ROOT "/dev/rdsk" -#define ZFS_RDISK_ROOTD ZFS_RDISK_ROOT "/" - -/* general zvol path */ -#define ZVOL_DIR "/dev/zvol" -/* expansion */ -#define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:" -/* for dump and swap */ -#define ZVOL_FULL_DEV_DIR ZVOL_DIR "/dsk/" -#define ZVOL_FULL_RDEV_DIR ZVOL_DIR "/rdsk/" - -#define ZVOL_PROP_NAME "name" -#define ZVOL_DEFAULT_BLOCKSIZE 8192 - -/* - * /dev/zfs ioctl numbers. - * - * These numbers cannot change over time. New ioctl numbers must be appended. - */ -typedef enum zfs_ioc { - /* - * Core features - 81/128 numbers reserved. - */ -#ifdef __FreeBSD__ - ZFS_IOC_FIRST = 0, -#else - ZFS_IOC_FIRST = ('Z' << 8), -#endif - ZFS_IOC = ZFS_IOC_FIRST, - ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, - ZFS_IOC_POOL_DESTROY, - ZFS_IOC_POOL_IMPORT, - ZFS_IOC_POOL_EXPORT, - ZFS_IOC_POOL_CONFIGS, - ZFS_IOC_POOL_STATS, - ZFS_IOC_POOL_TRYIMPORT, - ZFS_IOC_POOL_SCAN, - ZFS_IOC_POOL_FREEZE, - ZFS_IOC_POOL_UPGRADE, - ZFS_IOC_POOL_GET_HISTORY, - ZFS_IOC_VDEV_ADD, - ZFS_IOC_VDEV_REMOVE, - ZFS_IOC_VDEV_SET_STATE, - ZFS_IOC_VDEV_ATTACH, - ZFS_IOC_VDEV_DETACH, - ZFS_IOC_VDEV_SETPATH, - ZFS_IOC_VDEV_SETFRU, - ZFS_IOC_OBJSET_STATS, - ZFS_IOC_OBJSET_ZPLPROPS, - ZFS_IOC_DATASET_LIST_NEXT, - ZFS_IOC_SNAPSHOT_LIST_NEXT, - ZFS_IOC_SET_PROP, - ZFS_IOC_CREATE, - ZFS_IOC_DESTROY, - ZFS_IOC_ROLLBACK, - ZFS_IOC_RENAME, - ZFS_IOC_RECV, - ZFS_IOC_SEND, - ZFS_IOC_INJECT_FAULT, - ZFS_IOC_CLEAR_FAULT, - ZFS_IOC_INJECT_LIST_NEXT, - ZFS_IOC_ERROR_LOG, - ZFS_IOC_CLEAR, - ZFS_IOC_PROMOTE, - ZFS_IOC_DESTROY_SNAPS, - ZFS_IOC_SNAPSHOT, - ZFS_IOC_DSOBJ_TO_DSNAME, - ZFS_IOC_OBJ_TO_PATH, - ZFS_IOC_POOL_SET_PROPS, - ZFS_IOC_POOL_GET_PROPS, - ZFS_IOC_SET_FSACL, - ZFS_IOC_GET_FSACL, - ZFS_IOC_SHARE, - ZFS_IOC_INHERIT_PROP, - ZFS_IOC_SMB_ACL, - ZFS_IOC_USERSPACE_ONE, - ZFS_IOC_USERSPACE_MANY, - ZFS_IOC_USERSPACE_UPGRADE, - ZFS_IOC_HOLD, - ZFS_IOC_RELEASE, - ZFS_IOC_GET_HOLDS, - ZFS_IOC_OBJSET_RECVD_PROPS, - ZFS_IOC_VDEV_SPLIT, - ZFS_IOC_NEXT_OBJ, - ZFS_IOC_DIFF, - ZFS_IOC_TMP_SNAPSHOT, - ZFS_IOC_OBJ_TO_STATS, - ZFS_IOC_JAIL, - ZFS_IOC_UNJAIL, - ZFS_IOC_POOL_REGUID, - ZFS_IOC_SPACE_WRITTEN, - ZFS_IOC_SPACE_SNAPS, - ZFS_IOC_SEND_PROGRESS, - ZFS_IOC_POOL_REOPEN, - ZFS_IOC_LOG_HISTORY, - ZFS_IOC_SEND_NEW, - ZFS_IOC_SEND_SPACE, - ZFS_IOC_CLONE, - ZFS_IOC_BOOKMARK, - ZFS_IOC_GET_BOOKMARKS, - ZFS_IOC_DESTROY_BOOKMARKS, -#ifdef __FreeBSD__ - ZFS_IOC_NEXTBOOT, -#endif - ZFS_IOC_CHANNEL_PROGRAM, - ZFS_IOC_REMAP, - ZFS_IOC_POOL_CHECKPOINT, - ZFS_IOC_POOL_DISCARD_CHECKPOINT, - ZFS_IOC_POOL_INITIALIZE, - ZFS_IOC_POOL_SYNC, - ZFS_IOC_SET_BOOTENV, - ZFS_IOC_GET_BOOTENV, - ZFS_IOC_LAST -} zfs_ioc_t; - -/* - * ZFS-specific error codes used for returning descriptive errors - * to the userland through zfs ioctls. - * - * The enum implicitly includes all the error codes from errno.h. - * New code should use and extend this enum for errors that are - * not described precisely by generic errno codes. - * - * These numbers should not change over time. New entries should be appended. - */ -typedef enum { - ZFS_ERR_CHECKPOINT_EXISTS = 1024, - ZFS_ERR_DISCARDING_CHECKPOINT, - ZFS_ERR_NO_CHECKPOINT, - ZFS_ERR_DEVRM_IN_PROGRESS, - ZFS_ERR_VDEV_TOO_BIG, - ZFS_ERR_IOC_CMD_UNAVAIL, - ZFS_ERR_IOC_ARG_UNAVAIL, - ZFS_ERR_IOC_ARG_REQUIRED, - ZFS_ERR_IOC_ARG_BADTYPE, - ZFS_ERR_WRONG_PARENT, -} zfs_errno_t; - -/* - * Internal SPA load state. Used by FMA diagnosis engine. - */ -typedef enum { - SPA_LOAD_NONE, /* no load in progress */ - SPA_LOAD_OPEN, /* normal open */ - SPA_LOAD_IMPORT, /* import in progress */ - SPA_LOAD_TRYIMPORT, /* tryimport in progress */ - SPA_LOAD_RECOVER, /* recovery requested */ - SPA_LOAD_ERROR, /* load failed */ - SPA_LOAD_CREATE /* creation in progress */ -} spa_load_state_t; - -/* - * Bookmark name values. - */ -#define ZPOOL_ERR_LIST "error list" -#define ZPOOL_ERR_DATASET "dataset" -#define ZPOOL_ERR_OBJECT "object" - -#define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) - -/* - * The following are names used in the nvlist describing - * the pool's history log. - */ -#define ZPOOL_HIST_RECORD "history record" -#define ZPOOL_HIST_TIME "history time" -#define ZPOOL_HIST_CMD "history command" -#define ZPOOL_HIST_WHO "history who" -#define ZPOOL_HIST_ZONE "history zone" -#define ZPOOL_HIST_HOST "history hostname" -#define ZPOOL_HIST_TXG "history txg" -#define ZPOOL_HIST_INT_EVENT "history internal event" -#define ZPOOL_HIST_INT_STR "history internal str" -#define ZPOOL_HIST_INT_NAME "internal_name" -#define ZPOOL_HIST_IOCTL "ioctl" -#define ZPOOL_HIST_INPUT_NVL "in_nvl" -#define ZPOOL_HIST_OUTPUT_NVL "out_nvl" -#define ZPOOL_HIST_DSNAME "dsname" -#define ZPOOL_HIST_DSID "dsid" -#define ZPOOL_HIST_ERRNO "errno" - -/* - * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE. - */ -#define ZPOOL_INITIALIZE_COMMAND "initialize_command" -#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs" - -/* - * Flags for ZFS_IOC_VDEV_SET_STATE - */ -#define ZFS_ONLINE_CHECKREMOVE 0x1 -#define ZFS_ONLINE_UNSPARE 0x2 -#define ZFS_ONLINE_FORCEFAULT 0x4 -#define ZFS_ONLINE_EXPAND 0x8 -#define ZFS_OFFLINE_TEMPORARY 0x1 - -/* - * Flags for ZFS_IOC_POOL_IMPORT - */ -#define ZFS_IMPORT_NORMAL 0x0 -#define ZFS_IMPORT_VERBATIM 0x1 -#define ZFS_IMPORT_ANY_HOST 0x2 -#define ZFS_IMPORT_MISSING_LOG 0x4 -#define ZFS_IMPORT_ONLY 0x8 -#define ZFS_IMPORT_CHECKPOINT 0x10 -#define ZFS_IMPORT_TEMP_NAME 0x20 -#define ZFS_IMPORT_SKIP_MMP 0x40 - -/* - * Channel program argument/return nvlist keys and defaults. - */ -#define ZCP_ARG_PROGRAM "program" -#define ZCP_ARG_ARGLIST "arg" -#define ZCP_ARG_SYNC "sync" -#define ZCP_ARG_INSTRLIMIT "instrlimit" -#define ZCP_ARG_MEMLIMIT "memlimit" - -#define ZCP_ARG_CLIARGV "argv" - -#define ZCP_RET_ERROR "error" -#define ZCP_RET_RETURN "return" - -#define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000) -#define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT) -#define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024) -#define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT) - -/* - * nvlist name constants. Facilitate restricting snapshot iteration range for - * the "list next snapshot" ioctl - */ -#define SNAP_ITER_MIN_TXG "snap_iter_min_txg" -#define SNAP_ITER_MAX_TXG "snap_iter_max_txg" - -/* - * Sysevent payload members. ZFS will generate the following sysevents with the - * given payloads: - * - * ESC_ZFS_RESILVER_START - * ESC_ZFS_RESILVER_END - * ESC_ZFS_POOL_DESTROY - * ESC_ZFS_POOL_REGUID - * - * ZFS_EV_POOL_NAME DATA_TYPE_STRING - * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 - * - * ESC_ZFS_VDEV_REMOVE - * ESC_ZFS_VDEV_CLEAR - * ESC_ZFS_VDEV_CHECK - * - * ZFS_EV_POOL_NAME DATA_TYPE_STRING - * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 - * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) - * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 - * - * ESC_ZFS_HISTORY_EVENT - * - * ZFS_EV_POOL_NAME DATA_TYPE_STRING - * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 - * ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional) - * ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional) - * ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional) - * ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional) - * ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional) - * ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional) - * ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional) - * ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional) - * ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional) - * ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional) - * ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional) - * ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional) - * - * The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the - * history log nvlist. The keynames will be free of any spaces or other - * characters that could be potentially unexpected to consumers of the - * sysevents. - */ -#define ZFS_EV_POOL_NAME "pool_name" -#define ZFS_EV_POOL_GUID "pool_guid" -#define ZFS_EV_VDEV_PATH "vdev_path" -#define ZFS_EV_VDEV_GUID "vdev_guid" -#define ZFS_EV_HIST_TIME "history_time" -#define ZFS_EV_HIST_CMD "history_command" -#define ZFS_EV_HIST_WHO "history_who" -#define ZFS_EV_HIST_ZONE "history_zone" -#define ZFS_EV_HIST_HOST "history_hostname" -#define ZFS_EV_HIST_TXG "history_txg" -#define ZFS_EV_HIST_INT_EVENT "history_internal_event" -#define ZFS_EV_HIST_INT_STR "history_internal_str" -#define ZFS_EV_HIST_INT_NAME "history_internal_name" -#define ZFS_EV_HIST_IOCTL "history_ioctl" -#define ZFS_EV_HIST_DSNAME "history_dsname" -#define ZFS_EV_HIST_DSID "history_dsid" - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_FS_ZFS_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h deleted file mode 100644 index 36c9eaa7f18e..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h +++ /dev/null @@ -1,93 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _ZUT_H -#define _ZUT_H - -/* - * IOCTLs for the zfs unit test driver - */ - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include -#include - -#define ZUT_DRIVER "zut" -#define ZUT_DEV "/dev/zut" - -#define ZUT_VERSION_STRING "1" - -/* - * /dev/zut ioctl numbers. - */ -#define ZUT_IOC ('U' << 8) - -/* Request flags */ -#define ZUT_IGNORECASE 0x01 -#define ZUT_ACCFILTER 0x02 -#define ZUT_XATTR 0x04 -#define ZUT_EXTRDDIR 0x08 -#define ZUT_GETSTAT 0x10 - -typedef struct zut_lookup { - int zl_reqflags; - int zl_deflags; /* output */ - int zl_retcode; /* output */ - char zl_dir[MAXPATHLEN]; - char zl_file[MAXNAMELEN]; - char zl_xfile[MAXNAMELEN]; - char zl_real[MAXPATHLEN]; /* output */ - uint64_t zl_xvattrs; /* output */ - struct stat64 zl_statbuf; /* output */ -} zut_lookup_t; - -typedef struct zut_readdir { - uint64_t zr_buf; /* pointer to output buffer */ - uint64_t zr_loffset; /* output */ - char zr_dir[MAXPATHLEN]; - char zr_file[MAXNAMELEN]; - int zr_reqflags; - int zr_retcode; /* output */ - int zr_eof; /* output */ - uint_t zr_bytes; /* output */ - uint_t zr_buflen; -} zut_readdir_t; - -typedef enum zut_ioc { - ZUT_IOC_MIN_CMD = ZUT_IOC - 1, - ZUT_IOC_LOOKUP = ZUT_IOC, - ZUT_IOC_READDIR, - ZUT_IOC_MAX_CMD -} zut_ioc_t; - -#ifdef __cplusplus -} -#endif - -#endif /* _ZUT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h deleted file mode 100644 index 52d6aea0a364..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h +++ /dev/null @@ -1,351 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - */ - -#ifndef _SYS_NVPAIR_H -#define _SYS_NVPAIR_H - -#include -#include -#include - -#if defined(_KERNEL) && !defined(_BOOT) -#include -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -typedef enum { - DATA_TYPE_DONTCARE = -1, - DATA_TYPE_UNKNOWN = 0, - DATA_TYPE_BOOLEAN, - DATA_TYPE_BYTE, - DATA_TYPE_INT16, - DATA_TYPE_UINT16, - DATA_TYPE_INT32, - DATA_TYPE_UINT32, - DATA_TYPE_INT64, - DATA_TYPE_UINT64, - DATA_TYPE_STRING, - DATA_TYPE_BYTE_ARRAY, - DATA_TYPE_INT16_ARRAY, - DATA_TYPE_UINT16_ARRAY, - DATA_TYPE_INT32_ARRAY, - DATA_TYPE_UINT32_ARRAY, - DATA_TYPE_INT64_ARRAY, - DATA_TYPE_UINT64_ARRAY, - DATA_TYPE_STRING_ARRAY, - DATA_TYPE_HRTIME, - DATA_TYPE_NVLIST, - DATA_TYPE_NVLIST_ARRAY, - DATA_TYPE_BOOLEAN_VALUE, - DATA_TYPE_INT8, - DATA_TYPE_UINT8, - DATA_TYPE_BOOLEAN_ARRAY, - DATA_TYPE_INT8_ARRAY, -#if !defined(_KERNEL) - DATA_TYPE_UINT8_ARRAY, - DATA_TYPE_DOUBLE -#else - DATA_TYPE_UINT8_ARRAY -#endif -} data_type_t; - -typedef struct nvpair { - int32_t nvp_size; /* size of this nvpair */ - int16_t nvp_name_sz; /* length of name string */ - int16_t nvp_reserve; /* not used */ - int32_t nvp_value_elem; /* number of elements for array types */ - data_type_t nvp_type; /* type of value */ - /* name string */ - /* aligned ptr array for string arrays */ - /* aligned array of data for value */ -} nvpair_t; - -/* nvlist header */ -typedef struct nvlist { - int32_t nvl_version; - uint32_t nvl_nvflag; /* persistent flags */ - uint64_t nvl_priv; /* ptr to private data if not packed */ - uint32_t nvl_flag; - int32_t nvl_pad; /* currently not used, for alignment */ -} nvlist_t; - -/* nvp implementation version */ -#define NV_VERSION 0 - -/* nvlist pack encoding */ -#define NV_ENCODE_NATIVE 0 -#define NV_ENCODE_XDR 1 - -/* nvlist persistent unique name flags, stored in nvl_nvflags */ -#define NV_UNIQUE_NAME 0x1 -#define NV_UNIQUE_NAME_TYPE 0x2 - -/* nvlist lookup pairs related flags */ -#define NV_FLAG_NOENTOK 0x1 - -/* convenience macros */ -#define NV_ALIGN(x) (((ulong_t)(x) + 7ul) & ~7ul) -#define NV_ALIGN4(x) (((x) + 3) & ~3) - -#define NVP_SIZE(nvp) ((nvp)->nvp_size) -#define NVP_NAME(nvp) ((char *)(nvp) + sizeof (nvpair_t)) -#define NVP_TYPE(nvp) ((nvp)->nvp_type) -#define NVP_NELEM(nvp) ((nvp)->nvp_value_elem) -#define NVP_VALUE(nvp) ((char *)(nvp) + NV_ALIGN(sizeof (nvpair_t) \ - + (nvp)->nvp_name_sz)) - -#define NVL_VERSION(nvl) ((nvl)->nvl_version) -#define NVL_SIZE(nvl) ((nvl)->nvl_size) -#define NVL_FLAG(nvl) ((nvl)->nvl_flag) - -/* NV allocator framework */ -typedef struct nv_alloc_ops nv_alloc_ops_t; - -typedef struct nv_alloc { - const nv_alloc_ops_t *nva_ops; - void *nva_arg; -} nv_alloc_t; - -struct nv_alloc_ops { - int (*nv_ao_init)(nv_alloc_t *, __va_list); - void (*nv_ao_fini)(nv_alloc_t *); - void *(*nv_ao_alloc)(nv_alloc_t *, size_t); - void (*nv_ao_free)(nv_alloc_t *, void *, size_t); - void (*nv_ao_reset)(nv_alloc_t *); -}; - -extern const nv_alloc_ops_t *nv_fixed_ops; -extern nv_alloc_t *nv_alloc_nosleep; - -#if defined(_KERNEL) && !defined(_BOOT) -extern nv_alloc_t *nv_alloc_sleep; -#endif - -int nv_alloc_init(nv_alloc_t *, const nv_alloc_ops_t *, /* args */ ...); -void nv_alloc_reset(nv_alloc_t *); -void nv_alloc_fini(nv_alloc_t *); - -/* list management */ -int nvlist_alloc(nvlist_t **, uint_t, int); -void nvlist_free(nvlist_t *); -int nvlist_size(nvlist_t *, size_t *, int); -int nvlist_pack(nvlist_t *, char **, size_t *, int, int); -int nvlist_unpack(char *, size_t, nvlist_t **, int); -int nvlist_dup(nvlist_t *, nvlist_t **, int); -int nvlist_merge(nvlist_t *, nvlist_t *, int); - -uint_t nvlist_nvflag(nvlist_t *); - -int nvlist_xalloc(nvlist_t **, uint_t, nv_alloc_t *); -int nvlist_xpack(nvlist_t *, char **, size_t *, int, nv_alloc_t *); -int nvlist_xunpack(char *, size_t, nvlist_t **, nv_alloc_t *); -int nvlist_xdup(nvlist_t *, nvlist_t **, nv_alloc_t *); -nv_alloc_t *nvlist_lookup_nv_alloc(nvlist_t *); - -int nvlist_add_nvpair(nvlist_t *, nvpair_t *); -int nvlist_add_boolean(nvlist_t *, const char *); -int nvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); -int nvlist_add_byte(nvlist_t *, const char *, uchar_t); -int nvlist_add_int8(nvlist_t *, const char *, int8_t); -int nvlist_add_uint8(nvlist_t *, const char *, uint8_t); -int nvlist_add_int16(nvlist_t *, const char *, int16_t); -int nvlist_add_uint16(nvlist_t *, const char *, uint16_t); -int nvlist_add_int32(nvlist_t *, const char *, int32_t); -int nvlist_add_uint32(nvlist_t *, const char *, uint32_t); -int nvlist_add_int64(nvlist_t *, const char *, int64_t); -int nvlist_add_uint64(nvlist_t *, const char *, uint64_t); -int nvlist_add_string(nvlist_t *, const char *, const char *); -int nvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); -int nvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t); -int nvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t); -int nvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t); -int nvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t); -int nvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t); -int nvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t); -int nvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t); -int nvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t); -int nvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t); -int nvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t); -int nvlist_add_string_array(nvlist_t *, const char *, char *const *, uint_t); -int nvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t); -int nvlist_add_hrtime(nvlist_t *, const char *, hrtime_t); -#if !defined(_KERNEL) -int nvlist_add_double(nvlist_t *, const char *, double); -#endif - -int nvlist_remove(nvlist_t *, const char *, data_type_t); -int nvlist_remove_all(nvlist_t *, const char *); -int nvlist_remove_nvpair(nvlist_t *, nvpair_t *); - -int nvlist_lookup_boolean(nvlist_t *, const char *); -int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *); -int nvlist_lookup_byte(nvlist_t *, const char *, uchar_t *); -int nvlist_lookup_int8(nvlist_t *, const char *, int8_t *); -int nvlist_lookup_uint8(nvlist_t *, const char *, uint8_t *); -int nvlist_lookup_int16(nvlist_t *, const char *, int16_t *); -int nvlist_lookup_uint16(nvlist_t *, const char *, uint16_t *); -int nvlist_lookup_int32(nvlist_t *, const char *, int32_t *); -int nvlist_lookup_uint32(nvlist_t *, const char *, uint32_t *); -int nvlist_lookup_int64(nvlist_t *, const char *, int64_t *); -int nvlist_lookup_uint64(nvlist_t *, const char *, uint64_t *); -int nvlist_lookup_string(nvlist_t *, const char *, char **); -int nvlist_lookup_nvlist(nvlist_t *, const char *, nvlist_t **); -int nvlist_lookup_boolean_array(nvlist_t *, const char *, - boolean_t **, uint_t *); -int nvlist_lookup_byte_array(nvlist_t *, const char *, uchar_t **, uint_t *); -int nvlist_lookup_int8_array(nvlist_t *, const char *, int8_t **, uint_t *); -int nvlist_lookup_uint8_array(nvlist_t *, const char *, uint8_t **, uint_t *); -int nvlist_lookup_int16_array(nvlist_t *, const char *, int16_t **, uint_t *); -int nvlist_lookup_uint16_array(nvlist_t *, const char *, uint16_t **, uint_t *); -int nvlist_lookup_int32_array(nvlist_t *, const char *, int32_t **, uint_t *); -int nvlist_lookup_uint32_array(nvlist_t *, const char *, uint32_t **, uint_t *); -int nvlist_lookup_int64_array(nvlist_t *, const char *, int64_t **, uint_t *); -int nvlist_lookup_uint64_array(nvlist_t *, const char *, uint64_t **, uint_t *); -int nvlist_lookup_string_array(nvlist_t *, const char *, char ***, uint_t *); -int nvlist_lookup_nvlist_array(nvlist_t *, const char *, - nvlist_t ***, uint_t *); -int nvlist_lookup_hrtime(nvlist_t *, const char *, hrtime_t *); -int nvlist_lookup_pairs(nvlist_t *, int, ...); -#if !defined(_KERNEL) -int nvlist_lookup_double(nvlist_t *, const char *, double *); -#endif - -int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **); -int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, nvpair_t **, - int *, char **); -boolean_t nvlist_exists(nvlist_t *, const char *); -boolean_t nvlist_empty(nvlist_t *); - -/* processing nvpair */ -nvpair_t *nvlist_next_nvpair(nvlist_t *, nvpair_t *); -nvpair_t *nvlist_prev_nvpair(nvlist_t *, nvpair_t *); -char *nvpair_name(nvpair_t *); -data_type_t nvpair_type(nvpair_t *); -int nvpair_type_is_array(nvpair_t *); -int nvpair_value_boolean_value(nvpair_t *, boolean_t *); -int nvpair_value_byte(nvpair_t *, uchar_t *); -int nvpair_value_int8(nvpair_t *, int8_t *); -int nvpair_value_uint8(nvpair_t *, uint8_t *); -int nvpair_value_int16(nvpair_t *, int16_t *); -int nvpair_value_uint16(nvpair_t *, uint16_t *); -int nvpair_value_int32(nvpair_t *, int32_t *); -int nvpair_value_uint32(nvpair_t *, uint32_t *); -int nvpair_value_int64(nvpair_t *, int64_t *); -int nvpair_value_uint64(nvpair_t *, uint64_t *); -int nvpair_value_string(nvpair_t *, char **); -int nvpair_value_nvlist(nvpair_t *, nvlist_t **); -int nvpair_value_boolean_array(nvpair_t *, boolean_t **, uint_t *); -int nvpair_value_byte_array(nvpair_t *, uchar_t **, uint_t *); -int nvpair_value_int8_array(nvpair_t *, int8_t **, uint_t *); -int nvpair_value_uint8_array(nvpair_t *, uint8_t **, uint_t *); -int nvpair_value_int16_array(nvpair_t *, int16_t **, uint_t *); -int nvpair_value_uint16_array(nvpair_t *, uint16_t **, uint_t *); -int nvpair_value_int32_array(nvpair_t *, int32_t **, uint_t *); -int nvpair_value_uint32_array(nvpair_t *, uint32_t **, uint_t *); -int nvpair_value_int64_array(nvpair_t *, int64_t **, uint_t *); -int nvpair_value_uint64_array(nvpair_t *, uint64_t **, uint_t *); -int nvpair_value_string_array(nvpair_t *, char ***, uint_t *); -int nvpair_value_nvlist_array(nvpair_t *, nvlist_t ***, uint_t *); -int nvpair_value_hrtime(nvpair_t *, hrtime_t *); -#if !defined(_KERNEL) -int nvpair_value_double(nvpair_t *, double *); -#endif - -nvlist_t *fnvlist_alloc(void); -void fnvlist_free(nvlist_t *); -size_t fnvlist_size(nvlist_t *); -char *fnvlist_pack(nvlist_t *, size_t *); -void fnvlist_pack_free(char *, size_t); -nvlist_t *fnvlist_unpack(char *, size_t); -nvlist_t *fnvlist_dup(nvlist_t *); -void fnvlist_merge(nvlist_t *, nvlist_t *); -size_t fnvlist_num_pairs(nvlist_t *); - -void fnvlist_add_boolean(nvlist_t *, const char *); -void fnvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); -void fnvlist_add_byte(nvlist_t *, const char *, uchar_t); -void fnvlist_add_int8(nvlist_t *, const char *, int8_t); -void fnvlist_add_uint8(nvlist_t *, const char *, uint8_t); -void fnvlist_add_int16(nvlist_t *, const char *, int16_t); -void fnvlist_add_uint16(nvlist_t *, const char *, uint16_t); -void fnvlist_add_int32(nvlist_t *, const char *, int32_t); -void fnvlist_add_uint32(nvlist_t *, const char *, uint32_t); -void fnvlist_add_int64(nvlist_t *, const char *, int64_t); -void fnvlist_add_uint64(nvlist_t *, const char *, uint64_t); -void fnvlist_add_string(nvlist_t *, const char *, const char *); -void fnvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); -void fnvlist_add_nvpair(nvlist_t *, nvpair_t *); -void fnvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t); -void fnvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t); -void fnvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t); -void fnvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t); -void fnvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t); -void fnvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t); -void fnvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t); -void fnvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t); -void fnvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t); -void fnvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t); -void fnvlist_add_string_array(nvlist_t *, const char *, char * const *, uint_t); -void fnvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t); - -void fnvlist_remove(nvlist_t *, const char *); -void fnvlist_remove_nvpair(nvlist_t *, nvpair_t *); - -nvpair_t *fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name); -boolean_t fnvlist_lookup_boolean(nvlist_t *nvl, const char *name); -boolean_t fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name); -uchar_t fnvlist_lookup_byte(nvlist_t *nvl, const char *name); -int8_t fnvlist_lookup_int8(nvlist_t *nvl, const char *name); -int16_t fnvlist_lookup_int16(nvlist_t *nvl, const char *name); -int32_t fnvlist_lookup_int32(nvlist_t *nvl, const char *name); -int64_t fnvlist_lookup_int64(nvlist_t *nvl, const char *name); -uint8_t fnvlist_lookup_uint8_t(nvlist_t *nvl, const char *name); -uint16_t fnvlist_lookup_uint16(nvlist_t *nvl, const char *name); -uint32_t fnvlist_lookup_uint32(nvlist_t *nvl, const char *name); -uint64_t fnvlist_lookup_uint64(nvlist_t *nvl, const char *name); -char *fnvlist_lookup_string(nvlist_t *nvl, const char *name); -nvlist_t *fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name); - -boolean_t fnvpair_value_boolean_value(nvpair_t *nvp); -uchar_t fnvpair_value_byte(nvpair_t *nvp); -int8_t fnvpair_value_int8(nvpair_t *nvp); -int16_t fnvpair_value_int16(nvpair_t *nvp); -int32_t fnvpair_value_int32(nvpair_t *nvp); -int64_t fnvpair_value_int64(nvpair_t *nvp); -uint8_t fnvpair_value_uint8_t(nvpair_t *nvp); -uint16_t fnvpair_value_uint16(nvpair_t *nvp); -uint32_t fnvpair_value_uint32(nvpair_t *nvp); -uint64_t fnvpair_value_uint64(nvpair_t *nvp); -char *fnvpair_value_string(nvpair_t *nvp); -nvlist_t *fnvpair_value_nvlist(nvpair_t *nvp); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_NVPAIR_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h deleted file mode 100644 index c9874b3e4db7..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Copyright (c) 2017 by Delphix. All rights reserved. - */ - -#ifndef _NVPAIR_IMPL_H -#define _NVPAIR_IMPL_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include - -/* - * The structures here provided for information and debugging purposes only - * may be changed in the future. - */ - -/* - * implementation linked list for pre-packed data - */ -typedef struct i_nvp i_nvp_t; - -struct i_nvp { - union { - /* ensure alignment */ - uint64_t _nvi_align; - - struct { - /* pointer to next nvpair */ - i_nvp_t *_nvi_next; - - /* pointer to prev nvpair */ - i_nvp_t *_nvi_prev; - - /* next pair in table bucket */ - i_nvp_t *_nvi_hashtable_next; - } _nvi; - } _nvi_un; - - /* nvpair */ - nvpair_t nvi_nvp; -}; -#define nvi_next _nvi_un._nvi._nvi_next -#define nvi_prev _nvi_un._nvi._nvi_prev -#define nvi_hashtable_next _nvi_un._nvi._nvi_hashtable_next - -typedef struct { - i_nvp_t *nvp_list; /* linked list of nvpairs */ - i_nvp_t *nvp_last; /* last nvpair */ - i_nvp_t *nvp_curr; /* current walker nvpair */ - nv_alloc_t *nvp_nva; /* pluggable allocator */ - uint32_t nvp_stat; /* internal state */ - - i_nvp_t **nvp_hashtable; /* table of entries used for lookup */ - uint32_t nvp_nbuckets; /* # of buckets in hash table */ - uint32_t nvp_nentries; /* # of entries in hash table */ -} nvpriv_t; - -#ifdef __cplusplus -} -#endif - -#endif /* _NVPAIR_IMPL_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h b/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h deleted file mode 100644 index 465d8998d4e2..000000000000 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h +++ /dev/null @@ -1,427 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2017 RackTop Systems. - */ - -/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ -/* All Rights Reserved */ - -/* - * University Copyright- Copyright (c) 1982, 1986, 1988 - * The Regents of the University of California - * All Rights Reserved - * - * University Acknowledgment- Portions of this document are derived from - * software developed by the University of California, Berkeley, and its - * contributors. - */ - -#ifndef _SYS_VNODE_H -#define _SYS_VNODE_H - -#include_next - -#define IS_DEVVP(vp) \ - ((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO) - -#define V_XATTRDIR 0x0000 /* attribute unnamed directory */ - -#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ - -/* - * Structure of all optional attributes. - */ -typedef struct xoptattr { - timestruc_t xoa_createtime; /* Create time of file */ - uint8_t xoa_archive; - uint8_t xoa_system; - uint8_t xoa_readonly; - uint8_t xoa_hidden; - uint8_t xoa_nounlink; - uint8_t xoa_immutable; - uint8_t xoa_appendonly; - uint8_t xoa_nodump; - uint8_t xoa_opaque; - uint8_t xoa_av_quarantined; - uint8_t xoa_av_modified; - uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ]; - uint8_t xoa_reparse; - uint64_t xoa_generation; - uint8_t xoa_offline; - uint8_t xoa_sparse; -} xoptattr_t; - -/* - * The xvattr structure is really a variable length structure that - * is made up of: - * - The classic vattr_t (xva_vattr) - * - a 32 bit quantity (xva_mapsize) that specifies the size of the - * attribute bitmaps in 32 bit words. - * - A pointer to the returned attribute bitmap (needed because the - * previous element, the requested attribute bitmap) is variable lenth. - * - The requested attribute bitmap, which is an array of 32 bit words. - * Callers use the XVA_SET_REQ() macro to set the bits corresponding to - * the attributes that are being requested. - * - The returned attribute bitmap, which is an array of 32 bit words. - * File systems that support optional attributes use the XVA_SET_RTN() - * macro to set the bits corresponding to the attributes that are being - * returned. - * - The xoptattr_t structure which contains the attribute values - * - * xva_mapsize determines how many words in the attribute bitmaps. - * Immediately following the attribute bitmaps is the xoptattr_t. - * xva_getxoptattr() is used to get the pointer to the xoptattr_t - * section. - */ - -#define XVA_MAPSIZE 3 /* Size of attr bitmaps */ -#define XVA_MAGIC 0x78766174 /* Magic # for verification */ - -/* - * The xvattr structure is an extensible structure which permits optional - * attributes to be requested/returned. File systems may or may not support - * optional attributes. They do so at their own discretion but if they do - * support optional attributes, they must register the VFSFT_XVATTR feature - * so that the optional attributes can be set/retrived. - * - * The fields of the xvattr structure are: - * - * xva_vattr - The first element of an xvattr is a legacy vattr structure - * which includes the common attributes. If AT_XVATTR is set in the va_mask - * then the entire structure is treated as an xvattr. If AT_XVATTR is not - * set, then only the xva_vattr structure can be used. - * - * xva_magic - 0x78766174 (hex for "xvat"). Magic number for verification. - * - * xva_mapsize - Size of requested and returned attribute bitmaps. - * - * xva_rtnattrmapp - Pointer to xva_rtnattrmap[]. We need this since the - * size of the array before it, xva_reqattrmap[], could change which means - * the location of xva_rtnattrmap[] could change. This will allow unbundled - * file systems to find the location of xva_rtnattrmap[] when the sizes change. - * - * xva_reqattrmap[] - Array of requested attributes. Attributes are - * represented by a specific bit in a specific element of the attribute - * map array. Callers set the bits corresponding to the attributes - * that the caller wants to get/set. - * - * xva_rtnattrmap[] - Array of attributes that the file system was able to - * process. Not all file systems support all optional attributes. This map - * informs the caller which attributes the underlying file system was able - * to set/get. (Same structure as the requested attributes array in terms - * of each attribute corresponding to specific bits and array elements.) - * - * xva_xoptattrs - Structure containing values of optional attributes. - * These values are only valid if the corresponding bits in xva_reqattrmap - * are set and the underlying file system supports those attributes. - */ -typedef struct xvattr { - vattr_t xva_vattr; /* Embedded vattr structure */ - uint32_t xva_magic; /* Magic Number */ - uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */ - uint32_t *xva_rtnattrmapp; /* Ptr to xva_rtnattrmap[] */ - uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */ - uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */ - xoptattr_t xva_xoptattrs; /* Optional attributes */ -} xvattr_t; - -/* - * Attributes of interest to the caller of setattr or getattr. - */ -#define AT_TYPE 0x00001 -#define AT_MODE 0x00002 -#define AT_UID 0x00004 -#define AT_GID 0x00008 -#define AT_FSID 0x00010 -#define AT_NODEID 0x00020 -#define AT_NLINK 0x00040 -#define AT_SIZE 0x00080 -#define AT_ATIME 0x00100 -#define AT_MTIME 0x00200 -#define AT_CTIME 0x00400 -#define AT_RDEV 0x00800 -#define AT_BLKSIZE 0x01000 -#define AT_NBLOCKS 0x02000 -/* 0x04000 */ /* unused */ -#define AT_SEQ 0x08000 -/* - * If AT_XVATTR is set then there are additional bits to process in - * the xvattr_t's attribute bitmap. If this is not set then the bitmap - * MUST be ignored. Note that this bit must be set/cleared explicitly. - * That is, setting AT_ALL will NOT set AT_XVATTR. - */ -#define AT_XVATTR 0x10000 - -#define AT_ALL (AT_TYPE|AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|\ - AT_NLINK|AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|\ - AT_RDEV|AT_BLKSIZE|AT_NBLOCKS|AT_SEQ) - -#define AT_STAT (AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|AT_NLINK|\ - AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV|AT_TYPE) - -#define AT_TIMES (AT_ATIME|AT_MTIME|AT_CTIME) - -#define AT_NOSET (AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\ - AT_BLKSIZE|AT_NBLOCKS|AT_SEQ) - -/* - * Attribute bits used in the extensible attribute's (xva's) attribute - * bitmaps. Note that the bitmaps are made up of a variable length number - * of 32-bit words. The convention is to use XAT{n}_{attrname} where "n" - * is the element in the bitmap (starting at 1). This convention is for - * the convenience of the maintainer to keep track of which element each - * attribute belongs to. - * - * NOTE THAT CONSUMERS MUST *NOT* USE THE XATn_* DEFINES DIRECTLY. CONSUMERS - * MUST USE THE XAT_* DEFINES. - */ -#define XAT0_INDEX 0LL /* Index into bitmap for XAT0 attrs */ -#define XAT0_CREATETIME 0x00000001 /* Create time of file */ -#define XAT0_ARCHIVE 0x00000002 /* Archive */ -#define XAT0_SYSTEM 0x00000004 /* System */ -#define XAT0_READONLY 0x00000008 /* Readonly */ -#define XAT0_HIDDEN 0x00000010 /* Hidden */ -#define XAT0_NOUNLINK 0x00000020 /* Nounlink */ -#define XAT0_IMMUTABLE 0x00000040 /* immutable */ -#define XAT0_APPENDONLY 0x00000080 /* appendonly */ -#define XAT0_NODUMP 0x00000100 /* nodump */ -#define XAT0_OPAQUE 0x00000200 /* opaque */ -#define XAT0_AV_QUARANTINED 0x00000400 /* anti-virus quarantine */ -#define XAT0_AV_MODIFIED 0x00000800 /* anti-virus modified */ -#define XAT0_AV_SCANSTAMP 0x00001000 /* anti-virus scanstamp */ -#define XAT0_REPARSE 0x00002000 /* FS reparse point */ -#define XAT0_GEN 0x00004000 /* object generation number */ -#define XAT0_OFFLINE 0x00008000 /* offline */ -#define XAT0_SPARSE 0x00010000 /* sparse */ - -#define XAT0_ALL_ATTRS (XAT0_CREATETIME|XAT0_ARCHIVE|XAT0_SYSTEM| \ - XAT0_READONLY|XAT0_HIDDEN|XAT0_NOUNLINK|XAT0_IMMUTABLE|XAT0_APPENDONLY| \ - XAT0_NODUMP|XAT0_OPAQUE|XAT0_AV_QUARANTINED| XAT0_AV_MODIFIED| \ - XAT0_AV_SCANSTAMP|XAT0_REPARSE|XATO_GEN|XAT0_OFFLINE|XAT0_SPARSE) - -/* Support for XAT_* optional attributes */ -#define XVA_MASK 0xffffffff /* Used to mask off 32 bits */ -#define XVA_SHFT 32 /* Used to shift index */ - -/* - * Used to pry out the index and attribute bits from the XAT_* attributes - * defined below. Note that we're masking things down to 32 bits then - * casting to uint32_t. - */ -#define XVA_INDEX(attr) ((uint32_t)(((attr) >> XVA_SHFT) & XVA_MASK)) -#define XVA_ATTRBIT(attr) ((uint32_t)((attr) & XVA_MASK)) - -/* - * The following defines present a "flat namespace" so that consumers don't - * need to keep track of which element belongs to which bitmap entry. - * - * NOTE THAT THESE MUST NEVER BE OR-ed TOGETHER - */ -#define XAT_CREATETIME ((XAT0_INDEX << XVA_SHFT) | XAT0_CREATETIME) -#define XAT_ARCHIVE ((XAT0_INDEX << XVA_SHFT) | XAT0_ARCHIVE) -#define XAT_SYSTEM ((XAT0_INDEX << XVA_SHFT) | XAT0_SYSTEM) -#define XAT_READONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_READONLY) -#define XAT_HIDDEN ((XAT0_INDEX << XVA_SHFT) | XAT0_HIDDEN) -#define XAT_NOUNLINK ((XAT0_INDEX << XVA_SHFT) | XAT0_NOUNLINK) -#define XAT_IMMUTABLE ((XAT0_INDEX << XVA_SHFT) | XAT0_IMMUTABLE) -#define XAT_APPENDONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_APPENDONLY) -#define XAT_NODUMP ((XAT0_INDEX << XVA_SHFT) | XAT0_NODUMP) -#define XAT_OPAQUE ((XAT0_INDEX << XVA_SHFT) | XAT0_OPAQUE) -#define XAT_AV_QUARANTINED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_QUARANTINED) -#define XAT_AV_MODIFIED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_MODIFIED) -#define XAT_AV_SCANSTAMP ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_SCANSTAMP) -#define XAT_REPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_REPARSE) -#define XAT_GEN ((XAT0_INDEX << XVA_SHFT) | XAT0_GEN) -#define XAT_OFFLINE ((XAT0_INDEX << XVA_SHFT) | XAT0_OFFLINE) -#define XAT_SPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_SPARSE) - -/* - * The returned attribute map array (xva_rtnattrmap[]) is located past the - * requested attribute map array (xva_reqattrmap[]). Its location changes - * when the array sizes change. We use a separate pointer in a known location - * (xva_rtnattrmapp) to hold the location of xva_rtnattrmap[]. This is - * set in xva_init() - */ -#define XVA_RTNATTRMAP(xvap) ((xvap)->xva_rtnattrmapp) - -/* - * XVA_SET_REQ() sets an attribute bit in the proper element in the bitmap - * of requested attributes (xva_reqattrmap[]). - */ -#define XVA_SET_REQ(xvap, attr) { \ - ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ - ASSERT((xvap)->xva_magic == XVA_MAGIC); \ - (xvap)->xva_reqattrmap[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr); \ -} -/* - * XVA_CLR_REQ() clears an attribute bit in the proper element in the bitmap - * of requested attributes (xva_reqattrmap[]). - */ -#define XVA_CLR_REQ(xvap, attr) { \ - ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ - ASSERT((xvap)->xva_magic == XVA_MAGIC); \ - (xvap)->xva_reqattrmap[XVA_INDEX(attr)] &= ~XVA_ATTRBIT(attr); \ -} - -/* - * XVA_SET_RTN() sets an attribute bit in the proper element in the bitmap - * of returned attributes (xva_rtnattrmap[]). - */ -#define XVA_SET_RTN(xvap, attr) { \ - ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ - ASSERT((xvap)->xva_magic == XVA_MAGIC); \ - (XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr); \ -} - -/* - * XVA_ISSET_REQ() checks the requested attribute bitmap (xva_reqattrmap[]) - * to see of the corresponding attribute bit is set. If so, returns non-zero. - */ -#define XVA_ISSET_REQ(xvap, attr) \ - ((((xvap)->xva_vattr.va_mask | AT_XVATTR) && \ - ((xvap)->xva_magic == XVA_MAGIC) && \ - ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \ - ((xvap)->xva_reqattrmap[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0) - -/* - * XVA_ISSET_RTN() checks the returned attribute bitmap (xva_rtnattrmap[]) - * to see of the corresponding attribute bit is set. If so, returns non-zero. - */ -#define XVA_ISSET_RTN(xvap, attr) \ - ((((xvap)->xva_vattr.va_mask | AT_XVATTR) && \ - ((xvap)->xva_magic == XVA_MAGIC) && \ - ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \ - ((XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0) - -#define MODEMASK 07777 /* mode bits plus permission bits */ -#define PERMMASK 00777 /* permission bits */ - -/* - * VOP_ACCESS flags - */ -#define V_ACE_MASK 0x1 /* mask represents NFSv4 ACE permissions */ - -/* - * Flags for vnode operations. - */ -enum rm { RMFILE, RMDIRECTORY }; /* rm or rmdir (remove) */ -enum create { CRCREAT, CRMKNOD, CRMKDIR }; /* reason for create */ - -/* - * Structure used on VOP_GETSECATTR and VOP_SETSECATTR operations - */ - -typedef struct vsecattr { - uint_t vsa_mask; /* See below */ - int vsa_aclcnt; /* ACL entry count */ - void *vsa_aclentp; /* pointer to ACL entries */ - int vsa_dfaclcnt; /* default ACL entry count */ - void *vsa_dfaclentp; /* pointer to default ACL entries */ - size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */ - uint_t vsa_aclflags; /* ACE ACL flags */ -} vsecattr_t; - -/* vsa_mask values */ -#define VSA_ACL 0x0001 -#define VSA_ACLCNT 0x0002 -#define VSA_DFACL 0x0004 -#define VSA_DFACLCNT 0x0008 -#define VSA_ACE 0x0010 -#define VSA_ACECNT 0x0020 -#define VSA_ACE_ALLTYPES 0x0040 -#define VSA_ACE_ACLFLAGS 0x0080 /* get/set ACE ACL flags */ - -/* - * Structure used by various vnode operations to determine - * the context (pid, host, identity) of a caller. - * - * The cc_caller_id is used to identify one or more callers who invoke - * operations, possibly on behalf of others. For example, the NFS - * server could have it's own cc_caller_id which can be detected by - * vnode/vfs operations or (FEM) monitors on those operations. New - * caller IDs are generated by fs_new_caller_id(). - */ -typedef struct caller_context { - pid_t cc_pid; /* Process ID of the caller */ - int cc_sysid; /* System ID, used for remote calls */ - u_longlong_t cc_caller_id; /* Identifier for (set of) caller(s) */ - ulong_t cc_flags; -} caller_context_t; - -struct taskq; - -/* - * Flags for VOP_LOOKUP - * - * Defined in file.h, but also possible, FIGNORECASE and FSEARCH - * - */ -#define LOOKUP_DIR 0x01 /* want parent dir vp */ -#define LOOKUP_XATTR 0x02 /* lookup up extended attr dir */ -#define CREATE_XATTR_DIR 0x04 /* Create extended attr dir */ -#define LOOKUP_HAVE_SYSATTR_DIR 0x08 /* Already created virtual GFS dir */ - -/* - * Flags for VOP_READDIR - */ -#define V_RDDIR_ENTFLAGS 0x01 /* request dirent flags */ -#define V_RDDIR_ACCFILTER 0x02 /* filter out inaccessible dirents */ - -/* - * Public vnode manipulation functions. - */ -#ifdef _KERNEL - -void vn_rele_async(struct vnode *vp, struct taskq *taskq); - -/* - * Extensible vnode attribute (xva) routines: - * xva_init() initializes an xvattr_t (zero struct, init mapsize, set AT_XATTR) - * xva_getxoptattr() returns a ponter to the xoptattr_t section of xvattr_t - */ -void xva_init(xvattr_t *); -xoptattr_t *xva_getxoptattr(xvattr_t *); /* Get ptr to xoptattr_t */ - -#define VN_RELE_ASYNC(vp, taskq) { \ - vn_rele_async(vp, taskq); \ -} - -#endif /* _KERNEL */ - -/* - * Flags to VOP_SETATTR/VOP_GETATTR. - */ -#define ATTR_UTIME 0x01 /* non-default utime(2) request */ -#define ATTR_EXEC 0x02 /* invocation from exec(2) */ -#define ATTR_COMM 0x04 /* yield common vp attributes */ -#define ATTR_HINT 0x08 /* information returned will be `hint' */ -#define ATTR_REAL 0x10 /* yield attributes of the real vp */ -#define ATTR_NOACLCHECK 0x20 /* Don't check ACL when checking permissions */ -#define ATTR_TRIGGER 0x40 /* Mount first if vnode is a trigger mount */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_VNODE_H */ diff --git a/sys/cddl/dev/dtrace/amd64/dtrace_subr.c b/sys/cddl/dev/dtrace/amd64/dtrace_subr.c index 924a59b3d656..4f9d9995cbab 100644 --- a/sys/cddl/dev/dtrace/amd64/dtrace_subr.c +++ b/sys/cddl/dev/dtrace/amd64/dtrace_subr.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include diff --git a/sys/cddl/dev/fbt/fbt.c b/sys/cddl/dev/fbt/fbt.c index 775100c0a68b..db9b8d10347e 100644 --- a/sys/cddl/dev/fbt/fbt.c +++ b/sys/cddl/dev/fbt/fbt.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include diff --git a/sys/cddl/dev/profile/profile.c b/sys/cddl/dev/profile/profile.c index 449ff78ffc36..4fc9f6f22eef 100644 --- a/sys/cddl/dev/profile/profile.c +++ b/sys/cddl/dev/profile/profile.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include diff --git a/sys/cddl/dev/sdt/sdt.c b/sys/cddl/dev/sdt/sdt.c index 9e3053d90073..c5f1d5619761 100644 --- a/sys/cddl/dev/sdt/sdt.c +++ b/sys/cddl/dev/sdt/sdt.c @@ -44,6 +44,7 @@ #include #include +#include #include #include #include diff --git a/sys/cddl/dev/systrace/systrace.c b/sys/cddl/dev/systrace/systrace.c index 90274223c512..c455ea9482dc 100644 --- a/sys/cddl/dev/systrace/systrace.c +++ b/sys/cddl/dev/systrace/systrace.c @@ -290,7 +290,7 @@ systrace_provide(void *arg, dtrace_probedesc_t *desc) static void systrace_destroy(void *arg, dtrace_id_t id, void *parg) { -#ifdef DEBUG +#ifdef SYSTRACE_DEBUG int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); /* diff --git a/sys/conf/files b/sys/conf/files index 1db09c8cfae5..888cbc656556 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -133,184 +133,231 @@ cam/scsi/scsi_sg.c optional sg cam/scsi/scsi_targ_bh.c optional targbh cam/scsi/scsi_target.c optional targ cam/scsi/smp_all.c optional scbus + # shared between zfs and dtrace -cddl/compat/opensolaris/kern/opensolaris.c optional zfs | dtrace compile-with "${CDDL_C}" -cddl/compat/opensolaris/kern/opensolaris_cmn_err.c optional zfs | dtrace compile-with "${CDDL_C}" -cddl/compat/opensolaris/kern/opensolaris_kmem.c optional zfs | dtrace compile-with "${CDDL_C}" -cddl/compat/opensolaris/kern/opensolaris_misc.c optional zfs | dtrace compile-with "${CDDL_C}" +cddl/compat/opensolaris/kern/opensolaris.c optional dtrace compile-with "${CDDL_C}" cddl/compat/opensolaris/kern/opensolaris_proc.c optional zfs | dtrace compile-with "${CDDL_C}" -cddl/compat/opensolaris/kern/opensolaris_sunddi.c optional zfs | dtrace compile-with "${CDDL_C}" -cddl/compat/opensolaris/kern/opensolaris_taskq.c optional zfs | dtrace compile-with "${CDDL_C}" +contrib/openzfs/module/os/freebsd/spl/spl_misc.c optional zfs | dtrace compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_cmn_err.c optional zfs | dtrace compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_taskq.c optional zfs | dtrace compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_kmem.c optional zfs | dtrace compile-with "${ZFS_C}" + +#zfs solaris portability layer +contrib/openzfs/module/os/freebsd/spl/acl_common.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/callb.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/list.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_acl.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_kstat.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_policy.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_string.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_uio.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_vfs.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_vm.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_zone.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/spl/spl_zlib.c optional zfs compile-with "${ZFS_C}" + + # zfs specific -cddl/compat/opensolaris/kern/opensolaris_acl.c optional zfs compile-with "${ZFS_C}" -cddl/compat/opensolaris/kern/opensolaris_dtrace.c optional zfs compile-with "${ZFS_C}" -cddl/compat/opensolaris/kern/opensolaris_kobj.c optional zfs compile-with "${ZFS_C}" -cddl/compat/opensolaris/kern/opensolaris_kstat.c optional zfs compile-with "${ZFS_C}" -cddl/compat/opensolaris/kern/opensolaris_lookup.c optional zfs compile-with "${ZFS_C}" -cddl/compat/opensolaris/kern/opensolaris_policy.c optional zfs compile-with "${ZFS_C}" -cddl/compat/opensolaris/kern/opensolaris_string.c optional zfs compile-with "${ZFS_C}" -cddl/compat/opensolaris/kern/opensolaris_sysevent.c optional zfs compile-with "${ZFS_C}" -cddl/compat/opensolaris/kern/opensolaris_uio.c optional zfs compile-with "${ZFS_C}" -cddl/compat/opensolaris/kern/opensolaris_vfs.c optional zfs compile-with "${ZFS_C}" -cddl/compat/opensolaris/kern/opensolaris_vm.c optional zfs compile-with "${ZFS_C}" -cddl/compat/opensolaris/kern/opensolaris_zone.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/acl/acl_common.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/avl/avl.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/lz4/lz4.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/unicode/u8_textprep.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/zfs/zfeature_common.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/zfs/zfs_comutil.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/zfs/zfs_deleg.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/zfs/zfs_prop.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/zfs/zpool_prop.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/common/zfs/zprop_common.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/vnode.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c optional zfs compile-with "${ZFS_C}" \ - warning "kernel contains CDDL licensed ZFS filesystem" -cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/os/callb.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/os/fm.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/os/list.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/zmod/zmod.c optional zfs compile-with "${ZFS_C}" + +#zfs avl +contrib/openzfs/module/avl/avl.c optional zfs compile-with "${ZFS_C}" + # zfs lua support -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c optional zfs compile-with "${ZFS_C}" -cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lapi.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lauxlib.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lbaselib.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lcode.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lcompat.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lcorolib.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lctype.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/ldebug.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/ldo.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lfunc.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lgc.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/llex.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lmem.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lobject.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lopcodes.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lparser.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lstate.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lstring.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lstrlib.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/ltable.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/ltablib.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/ltm.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lvm.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/lua/lzio.c optional zfs compile-with "${ZFS_C}" + +# zfs nvpair support +contrib/openzfs/module/nvpair/fnvpair.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/nvpair/nvpair.c optional zfs compile-with "${ZFS_RPC_C}" +contrib/openzfs/module/nvpair/nvpair_alloc_fixed.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/nvpair/nvpair_alloc_spl.c optional zfs compile-with "${ZFS_C}" + +#zfs platform compatibility code +contrib/openzfs/module/os/freebsd/zfs/abd_os.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/arc_os.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/crypto_os.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/dmu_os.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/hkdf.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/kmod_core.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/spa_os.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/spa_stats.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c optional zfs compile-with "${ZFS_C} -include $S/modules/zfs/zfs_config.h" +contrib/openzfs/module/os/freebsd/zfs/vdev_file.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/zfs_debug.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/os/freebsd/zfs/zvol_os.c optional zfs compile-with "${ZFS_C}" + +#zfs unicode support +contrib/openzfs/module/unicode/uconv.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/unicode/u8_textprep.c optional zfs compile-with "${ZFS_C}" + +#zfs checksums / zcommon +contrib/openzfs/module/zcommon/cityhash.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zcommon/zfeature_common.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zcommon/zfs_comutil.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zcommon/zfs_deleg.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zcommon/zfs_fletcher.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zcommon/zfs_fletcher_superscalar.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zcommon/zfs_fletcher_superscalar4.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zcommon/zfs_namecheck.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zcommon/zfs_prop.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zcommon/zpool_prop.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zcommon/zprop_common.c optional zfs compile-with "${ZFS_C}" + +#zfs core common code +contrib/openzfs/module/zfs/abd.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/aggsum.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/arc.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/blkptr.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/bplist.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/bpobj.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/bptree.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/btree.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/bqueue.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dbuf.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dbuf_stats.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dataset_kstats.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/ddt.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/ddt_zap.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dmu.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dmu_diff.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dmu_object.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dmu_objset.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dmu_recv.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dmu_redact.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dmu_send.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dmu_traverse.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dmu_tx.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dmu_zfetch.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dnode.c optional zfs compile-with "${ZFS_C}" \ + warning "kernel contains CDDL licensed ZFS filesystem" +contrib/openzfs/module/zfs/dnode_sync.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dsl_bookmark.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dsl_crypt.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dsl_dataset.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dsl_deadlist.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dsl_deleg.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dsl_destroy.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dsl_dir.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dsl_pool.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dsl_prop.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dsl_scan.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dsl_synctask.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/dsl_userhold.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/fm.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/gzip.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/lzjb.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/lz4.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/metaslab.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/mmp.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/multilist.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/objlist.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/pathname.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/range_tree.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/refcount.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/rrwlock.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/sa.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/sha256.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/skein_zfs.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/spa.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/spa_boot.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/spa_checkpoint.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/spa_config.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/spa_errlog.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/spa_history.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/spa_log_spacemap.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/spa_misc.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/space_map.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/space_reftree.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/txg.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/uberblock.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/unique.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_cache.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_indirect.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_indirect_births.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_indirect_mapping.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_initialize.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_label.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_mirror.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_missing.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_queue.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_raidz.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_raidz_math.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_rebuild.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_removal.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_root.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_trim.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zap.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zap_leaf.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zap_micro.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zcp.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zcp_get.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zcp_global.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zcp_iter.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zcp_set.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zcp_synctask.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zfeature.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zfs_byteswap.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zfs_fm.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zfs_fuid.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zfs_ioctl.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zfs_log.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zfs_onexit.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zfs_quota.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zfs_ratelimit.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zfs_replay.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zfs_rlock.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zfs_sa.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zstd/zfs_zstd.c optional zfs zstdio compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zil.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zio.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zio_checksum.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zio_compress.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zio_inject.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zle.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zrlock.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zthr.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/zvol.c optional zfs compile-with "${ZFS_C}" + # dtrace specific cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c optional dtrace compile-with "${DTRACE_C}" \ warning "kernel contains CDDL licensed DTRACE" diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index 3d9d52eb99fe..f9e6d0925820 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -463,3 +463,13 @@ x86/x86/msi.c optional pci x86/xen/pv.c optional xenhvm x86/xen/pvcpu_enum.c optional xenhvm x86/xen/xen_pci_bus.c optional xenhvm + +contrib/openzfs/module/zcommon/zfs_fletcher_avx512.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zcommon/zfs_fletcher_intel.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zcommon/zfs_fletcher_sse.c optional zfs compile-with "${ZFS_C}" + +contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c optional zfs compile-with "${ZFS_C}" diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk index 16910042af8b..28ecab86a6ea 100644 --- a/sys/conf/kern.pre.mk +++ b/sys/conf/kern.pre.mk @@ -208,34 +208,82 @@ ZSTD_C= ${CC} -c -DZSTD_HEAPMODE=1 -I$S/contrib/zstd/lib/freebsd ${CFLAGS} -I$S/ ZSTD_DECOMPRESS_BLOCK_FLAGS= -fno-tree-vectorize .endif +ZINCDIR=$S/contrib/openzfs/include # Common for dtrace / zfs -CDDL_CFLAGS= -DFREEBSD_NAMECACHE -nostdinc -I$S/cddl/compat/opensolaris -I$S/cddl/contrib/opensolaris/uts/common -I$S -I$S/cddl/contrib/opensolaris/common ${CFLAGS} -Wno-unknown-pragmas -Wno-missing-prototypes -Wno-undef -Wno-strict-prototypes -Wno-cast-qual -Wno-parentheses -Wno-redundant-decls -Wno-missing-braces -Wno-uninitialized -Wno-unused -Wno-inline -Wno-switch -Wno-pointer-arith -Wno-unknown-pragmas -CDDL_CFLAGS+= -include $S/cddl/compat/opensolaris/sys/debug_compat.h +CDDL_CFLAGS= \ + -DFREEBSD_NAMECACHE \ + -D_SYS_VMEM_H_ \ + -D__KERNEL \ + -D__KERNEL__ \ + -nostdinc \ + -include $S/modules/zfs/static_ccompile.h \ + -I${ZINCDIR} \ + -I${ZINCDIR}/spl \ + -I${ZINCDIR}/os/freebsd \ + -I${ZINCDIR}/os/freebsd/spl \ + -I${ZINCDIR}/os/freebsd/zfs \ + -I$S/modules/zfs \ + -I$S/contrib/openzfs/module/zstd/include \ + -I$S/contrib/openzfs/module/zstd/lib/freebsd/ \ + ${CFLAGS} \ + -Wno-unknown-pragmas \ + -Wno-missing-prototypes \ + -Wno-undef \ + -Wno-strict-prototypes \ + -Wno-cast-qual \ + -Wno-parentheses \ + -Wno-redundant-decls \ + -Wno-missing-braces \ + -Wno-uninitialized \ + -Wno-unused \ + -Wno-inline \ + -Wno-switch \ + -Wno-pointer-arith \ + -Wno-unknown-pragmas \ + -Wno-duplicate-decl-specifier \ + -include ${ZINCDIR}/os/freebsd/spl/sys/ccompile.h \ + -I$S/cddl/contrib/opensolaris/uts/common \ + -I$S -I$S/cddl/compat/opensolaris CDDL_C= ${CC} -c ${CDDL_CFLAGS} ${WERROR} ${PROF} ${.IMPSRC} # Special flags for managing the compat compiles for ZFS -ZFS_CFLAGS= -DBUILDING_ZFS -I$S/cddl/contrib/opensolaris/uts/common/fs/zfs -ZFS_CFLAGS+= -I$S/cddl/contrib/opensolaris/uts/common/fs/zfs/lua -ZFS_CFLAGS+= -I$S/cddl/contrib/opensolaris/uts/common/zmod -ZFS_CFLAGS+= -I$S/cddl/contrib/opensolaris/common/lz4 -ZFS_CFLAGS+= -I$S/cddl/contrib/opensolaris/common/zfs -ZFS_CFLAGS+= ${CDDL_CFLAGS} +ZFS_CFLAGS+= ${CDDL_CFLAGS} -DBUILDING_ZFS -DHAVE_UIO_ZEROCOPY \ + -DWITH_NETDUMP -D__KERNEL__ -D_SYS_CONDVAR_H_ -DSMP \ + -DIN_FREEBSD_BASE -DHAVE_KSID + +.if ${MACHINE_ARCH} == "amd64" +ZFS_CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F \ + -DHAVE_SSSE3 -DHAVE_AVX512BW +.endif + +.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \ + ${MACHINE_ARCH} == "arm" +ZFS_CFLAGS+= -DBITS_PER_LONG=32 +.else +ZFS_CFLAGS+= -DBITS_PER_LONG=64 +.endif + + ZFS_ASM_CFLAGS= -x assembler-with-cpp -DLOCORE ${ZFS_CFLAGS} ZFS_C= ${CC} -c ${ZFS_CFLAGS} ${WERROR} ${PROF} ${.IMPSRC} +ZFS_RPC_C= ${CC} -c ${ZFS_CFLAGS} -DHAVE_RPC_TYPES ${WERROR} ${PROF} ${.IMPSRC} ZFS_S= ${CC} -c ${ZFS_ASM_CFLAGS} ${WERROR} ${.IMPSRC} + + # Special flags for managing the compat compiles for DTrace DTRACE_CFLAGS= -DBUILDING_DTRACE ${CDDL_CFLAGS} -I$S/cddl/dev/dtrace -I$S/cddl/dev/dtrace/${MACHINE_CPUARCH} .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386" DTRACE_CFLAGS+= -I$S/cddl/contrib/opensolaris/uts/intel -I$S/cddl/dev/dtrace/x86 .endif -DTRACE_CFLAGS+= -I$S/cddl/contrib/opensolaris/common/util -I$S -DDIS_MEM -DSMP +DTRACE_CFLAGS+= -I$S/cddl/contrib/opensolaris/common/util -I$S -DDIS_MEM -DSMP -I$S/cddl/compat/opensolaris +DTRACE_CFLAGS+= -I$S/cddl/contrib/opensolaris/uts/common DTRACE_ASM_CFLAGS= -x assembler-with-cpp -DLOCORE ${DTRACE_CFLAGS} DTRACE_C= ${CC} -c ${DTRACE_CFLAGS} ${WERROR} ${PROF} ${.IMPSRC} DTRACE_S= ${CC} -c ${DTRACE_ASM_CFLAGS} ${WERROR} ${.IMPSRC} # Special flags for managing the compat compiles for DTrace/FBT -FBT_CFLAGS= -DBUILDING_DTRACE -nostdinc -I$S/cddl/dev/fbt/${MACHINE_CPUARCH} -I$S/cddl/dev/fbt -I$S/cddl/compat/opensolaris -I$S/cddl/contrib/opensolaris/uts/common -I$S ${CDDL_CFLAGS} +FBT_CFLAGS= -DBUILDING_DTRACE -nostdinc -I$S/cddl/dev/fbt/${MACHINE_CPUARCH} -I$S/cddl/dev/fbt ${CDDL_CFLAGS} -I$S/cddl/compat/opensolaris -I$S/cddl/contrib/opensolaris/uts/common .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386" FBT_CFLAGS+= -I$S/cddl/dev/fbt/x86 .endif diff --git a/sys/conf/kmod.mk b/sys/conf/kmod.mk index bb1baa82f58d..e60e49dc9eb5 100644 --- a/sys/conf/kmod.mk +++ b/sys/conf/kmod.mk @@ -532,6 +532,22 @@ OBJS_DEPEND_GUESS+= ${SRCS:M*.h} OBJS_DEPEND_GUESS+= opt_global.h .endif +ZINCDIR=${SYSDIR}/contrib/openzfs/include +OPENZFS_CFLAGS= \ + -D_SYS_VMEM_H_ \ + -D__KERNEL__ \ + -nostdinc \ + -DSMP \ + -I${ZINCDIR} \ + -I${ZINCDIR}/spl \ + -I${ZINCDIR}/os/freebsd \ + -I${ZINCDIR}/os/freebsd/spl \ + -I${ZINCDIR}/os/freebsd/zfs \ + -I${SYSDIR}/cddl/compat/opensolaris \ + -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \ + -include ${ZINCDIR}/os/freebsd/spl/sys/ccompile.h + + .include .include .include diff --git a/sys/modules/Makefile b/sys/modules/Makefile index dc4e04736670..9b0bd33df49c 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -563,7 +563,7 @@ _mpr= mpr SUBDIR+= tests .endif -.if ${MK_ZFS} != "no" || defined(ALL_MODULES) +.if ${MK_ZFS} != "no" || (defined(ALL_MODULES) && ${MACHINE_CPUARCH} != "powerpc") SUBDIR+= zfs .endif diff --git a/sys/modules/dtrace/dtaudit/Makefile b/sys/modules/dtrace/dtaudit/Makefile index aea5bd590ac6..72cdf03f4bc3 100644 --- a/sys/modules/dtrace/dtaudit/Makefile +++ b/sys/modules/dtrace/dtaudit/Makefile @@ -8,9 +8,7 @@ KMOD= dtaudit SRCS= audit_dtrace.c \ vnode_if.h -CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \ - -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \ - -I${SYSDIR} +CFLAGS+= ${OPENZFS_CFLAGS} .include diff --git a/sys/modules/dtrace/dtmalloc/Makefile b/sys/modules/dtrace/dtmalloc/Makefile index d43d302a53b8..910f8f360e80 100644 --- a/sys/modules/dtrace/dtmalloc/Makefile +++ b/sys/modules/dtrace/dtmalloc/Makefile @@ -8,9 +8,7 @@ KMOD= dtmalloc SRCS= dtmalloc.c SRCS+= vnode_if.h -CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \ - -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \ - -I${SYSDIR} +CFLAGS+= ${OPENZFS_CFLAGS} .include diff --git a/sys/modules/dtrace/dtnfscl/Makefile b/sys/modules/dtrace/dtnfscl/Makefile index 3b6134348019..6184ad183fc6 100644 --- a/sys/modules/dtrace/dtnfscl/Makefile +++ b/sys/modules/dtrace/dtnfscl/Makefile @@ -8,9 +8,7 @@ KMOD= dtnfscl SRCS= nfs_clkdtrace.c \ vnode_if.h -CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \ - -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \ - -I${SYSDIR} +CFLAGS+= ${OPENZFS_CFLAGS} .include diff --git a/sys/modules/dtrace/dtrace/Makefile b/sys/modules/dtrace/dtrace/Makefile index 84960728aaac..0bedcaa12fe7 100644 --- a/sys/modules/dtrace/dtrace/Makefile +++ b/sys/modules/dtrace/dtrace/Makefile @@ -20,9 +20,11 @@ SRCS= dtrace.c \ .PATH: ${SYSDIR}/cddl/dev/dtrace/x86 SRCS+= dis_tables.c \ instr_size.c -CFLAGS+= -I${SYSDIR}/cddl/contrib/opensolaris/uts/intel \ - -I${SYSDIR}/cddl/dev/dtrace/x86 +CFLAGS+= -I${SYSDIR}/cddl/contrib/opensolaris/uts/intel \ + -I${SYSDIR}/cddl/dev/dtrace/x86 + .endif +CFLAGS+= ${OPENZFS_CFLAGS} SRCS+= bus_if.h device_if.h vnode_if.h @@ -56,7 +58,7 @@ dtrace_asm.o: assym.inc .include CFLAGS+= -include ${SYSDIR}/cddl/compat/opensolaris/sys/debug_compat.h - +CFLAGS.dtrace_asm.S+= -D_SYS_ERRNO_H_ -D_SYS_PARAM_H_ -DLOCORE CWARNFLAGS+= -Wno-parentheses CWARNFLAGS+= -Wno-uninitialized CWARNFLAGS+= -Wno-cast-qual diff --git a/sys/modules/dtrace/fasttrap/Makefile b/sys/modules/dtrace/fasttrap/Makefile index 4f1ecc4839be..52fe1ef46e60 100644 --- a/sys/modules/dtrace/fasttrap/Makefile +++ b/sys/modules/dtrace/fasttrap/Makefile @@ -6,12 +6,10 @@ SYSDIR?= ${SRCTOP}/sys KMOD= fasttrap SRCS= fasttrap.c fasttrap_isa.c -SRCS+= vnode_if.h +SRCS+= vnode_if.h opt_global.h -CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \ - -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \ - -I${SYSDIR}/cddl/contrib/opensolaris/uts/common/dtrace \ - -I${SYSDIR} +CFLAGS+= -include ${.OBJDIR}/opt_global.h +CFLAGS+= ${OPENZFS_CFLAGS} .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386" CFLAGS+= -I${SYSDIR}/cddl/contrib/opensolaris/uts/intel diff --git a/sys/modules/dtrace/fbt/Makefile b/sys/modules/dtrace/fbt/Makefile index 68156981d077..288c8cafa817 100644 --- a/sys/modules/dtrace/fbt/Makefile +++ b/sys/modules/dtrace/fbt/Makefile @@ -8,6 +8,7 @@ KMOD= fbt SRCS= fbt.c fbt_isa.c SRCS+= vnode_if.h + .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386" CFLAGS+= -I${SYSDIR}/cddl/dev/fbt/x86 .PATH: ${SYSDIR}/cddl/dev/fbt/x86 @@ -16,10 +17,8 @@ CFLAGS+= -I${SYSDIR}/cddl/dev/fbt/${MACHINE_CPUARCH} .PATH: ${SYSDIR}/cddl/dev/fbt/${MACHINE_CPUARCH} .endif -CFLAGS+= -I${SYSDIR}/cddl/dev/fbt \ - -I${SYSDIR}/cddl/compat/opensolaris \ - -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \ - -I${SYSDIR} +CFLAGS+= ${OPENZFS_CFLAGS} +CFLAGS+= -I${SYSDIR}/cddl/dev/fbt .include diff --git a/sys/modules/dtrace/profile/Makefile b/sys/modules/dtrace/profile/Makefile index c33427564880..aa36f9a2dfba 100644 --- a/sys/modules/dtrace/profile/Makefile +++ b/sys/modules/dtrace/profile/Makefile @@ -8,9 +8,7 @@ KMOD= profile SRCS= profile.c SRCS+= vnode_if.h -CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \ - -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \ - -I${SYSDIR} +CFLAGS+= ${OPENZFS_CFLAGS} .include diff --git a/sys/modules/dtrace/prototype/Makefile b/sys/modules/dtrace/prototype/Makefile index 83ec176d0306..476c567a4dc0 100644 --- a/sys/modules/dtrace/prototype/Makefile +++ b/sys/modules/dtrace/prototype/Makefile @@ -8,9 +8,7 @@ KMOD= prototype SRCS= prototype.c SRCS+= vnode_if.h -CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \ - -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \ - -I${SYSDIR} +CFLAGS+= ${OPENZFS_CFLAGS} .include diff --git a/sys/modules/dtrace/sdt/Makefile b/sys/modules/dtrace/sdt/Makefile index 79c0e5f0f383..2f6432e4a71f 100644 --- a/sys/modules/dtrace/sdt/Makefile +++ b/sys/modules/dtrace/sdt/Makefile @@ -8,10 +8,7 @@ KMOD= sdt SRCS= sdt.c SRCS+= vnode_if.h -CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \ - -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \ - -I${SYSDIR} +CFLAGS+= ${OPENZFS_CFLAGS} .include - CFLAGS+= -include ${SYSDIR}/cddl/compat/opensolaris/sys/debug_compat.h diff --git a/sys/modules/dtrace/systrace/Makefile b/sys/modules/dtrace/systrace/Makefile index 0c682f4e3d5e..3e122f70da25 100644 --- a/sys/modules/dtrace/systrace/Makefile +++ b/sys/modules/dtrace/systrace/Makefile @@ -8,10 +8,7 @@ KMOD= systrace SRCS= systrace.c SRCS+= vnode_if.h -CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \ - -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \ - -I${SYSDIR}/cddl/contrib/opensolaris/uts/common/dtrace \ - -I${SYSDIR} +CFLAGS+= ${OPENZFS_CFLAGS} .include diff --git a/sys/modules/dtrace/systrace_freebsd32/Makefile b/sys/modules/dtrace/systrace_freebsd32/Makefile index e20f59f27e53..4661633f9a62 100644 --- a/sys/modules/dtrace/systrace_freebsd32/Makefile +++ b/sys/modules/dtrace/systrace_freebsd32/Makefile @@ -8,9 +8,8 @@ KMOD= systrace_freebsd32 SRCS= systrace.c SRCS+= vnode_if.h -CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \ - -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \ - -I${SYSDIR} -DFREEBSD32_SYSTRACE +CFLAGS+= ${OPENZFS_CFLAGS} +CFLAGS+= -DFREEBSD32_SYSTRACE .include diff --git a/sys/modules/dtrace/systrace_linux/Makefile b/sys/modules/dtrace/systrace_linux/Makefile index 037dd3d03849..7dbd88ffb5f0 100644 --- a/sys/modules/dtrace/systrace_linux/Makefile +++ b/sys/modules/dtrace/systrace_linux/Makefile @@ -9,9 +9,8 @@ KMOD= systrace_linux SRCS= systrace.c SRCS+= vnode_if.h -CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \ - -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \ - -I${SYSDIR} -DLINUX_SYSTRACE +CFLAGS+= ${OPENZFS_CFLAGS} +CFLAGS+= -DLINUX_SYSTRACE .include diff --git a/sys/modules/dtrace/systrace_linux32/Makefile b/sys/modules/dtrace/systrace_linux32/Makefile index 52a71100bc04..81aa1b7de9df 100644 --- a/sys/modules/dtrace/systrace_linux32/Makefile +++ b/sys/modules/dtrace/systrace_linux32/Makefile @@ -9,9 +9,8 @@ KMOD= systrace_linux32 SRCS= systrace.c SRCS+= vnode_if.h -CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \ - -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \ - -I${SYSDIR} -DLINUX32_SYSTRACE +CFLAGS+= ${OPENZFS_CFLAGS} +CFLAGS+= -DLINUX32_SYSTRACE .include diff --git a/sys/modules/opensolaris/Makefile b/sys/modules/opensolaris/Makefile index 3c8fa5794b39..1b5dd6dffcfb 100644 --- a/sys/modules/opensolaris/Makefile +++ b/sys/modules/opensolaris/Makefile @@ -3,14 +3,16 @@ SYSDIR?= ${SRCTOP}/sys .PATH: ${SYSDIR}/cddl/compat/opensolaris/kern +.PATH: ${SYSDIR}/contrib/openzfs/module/os/freebsd/spl KMOD= opensolaris -SRCS= opensolaris.c \ - opensolaris_cmn_err.c \ - opensolaris_kmem.c \ - opensolaris_misc.c \ +SRCS= vnode_if.h \ + opensolaris.c \ opensolaris_proc.c \ - opensolaris_sunddi.c + spl_cmn_err.c \ + spl_kmem.c \ + spl_misc.c \ + spl_sunddi.c _A=${SYSDIR}/cddl/contrib/opensolaris/common/atomic .if exists(${_A}/${MACHINE_CPUARCH}/opensolaris_atomic.S) @@ -23,9 +25,7 @@ SRCS+= opensolaris_atomic.S SRCS+= opensolaris_atomic.c .endif -CFLAGS+= -I${SYSDIR}/cddl/compat/opensolaris \ - -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \ - -I${SYSDIR} +CFLAGS+= ${OPENZFS_CFLAGS} EXPORT_SYMS= cpu_core diff --git a/sys/modules/zfs/Makefile b/sys/modules/zfs/Makefile index efde78f28722..5c020620e32b 100644 --- a/sys/modules/zfs/Makefile +++ b/sys/modules/zfs/Makefile @@ -1,118 +1,344 @@ # $FreeBSD$ -SYSDIR?=${SRCTOP}/sys +SRCDIR=${SRCTOP}/sys/contrib/openzfs/module +INCDIR=${SRCTOP}/sys/contrib/openzfs/include KMOD= zfs -SRCS= bus_if.h device_if.h vnode_if.h opt_kstack_pages.h +.PATH: ${SRCDIR}/avl \ + ${SRCDIR}/lua \ + ${SRCDIR}/nvpair \ + ${SRCDIR}/os/freebsd/spl \ + ${SRCDIR}/os/freebsd/zfs \ + ${SRCDIR}/unicode \ + ${SRCDIR}/zcommon \ + ${SRCDIR}/zfs \ + ${SRCDIR}/zstd \ + ${SRCDIR}/zstd/lib -SUNW= ${SYSDIR}/cddl/contrib/opensolaris -.PATH: ${SUNW}/common/acl -SRCS+= acl_common.c -.PATH: ${SUNW}/common/avl -SRCS+= avl.c -.PATH: ${SUNW}/common/nvpair -SRCS+= opensolaris_nvpair.c -SRCS+= opensolaris_nvpair_alloc_fixed.c -SRCS+= opensolaris_fnvpair.c -.PATH: ${SYSDIR}/cddl/contrib/opensolaris/common/unicode -SRCS+= u8_textprep.c -.PATH: ${SUNW}/common/lz4 -SRCS+= lz4.c +CFLAGS+= -I${INCDIR} +CFLAGS+= -I${INCDIR}/spl +CFLAGS+= -I${INCDIR}/os/freebsd +CFLAGS+= -I${INCDIR}/os/freebsd/spl +CFLAGS+= -I${INCDIR}/os/freebsd/zfs +CFLAGS+= -I${SRCDIR}/zstd/include +CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/static_ccompile.h +CFLAGS+= -I${.CURDIR} -.PATH: ${SYSDIR}/cddl/compat/opensolaris/kern -SRCS+= opensolaris_acl.c -SRCS+= opensolaris_dtrace.c -SRCS+= opensolaris_kobj.c -SRCS+= opensolaris_kstat.c -SRCS+= opensolaris_lookup.c -SRCS+= opensolaris_policy.c -SRCS+= opensolaris_string.c -SRCS+= opensolaris_sysevent.c -SRCS+= opensolaris_taskq.c -SRCS+= opensolaris_uio.c -SRCS+= opensolaris_vfs.c -SRCS+= opensolaris_vm.c -SRCS+= opensolaris_zone.c +CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 \ + -DHAVE_UIO_ZEROCOPY -DWITHOUT_NETDUMP -D__KERNEL -D_SYS_CONDVAR_H_ \ + -D_SYS_VMEM_H_ -DKDTRACE_HOOKS -DSMP -DIN_FREEBSD_BASE -DHAVE_KSID -_A=${SYSDIR}/cddl/contrib/opensolaris/common/atomic -.if exists(${_A}/${MACHINE_CPUARCH}/opensolaris_atomic.S) -.PATH: ${_A}/${MACHINE_CPUARCH} -SRCS+= opensolaris_atomic.S -.elif exists(${_A}/${MACHINE_ARCH}/opensolaris_atomic.S) -.PATH: ${_A}/${MACHINE_ARCH} -SRCS+= opensolaris_atomic.S -.else -SRCS+= opensolaris_atomic.c +.if ${MACHINE_ARCH} == "amd64" +CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F -DHAVE_AVX512BW -DHAVE_SSSE3 .endif -.PATH: ${SUNW}/uts/common/fs -SRCS+= vnode.c - -.PATH: ${SUNW}/uts/common/os -SRCS+= callb.c -SRCS+= fm.c -SRCS+= list.c -SRCS+= nvpair_alloc_system.c - -.PATH: ${SUNW}/uts/common/zmod -SRCS+= zmod.c - -.PATH: ${SYSDIR}/crypto/sha2 -SRCS+= sha256c.c sha512c.c - -.PATH: ${SYSDIR}/crypto/skein -SRCS+= skein.c skein_block.c - -.PATH: ${SUNW}/common/zfs -.include "${SUNW}/uts/common/Makefile.files" -.PATH: ${SUNW}/uts/common/fs/zfs -ZFS_SRCS= ${ZFS_OBJS:C/.o$/.c/} -SRCS+= ${ZFS_SRCS} -SRCS+= vdev_geom.c -SRCS+= trim_map.c -.PATH: ${SUNW}/uts/common/fs/zfs/lua -LUA_SRCS= ${LUA_OBJS:C/.o$/.c/} -SRCS+= ${LUA_SRCS} - -# Use FreeBSD's namecache. -CFLAGS+=-DFREEBSD_NAMECACHE - -CFLAGS+=-I${SYSDIR}/cddl/compat/opensolaris -CFLAGS+=-I${SUNW}/uts/common/fs/zfs -CFLAGS+=-I${SUNW}/uts/common/fs/zfs/lua -CFLAGS+=-I${SUNW}/uts/common/zmod -CFLAGS+=-I${SUNW}/uts/common -CFLAGS+=-I${SYSDIR} -CFLAGS+=-I${SUNW}/common/zfs -CFLAGS+=-I${SUNW}/common/lz4 -CFLAGS+=-I${SUNW}/common -CFLAGS+=-DBUILDING_ZFS -CFLAGS.gcc+=-fms-extensions - -.if ${MACHINE_ARCH} == "powerpc64" -CFLAGS.gcc+=-mminimal-toc +.if defined(WITH_VFS_DEBUG) && ${WITH_VFS_DEBUG} == "true" +# kernel must also be built with this option for this to work +CFLAGS+= -DDEBUG_VFS_LOCKS +.endif + +.if defined(WITH_GCOV) && ${WITH_GCOV} == "true" +CFLAGS+= -fprofile-arcs -ftest-coverage .endif -.ifdef ZFS_DEBUG -CFLAGS+=-DDEBUG=1 DEBUG_FLAGS=-g + +.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \ + ${MACHINE_ARCH} == "arm" +CFLAGS+= -DBITS_PER_LONG=32 +.else +CFLAGS+= -DBITS_PER_LONG=64 .endif +SRCS= vnode_if.h device_if.h bus_if.h + +# avl +SRCS+= avl.c + +#lua +SRCS+= lapi.c \ + lauxlib.c \ + lbaselib.c \ + lcode.c \ + lcompat.c \ + lcorolib.c \ + lctype.c \ + ldebug.c \ + ldo.c \ + lfunc.c \ + lgc.c \ + llex.c \ + lmem.c \ + lobject.c \ + lopcodes.c \ + lparser.c \ + lstate.c \ + lstring.c \ + lstrlib.c \ + ltable.c \ + ltablib.c \ + ltm.c \ + lvm.c \ + lzio.c + +#nvpair +SRCS+= nvpair.c \ + fnvpair.c \ + nvpair_alloc_spl.c \ + nvpair_alloc_fixed.c + +#os/freebsd/spl +SRCS+= acl_common.c \ + btree.c \ + callb.c \ + list.c \ + spl_acl.c \ + spl_cmn_err.c \ + spl_dtrace.c \ + spl_kmem.c \ + spl_kstat.c \ + spl_misc.c \ + spl_policy.c \ + spl_string.c \ + spl_sunddi.c \ + spl_sysevent.c \ + spl_taskq.c \ + spl_uio.c \ + spl_vfs.c \ + spl_vm.c \ + spl_zone.c \ + sha256c.c \ + sha512c.c \ + spl_procfs_list.c \ + spl_zlib.c + + +.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \ + ${MACHINE_ARCH} == "arm" +SRCS+= spl_atomic.c +.endif + +#os/freebsd/zfs +SRCS+= abd_os.c \ + crypto_os.c \ + dmu_os.c \ + hkdf.c \ + kmod_core.c \ + spa_os.c \ + sysctl_os.c \ + vdev_file.c \ + vdev_label_os.c \ + vdev_geom.c \ + zfs_acl.c \ + zfs_ctldir.c \ + zfs_dir.c \ + zfs_ioctl_compat.c \ + zfs_ioctl_os.c \ + zfs_log.c \ + zfs_replay.c \ + zfs_vfsops.c \ + zfs_vnops.c \ + zfs_znode.c \ + zio_crypt.c \ + zvol_os.c + +#unicode +SRCS+= uconv.c \ + u8_textprep.c + +#zcommon +SRCS+= zfeature_common.c \ + zfs_comutil.c \ + zfs_deleg.c \ + zfs_fletcher.c \ + zfs_fletcher_avx512.c \ + zfs_fletcher_intel.c \ + zfs_fletcher_sse.c \ + zfs_fletcher_superscalar.c \ + zfs_fletcher_superscalar4.c \ + zfs_namecheck.c \ + zfs_prop.c \ + zpool_prop.c \ + zprop_common.c + +#zfs +SRCS+= abd.c \ + aggsum.c \ + arc.c \ + arc_os.c \ + blkptr.c \ + bplist.c \ + bpobj.c \ + cityhash.c \ + dbuf.c \ + dbuf_stats.c \ + bptree.c \ + bqueue.c \ + dataset_kstats.c \ + ddt.c \ + ddt_zap.c \ + dmu.c \ + dmu_diff.c \ + dmu_object.c \ + dmu_objset.c \ + dmu_recv.c \ + dmu_redact.c \ + dmu_send.c \ + dmu_traverse.c \ + dmu_tx.c \ + dmu_zfetch.c \ + dnode.c \ + dnode_sync.c \ + dsl_dataset.c \ + dsl_deadlist.c \ + dsl_deleg.c \ + dsl_bookmark.c \ + dsl_dir.c \ + dsl_crypt.c \ + dsl_destroy.c \ + dsl_pool.c \ + dsl_prop.c \ + dsl_scan.c \ + dsl_synctask.c \ + dsl_userhold.c \ + fm.c \ + gzip.c \ + lzjb.c \ + lz4.c \ + metaslab.c \ + mmp.c \ + multilist.c \ + objlist.c \ + pathname.c \ + range_tree.c \ + refcount.c \ + rrwlock.c \ + sa.c \ + sha256.c \ + skein_zfs.c \ + spa.c \ + spa_boot.c \ + spa_checkpoint.c \ + spa_config.c \ + spa_errlog.c \ + spa_history.c \ + spa_log_spacemap.c \ + spa_misc.c \ + spa_stats.c \ + space_map.c \ + space_reftree.c \ + txg.c \ + uberblock.c \ + unique.c \ + vdev.c \ + vdev_cache.c \ + vdev_indirect.c \ + vdev_indirect_births.c \ + vdev_indirect_mapping.c \ + vdev_initialize.c \ + vdev_label.c \ + vdev_mirror.c \ + vdev_missing.c \ + vdev_queue.c \ + vdev_raidz.c \ + vdev_raidz_math.c \ + vdev_raidz_math_scalar.c \ + vdev_raidz_math_avx2.c \ + vdev_raidz_math_avx512bw.c \ + vdev_raidz_math_avx512f.c \ + vdev_raidz_math_sse2.c \ + vdev_raidz_math_ssse3.c \ + vdev_rebuild.c \ + vdev_removal.c \ + vdev_root.c \ + vdev_trim.c \ + zap.c \ + zap_leaf.c \ + zap_micro.c \ + zcp.c \ + zcp_get.c \ + zcp_global.c \ + zcp_iter.c \ + zcp_set.c \ + zcp_synctask.c \ + zfeature.c \ + zfs_byteswap.c \ + zfs_debug.c \ + zfs_file_os.c \ + zfs_fm.c \ + zfs_fuid.c \ + zfs_ioctl.c \ + zfs_onexit.c \ + zfs_quota.c \ + zfs_ratelimit.c \ + zfs_rlock.c \ + zfs_sa.c \ + zil.c \ + zio.c \ + zio_checksum.c \ + zio_compress.c \ + zio_inject.c \ + zle.c \ + zrlock.c \ + zthr.c \ + zvol.c + +SRCS+= zfs_zstd.c \ + zstd.c + .include -CFLAGS+= -include ${SYSDIR}/cddl/compat/opensolaris/sys/debug_compat.h -CWARNFLAGS+=-Wno-missing-prototypes -CWARNFLAGS+=-Wno-undef -CWARNFLAGS+=-Wno-strict-prototypes -CWARNFLAGS+=-Wno-cast-qual -CWARNFLAGS+=-Wno-parentheses -CWARNFLAGS+=-Wno-redundant-decls -CWARNFLAGS+=-Wno-missing-braces -CWARNFLAGS+=-Wno-uninitialized -CWARNFLAGS+=-Wno-unused -CWARNFLAGS+=-Wno-inline -CWARNFLAGS+=-Wno-switch -CWARNFLAGS+=-Wno-pointer-arith +CFLAGS.gcc+= -Wno-pointer-to-int-cast + +CFLAGS.lapi.c= -Wno-cast-qual +CFLAGS.lcompat.c= -Wno-cast-qual +CFLAGS.lobject.c= -Wno-cast-qual +CFLAGS.ltable.c= -Wno-cast-qual +CFLAGS.lvm.c= -Wno-cast-qual +CFLAGS.nvpair.c= -Wno-cast-qual -DHAVE_RPC_TYPES +CFLAGS.spl_string.c= -Wno-cast-qual +CFLAGS.spl_vm.c= -Wno-cast-qual +CFLAGS.spl_zlib.c= -Wno-cast-qual +CFLAGS.abd.c= -Wno-cast-qual +CFLAGS.zfs_log.c= -Wno-cast-qual +CFLAGS.zfs_vnops.c= -Wno-pointer-arith +CFLAGS.u8_textprep.c= -Wno-cast-qual +CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zfs_fletcher_intel.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zfs_fletcher_sse.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zfs_fletcher_avx512.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zprop_common.c= -Wno-cast-qual +CFLAGS.ddt.c= -Wno-cast-qual +CFLAGS.dmu.c= -Wno-cast-qual +CFLAGS.dmu_traverse.c= -Wno-cast-qual +CFLAGS.dsl_dir.c= -Wno-cast-qual +CFLAGS.dsl_deadlist.c= -Wno-cast-qual +CFLAGS.dsl_prop.c= -Wno-cast-qual +CFLAGS.fm.c= -Wno-cast-qual +CFLAGS.lz4.c= -Wno-cast-qual +CFLAGS.spa.c= -Wno-cast-qual +CFLAGS.spa_misc.c= -Wno-cast-qual +CFLAGS.sysctl_os.c= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h +CFLAGS.vdev_raidz.c= -Wno-cast-qual +CFLAGS.vdev_raidz_math.c= -Wno-cast-qual +CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual +CFLAGS.vdev_raidz_math_avx2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier +CFLAGS.vdev_raidz_math_avx512f.c= -Wno-cast-qual -Wno-duplicate-decl-specifier +CFLAGS.vdev_raidz_math_sse2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier +CFLAGS.zap_leaf.c= -Wno-cast-qual +CFLAGS.zap_micro.c= -Wno-cast-qual +CFLAGS.zcp.c= -Wno-cast-qual +CFLAGS.zfs_fm.c= -Wno-cast-qual +CFLAGS.zfs_ioctl.c= -Wno-cast-qual +CFLAGS.zil.c= -Wno-cast-qual +CFLAGS.zio.c= -Wno-cast-qual +CFLAGS.zrlock.c= -Wno-cast-qual +CFLAGS.zfs_zstd.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zstd.c= -fno-tree-vectorize +.if ${MACHINE_CPUARCH} == "aarch64" +CFLAGS.zstd.c+= -include ${SRCDIR}/zstd/include/aarch64_compat.h +.endif diff --git a/sys/modules/zfs/static_ccompile.h b/sys/modules/zfs/static_ccompile.h new file mode 100644 index 000000000000..8c3b89a8aa90 --- /dev/null +++ b/sys/modules/zfs/static_ccompile.h @@ -0,0 +1,29 @@ +/* + * $FreeBSD$ + */ + +#ifndef _SPL_NVLIST_H_ +#define _SPL_NVLIST_H_ + +#ifdef INVARIANTS +#define ZFS_DEBUG +#endif + +#define nvlist_add_nvlist spl_nvlist_add_nvlist +#define nvlist_add_nvlist_array spl_nvlist_add_nvlist_array +#define nvlist_add_nvpair spl_nvlist_add_nvpair +#define nvlist_add_string spl_nvlist_add_string +#define nvlist_add_string_array spl_nvlist_add_string_array +#define nvlist_empty spl_nvlist_empty +#define nvlist_exists spl_nvlist_exists +#define nvlist_free spl_nvlist_free +#define nvlist_next_nvpair spl_nvlist_next_nvpair +#define nvlist_pack spl_nvlist_pack +#define nvlist_prev_nvpair spl_nvlist_prev_nvpair +#define nvlist_remove_nvpair spl_nvlist_remove_nvpair +#define nvlist_size spl_nvlist_size +#define nvlist_unpack spl_nvlist_unpack + +#define nvpair_type spl_nvpair_type +#define nvpair_name spl_nvpair_name +#endif diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h new file mode 100644 index 000000000000..6d73d1b8a341 --- /dev/null +++ b/sys/modules/zfs/zfs_config.h @@ -0,0 +1,711 @@ +/* + * $FreeBSD$ + */ + +/* zfs_config.h. Generated from zfs_config.h.in by configure. */ +/* zfs_config.h.in. Generated from configure.ac by autoheader. */ + +/* Define to 1 if translation of program messages to the user's native + language is requested. */ +/* #undef ENABLE_NLS */ + +/* bio_end_io_t wants 1 arg */ +/* #undef HAVE_1ARG_BIO_END_IO_T */ + +/* lookup_bdev() wants 1 arg */ +/* #undef HAVE_1ARG_LOOKUP_BDEV */ + +/* submit_bio() wants 1 arg */ +/* #undef HAVE_1ARG_SUBMIT_BIO */ + +/* bdi_setup_and_register() wants 2 args */ +/* #undef HAVE_2ARGS_BDI_SETUP_AND_REGISTER */ + +/* lookup_bdev() wants 2 args */ +/* #undef HAVE_2ARGS_LOOKUP_BDEV */ + +/* vfs_getattr wants 2 args */ +/* #undef HAVE_2ARGS_VFS_GETATTR */ + +/* zlib_deflate_workspacesize() wants 2 args */ +/* #undef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */ + +/* bdi_setup_and_register() wants 3 args */ +/* #undef HAVE_3ARGS_BDI_SETUP_AND_REGISTER */ + +/* vfs_getattr wants 3 args */ +/* #undef HAVE_3ARGS_VFS_GETATTR */ + +/* vfs_getattr wants 4 args */ +/* #undef HAVE_4ARGS_VFS_GETATTR */ + +/* kernel has access_ok with 'type' parameter */ +/* #undef HAVE_ACCESS_OK_TYPE */ + +/* posix_acl has refcount_t */ +/* #undef HAVE_ACL_REFCOUNT */ + +/* Define if host toolchain supports AES */ +#define HAVE_AES 1 + +#ifdef __amd64__ +#ifndef RESCUE +/* Define if host toolchain supports AVX */ +#define HAVE_AVX 1 +#endif + +/* Define if host toolchain supports AVX2 */ +#define HAVE_AVX2 1 + +/* Define if host toolchain supports AVX512BW */ +#define HAVE_AVX512BW 1 + +/* Define if host toolchain supports AVX512CD */ +#define HAVE_AVX512CD 1 + +/* Define if host toolchain supports AVX512DQ */ +#define HAVE_AVX512DQ 1 + +/* Define if host toolchain supports AVX512ER */ +#define HAVE_AVX512ER 1 + +/* Define if host toolchain supports AVX512F */ +#define HAVE_AVX512F 1 + +/* Define if host toolchain supports AVX512IFMA */ +#define HAVE_AVX512IFMA 1 + +/* Define if host toolchain supports AVX512PF */ +#define HAVE_AVX512PF 1 + +/* Define if host toolchain supports AVX512VBMI */ +#define HAVE_AVX512VBMI 1 + +/* Define if host toolchain supports AVX512VL */ +#define HAVE_AVX512VL 1 +#endif + +/* bio->bi_opf is defined */ +/* #undef HAVE_BIO_BI_OPF */ + +/* bio->bi_status exists */ +/* #undef HAVE_BIO_BI_STATUS */ + +/* bio has bi_iter */ +/* #undef HAVE_BIO_BVEC_ITER */ + +/* bio_set_dev() is available */ +/* #undef HAVE_BIO_SET_DEV */ + +/* bio_set_dev() GPL-only */ +/* #undef HAVE_BIO_SET_DEV_GPL_ONLY */ + +/* bio_set_op_attrs is available */ +/* #undef HAVE_BIO_SET_OP_ATTRS */ + +/* blkdev_reread_part() exists */ +/* #undef HAVE_BLKDEV_REREAD_PART */ + +/* blkg_tryget() is available */ +/* #undef HAVE_BLKG_TRYGET */ + +/* blkg_tryget() GPL-only */ +/* #undef HAVE_BLKG_TRYGET_GPL_ONLY */ + +/* blk_alloc_queue() expects request function */ +/* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN */ + +/* blk queue backing_dev_info is dynamic */ +/* #undef HAVE_BLK_QUEUE_BDI_DYNAMIC */ + +/* blk_queue_flag_clear() exists */ +/* #undef HAVE_BLK_QUEUE_FLAG_CLEAR */ + +/* blk_queue_flag_set() exists */ +/* #undef HAVE_BLK_QUEUE_FLAG_SET */ + +/* blk_queue_flush() is available */ +/* #undef HAVE_BLK_QUEUE_FLUSH */ + +/* blk_queue_flush() is GPL-only */ +/* #undef HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */ + +/* blk_queue_secdiscard() is available */ +/* #undef HAVE_BLK_QUEUE_SECDISCARD */ + +/* blk_queue_secure_erase() is available */ +/* #undef HAVE_BLK_QUEUE_SECURE_ERASE */ + +/* blk_queue_write_cache() exists */ +/* #undef HAVE_BLK_QUEUE_WRITE_CACHE */ + +/* blk_queue_write_cache() is GPL-only */ +/* #undef HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY */ + +/* Define to 1 if you have the Mac OS X function CFLocaleCopyCurrent in the + CoreFoundation framework. */ +/* #undef HAVE_CFLOCALECOPYCURRENT */ + +/* Define to 1 if you have the Mac OS X function + CFLocaleCopyPreferredLanguages in the CoreFoundation framework. */ +/* #undef HAVE_CFLOCALECOPYPREFERREDLANGUAGES */ + +/* Define to 1 if you have the Mac OS X function CFPreferencesCopyAppValue in + the CoreFoundation framework. */ +/* #undef HAVE_CFPREFERENCESCOPYAPPVALUE */ + +/* clear_inode() is available */ +/* #undef HAVE_CLEAR_INODE */ + +/* dentry uses const struct dentry_operations */ +/* #undef HAVE_CONST_DENTRY_OPERATIONS */ + +/* current_time() exists */ +/* #undef HAVE_CURRENT_TIME */ + +/* Define if the GNU dcgettext() function is already present or preinstalled. + */ +/* #undef HAVE_DCGETTEXT */ + +/* DECLARE_EVENT_CLASS() is available */ +/* #undef HAVE_DECLARE_EVENT_CLASS */ + +/* sops->dirty_inode() wants flags */ +/* #undef HAVE_DIRTY_INODE_WITH_FLAGS */ + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* d_make_root() is available */ +/* #undef HAVE_D_MAKE_ROOT */ + +/* d_prune_aliases() is available */ +/* #undef HAVE_D_PRUNE_ALIASES */ + +/* dops->d_revalidate() operation takes nameidata */ +/* #undef HAVE_D_REVALIDATE_NAMEIDATA */ + +/* eops->encode_fh() wants child and parent inodes */ +/* #undef HAVE_ENCODE_FH_WITH_INODE */ + +/* sops->evict_inode() exists */ +/* #undef HAVE_EVICT_INODE */ + +/* fops->aio_fsync() exists */ +/* #undef HAVE_FILE_AIO_FSYNC */ + +/* file_dentry() is available */ +/* #undef HAVE_FILE_DENTRY */ + +/* file_inode() is available */ +/* #undef HAVE_FILE_INODE */ + +/* iops->follow_link() cookie */ +/* #undef HAVE_FOLLOW_LINK_COOKIE */ + +/* iops->follow_link() nameidata */ +/* #undef HAVE_FOLLOW_LINK_NAMEIDATA */ + +/* fops->fsync() with range */ +/* #undef HAVE_FSYNC_RANGE */ + +/* fops->fsync() without dentry */ +/* #undef HAVE_FSYNC_WITHOUT_DENTRY */ + +/* generic_start_io_acct()/generic_end_io_acct() available */ +/* #undef HAVE_GENERIC_IO_ACCT_3ARG */ + +/* generic_start_io_acct()/generic_end_io_acct() 4 arg available */ +/* #undef HAVE_GENERIC_IO_ACCT_4ARG */ + +/* generic_readlink is global */ +/* #undef HAVE_GENERIC_READLINK */ + +/* generic_setxattr() exists */ +/* #undef HAVE_GENERIC_SETXATTR */ + +/* generic_write_checks() takes kiocb */ +/* #undef HAVE_GENERIC_WRITE_CHECKS_KIOCB */ + +/* Define if the GNU gettext() function is already present or preinstalled. */ +/* #undef HAVE_GETTEXT */ + +/* get_disk_and_module() is available */ +/* #undef HAVE_GET_DISK_AND_MODULE */ + +/* iops->get_link() cookie */ +/* #undef HAVE_GET_LINK_COOKIE */ + +/* iops->get_link() delayed */ +/* #undef HAVE_GET_LINK_DELAYED */ + +/* group_info->gid exists */ +/* #undef HAVE_GROUP_INFO_GID */ + +/* Define if you have the iconv() function and it works. */ +#define HAVE_ICONV 1 + +/* yes */ +/* #undef HAVE_INODE_LOCK_SHARED */ + +/* inode_set_flags() exists */ +/* #undef HAVE_INODE_SET_FLAGS */ + +/* inode_set_iversion() exists */ +/* #undef HAVE_INODE_SET_IVERSION */ + +/* inode->i_*time's are timespec64 */ +/* #undef HAVE_INODE_TIMESPEC64_TIMES */ + +/* timestamp_truncate() exists */ +/* #undef HAVE_INODE_TIMESTAMP_TRUNCATE */ + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* in_compat_syscall() is available */ +/* #undef HAVE_IN_COMPAT_SYSCALL */ + +/* yes */ +/* #undef HAVE_IO_SCHEDULE_TIMEOUT */ + +/* Define to 1 if you have the `issetugid' function. */ +#define HAVE_ISSETUGID 1 + +/* kernel has kernel_fpu_* functions */ +/* #undef HAVE_KERNEL_FPU */ + +/* kernel has asm/fpu/api.h */ +/* #undef HAVE_KERNEL_FPU_API_HEADER */ + +/* kernel fpu internal */ +/* #undef HAVE_KERNEL_FPU_INTERNAL */ + +/* uncached_acl_sentinel() exists */ +/* #undef HAVE_KERNEL_GET_ACL_HANDLE_CACHE */ + +/* kernel does stack verification */ +/* #undef HAVE_KERNEL_OBJTOOL */ + +/* kernel_read() take loff_t pointer */ +/* #undef HAVE_KERNEL_READ_PPOS */ + +/* timer_list.function gets a timer_list */ +/* #undef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST */ + +/* struct timer_list has a flags member */ +/* #undef HAVE_KERNEL_TIMER_LIST_FLAGS */ + +/* timer_setup() is available */ +/* #undef HAVE_KERNEL_TIMER_SETUP */ + +/* kernel_write() take loff_t pointer */ +/* #undef HAVE_KERNEL_WRITE_PPOS */ + +/* kmem_cache_create_usercopy() exists */ +/* #undef HAVE_KMEM_CACHE_CREATE_USERCOPY */ + +/* kstrtoul() exists */ +/* #undef HAVE_KSTRTOUL */ + +/* ktime_get_coarse_real_ts64() exists */ +/* #undef HAVE_KTIME_GET_COARSE_REAL_TS64 */ + +/* ktime_get_raw_ts64() exists */ +/* #undef HAVE_KTIME_GET_RAW_TS64 */ + +/* kvmalloc exists */ +/* #undef HAVE_KVMALLOC */ + +/* kernel has large stacks */ +/* #undef HAVE_LARGE_STACKS */ + +/* Define if you have libaio */ +/* #undef HAVE_LIBAIO */ + +/* Define if you have libblkid */ +/* #undef HAVE_LIBBLKID */ + +/* Define if you have libssl */ +#define HAVE_LIBSSL 1 + +/* Define to 1 if you have the `tirpc' library (-ltirpc). */ +/* #undef HAVE_LIBTIRPC */ + +/* Define if you have libudev */ +/* #undef HAVE_LIBUDEV */ + +/* Define if udev_device_get_is_initialized is available */ +/* #undef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED */ + +/* Define if you have libuuid */ +/* #undef HAVE_LIBUUID */ + +/* lseek_execute() is available */ +/* #undef HAVE_LSEEK_EXECUTE */ + +/* makedev() is declared in sys/mkdev.h */ +/* #undef HAVE_MAKEDEV_IN_MKDEV */ + +/* makedev() is declared in sys/sysmacros.h */ +/* #undef HAVE_MAKEDEV_IN_SYSMACROS */ + +/* Noting that make_request_fn() returns blk_qc_t */ +/* #undef HAVE_MAKE_REQUEST_FN_RET_QC */ + +/* Noting that make_request_fn() returns void */ +/* #undef HAVE_MAKE_REQUEST_FN_RET_VOID */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* iops->create()/mkdir()/mknod() take umode_t */ +/* #undef HAVE_MKDIR_UMODE_T */ + +/* Define to 1 if you have the `mlockall' function. */ +#define HAVE_MLOCKALL 1 + +/* Define if host toolchain supports MOVBE */ +#define HAVE_MOVBE 1 + +/* new_sync_read()/new_sync_write() are available */ +/* #undef HAVE_NEW_SYNC_READ */ + +/* iops->getattr() takes a path */ +/* #undef HAVE_PATH_IOPS_GETATTR */ + +/* Define if host toolchain supports PCLMULQDQ */ +#define HAVE_PCLMULQDQ 1 + +/* posix_acl_chmod() exists */ +/* #undef HAVE_POSIX_ACL_CHMOD */ + +/* posix_acl_from_xattr() needs user_ns */ +/* #undef HAVE_POSIX_ACL_FROM_XATTR_USERNS */ + +/* posix_acl_release() is available */ +/* #undef HAVE_POSIX_ACL_RELEASE */ + +/* posix_acl_release() is GPL-only */ +/* #undef HAVE_POSIX_ACL_RELEASE_GPL_ONLY */ + +/* posix_acl_valid() wants user namespace */ +/* #undef HAVE_POSIX_ACL_VALID_WITH_NS */ + +/* proc_ops structure exists */ +/* #undef HAVE_PROC_OPS_STRUCT */ + +/* iops->put_link() cookie */ +/* #undef HAVE_PUT_LINK_COOKIE */ + +/* iops->put_link() delayed */ +/* #undef HAVE_PUT_LINK_DELAYED */ + +/* iops->put_link() nameidata */ +/* #undef HAVE_PUT_LINK_NAMEIDATA */ + +/* If available, contains the Python version number currently in use. */ +#define HAVE_PYTHON "3.7" + +/* qat is enabled and existed */ +/* #undef HAVE_QAT */ + +/* iops->rename() wants flags */ +/* #undef HAVE_RENAME_WANTS_FLAGS */ + +/* REQ_DISCARD is defined */ +/* #undef HAVE_REQ_DISCARD */ + +/* REQ_FLUSH is defined */ +/* #undef HAVE_REQ_FLUSH */ + +/* REQ_OP_DISCARD is defined */ +/* #undef HAVE_REQ_OP_DISCARD */ + +/* REQ_OP_FLUSH is defined */ +/* #undef HAVE_REQ_OP_FLUSH */ + +/* REQ_OP_SECURE_ERASE is defined */ +/* #undef HAVE_REQ_OP_SECURE_ERASE */ + +/* REQ_PREFLUSH is defined */ +/* #undef HAVE_REQ_PREFLUSH */ + +/* struct rw_semaphore has member activity */ +/* #undef HAVE_RWSEM_ACTIVITY */ + +/* struct rw_semaphore has atomic_long_t member count */ +/* #undef HAVE_RWSEM_ATOMIC_LONG_COUNT */ + +/* linux/sched/signal.h exists */ +/* #undef HAVE_SCHED_SIGNAL_HEADER */ + +/* setattr_prepare() is available */ +/* #undef HAVE_SETATTR_PREPARE */ + +/* iops->set_acl() exists */ +/* #undef HAVE_SET_ACL */ + +/* set_cached_acl() is usable */ +/* #undef HAVE_SET_CACHED_ACL_USABLE */ + +/* struct shrink_control exists */ +/* #undef HAVE_SHRINK_CONTROL_STRUCT */ + +/* new shrinker callback wants 2 args */ +/* #undef HAVE_SINGLE_SHRINKER_CALLBACK */ + +/* ->count_objects exists */ +/* #undef HAVE_SPLIT_SHRINKER_CALLBACK */ + +#if defined(__amd64__) || defined(__i386__) +/* Define if host toolchain supports SSE */ +#define HAVE_SSE 1 + +/* Define if host toolchain supports SSE2 */ +#define HAVE_SSE2 1 + +/* Define if host toolchain supports SSE3 */ +#define HAVE_SSE3 1 + +/* Define if host toolchain supports SSE4.1 */ +#define HAVE_SSE4_1 1 + +/* Define if host toolchain supports SSE4.2 */ +#define HAVE_SSE4_2 1 + +/* Define if host toolchain supports SSSE3 */ +#define HAVE_SSSE3 1 +#endif + +/* STACK_FRAME_NON_STANDARD is defined */ +/* #undef HAVE_STACK_FRAME_NON_STANDARD */ + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strlcat' function. */ +#define HAVE_STRLCAT 1 + +/* Define to 1 if you have the `strlcpy' function. */ +#define HAVE_STRLCPY 1 + +/* super_setup_bdi_name() exits */ +/* #undef HAVE_SUPER_SETUP_BDI_NAME */ + +/* super_block->s_user_ns exists */ +/* #undef HAVE_SUPER_USER_NS */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* i_op->tmpfile() exists */ +/* #undef HAVE_TMPFILE */ + +/* totalhigh_pages() exists */ +/* #undef HAVE_TOTALHIGH_PAGES */ + +/* kernel has totalram_pages() */ +/* #undef HAVE_TOTALRAM_PAGES_FUNC */ + +/* kernel has __kernel_fpu_* functions */ +/* #undef HAVE_UNDERSCORE_KERNEL_FPU */ + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* iops->getattr() takes a vfsmount */ +/* #undef HAVE_VFSMOUNT_IOPS_GETATTR */ + +/* aops->direct_IO() uses iovec */ +/* #undef HAVE_VFS_DIRECT_IO_IOVEC */ + +/* aops->direct_IO() uses iov_iter without rw */ +/* #undef HAVE_VFS_DIRECT_IO_ITER */ + +/* aops->direct_IO() uses iov_iter with offset */ +/* #undef HAVE_VFS_DIRECT_IO_ITER_OFFSET */ + +/* aops->direct_IO() uses iov_iter with rw and offset */ +/* #undef HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET */ + +/* fops->iterate() is available */ +/* #undef HAVE_VFS_ITERATE */ + +/* fops->iterate_shared() is available */ +/* #undef HAVE_VFS_ITERATE_SHARED */ + +/* fops->readdir() is available */ +/* #undef HAVE_VFS_READDIR */ + +/* fops->read/write_iter() are available */ +/* #undef HAVE_VFS_RW_ITERATE */ + +/* __vmalloc page flags exists */ +/* #undef HAVE_VMALLOC_PAGE_KERNEL */ + +/* yes */ +/* #undef HAVE_WAIT_ON_BIT_ACTION */ + +/* wait_queue_entry_t exists */ +/* #undef HAVE_WAIT_QUEUE_ENTRY_T */ + +/* wq_head->head and wq_entry->entry exist */ +/* #undef HAVE_WAIT_QUEUE_HEAD_ENTRY */ + +/* xattr_handler->get() wants dentry */ +/* #undef HAVE_XATTR_GET_DENTRY */ + +/* xattr_handler->get() wants both dentry and inode */ +/* #undef HAVE_XATTR_GET_DENTRY_INODE */ + +/* xattr_handler->get() wants xattr_handler */ +/* #undef HAVE_XATTR_GET_HANDLER */ + +/* xattr_handler has name */ +/* #undef HAVE_XATTR_HANDLER_NAME */ + +/* xattr_handler->list() wants dentry */ +/* #undef HAVE_XATTR_LIST_DENTRY */ + +/* xattr_handler->list() wants xattr_handler */ +/* #undef HAVE_XATTR_LIST_HANDLER */ + +/* xattr_handler->list() wants simple */ +/* #undef HAVE_XATTR_LIST_SIMPLE */ + +/* xattr_handler->set() wants dentry */ +/* #undef HAVE_XATTR_SET_DENTRY */ + +/* xattr_handler->set() wants both dentry and inode */ +/* #undef HAVE_XATTR_SET_DENTRY_INODE */ + +/* xattr_handler->set() wants xattr_handler */ +/* #undef HAVE_XATTR_SET_HANDLER */ + +/* Define if you have zlib */ +#define HAVE_ZLIB 1 + +/* __posix_acl_chmod() exists */ +/* #undef HAVE___POSIX_ACL_CHMOD */ + +/* Define as const if the declaration of iconv() needs const. */ +#define ICONV_CONST + +/* kernel exports FPU functions */ +/* #undef KERNEL_EXPORTS_X86_FPU */ + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* make_request_fn() return type */ +/* #undef MAKE_REQUEST_FN_RET */ + +/* hardened module_param_call */ +/* #undef MODULE_PARAM_CALL_CONST */ + +/* struct shrink_control has nid */ +/* #undef SHRINK_CONTROL_HAS_NID */ + +/* Defined for legacy compatibility. */ +#define SPL_META_ALIAS ZFS_META_ALIAS + +/* Defined for legacy compatibility. */ +#define SPL_META_RELEASE ZFS_META_RELEASE + +/* Defined for legacy compatibility. */ +#define SPL_META_VERSION ZFS_META_VERSION + +/* True if ZFS is to be compiled for a FreeBSD system */ +#define SYSTEM_FREEBSD 1 + +/* True if ZFS is to be compiled for a Linux system */ +/* #undef SYSTEM_LINUX */ + +/* zfs debugging enabled */ +/* #define ZFS_DEBUG 1 */ + +/* /dev/zfs minor */ +/* #undef ZFS_DEVICE_MINOR */ + +/* enum node_stat_item contains NR_FILE_PAGES */ +/* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES */ + +/* enum node_stat_item contains NR_INACTIVE_ANON */ +/* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON */ + +/* enum node_stat_item contains NR_INACTIVE_FILE */ +/* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE */ + +/* enum node_stat_item contains NR_SLAB_RECLAIMABLE */ +/* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE */ + +/* enum zone_stat_item contains NR_FILE_PAGES */ +/* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_FILE_PAGES */ + +/* enum zone_stat_item contains NR_INACTIVE_ANON */ +/* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_ANON */ + +/* enum zone_stat_item contains NR_INACTIVE_FILE */ +/* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_FILE */ + +/* enum zone_stat_item contains NR_SLAB_RECLAIMABLE */ +/* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_SLAB_RECLAIMABLE */ + +/* global_node_page_state() exists */ +/* #undef ZFS_GLOBAL_NODE_PAGE_STATE */ + +/* global_zone_page_state() exists */ +/* #undef ZFS_GLOBAL_ZONE_PAGE_STATE */ + +/* Define to 1 if GPL-only symbols can be used */ +/* #undef ZFS_IS_GPL_COMPATIBLE */ + +/* Define the project alias string. */ +#define ZFS_META_ALIAS "zfs-0.8.0-1" + +/* Define the project author. */ +#define ZFS_META_AUTHOR "OpenZFS on Linux" + +/* Define the project release date. */ +/* #undef ZFS_META_DATA */ + +/* Define the maximum compatible kernel version. */ +#define ZFS_META_KVER_MAX "5.6" + +/* Define the minimum compatible kernel version. */ +#define ZFS_META_KVER_MIN "3.10" + +/* Define the project license. */ +#define ZFS_META_LICENSE "CDDL" + +/* Define the libtool library 'age' version information. */ +/* #undef ZFS_META_LT_AGE */ + +/* Define the libtool library 'current' version information. */ +/* #undef ZFS_META_LT_CURRENT */ + +/* Define the libtool library 'revision' version information. */ +/* #undef ZFS_META_LT_REVISION */ + +/* Define the project name. */ +#define ZFS_META_NAME "zfs" + +/* Define the project release. */ +#define ZFS_META_RELEASE "1" + +/* Define the project version. */ +#define ZFS_META_VERSION "0.8.0" + diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h new file mode 100644 index 000000000000..6709f54ffa7b --- /dev/null +++ b/sys/modules/zfs/zfs_gitrev.h @@ -0,0 +1,5 @@ +/* + * $FreeBSD$ + */ + +#define ZFS_META_GITREV "zfs-0.7.0-3175-g184df27ee" diff --git a/sys/vm/vm.h b/sys/vm/vm.h index ca114a9aa166..675f93b62da2 100644 --- a/sys/vm/vm.h +++ b/sys/vm/vm.h @@ -112,7 +112,9 @@ typedef struct vm_object *vm_object_t; * Define it here for "applications" that include vm headers (e.g., * genassym). */ +#ifndef HAVE_BOOLEAN typedef int boolean_t; +#endif /* * The exact set of memory attributes is machine dependent. However, diff --git a/tests/sys/cddl/zfs/bin/file_write.c b/tests/sys/cddl/zfs/bin/file_write.c index 1c276589ec37..b1ef836879e6 100644 --- a/tests/sys/cddl/zfs/bin/file_write.c +++ b/tests/sys/cddl/zfs/bin/file_write.c @@ -28,6 +28,7 @@ #pragma ident "@(#)file_write.c 1.4 07/10/09 SMI" #include "file_common.h" +#include #include static unsigned char bigbuffer[BIGBUFFERSIZE]; @@ -180,14 +181,15 @@ main(int argc, char **argv) } noffset = lseek(bigfd, offset, SEEK_SET); if (noffset != offset) { - (void) printf("lseek %s (%lld/%lld) failed [%s]%d.Aborting!\n", + (void) printf("lseek %s (%"PRId64"/%"PRId64") " + "failed [%s]%d. Aborting!\n", filename, offset, noffset, strerror(errno), errno); exit(errno); } if (verbose) { (void) printf("%s: block_size = %d, write_count = %d, " - "offset = %lld, data = %s%d\n", filename, block_size, + "offset = %"PRId64", data = %s%d\n", filename, block_size, write_count, offset, (fillchar == 0) ? "0->" : "", (fillchar == 0) ? DATA_RANGE : fillchar); @@ -197,17 +199,17 @@ main(int argc, char **argv) ssize_t n; if ((n = write(bigfd, &bigbuffer, block_size)) == -1) { - (void) printf("write failed (%ld), good_writes = %lld, " + (void) printf("write failed (%ld), " + "good_writes = %"PRId64", " "error: %s[%d]\n", (long)n, good_writes, - strerror(errno), - errno); + strerror(errno), errno); exit(errno); } good_writes++; } if (verbose) { - (void) printf("Success: good_writes = %lld (%lld)\n", + (void) printf("Success: good_writes = %"PRId64" (%"PRId64")\n", good_writes, (good_writes * block_size)); } diff --git a/tools/boot/rootgen.sh b/tools/boot/rootgen.sh index a42a12fc4420..eb340d96ce83 100755 --- a/tools/boot/rootgen.sh +++ b/tools/boot/rootgen.sh @@ -107,12 +107,12 @@ mk_nogeli_gpt_zfs_legacy() { cpsys ${src} ${mntpt} # need to make a couple of tweaks cat >> ${mntpt}/boot/loader.conf <> ${mntpt}/boot/loader.conf <> ${mntpt}/boot/loader.conf <> ${mntpt}/boot/loader.conf <> ${mntpt}/boot/loader.conf <> ${mntpt}/boot/loader.conf <> ${mntpt}/boot/loader.conf <> ${mntpt}/boot/loader.conf < ${mntpt}/boot/loader.conf < diff --git a/usr.sbin/fstyp/zfs.c b/usr.sbin/fstyp/zfs.c index 52d05a4df55a..96ff0485d71e 100644 --- a/usr.sbin/fstyp/zfs.c +++ b/usr.sbin/fstyp/zfs.c @@ -28,9 +28,7 @@ __FBSDID("$FreeBSD$"); #include -#include #include -#include #include #include #include